Subject: | multiple bugs handling non-ASCII characters |
HTML-Parser fails to handle non-ASCII characters in the HTML file being parsed. It fails to examine or copy the UTF8 flag, with the exception of decode_entities(). Following a unicode entity, decode_entities() in UNICODE_ENTITIES mode fails to convert ISO-8859-1 to UTF-8, leading to a result that is not utf8::valid(). hparser.c has hash lookup code that is not UTF8 safe.
The attached patch fixes all this.
Common subdirectories: HTML-Parser-3.36/blib and HTML-Parser-3.36-work/blib
Common subdirectories: HTML-Parser-3.36/eg and HTML-Parser-3.36-work/eg
Common subdirectories: HTML-Parser-3.36/hints and HTML-Parser-3.36-work/hints
diff -u HTML-Parser-3.36/hparser.c HTML-Parser-3.36-work/hparser.c
--- HTML-Parser-3.36/hparser.c 2004-04-01 03:56:37.000000000 -0800
+++ HTML-Parser-3.36-work/hparser.c 2004-07-19 14:54:50.000000000 -0700
@@ -111,7 +111,7 @@
static void
report_event(PSTATE* p_state,
event_id_t event,
- char *beg, char *end,
+ char *beg, char *end, U32 utf8,
token_pos_t *tokens, int num_tokens,
SV* self
)
@@ -196,10 +196,13 @@
if (event == E_START || event == E_END) {
SV* tagname = p_state->tmp;
- U32 hash;
assert(num_tokens >= 1);
sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg);
+ if (utf8)
+ SvUTF8_on(tagname);
+ else
+ SvUTF8_off(tagname);
if (!CASE_SENSITIVE(p_state))
sv_lower(aTHX_ tagname);
@@ -215,10 +218,8 @@
goto IGNORE_EVENT;
}
- PERL_HASH(hash, SvPVX(tagname), SvCUR(tagname));
-
if (p_state->ignore_elements &&
- hv_fetch_ent(p_state->ignore_elements, tagname, 0, hash))
+ hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0))
{
p_state->ignoring_element = newSVsv(tagname);
p_state->ignore_depth = 1;
@@ -226,12 +227,12 @@
}
if (p_state->ignore_tags &&
- hv_fetch_ent(p_state->ignore_tags, tagname, 0, hash))
+ hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0))
{
goto IGNORE_EVENT;
}
if (p_state->report_tags &&
- !hv_fetch_ent(p_state->report_tags, tagname, 0, hash))
+ !hv_fetch_ent(p_state->report_tags, tagname, 0, 0))
{
goto IGNORE_EVENT;
}
@@ -272,8 +273,18 @@
p_state->pend_text_column = column;
p_state->pend_text_is_cdata = p_state->is_cdata;
sv_setpvn(p_state->pend_text, "", 0);
+ if (!utf8)
+ SvUTF8_off(p_state->pend_text);
+ }
+ if (utf8 && !SvUTF8(p_state->pend_text))
+ sv_utf8_upgrade(p_state->pend_text);
+ if (utf8 || !SvUTF8(p_state->pend_text)) {
+ sv_catpvn(p_state->pend_text, beg, end - beg);
+ }
+ else {
+ SV *tmp = NULL;
+ sv_catpvn_utf8_upgrade(p_state->pend_text, beg, end - beg, tmp);
}
- sv_catpvn(p_state->pend_text, beg, end - beg);
return;
}
else if (p_state->pend_text && SvOK(p_state->pend_text)) {
@@ -327,6 +338,8 @@
for (i = 0; i < num_tokens; i++) {
if (tokens[i].beg) {
prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg);
+ if (utf8)
+ SvUTF8_on(prev_token);
av_push(av, prev_token);
}
else { /* boolean */
@@ -366,6 +379,8 @@
if (num_tokens >= 1) {
arg = sv_2mortal(newSVpvn(tokens[0].beg,
tokens[0].end - tokens[0].beg));
+ if (utf8)
+ SvUTF8_on(arg);
if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0)
sv_lower(aTHX_ arg);
if (argcode == ARG_TAG && event != E_START) {
@@ -392,6 +407,8 @@
tokens[i].end-tokens[i].beg);
SV* attrval;
+ if (utf8)
+ SvUTF8_on(attrname);
if (tokens[i+1].beg) {
char *beg = tokens[i+1].beg;
STRLEN len = tokens[i+1].end - beg;
@@ -400,6 +417,8 @@
beg++; len -= 2;
}
attrval = newSVpvn(beg, len);
+ if (utf8)
+ SvUTF8_on(attrval);
if (!p_state->attr_encoded)
decode_entities(aTHX_ attrval, p_state->entity2char);
}
@@ -414,10 +433,8 @@
sv_lower(aTHX_ attrname);
if (argcode == ARG_ATTR) {
- U32 hash;
- PERL_HASH(hash, SvPVX(attrname), SvCUR(attrname));
- if (hv_exists_ent(hv, attrname, hash) ||
- !hv_store_ent(hv, attrname, attrval, hash)) {
+ if (hv_exists_ent(hv, attrname, 0) ||
+ !hv_store_ent(hv, attrname, attrval, 0)) {
SvREFCNT_dec(attrval);
}
SvREFCNT_dec(attrname);
@@ -446,6 +463,8 @@
for (i = 1; i < num_tokens; i += 2) {
SV* attrname = newSVpvn(tokens[i].beg,
tokens[i].end-tokens[i].beg);
+ if (utf8)
+ SvUTF8_on(attrname);
if (!CASE_SENSITIVE(p_state))
sv_lower(aTHX_ attrname);
av_push(av, attrname);
@@ -456,11 +475,15 @@
case ARG_TEXT:
arg = sv_2mortal(newSVpvn(beg, end - beg));
+ if (utf8)
+ SvUTF8_on(arg);
break;
case ARG_DTEXT:
if (event == E_TEXT) {
arg = sv_2mortal(newSVpvn(beg, end - beg));
+ if (utf8)
+ SvUTF8_on(arg);
if (!p_state->is_cdata)
decode_entities(aTHX_ arg, p_state->entity2char);
}
@@ -475,6 +498,7 @@
case ARG_SKIPPED_TEXT:
arg = sv_2mortal(p_state->skipped_text);
p_state->skipped_text = newSVpvn("", 0);
+ SvUTF8_off(p_state->skipped_text);
break;
case ARG_OFFSET:
@@ -506,6 +530,8 @@
{
int len = (unsigned char)s[1];
arg = sv_2mortal(newSVpvn(s+2, len));
+ if (SvUTF8(h->argspec))
+ SvUTF8_on(arg);
s += len + 1;
}
break;
@@ -565,7 +591,15 @@
if (p_state->skipped_text) {
if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text))
flush_pending_text(p_state, self);
- sv_catpvn(p_state->skipped_text, beg, end - beg);
+ if (utf8 && !SvUTF8(p_state->skipped_text))
+ sv_utf8_upgrade(p_state->skipped_text);
+ if (utf8 || !SvUTF8(p_state->skipped_text)) {
+ sv_catpvn(p_state->skipped_text, beg, end - beg);
+ }
+ else {
+ SV *tmp = NULL;
+ sv_catpvn_utf8_upgrade(p_state->skipped_text, beg, end - beg, tmp);
+ }
}
return;
}
@@ -580,6 +614,9 @@
char *s = SvPV(src, len);
char *end = s + len;
+ if (SvUTF8(src))
+ SvUTF8_on(argspec);
+
while (isHSPACE(*s))
s++;
@@ -708,8 +745,8 @@
p_state->column = p_state->pend_text_column;
report_event(p_state, E_TEXT,
- SvPVX(old_pend_text), SvEND(old_pend_text),
- 0, 0, self);
+ SvPVX(old_pend_text), SvEND(old_pend_text),
+ SvUTF8(old_pend_text), 0, 0, self);
SvOK_off(old_pend_text);
p_state->unbroken_text = old_unbroken_text;
@@ -744,7 +781,7 @@
}
static char*
-parse_comment(PSTATE* p_state, char *beg, char *end, SV* self)
+parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg;
@@ -771,7 +808,7 @@
/* we are done recognizing all comments, make callbacks */
report_event(p_state, E_COMMENT,
- beg - 4, s,
+ beg - 4, s, utf8,
tokens, num_tokens,
self);
FREE_TOKENS;
@@ -808,7 +845,7 @@
token.end = s;
if (s < end) {
s++;
- report_event(p_state, E_COMMENT, beg-4, s, &token, 1, self);
+ report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
return s;
}
else {
@@ -832,7 +869,7 @@
if (*s == '>') {
s++;
/* yup */
- report_event(p_state, E_COMMENT, beg-4, s, &token, 1, self);
+ report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
return s;
}
}
@@ -900,7 +937,7 @@
static char*
-parse_marked_section(PSTATE* p_state, char *beg, char *end, SV* self)
+parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
dTHX;
char *s = beg;
@@ -915,6 +952,7 @@
while (isHNAME_FIRST(*s)) {
char *name_start = s;
char *name_end;
+ SV *name;
s++;
while (isHNAME_CHAR(*s))
s++;
@@ -926,8 +964,10 @@
if (!tokens)
tokens = newAV();
- av_push(tokens, sv_lower(aTHX_ newSVpvn(name_start,
- name_end - name_start)));
+ name = newSVpvn(name_start, name_end - name_start);
+ if (utf8)
+ SvUTF8_on(name);
+ av_push(tokens, sv_lower(aTHX_ name));
}
if (*s == '-') {
s++;
@@ -965,7 +1005,7 @@
p_state->ms_stack = newAV();
av_push(p_state->ms_stack, newRV_noinc((SV*)tokens));
marked_section_update(p_state);
- report_event(p_state, E_NONE, beg, s, 0, 0, self);
+ report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self);
return s;
}
@@ -981,7 +1021,7 @@
static char*
-parse_decl(PSTATE* p_state, char *beg, char *end, SV* self)
+parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg + 2;
@@ -999,7 +1039,7 @@
/* yes, two dashes seen */
s++;
- tmp = parse_comment(p_state, s, end, self);
+ tmp = parse_comment(p_state, s, end, utf8, self);
return (tmp == s) ? beg : tmp;
}
@@ -1008,7 +1048,7 @@
/* marked section */
char *tmp;
s++;
- tmp = parse_marked_section(p_state, s, end, self);
+ tmp = parse_marked_section(p_state, s, end, utf8, self);
if (!tmp)
goto DECL_FAIL;
return (tmp == s) ? beg : tmp;
@@ -1021,7 +1061,7 @@
token.beg = s;
token.end = s;
s++;
- report_event(p_state, E_COMMENT, beg, s, &token, 1, self);
+ report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
return s;
}
@@ -1112,7 +1152,7 @@
goto PREMATURE;
if (*s == '>') {
s++;
- report_event(p_state, E_DECLARATION, beg, s, tokens, num_tokens, self);
+ report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self);
FREE_TOKENS;
return s;
}
@@ -1138,7 +1178,7 @@
token.beg = beg + 2;
token.end = s;
s++;
- report_event(p_state, E_COMMENT, beg, s, &token, 1, self);
+ report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
return s;
}
else {
@@ -1148,7 +1188,7 @@
static char*
-parse_start(PSTATE* p_state, char *beg, char *end, SV* self)
+parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg;
int empty_tag = 0; /* XML feature */
@@ -1249,9 +1289,9 @@
if (*s == '>') {
s++;
/* done */
- report_event(p_state, E_START, beg, s, tokens, num_tokens, self);
+ report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self);
if (empty_tag)
- report_event(p_state, E_END, s, s, tokens, 1, self);
+ report_event(p_state, E_END, s, s, utf8, tokens, 1, self);
if (!p_state->xml_mode) {
/* find out if this start tag should put us into literal_mode
@@ -1298,7 +1338,7 @@
static char*
-parse_end(PSTATE* p_state, char *beg, char *end, SV* self)
+parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg+2;
hctype_t name_first, name_char;
@@ -1330,7 +1370,7 @@
if (*s == '>') {
s++;
/* a complete end tag has been recognized */
- report_event(p_state, E_END, beg, s, &tagname, 1, self);
+ report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self);
return s;
}
}
@@ -1345,7 +1385,7 @@
token.beg = beg + 2;
token.end = s;
s++;
- report_event(p_state, E_COMMENT, beg, s, &token, 1, self);
+ report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
return s;
}
else {
@@ -1357,7 +1397,7 @@
static char*
-parse_process(PSTATE* p_state, char *beg, char *end, SV* self)
+parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg + 2; /* skip '<?' */
/* processing instruction */
@@ -1377,7 +1417,7 @@
}
/* a complete processing instruction seen */
- report_event(p_state, E_PROCESS, beg, s,
+ report_event(p_state, E_PROCESS, beg, s, utf8,
&token_pos, 1, self);
return s;
}
@@ -1389,7 +1429,7 @@
#ifdef USE_PFUNC
static char*
-parse_null(PSTATE* p_state, char *beg, char *end, SV* self)
+parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
return 0;
}
@@ -1400,7 +1440,7 @@
#endif /* USE_PFUNC */
static char*
-parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, SV* self)
+parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg;
char *t = beg;
@@ -1447,9 +1487,9 @@
if (*s == '>') {
s++;
if (t != end_text)
- report_event(p_state, E_TEXT, t, end_text,
+ report_event(p_state, E_TEXT, t, end_text, utf8,
0, 0, self);
- report_event(p_state, E_END, end_text, s,
+ report_event(p_state, E_END, end_text, s, utf8,
&end_token, 1, self);
p_state->literal_mode = 0;
p_state->is_cdata = 0;
@@ -1472,9 +1512,9 @@
s++;
/* marked section end */
if (t != end_text)
- report_event(p_state, E_TEXT, t, end_text,
+ report_event(p_state, E_TEXT, t, end_text, utf8,
0, 0, self);
- report_event(p_state, E_NONE, end_text, s, 0, 0, self);
+ report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self);
t = s;
SvREFCNT_dec(av_pop(p_state->ms_stack));
marked_section_update(p_state);
@@ -1499,9 +1539,9 @@
s++;
if (*s == '>') {
s++;
- report_event(p_state, E_TEXT, t, end_text,
+ report_event(p_state, E_TEXT, t, end_text, utf8,
0, 0, self);
- report_event(p_state, E_NONE, end_text, s,
+ report_event(p_state, E_NONE, end_text, s, utf8,
0, 0, self);
t = s;
SvREFCNT_dec(av_pop(p_state->ms_stack));
@@ -1515,7 +1555,7 @@
}
if (s != t) {
if (*s == '<') {
- report_event(p_state, E_TEXT, t, s, 0, 0, self);
+ report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
t = s;
}
else {
@@ -1534,7 +1574,7 @@
}
s++;
if (s != t)
- report_event(p_state, E_TEXT, t, s, 0, 0, self);
+ report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
break;
}
}
@@ -1546,16 +1586,16 @@
s++;
#ifdef USE_PFUNC
- new_pos = parsefunc[(unsigned char)*s](p_state, t, end, self);
+ new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self);
#else
if (isHNAME_FIRST(*s))
- new_pos = parse_start(p_state, t, end, self);
+ new_pos = parse_start(p_state, t, end, utf8, self);
else if (*s == '/')
- new_pos = parse_end(p_state, t, end, self);
+ new_pos = parse_end(p_state, t, end, utf8, self);
else if (*s == '!')
- new_pos = parse_decl(p_state, t, end, self);
+ new_pos = parse_decl(p_state, t, end, utf8, self);
else if (*s == '?')
- new_pos = parse_process(p_state, t, end, self);
+ new_pos = parse_process(p_state, t, end, utf8, self);
else
new_pos = 0;
#endif /* USE_PFUNC */
@@ -1587,6 +1627,7 @@
SV* self)
{
char *s, *beg, *end;
+ U32 utf8;
STRLEN len;
if (!chunk) {
@@ -1601,14 +1642,14 @@
if (*s == '<') {
/* try to parse with comments terminated with a plain '>' first */
p_state->no_dash_dash_comment_end = 1;
- s = parse_buf(aTHX_ p_state, s, end, self);
+ s = parse_buf(aTHX_ p_state, s, end, SvUTF8(p_state->buf), self);
}
if (*s == '<') {
/* some kind of unterminated markup. Report rest as as comment */
token_pos_t token;
token.beg = s + 1;
token.end = end;
- report_event(p_state, E_COMMENT, s, end, &token, 1, self);
+ report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self);
SvREFCNT_dec(p_state->buf);
p_state->buf = 0;
}
@@ -1619,7 +1660,7 @@
else {
/* report rest as text */
REST_IS_TEXT:
- report_event(p_state, E_TEXT, s, end, 0, 0, self);
+ report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self);
SvREFCNT_dec(p_state->buf);
p_state->buf = 0;
}
@@ -1632,7 +1673,7 @@
SvREFCNT_dec(p_state->ignoring_element);
p_state->ignoring_element = 0;
}
- report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, self);
+ report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);
/* reset state */
p_state->offset = 0;
@@ -1647,18 +1688,20 @@
if (p_state->buf && SvOK(p_state->buf)) {
sv_catsv(p_state->buf, chunk);
beg = SvPV(p_state->buf, len);
+ utf8 = SvUTF8(p_state->buf);
}
else {
beg = SvPV(chunk, len);
+ utf8 = SvUTF8(chunk);
if (p_state->offset == 0)
- report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, self);
+ report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, 0, self);
}
if (!len)
return; /* nothing to do */
end = beg + len;
- s = parse_buf(aTHX_ p_state, beg, end, self);
+ s = parse_buf(aTHX_ p_state, beg, end, utf8, self);
if (s == end || p_state->eof) {
if (p_state->buf) {
@@ -1669,13 +1712,21 @@
/* need to keep rest in buffer */
if (p_state->buf) {
/* chop off some chars at the beginning */
- if (SvOK(p_state->buf))
+ if (SvOK(p_state->buf)) {
sv_chop(p_state->buf, s);
- else
+ }
+ else {
sv_setpvn(p_state->buf, s, end - s);
+ if (utf8)
+ SvUTF8_on(p_state->buf);
+ else
+ SvUTF8_off(p_state->buf);
+ }
}
else {
p_state->buf = newSVpv(s, end - s);
+ if (utf8)
+ SvUTF8_on(p_state->buf);
}
}
return;
Only in HTML-Parser-3.36-work/: hparser.c~
Common subdirectories: HTML-Parser-3.36/lib and HTML-Parser-3.36-work/lib
diff -u HTML-Parser-3.36/mkpfunc HTML-Parser-3.36-work/mkpfunc
--- HTML-Parser-3.36/mkpfunc 2001-03-29 23:11:00.000000000 -0800
+++ HTML-Parser-3.36-work/mkpfunc 2004-07-19 14:29:52.000000000 -0700
@@ -4,7 +4,7 @@
print "/* This file is autogenerated by $progname */\n";
-print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, SV* self);\n";
+print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);\n";
print "static PFUNC parsefunc[] = {\n";
for my $c (0..255) {
Only in HTML-Parser-3.36-work/: mkpfunc~
Binary files HTML-Parser-3.36/Parser.o and HTML-Parser-3.36-work/Parser.o differ
diff -u HTML-Parser-3.36/pfunc.h HTML-Parser-3.36-work/pfunc.h
--- HTML-Parser-3.36/pfunc.h 2004-07-19 15:10:11.000000000 -0700
+++ HTML-Parser-3.36-work/pfunc.h 2004-07-19 14:51:33.000000000 -0700
@@ -1,5 +1,5 @@
/* This file is autogenerated by mkpfunc */
-typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, SV* self);
+typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);
static PFUNC parsefunc[] = {
parse_null, /* 0 */
parse_null, /* 1 */
Only in HTML-Parser-3.36-work/: pfunc.h~
Common subdirectories: HTML-Parser-3.36/t and HTML-Parser-3.36-work/t
diff -u HTML-Parser-3.36/util.c HTML-Parser-3.36-work/util.c
--- HTML-Parser-3.36/util.c 2003-08-15 07:38:37.000000000 -0700
+++ HTML-Parser-3.36-work/util.c 2004-07-19 16:15:40.000000000 -0700
@@ -185,6 +185,16 @@
t = SvPVX(sv) + len;
}
Safefree(ustr);
+ /* upgrade the part we have yet to process */
+ old_len = len = end - s;
+ ustr = bytes_to_utf8(s, &len);
+ grow = len - old_len;
+ if (grow) {
+ grow_gap(aTHX_ sv, grow, &t, &s, &end);
+ s -= grow;
+ Copy(ustr, s, len, char);
+ }
+ Safefree(ustr);
}
SvUTF8_on(sv);
}
Only in HTML-Parser-3.36-work/: util.c~