This queue is for tickets about the HTML-Parser CPAN distribution.

Report information
The Basics
Id:
7014
Status:
resolved
Priority:
Low/Low
Queue:

People
Owner:
Nobody in particular
Requestors:
jgmyers [...] proofpoint.com
Cc:
AdminCc:

BugTracker
Severity:
Normal
Broken in:
3.36
Fixed in:
(no value)



Subject: multiple bugs handling non-ASCII characters
HTML-Parser fails to handle non-ASCII characters in the HTML file being parsed. It fails to examine or copy the UTF8 flag, with the exception of decode_entities(). Following a unicode entity, decode_entities() in UNICODE_ENTITIES mode fails to convert ISO-8859-1 to UTF-8, leading to a result that is not utf8::valid(). hparser.c has hash lookup code that is not UTF8 safe. The attached patch fixes all this.
Common subdirectories: HTML-Parser-3.36/blib and HTML-Parser-3.36-work/blib Common subdirectories: HTML-Parser-3.36/eg and HTML-Parser-3.36-work/eg Common subdirectories: HTML-Parser-3.36/hints and HTML-Parser-3.36-work/hints diff -u HTML-Parser-3.36/hparser.c HTML-Parser-3.36-work/hparser.c --- HTML-Parser-3.36/hparser.c 2004-04-01 03:56:37.000000000 -0800 +++ HTML-Parser-3.36-work/hparser.c 2004-07-19 14:54:50.000000000 -0700 @@ -111,7 +111,7 @@ static void report_event(PSTATE* p_state, event_id_t event, - char *beg, char *end, + char *beg, char *end, U32 utf8, token_pos_t *tokens, int num_tokens, SV* self ) @@ -196,10 +196,13 @@ if (event == E_START || event == E_END) { SV* tagname = p_state->tmp; - U32 hash; assert(num_tokens >= 1); sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg); + if (utf8) + SvUTF8_on(tagname); + else + SvUTF8_off(tagname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ tagname); @@ -215,10 +218,8 @@ goto IGNORE_EVENT; } - PERL_HASH(hash, SvPVX(tagname), SvCUR(tagname)); - if (p_state->ignore_elements && - hv_fetch_ent(p_state->ignore_elements, tagname, 0, hash)) + hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0)) { p_state->ignoring_element = newSVsv(tagname); p_state->ignore_depth = 1; @@ -226,12 +227,12 @@ } if (p_state->ignore_tags && - hv_fetch_ent(p_state->ignore_tags, tagname, 0, hash)) + hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } if (p_state->report_tags && - !hv_fetch_ent(p_state->report_tags, tagname, 0, hash)) + !hv_fetch_ent(p_state->report_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } @@ -272,8 +273,18 @@ p_state->pend_text_column = column; p_state->pend_text_is_cdata = p_state->is_cdata; sv_setpvn(p_state->pend_text, "", 0); + if (!utf8) + SvUTF8_off(p_state->pend_text); + } + if (utf8 && !SvUTF8(p_state->pend_text)) + sv_utf8_upgrade(p_state->pend_text); + if (utf8 || !SvUTF8(p_state->pend_text)) { + sv_catpvn(p_state->pend_text, beg, end - beg); + } + else { + SV *tmp = NULL; + sv_catpvn_utf8_upgrade(p_state->pend_text, beg, end - beg, tmp); } - sv_catpvn(p_state->pend_text, beg, end - beg); return; } else if (p_state->pend_text && SvOK(p_state->pend_text)) { @@ -327,6 +338,8 @@ for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); + if (utf8) + SvUTF8_on(prev_token); av_push(av, prev_token); } else { /* boolean */ @@ -366,6 +379,8 @@ if (num_tokens >= 1) { arg = sv_2mortal(newSVpvn(tokens[0].beg, tokens[0].end - tokens[0].beg)); + if (utf8) + SvUTF8_on(arg); if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0) sv_lower(aTHX_ arg); if (argcode == ARG_TAG && event != E_START) { @@ -392,6 +407,8 @@ tokens[i].end-tokens[i].beg); SV* attrval; + if (utf8) + SvUTF8_on(attrname); if (tokens[i+1].beg) { char *beg = tokens[i+1].beg; STRLEN len = tokens[i+1].end - beg; @@ -400,6 +417,8 @@ beg++; len -= 2; } attrval = newSVpvn(beg, len); + if (utf8) + SvUTF8_on(attrval); if (!p_state->attr_encoded) decode_entities(aTHX_ attrval, p_state->entity2char); } @@ -414,10 +433,8 @@ sv_lower(aTHX_ attrname); if (argcode == ARG_ATTR) { - U32 hash; - PERL_HASH(hash, SvPVX(attrname), SvCUR(attrname)); - if (hv_exists_ent(hv, attrname, hash) || - !hv_store_ent(hv, attrname, attrval, hash)) { + if (hv_exists_ent(hv, attrname, 0) || + !hv_store_ent(hv, attrname, attrval, 0)) { SvREFCNT_dec(attrval); } SvREFCNT_dec(attrname); @@ -446,6 +463,8 @@ for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); + if (utf8) + SvUTF8_on(attrname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); av_push(av, attrname); @@ -456,11 +475,15 @@ case ARG_TEXT: arg = sv_2mortal(newSVpvn(beg, end - beg)); + if (utf8) + SvUTF8_on(arg); break; case ARG_DTEXT: if (event == E_TEXT) { arg = sv_2mortal(newSVpvn(beg, end - beg)); + if (utf8) + SvUTF8_on(arg); if (!p_state->is_cdata) decode_entities(aTHX_ arg, p_state->entity2char); } @@ -475,6 +498,7 @@ case ARG_SKIPPED_TEXT: arg = sv_2mortal(p_state->skipped_text); p_state->skipped_text = newSVpvn("", 0); + SvUTF8_off(p_state->skipped_text); break; case ARG_OFFSET: @@ -506,6 +530,8 @@ { int len = (unsigned char)s[1]; arg = sv_2mortal(newSVpvn(s+2, len)); + if (SvUTF8(h->argspec)) + SvUTF8_on(arg); s += len + 1; } break; @@ -565,7 +591,15 @@ if (p_state->skipped_text) { if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text)) flush_pending_text(p_state, self); - sv_catpvn(p_state->skipped_text, beg, end - beg); + if (utf8 && !SvUTF8(p_state->skipped_text)) + sv_utf8_upgrade(p_state->skipped_text); + if (utf8 || !SvUTF8(p_state->skipped_text)) { + sv_catpvn(p_state->skipped_text, beg, end - beg); + } + else { + SV *tmp = NULL; + sv_catpvn_utf8_upgrade(p_state->skipped_text, beg, end - beg, tmp); + } } return; } @@ -580,6 +614,9 @@ char *s = SvPV(src, len); char *end = s + len; + if (SvUTF8(src)) + SvUTF8_on(argspec); + while (isHSPACE(*s)) s++; @@ -708,8 +745,8 @@ p_state->column = p_state->pend_text_column; report_event(p_state, E_TEXT, - SvPVX(old_pend_text), SvEND(old_pend_text), - 0, 0, self); + SvPVX(old_pend_text), SvEND(old_pend_text), + SvUTF8(old_pend_text), 0, 0, self); SvOK_off(old_pend_text); p_state->unbroken_text = old_unbroken_text; @@ -744,7 +781,7 @@ } static char* -parse_comment(PSTATE* p_state, char *beg, char *end, SV* self) +parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; @@ -771,7 +808,7 @@ /* we are done recognizing all comments, make callbacks */ report_event(p_state, E_COMMENT, - beg - 4, s, + beg - 4, s, utf8, tokens, num_tokens, self); FREE_TOKENS; @@ -808,7 +845,7 @@ token.end = s; if (s < end) { s++; - report_event(p_state, E_COMMENT, beg-4, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); return s; } else { @@ -832,7 +869,7 @@ if (*s == '>') { s++; /* yup */ - report_event(p_state, E_COMMENT, beg-4, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); return s; } } @@ -900,7 +937,7 @@ static char* -parse_marked_section(PSTATE* p_state, char *beg, char *end, SV* self) +parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { dTHX; char *s = beg; @@ -915,6 +952,7 @@ while (isHNAME_FIRST(*s)) { char *name_start = s; char *name_end; + SV *name; s++; while (isHNAME_CHAR(*s)) s++; @@ -926,8 +964,10 @@ if (!tokens) tokens = newAV(); - av_push(tokens, sv_lower(aTHX_ newSVpvn(name_start, - name_end - name_start))); + name = newSVpvn(name_start, name_end - name_start); + if (utf8) + SvUTF8_on(name); + av_push(tokens, sv_lower(aTHX_ name)); } if (*s == '-') { s++; @@ -965,7 +1005,7 @@ p_state->ms_stack = newAV(); av_push(p_state->ms_stack, newRV_noinc((SV*)tokens)); marked_section_update(p_state); - report_event(p_state, E_NONE, beg, s, 0, 0, self); + report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self); return s; } @@ -981,7 +1021,7 @@ static char* -parse_decl(PSTATE* p_state, char *beg, char *end, SV* self) +parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg + 2; @@ -999,7 +1039,7 @@ /* yes, two dashes seen */ s++; - tmp = parse_comment(p_state, s, end, self); + tmp = parse_comment(p_state, s, end, utf8, self); return (tmp == s) ? beg : tmp; } @@ -1008,7 +1048,7 @@ /* marked section */ char *tmp; s++; - tmp = parse_marked_section(p_state, s, end, self); + tmp = parse_marked_section(p_state, s, end, utf8, self); if (!tmp) goto DECL_FAIL; return (tmp == s) ? beg : tmp; @@ -1021,7 +1061,7 @@ token.beg = s; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } @@ -1112,7 +1152,7 @@ goto PREMATURE; if (*s == '>') { s++; - report_event(p_state, E_DECLARATION, beg, s, tokens, num_tokens, self); + report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self); FREE_TOKENS; return s; } @@ -1138,7 +1178,7 @@ token.beg = beg + 2; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } else { @@ -1148,7 +1188,7 @@ static char* -parse_start(PSTATE* p_state, char *beg, char *end, SV* self) +parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; int empty_tag = 0; /* XML feature */ @@ -1249,9 +1289,9 @@ if (*s == '>') { s++; /* done */ - report_event(p_state, E_START, beg, s, tokens, num_tokens, self); + report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self); if (empty_tag) - report_event(p_state, E_END, s, s, tokens, 1, self); + report_event(p_state, E_END, s, s, utf8, tokens, 1, self); if (!p_state->xml_mode) { /* find out if this start tag should put us into literal_mode @@ -1298,7 +1338,7 @@ static char* -parse_end(PSTATE* p_state, char *beg, char *end, SV* self) +parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg+2; hctype_t name_first, name_char; @@ -1330,7 +1370,7 @@ if (*s == '>') { s++; /* a complete end tag has been recognized */ - report_event(p_state, E_END, beg, s, &tagname, 1, self); + report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self); return s; } } @@ -1345,7 +1385,7 @@ token.beg = beg + 2; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } else { @@ -1357,7 +1397,7 @@ static char* -parse_process(PSTATE* p_state, char *beg, char *end, SV* self) +parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg + 2; /* skip '<?' */ /* processing instruction */ @@ -1377,7 +1417,7 @@ } /* a complete processing instruction seen */ - report_event(p_state, E_PROCESS, beg, s, + report_event(p_state, E_PROCESS, beg, s, utf8, &token_pos, 1, self); return s; } @@ -1389,7 +1429,7 @@ #ifdef USE_PFUNC static char* -parse_null(PSTATE* p_state, char *beg, char *end, SV* self) +parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { return 0; } @@ -1400,7 +1440,7 @@ #endif /* USE_PFUNC */ static char* -parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, SV* self) +parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; char *t = beg; @@ -1447,9 +1487,9 @@ if (*s == '>') { s++; if (t != end_text) - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_END, end_text, s, + report_event(p_state, E_END, end_text, s, utf8, &end_token, 1, self); p_state->literal_mode = 0; p_state->is_cdata = 0; @@ -1472,9 +1512,9 @@ s++; /* marked section end */ if (t != end_text) - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_NONE, end_text, s, 0, 0, self); + report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); t = s; SvREFCNT_dec(av_pop(p_state->ms_stack)); marked_section_update(p_state); @@ -1499,9 +1539,9 @@ s++; if (*s == '>') { s++; - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_NONE, end_text, s, + report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); t = s; SvREFCNT_dec(av_pop(p_state->ms_stack)); @@ -1515,7 +1555,7 @@ } if (s != t) { if (*s == '<') { - report_event(p_state, E_TEXT, t, s, 0, 0, self); + report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); t = s; } else { @@ -1534,7 +1574,7 @@ } s++; if (s != t) - report_event(p_state, E_TEXT, t, s, 0, 0, self); + report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); break; } } @@ -1546,16 +1586,16 @@ s++; #ifdef USE_PFUNC - new_pos = parsefunc[(unsigned char)*s](p_state, t, end, self); + new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self); #else if (isHNAME_FIRST(*s)) - new_pos = parse_start(p_state, t, end, self); + new_pos = parse_start(p_state, t, end, utf8, self); else if (*s == '/') - new_pos = parse_end(p_state, t, end, self); + new_pos = parse_end(p_state, t, end, utf8, self); else if (*s == '!') - new_pos = parse_decl(p_state, t, end, self); + new_pos = parse_decl(p_state, t, end, utf8, self); else if (*s == '?') - new_pos = parse_process(p_state, t, end, self); + new_pos = parse_process(p_state, t, end, utf8, self); else new_pos = 0; #endif /* USE_PFUNC */ @@ -1587,6 +1627,7 @@ SV* self) { char *s, *beg, *end; + U32 utf8; STRLEN len; if (!chunk) { @@ -1601,14 +1642,14 @@ if (*s == '<') { /* try to parse with comments terminated with a plain '>' first */ p_state->no_dash_dash_comment_end = 1; - s = parse_buf(aTHX_ p_state, s, end, self); + s = parse_buf(aTHX_ p_state, s, end, SvUTF8(p_state->buf), self); } if (*s == '<') { /* some kind of unterminated markup. Report rest as as comment */ token_pos_t token; token.beg = s + 1; token.end = end; - report_event(p_state, E_COMMENT, s, end, &token, 1, self); + report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self); SvREFCNT_dec(p_state->buf); p_state->buf = 0; } @@ -1619,7 +1660,7 @@ else { /* report rest as text */ REST_IS_TEXT: - report_event(p_state, E_TEXT, s, end, 0, 0, self); + report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self); SvREFCNT_dec(p_state->buf); p_state->buf = 0; } @@ -1632,7 +1673,7 @@ SvREFCNT_dec(p_state->ignoring_element); p_state->ignoring_element = 0; } - report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, self); + report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self); /* reset state */ p_state->offset = 0; @@ -1647,18 +1688,20 @@ if (p_state->buf && SvOK(p_state->buf)) { sv_catsv(p_state->buf, chunk); beg = SvPV(p_state->buf, len); + utf8 = SvUTF8(p_state->buf); } else { beg = SvPV(chunk, len); + utf8 = SvUTF8(chunk); if (p_state->offset == 0) - report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, self); + report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, 0, self); } if (!len) return; /* nothing to do */ end = beg + len; - s = parse_buf(aTHX_ p_state, beg, end, self); + s = parse_buf(aTHX_ p_state, beg, end, utf8, self); if (s == end || p_state->eof) { if (p_state->buf) { @@ -1669,13 +1712,21 @@ /* need to keep rest in buffer */ if (p_state->buf) { /* chop off some chars at the beginning */ - if (SvOK(p_state->buf)) + if (SvOK(p_state->buf)) { sv_chop(p_state->buf, s); - else + } + else { sv_setpvn(p_state->buf, s, end - s); + if (utf8) + SvUTF8_on(p_state->buf); + else + SvUTF8_off(p_state->buf); + } } else { p_state->buf = newSVpv(s, end - s); + if (utf8) + SvUTF8_on(p_state->buf); } } return; Only in HTML-Parser-3.36-work/: hparser.c~ Common subdirectories: HTML-Parser-3.36/lib and HTML-Parser-3.36-work/lib diff -u HTML-Parser-3.36/mkpfunc HTML-Parser-3.36-work/mkpfunc --- HTML-Parser-3.36/mkpfunc 2001-03-29 23:11:00.000000000 -0800 +++ HTML-Parser-3.36-work/mkpfunc 2004-07-19 14:29:52.000000000 -0700 @@ -4,7 +4,7 @@ print "/* This file is autogenerated by $progname */\n"; -print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, SV* self);\n"; +print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);\n"; print "static PFUNC parsefunc[] = {\n"; for my $c (0..255) { Only in HTML-Parser-3.36-work/: mkpfunc~ Binary files HTML-Parser-3.36/Parser.o and HTML-Parser-3.36-work/Parser.o differ diff -u HTML-Parser-3.36/pfunc.h HTML-Parser-3.36-work/pfunc.h --- HTML-Parser-3.36/pfunc.h 2004-07-19 15:10:11.000000000 -0700 +++ HTML-Parser-3.36-work/pfunc.h 2004-07-19 14:51:33.000000000 -0700 @@ -1,5 +1,5 @@ /* This file is autogenerated by mkpfunc */ -typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, SV* self); +typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self); static PFUNC parsefunc[] = { parse_null, /* 0 */ parse_null, /* 1 */ Only in HTML-Parser-3.36-work/: pfunc.h~ Common subdirectories: HTML-Parser-3.36/t and HTML-Parser-3.36-work/t diff -u HTML-Parser-3.36/util.c HTML-Parser-3.36-work/util.c --- HTML-Parser-3.36/util.c 2003-08-15 07:38:37.000000000 -0700 +++ HTML-Parser-3.36-work/util.c 2004-07-19 16:15:40.000000000 -0700 @@ -185,6 +185,16 @@ t = SvPVX(sv) + len; } Safefree(ustr); + /* upgrade the part we have yet to process */ + old_len = len = end - s; + ustr = bytes_to_utf8(s, &len); + grow = len - old_len; + if (grow) { + grow_gap(aTHX_ sv, grow, &t, &s, &end); + s -= grow; + Copy(ustr, s, len, char); + } + Safefree(ustr); } SvUTF8_on(sv); } Only in HTML-Parser-3.36-work/: util.c~
Date: Wed, 21 Jul 2004 17:53:52 -0700
From: John Gardiner Myers <jgmyers@proofpoint.com>
To: bug-HTML-Parser@rt.cpan.org
Subject: Re: [cpan #7014] AutoReply: multiple bugs handling non-ASCII characters
RT-Send-Cc:
With the previous patch applied, one can remove one of the documented bugs. diff -ru HTML-Parser-3.36/Parser.pm HTML-Parser-3.36-work/Parser.pm --- HTML-Parser-3.36/Parser.pm 2004-04-01 04:05:52.000000000 -0800 +++ HTML-Parser-3.36-work/Parser.pm 2004-07-21 15:32:57.000000000 -0700 @@ -996,10 +996,6 @@ =head1 BUGS -Unicode strings are not parsed correctly. A workaround is to encode -them as UTF-8 before passing them to the HTML::Parser. The C<Encode> -module can do that. - The <style> and <script> sections do not end with the first "</", but need the complete corresponding end tag. MSIE avoids terminating a <script> section if the </script> occurs inside quotes. HTML::Parser
From: Tom Insam
The original patch patched an auto-generated file, I've removed this from the patch, and integrated the documentation page in the previous comment. This applies cleanly and passes tests for me on Darwin (Mac OS X 10.3).
Common subdirectories: HTML-Parser-3.36/blib and HTML-Parser-3.36-work/blib Common subdirectories: HTML-Parser-3.36/eg and HTML-Parser-3.36-work/eg Common subdirectories: HTML-Parser-3.36/hints and HTML-Parser-3.36-work/hints diff -u HTML-Parser-3.36/hparser.c HTML-Parser-3.36-work/hparser.c --- HTML-Parser-3.36/hparser.c 2004-04-01 03:56:37.000000000 -0800 +++ HTML-Parser-3.36-work/hparser.c 2004-07-19 14:54:50.000000000 -0700 @@ -111,7 +111,7 @@ static void report_event(PSTATE* p_state, event_id_t event, - char *beg, char *end, + char *beg, char *end, U32 utf8, token_pos_t *tokens, int num_tokens, SV* self ) @@ -196,10 +196,13 @@ if (event == E_START || event == E_END) { SV* tagname = p_state->tmp; - U32 hash; assert(num_tokens >= 1); sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg); + if (utf8) + SvUTF8_on(tagname); + else + SvUTF8_off(tagname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ tagname); @@ -215,10 +218,8 @@ goto IGNORE_EVENT; } - PERL_HASH(hash, SvPVX(tagname), SvCUR(tagname)); - if (p_state->ignore_elements && - hv_fetch_ent(p_state->ignore_elements, tagname, 0, hash)) + hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0)) { p_state->ignoring_element = newSVsv(tagname); p_state->ignore_depth = 1; @@ -226,12 +227,12 @@ } if (p_state->ignore_tags && - hv_fetch_ent(p_state->ignore_tags, tagname, 0, hash)) + hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } if (p_state->report_tags && - !hv_fetch_ent(p_state->report_tags, tagname, 0, hash)) + !hv_fetch_ent(p_state->report_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } @@ -272,8 +273,18 @@ p_state->pend_text_column = column; p_state->pend_text_is_cdata = p_state->is_cdata; sv_setpvn(p_state->pend_text, "", 0); + if (!utf8) + SvUTF8_off(p_state->pend_text); + } + if (utf8 && !SvUTF8(p_state->pend_text)) + sv_utf8_upgrade(p_state->pend_text); + if (utf8 || !SvUTF8(p_state->pend_text)) { + sv_catpvn(p_state->pend_text, beg, end - beg); + } + else { + SV *tmp = NULL; + sv_catpvn_utf8_upgrade(p_state->pend_text, beg, end - beg, tmp); } - sv_catpvn(p_state->pend_text, beg, end - beg); return; } else if (p_state->pend_text && SvOK(p_state->pend_text)) { @@ -327,6 +338,8 @@ for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); + if (utf8) + SvUTF8_on(prev_token); av_push(av, prev_token); } else { /* boolean */ @@ -366,6 +379,8 @@ if (num_tokens >= 1) { arg = sv_2mortal(newSVpvn(tokens[0].beg, tokens[0].end - tokens[0].beg)); + if (utf8) + SvUTF8_on(arg); if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0) sv_lower(aTHX_ arg); if (argcode == ARG_TAG && event != E_START) { @@ -392,6 +407,8 @@ tokens[i].end-tokens[i].beg); SV* attrval; + if (utf8) + SvUTF8_on(attrname); if (tokens[i+1].beg) { char *beg = tokens[i+1].beg; STRLEN len = tokens[i+1].end - beg; @@ -400,6 +417,8 @@ beg++; len -= 2; } attrval = newSVpvn(beg, len); + if (utf8) + SvUTF8_on(attrval); if (!p_state->attr_encoded) decode_entities(aTHX_ attrval, p_state->entity2char); } @@ -414,10 +433,8 @@ sv_lower(aTHX_ attrname); if (argcode == ARG_ATTR) { - U32 hash; - PERL_HASH(hash, SvPVX(attrname), SvCUR(attrname)); - if (hv_exists_ent(hv, attrname, hash) || - !hv_store_ent(hv, attrname, attrval, hash)) { + if (hv_exists_ent(hv, attrname, 0) || + !hv_store_ent(hv, attrname, attrval, 0)) { SvREFCNT_dec(attrval); } SvREFCNT_dec(attrname); @@ -446,6 +463,8 @@ for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); + if (utf8) + SvUTF8_on(attrname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); av_push(av, attrname); @@ -456,11 +475,15 @@ case ARG_TEXT: arg = sv_2mortal(newSVpvn(beg, end - beg)); + if (utf8) + SvUTF8_on(arg); break; case ARG_DTEXT: if (event == E_TEXT) { arg = sv_2mortal(newSVpvn(beg, end - beg)); + if (utf8) + SvUTF8_on(arg); if (!p_state->is_cdata) decode_entities(aTHX_ arg, p_state->entity2char); } @@ -475,6 +498,7 @@ case ARG_SKIPPED_TEXT: arg = sv_2mortal(p_state->skipped_text); p_state->skipped_text = newSVpvn("", 0); + SvUTF8_off(p_state->skipped_text); break; case ARG_OFFSET: @@ -506,6 +530,8 @@ { int len = (unsigned char)s[1]; arg = sv_2mortal(newSVpvn(s+2, len)); + if (SvUTF8(h->argspec)) + SvUTF8_on(arg); s += len + 1; } break; @@ -565,7 +591,15 @@ if (p_state->skipped_text) { if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text)) flush_pending_text(p_state, self); - sv_catpvn(p_state->skipped_text, beg, end - beg); + if (utf8 && !SvUTF8(p_state->skipped_text)) + sv_utf8_upgrade(p_state->skipped_text); + if (utf8 || !SvUTF8(p_state->skipped_text)) { + sv_catpvn(p_state->skipped_text, beg, end - beg); + } + else { + SV *tmp = NULL; + sv_catpvn_utf8_upgrade(p_state->skipped_text, beg, end - beg, tmp); + } } return; } @@ -580,6 +614,9 @@ char *s = SvPV(src, len); char *end = s + len; + if (SvUTF8(src)) + SvUTF8_on(argspec); + while (isHSPACE(*s)) s++; @@ -708,8 +745,8 @@ p_state->column = p_state->pend_text_column; report_event(p_state, E_TEXT, - SvPVX(old_pend_text), SvEND(old_pend_text), - 0, 0, self); + SvPVX(old_pend_text), SvEND(old_pend_text), + SvUTF8(old_pend_text), 0, 0, self); SvOK_off(old_pend_text); p_state->unbroken_text = old_unbroken_text; @@ -744,7 +781,7 @@ } static char* -parse_comment(PSTATE* p_state, char *beg, char *end, SV* self) +parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; @@ -771,7 +808,7 @@ /* we are done recognizing all comments, make callbacks */ report_event(p_state, E_COMMENT, - beg - 4, s, + beg - 4, s, utf8, tokens, num_tokens, self); FREE_TOKENS; @@ -808,7 +845,7 @@ token.end = s; if (s < end) { s++; - report_event(p_state, E_COMMENT, beg-4, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); return s; } else { @@ -832,7 +869,7 @@ if (*s == '>') { s++; /* yup */ - report_event(p_state, E_COMMENT, beg-4, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); return s; } } @@ -900,7 +937,7 @@ static char* -parse_marked_section(PSTATE* p_state, char *beg, char *end, SV* self) +parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { dTHX; char *s = beg; @@ -915,6 +952,7 @@ while (isHNAME_FIRST(*s)) { char *name_start = s; char *name_end; + SV *name; s++; while (isHNAME_CHAR(*s)) s++; @@ -926,8 +964,10 @@ if (!tokens) tokens = newAV(); - av_push(tokens, sv_lower(aTHX_ newSVpvn(name_start, - name_end - name_start))); + name = newSVpvn(name_start, name_end - name_start); + if (utf8) + SvUTF8_on(name); + av_push(tokens, sv_lower(aTHX_ name)); } if (*s == '-') { s++; @@ -965,7 +1005,7 @@ p_state->ms_stack = newAV(); av_push(p_state->ms_stack, newRV_noinc((SV*)tokens)); marked_section_update(p_state); - report_event(p_state, E_NONE, beg, s, 0, 0, self); + report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self); return s; } @@ -981,7 +1021,7 @@ static char* -parse_decl(PSTATE* p_state, char *beg, char *end, SV* self) +parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg + 2; @@ -999,7 +1039,7 @@ /* yes, two dashes seen */ s++; - tmp = parse_comment(p_state, s, end, self); + tmp = parse_comment(p_state, s, end, utf8, self); return (tmp == s) ? beg : tmp; } @@ -1008,7 +1048,7 @@ /* marked section */ char *tmp; s++; - tmp = parse_marked_section(p_state, s, end, self); + tmp = parse_marked_section(p_state, s, end, utf8, self); if (!tmp) goto DECL_FAIL; return (tmp == s) ? beg : tmp; @@ -1021,7 +1061,7 @@ token.beg = s; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } @@ -1112,7 +1152,7 @@ goto PREMATURE; if (*s == '>') { s++; - report_event(p_state, E_DECLARATION, beg, s, tokens, num_tokens, self); + report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self); FREE_TOKENS; return s; } @@ -1138,7 +1178,7 @@ token.beg = beg + 2; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } else { @@ -1148,7 +1188,7 @@ static char* -parse_start(PSTATE* p_state, char *beg, char *end, SV* self) +parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; int empty_tag = 0; /* XML feature */ @@ -1249,9 +1289,9 @@ if (*s == '>') { s++; /* done */ - report_event(p_state, E_START, beg, s, tokens, num_tokens, self); + report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self); if (empty_tag) - report_event(p_state, E_END, s, s, tokens, 1, self); + report_event(p_state, E_END, s, s, utf8, tokens, 1, self); if (!p_state->xml_mode) { /* find out if this start tag should put us into literal_mode @@ -1298,7 +1338,7 @@ static char* -parse_end(PSTATE* p_state, char *beg, char *end, SV* self) +parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg+2; hctype_t name_first, name_char; @@ -1330,7 +1370,7 @@ if (*s == '>') { s++; /* a complete end tag has been recognized */ - report_event(p_state, E_END, beg, s, &tagname, 1, self); + report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self); return s; } } @@ -1345,7 +1385,7 @@ token.beg = beg + 2; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } else { @@ -1357,7 +1397,7 @@ static char* -parse_process(PSTATE* p_state, char *beg, char *end, SV* self) +parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg + 2; /* skip '<?' */ /* processing instruction */ @@ -1377,7 +1417,7 @@ } /* a complete processing instruction seen */ - report_event(p_state, E_PROCESS, beg, s, + report_event(p_state, E_PROCESS, beg, s, utf8, &token_pos, 1, self); return s; } @@ -1389,7 +1429,7 @@ #ifdef USE_PFUNC static char* -parse_null(PSTATE* p_state, char *beg, char *end, SV* self) +parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { return 0; } @@ -1400,7 +1440,7 @@ #endif /* USE_PFUNC */ static char* -parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, SV* self) +parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; char *t = beg; @@ -1447,9 +1487,9 @@ if (*s == '>') { s++; if (t != end_text) - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_END, end_text, s, + report_event(p_state, E_END, end_text, s, utf8, &end_token, 1, self); p_state->literal_mode = 0; p_state->is_cdata = 0; @@ -1472,9 +1512,9 @@ s++; /* marked section end */ if (t != end_text) - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_NONE, end_text, s, 0, 0, self); + report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); t = s; SvREFCNT_dec(av_pop(p_state->ms_stack)); marked_section_update(p_state); @@ -1499,9 +1539,9 @@ s++; if (*s == '>') { s++; - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_NONE, end_text, s, + report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); t = s; SvREFCNT_dec(av_pop(p_state->ms_stack)); @@ -1515,7 +1555,7 @@ } if (s != t) { if (*s == '<') { - report_event(p_state, E_TEXT, t, s, 0, 0, self); + report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); t = s; } else { @@ -1534,7 +1574,7 @@ } s++; if (s != t) - report_event(p_state, E_TEXT, t, s, 0, 0, self); + report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); break; } } @@ -1546,16 +1586,16 @@ s++; #ifdef USE_PFUNC - new_pos = parsefunc[(unsigned char)*s](p_state, t, end, self); + new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self); #else if (isHNAME_FIRST(*s)) - new_pos = parse_start(p_state, t, end, self); + new_pos = parse_start(p_state, t, end, utf8, self); else if (*s == '/') - new_pos = parse_end(p_state, t, end, self); + new_pos = parse_end(p_state, t, end, utf8, self); else if (*s == '!') - new_pos = parse_decl(p_state, t, end, self); + new_pos = parse_decl(p_state, t, end, utf8, self); else if (*s == '?') - new_pos = parse_process(p_state, t, end, self); + new_pos = parse_process(p_state, t, end, utf8, self); else new_pos = 0; #endif /* USE_PFUNC */ @@ -1587,6 +1627,7 @@ SV* self) { char *s, *beg, *end; + U32 utf8; STRLEN len; if (!chunk) { @@ -1601,14 +1642,14 @@ if (*s == '<') { /* try to parse with comments terminated with a plain '>' first */ p_state->no_dash_dash_comment_end = 1; - s = parse_buf(aTHX_ p_state, s, end, self); + s = parse_buf(aTHX_ p_state, s, end, SvUTF8(p_state->buf), self); } if (*s == '<') { /* some kind of unterminated markup. Report rest as as comment */ token_pos_t token; token.beg = s + 1; token.end = end; - report_event(p_state, E_COMMENT, s, end, &token, 1, self); + report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self); SvREFCNT_dec(p_state->buf); p_state->buf = 0; } @@ -1619,7 +1660,7 @@ else { /* report rest as text */ REST_IS_TEXT: - report_event(p_state, E_TEXT, s, end, 0, 0, self); + report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self); SvREFCNT_dec(p_state->buf); p_state->buf = 0; } @@ -1632,7 +1673,7 @@ SvREFCNT_dec(p_state->ignoring_element); p_state->ignoring_element = 0; } - report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, self); + report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self); /* reset state */ p_state->offset = 0; @@ -1647,18 +1688,20 @@ if (p_state->buf && SvOK(p_state->buf)) { sv_catsv(p_state->buf, chunk); beg = SvPV(p_state->buf, len); + utf8 = SvUTF8(p_state->buf); } else { beg = SvPV(chunk, len); + utf8 = SvUTF8(chunk); if (p_state->offset == 0) - report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, self); + report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, 0, self); } if (!len) return; /* nothing to do */ end = beg + len; - s = parse_buf(aTHX_ p_state, beg, end, self); + s = parse_buf(aTHX_ p_state, beg, end, utf8, self); if (s == end || p_state->eof) { if (p_state->buf) { @@ -1669,13 +1712,21 @@ /* need to keep rest in buffer */ if (p_state->buf) { /* chop off some chars at the beginning */ - if (SvOK(p_state->buf)) + if (SvOK(p_state->buf)) { sv_chop(p_state->buf, s); - else + } + else { sv_setpvn(p_state->buf, s, end - s); + if (utf8) + SvUTF8_on(p_state->buf); + else + SvUTF8_off(p_state->buf); + } } else { p_state->buf = newSVpv(s, end - s); + if (utf8) + SvUTF8_on(p_state->buf); } } return; Only in HTML-Parser-3.36-work/: hparser.c~ Common subdirectories: HTML-Parser-3.36/lib and HTML-Parser-3.36-work/lib diff -u HTML-Parser-3.36/mkpfunc HTML-Parser-3.36-work/mkpfunc --- HTML-Parser-3.36/mkpfunc 2001-03-29 23:11:00.000000000 -0800 +++ HTML-Parser-3.36-work/mkpfunc 2004-07-19 14:29:52.000000000 -0700 @@ -4,7 +4,7 @@ print "/* This file is autogenerated by $progname */\n"; -print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, SV* self);\n"; +print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);\n"; print "static PFUNC parsefunc[] = {\n"; for my $c (0..255) { Only in HTML-Parser-3.36-work/: mkpfunc~ Binary files HTML-Parser-3.36/Parser.o and HTML-Parser-3.36-work/Parser.o differ diff -u HTML-Parser-3.36/util.c HTML-Parser-3.36-work/util.c --- HTML-Parser-3.36/util.c 2003-08-15 07:38:37.000000000 -0700 +++ HTML-Parser-3.36-work/util.c 2004-07-19 16:15:40.000000000 -0700 @@ -185,6 +185,16 @@ t = SvPVX(sv) + len; } Safefree(ustr); + /* upgrade the part we have yet to process */ + old_len = len = end - s; + ustr = bytes_to_utf8(s, &len); + grow = len - old_len; + if (grow) { + grow_gap(aTHX_ sv, grow, &t, &s, &end); + s -= grow; + Copy(ustr, s, len, char); + } + Safefree(ustr); } SvUTF8_on(sv); } Only in HTML-Parser-3.36-work/: util.c~ diff -ru HTML-Parser-3.36/Parser.pm HTML-Parser-3.36-work/Parser.pm --- HTML-Parser-3.36/Parser.pm 2004-04-01 04:05:52.000000000 -0800 +++ HTML-Parser-3.36-work/Parser.pm 2004-07-21 15:32:57.000000000 -0700 @@ -996,10 +996,6 @@ =head1 BUGS -Unicode strings are not parsed correctly. A workaround is to encode -them as UTF-8 before passing them to the HTML::Parser. The C<Encode> -module can do that. - The <style> and <script> sections do not end with the first "</", but need the complete corresponding end tag. MSIE avoids terminating a <script> section if the </script> occurs inside quotes. HTML::Parser
From: Tom Insam
Also, I have a test case.
BEGIN { if ($] < 5.006) { print "1..0 # skipped: This perl does not support Unicode\n"; exit; } } use warnings; use strict; use Encode qw( is_utf8 decode ); use HTML::Parser; print "1..2\n"; my $utf8_string = decode('utf8', "\x{c3}\x{a9}"); # e-acute $utf8_string = "<title>$utf8_string</title>"; # this string is UTF8 at the moment. print "not " unless Encode::is_utf8($utf8_string); print "ok 1\n"; my $parser = HTML::Parser->new; $parser->handler( text => sub { my (undef, $text, undef) = @_; # We expect the text parsed out of the HTML to still be UTF8. print "not " unless Encode::is_utf8($text); print "ok 2\n"; } ); $parser->parse($utf8_string);
Subject: Revised fix
From: jgmyers@proofpoint.com
The previous patch had an uninitialized variable which would in some situations cause the result to be gratuitously upgraded to utf8.
Only in HTML-Parser-3.36: blib Only in HTML-Parser-3.36: hctype.h diff -ur HTML-Parser-3.36/hparser.c HTML-Parser-3.36-utf8/hparser.c --- HTML-Parser-3.36/hparser.c 2004-04-01 03:56:37.000000000 -0800 +++ HTML-Parser-3.36-utf8/hparser.c 2004-11-02 10:46:04.000000000 -0800 @@ -111,7 +111,7 @@ static void report_event(PSTATE* p_state, event_id_t event, - char *beg, char *end, + char *beg, char *end, U32 utf8, token_pos_t *tokens, int num_tokens, SV* self ) @@ -196,10 +196,13 @@ if (event == E_START || event == E_END) { SV* tagname = p_state->tmp; - U32 hash; assert(num_tokens >= 1); sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg); + if (utf8) + SvUTF8_on(tagname); + else + SvUTF8_off(tagname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ tagname); @@ -215,10 +218,8 @@ goto IGNORE_EVENT; } - PERL_HASH(hash, SvPVX(tagname), SvCUR(tagname)); - if (p_state->ignore_elements && - hv_fetch_ent(p_state->ignore_elements, tagname, 0, hash)) + hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0)) { p_state->ignoring_element = newSVsv(tagname); p_state->ignore_depth = 1; @@ -226,12 +227,12 @@ } if (p_state->ignore_tags && - hv_fetch_ent(p_state->ignore_tags, tagname, 0, hash)) + hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } if (p_state->report_tags && - !hv_fetch_ent(p_state->report_tags, tagname, 0, hash)) + !hv_fetch_ent(p_state->report_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } @@ -272,8 +273,18 @@ p_state->pend_text_column = column; p_state->pend_text_is_cdata = p_state->is_cdata; sv_setpvn(p_state->pend_text, "", 0); + if (!utf8) + SvUTF8_off(p_state->pend_text); + } + if (utf8 && !SvUTF8(p_state->pend_text)) + sv_utf8_upgrade(p_state->pend_text); + if (utf8 || !SvUTF8(p_state->pend_text)) { + sv_catpvn(p_state->pend_text, beg, end - beg); + } + else { + SV *tmp = NULL; + sv_catpvn_utf8_upgrade(p_state->pend_text, beg, end - beg, tmp); } - sv_catpvn(p_state->pend_text, beg, end - beg); return; } else if (p_state->pend_text && SvOK(p_state->pend_text)) { @@ -327,6 +338,8 @@ for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); + if (utf8) + SvUTF8_on(prev_token); av_push(av, prev_token); } else { /* boolean */ @@ -366,6 +379,8 @@ if (num_tokens >= 1) { arg = sv_2mortal(newSVpvn(tokens[0].beg, tokens[0].end - tokens[0].beg)); + if (utf8) + SvUTF8_on(arg); if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0) sv_lower(aTHX_ arg); if (argcode == ARG_TAG && event != E_START) { @@ -392,6 +407,8 @@ tokens[i].end-tokens[i].beg); SV* attrval; + if (utf8) + SvUTF8_on(attrname); if (tokens[i+1].beg) { char *beg = tokens[i+1].beg; STRLEN len = tokens[i+1].end - beg; @@ -400,6 +417,8 @@ beg++; len -= 2; } attrval = newSVpvn(beg, len); + if (utf8) + SvUTF8_on(attrval); if (!p_state->attr_encoded) decode_entities(aTHX_ attrval, p_state->entity2char); } @@ -414,10 +433,8 @@ sv_lower(aTHX_ attrname); if (argcode == ARG_ATTR) { - U32 hash; - PERL_HASH(hash, SvPVX(attrname), SvCUR(attrname)); - if (hv_exists_ent(hv, attrname, hash) || - !hv_store_ent(hv, attrname, attrval, hash)) { + if (hv_exists_ent(hv, attrname, 0) || + !hv_store_ent(hv, attrname, attrval, 0)) { SvREFCNT_dec(attrval); } SvREFCNT_dec(attrname); @@ -446,6 +463,8 @@ for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); + if (utf8) + SvUTF8_on(attrname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); av_push(av, attrname); @@ -456,11 +475,15 @@ case ARG_TEXT: arg = sv_2mortal(newSVpvn(beg, end - beg)); + if (utf8) + SvUTF8_on(arg); break; case ARG_DTEXT: if (event == E_TEXT) { arg = sv_2mortal(newSVpvn(beg, end - beg)); + if (utf8) + SvUTF8_on(arg); if (!p_state->is_cdata) decode_entities(aTHX_ arg, p_state->entity2char); } @@ -475,6 +498,7 @@ case ARG_SKIPPED_TEXT: arg = sv_2mortal(p_state->skipped_text); p_state->skipped_text = newSVpvn("", 0); + SvUTF8_off(p_state->skipped_text); break; case ARG_OFFSET: @@ -506,6 +530,8 @@ { int len = (unsigned char)s[1]; arg = sv_2mortal(newSVpvn(s+2, len)); + if (SvUTF8(h->argspec)) + SvUTF8_on(arg); s += len + 1; } break; @@ -565,7 +591,15 @@ if (p_state->skipped_text) { if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text)) flush_pending_text(p_state, self); - sv_catpvn(p_state->skipped_text, beg, end - beg); + if (utf8 && !SvUTF8(p_state->skipped_text)) + sv_utf8_upgrade(p_state->skipped_text); + if (utf8 || !SvUTF8(p_state->skipped_text)) { + sv_catpvn(p_state->skipped_text, beg, end - beg); + } + else { + SV *tmp = NULL; + sv_catpvn_utf8_upgrade(p_state->skipped_text, beg, end - beg, tmp); + } } return; } @@ -580,6 +614,9 @@ char *s = SvPV(src, len); char *end = s + len; + if (SvUTF8(src)) + SvUTF8_on(argspec); + while (isHSPACE(*s)) s++; @@ -708,8 +745,8 @@ p_state->column = p_state->pend_text_column; report_event(p_state, E_TEXT, - SvPVX(old_pend_text), SvEND(old_pend_text), - 0, 0, self); + SvPVX(old_pend_text), SvEND(old_pend_text), + SvUTF8(old_pend_text), 0, 0, self); SvOK_off(old_pend_text); p_state->unbroken_text = old_unbroken_text; @@ -744,7 +781,7 @@ } static char* -parse_comment(PSTATE* p_state, char *beg, char *end, SV* self) +parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; @@ -771,7 +808,7 @@ /* we are done recognizing all comments, make callbacks */ report_event(p_state, E_COMMENT, - beg - 4, s, + beg - 4, s, utf8, tokens, num_tokens, self); FREE_TOKENS; @@ -808,7 +845,7 @@ token.end = s; if (s < end) { s++; - report_event(p_state, E_COMMENT, beg-4, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); return s; } else { @@ -832,7 +869,7 @@ if (*s == '>') { s++; /* yup */ - report_event(p_state, E_COMMENT, beg-4, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); return s; } } @@ -900,7 +937,7 @@ static char* -parse_marked_section(PSTATE* p_state, char *beg, char *end, SV* self) +parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { dTHX; char *s = beg; @@ -915,6 +952,7 @@ while (isHNAME_FIRST(*s)) { char *name_start = s; char *name_end; + SV *name; s++; while (isHNAME_CHAR(*s)) s++; @@ -926,8 +964,10 @@ if (!tokens) tokens = newAV(); - av_push(tokens, sv_lower(aTHX_ newSVpvn(name_start, - name_end - name_start))); + name = newSVpvn(name_start, name_end - name_start); + if (utf8) + SvUTF8_on(name); + av_push(tokens, sv_lower(aTHX_ name)); } if (*s == '-') { s++; @@ -965,7 +1005,7 @@ p_state->ms_stack = newAV(); av_push(p_state->ms_stack, newRV_noinc((SV*)tokens)); marked_section_update(p_state); - report_event(p_state, E_NONE, beg, s, 0, 0, self); + report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self); return s; } @@ -981,7 +1021,7 @@ static char* -parse_decl(PSTATE* p_state, char *beg, char *end, SV* self) +parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg + 2; @@ -999,7 +1039,7 @@ /* yes, two dashes seen */ s++; - tmp = parse_comment(p_state, s, end, self); + tmp = parse_comment(p_state, s, end, utf8, self); return (tmp == s) ? beg : tmp; } @@ -1008,7 +1048,7 @@ /* marked section */ char *tmp; s++; - tmp = parse_marked_section(p_state, s, end, self); + tmp = parse_marked_section(p_state, s, end, utf8, self); if (!tmp) goto DECL_FAIL; return (tmp == s) ? beg : tmp; @@ -1021,7 +1061,7 @@ token.beg = s; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } @@ -1112,7 +1152,7 @@ goto PREMATURE; if (*s == '>') { s++; - report_event(p_state, E_DECLARATION, beg, s, tokens, num_tokens, self); + report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self); FREE_TOKENS; return s; } @@ -1138,7 +1178,7 @@ token.beg = beg + 2; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } else { @@ -1148,7 +1188,7 @@ static char* -parse_start(PSTATE* p_state, char *beg, char *end, SV* self) +parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; int empty_tag = 0; /* XML feature */ @@ -1249,9 +1289,9 @@ if (*s == '>') { s++; /* done */ - report_event(p_state, E_START, beg, s, tokens, num_tokens, self); + report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self); if (empty_tag) - report_event(p_state, E_END, s, s, tokens, 1, self); + report_event(p_state, E_END, s, s, utf8, tokens, 1, self); if (!p_state->xml_mode) { /* find out if this start tag should put us into literal_mode @@ -1298,7 +1338,7 @@ static char* -parse_end(PSTATE* p_state, char *beg, char *end, SV* self) +parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg+2; hctype_t name_first, name_char; @@ -1330,7 +1370,7 @@ if (*s == '>') { s++; /* a complete end tag has been recognized */ - report_event(p_state, E_END, beg, s, &tagname, 1, self); + report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self); return s; } } @@ -1345,7 +1385,7 @@ token.beg = beg + 2; token.end = s; s++; - report_event(p_state, E_COMMENT, beg, s, &token, 1, self); + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } else { @@ -1357,7 +1397,7 @@ static char* -parse_process(PSTATE* p_state, char *beg, char *end, SV* self) +parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg + 2; /* skip '<?' */ /* processing instruction */ @@ -1377,7 +1417,7 @@ } /* a complete processing instruction seen */ - report_event(p_state, E_PROCESS, beg, s, + report_event(p_state, E_PROCESS, beg, s, utf8, &token_pos, 1, self); return s; } @@ -1389,7 +1429,7 @@ #ifdef USE_PFUNC static char* -parse_null(PSTATE* p_state, char *beg, char *end, SV* self) +parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { return 0; } @@ -1400,7 +1440,7 @@ #endif /* USE_PFUNC */ static char* -parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, SV* self) +parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; char *t = beg; @@ -1447,9 +1487,9 @@ if (*s == '>') { s++; if (t != end_text) - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_END, end_text, s, + report_event(p_state, E_END, end_text, s, utf8, &end_token, 1, self); p_state->literal_mode = 0; p_state->is_cdata = 0; @@ -1472,9 +1512,9 @@ s++; /* marked section end */ if (t != end_text) - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_NONE, end_text, s, 0, 0, self); + report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); t = s; SvREFCNT_dec(av_pop(p_state->ms_stack)); marked_section_update(p_state); @@ -1499,9 +1539,9 @@ s++; if (*s == '>') { s++; - report_event(p_state, E_TEXT, t, end_text, + report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); - report_event(p_state, E_NONE, end_text, s, + report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); t = s; SvREFCNT_dec(av_pop(p_state->ms_stack)); @@ -1515,7 +1555,7 @@ } if (s != t) { if (*s == '<') { - report_event(p_state, E_TEXT, t, s, 0, 0, self); + report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); t = s; } else { @@ -1534,7 +1574,7 @@ } s++; if (s != t) - report_event(p_state, E_TEXT, t, s, 0, 0, self); + report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); break; } } @@ -1546,16 +1586,16 @@ s++; #ifdef USE_PFUNC - new_pos = parsefunc[(unsigned char)*s](p_state, t, end, self); + new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self); #else if (isHNAME_FIRST(*s)) - new_pos = parse_start(p_state, t, end, self); + new_pos = parse_start(p_state, t, end, utf8, self); else if (*s == '/') - new_pos = parse_end(p_state, t, end, self); + new_pos = parse_end(p_state, t, end, utf8, self); else if (*s == '!') - new_pos = parse_decl(p_state, t, end, self); + new_pos = parse_decl(p_state, t, end, utf8, self); else if (*s == '?') - new_pos = parse_process(p_state, t, end, self); + new_pos = parse_process(p_state, t, end, utf8, self); else new_pos = 0; #endif /* USE_PFUNC */ @@ -1587,6 +1627,7 @@ SV* self) { char *s, *beg, *end; + U32 utf8 = 0; STRLEN len; if (!chunk) { @@ -1601,14 +1642,14 @@ if (*s == '<') { /* try to parse with comments terminated with a plain '>' first */ p_state->no_dash_dash_comment_end = 1; - s = parse_buf(aTHX_ p_state, s, end, self); + s = parse_buf(aTHX_ p_state, s, end, SvUTF8(p_state->buf), self); } if (*s == '<') { /* some kind of unterminated markup. Report rest as as comment */ token_pos_t token; token.beg = s + 1; token.end = end; - report_event(p_state, E_COMMENT, s, end, &token, 1, self); + report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self); SvREFCNT_dec(p_state->buf); p_state->buf = 0; } @@ -1619,7 +1660,7 @@ else { /* report rest as text */ REST_IS_TEXT: - report_event(p_state, E_TEXT, s, end, 0, 0, self); + report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self); SvREFCNT_dec(p_state->buf); p_state->buf = 0; } @@ -1632,7 +1673,7 @@ SvREFCNT_dec(p_state->ignoring_element); p_state->ignoring_element = 0; } - report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, self); + report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self); /* reset state */ p_state->offset = 0; @@ -1647,18 +1688,20 @@ if (p_state->buf && SvOK(p_state->buf)) { sv_catsv(p_state->buf, chunk); beg = SvPV(p_state->buf, len); + utf8 = SvUTF8(p_state->buf); } else { beg = SvPV(chunk, len); + utf8 = SvUTF8(chunk); if (p_state->offset == 0) - report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, self); + report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, 0, self); } if (!len) return; /* nothing to do */ end = beg + len; - s = parse_buf(aTHX_ p_state, beg, end, self); + s = parse_buf(aTHX_ p_state, beg, end, utf8, self); if (s == end || p_state->eof) { if (p_state->buf) { @@ -1669,13 +1712,21 @@ /* need to keep rest in buffer */ if (p_state->buf) { /* chop off some chars at the beginning */ - if (SvOK(p_state->buf)) + if (SvOK(p_state->buf)) { sv_chop(p_state->buf, s); - else + } + else { sv_setpvn(p_state->buf, s, end - s); + if (utf8) + SvUTF8_on(p_state->buf); + else + SvUTF8_off(p_state->buf); + } } else { p_state->buf = newSVpv(s, end - s); + if (utf8) + SvUTF8_on(p_state->buf); } } return; Only in HTML-Parser-3.36-utf8/: hparser.c~ Only in HTML-Parser-3.36: Makefile Only in HTML-Parser-3.36-utf8/: Makefile~ Only in HTML-Parser-3.36-utf8/: Makefile.old diff -ur HTML-Parser-3.36/mkpfunc HTML-Parser-3.36-utf8/mkpfunc --- HTML-Parser-3.36/mkpfunc 2001-03-29 23:11:00.000000000 -0800 +++ HTML-Parser-3.36-utf8/mkpfunc 2004-09-27 19:01:40.000000000 -0700 @@ -4,7 +4,7 @@ print "/* This file is autogenerated by $progname */\n"; -print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, SV* self);\n"; +print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);\n"; print "static PFUNC parsefunc[] = {\n"; for my $c (0..255) { Only in HTML-Parser-3.36: Parser.bs Only in HTML-Parser-3.36: Parser.c Only in HTML-Parser-3.36: Parser.o diff -ur HTML-Parser-3.36/Parser.pm HTML-Parser-3.36-utf8/Parser.pm --- HTML-Parser-3.36/Parser.pm 2004-04-01 04:05:52.000000000 -0800 +++ HTML-Parser-3.36-utf8/Parser.pm 2004-11-01 12:17:40.000000000 -0800 @@ -996,10 +996,6 @@ =head1 BUGS -Unicode strings are not parsed correctly. A workaround is to encode -them as UTF-8 before passing them to the HTML::Parser. The C<Encode> -module can do that. - The <style> and <script> sections do not end with the first "</", but need the complete corresponding end tag. MSIE avoids terminating a <script> section if the </script> occurs inside quotes. HTML::Parser Only in HTML-Parser-3.36: pfunc.h Only in HTML-Parser-3.36: pm_to_blib Only in HTML-Parser-3.36-utf8/t: parser.t~ diff -ur HTML-Parser-3.36/util.c HTML-Parser-3.36-utf8/util.c --- HTML-Parser-3.36/util.c 2003-08-15 07:38:37.000000000 -0700 +++ HTML-Parser-3.36-utf8/util.c 2004-09-27 19:01:40.000000000 -0700 @@ -185,6 +185,16 @@ t = SvPVX(sv) + len; } Safefree(ustr); + /* upgrade the part we have yet to process */ + old_len = len = end - s; + ustr = bytes_to_utf8(s, &len); + grow = len - old_len; + if (grow) { + grow_gap(aTHX_ sv, grow, &t, &s, &end); + s -= grow; + Copy(ustr, s, len, char); + } + Safefree(ustr); } SvUTF8_on(sv); }
I have now uploaded HTML-Parser-3.39_90 with the proposed patch in it. Please give it a spin.
From: jgmyers@proofpoint.com
Remove completed TODO item.
diff -ru HTML-Parser-3.3990-orig/TODO HTML-Parser-3.3990/TODO --- HTML-Parser-3.3990-orig/TODO 2003-08-15 09:47:03.000000000 -0700 +++ HTML-Parser-3.3990/TODO 2004-11-17 11:03:45.000000000 -0800 @@ -3,8 +3,6 @@ - limit the length of markup elements that never end. Perhaps by configurable limits on the length that markup can have and still be recongnized. Report stuff as 'text' when this happens? - - unicode support (when parsing Unicode strings the strings reported - in callbacks should also be Unicode strings). - remove 255 char limit on literal argspec strings - implement backslash escapes in literal argspec string - <![%app1;[...]]> (parameter entities) Only in HTML-Parser-3.3990: TODO~


This service runs on Request Tracker, is sponsored by The Perl Foundation, and maintained by Best Practical Solutions.

Please report any issues with rt.cpan.org to rt-cpan-admin@bestpractical.com.