3131piclose = re .compile ('>' )
3232commentclose = re .compile (r'--\s*>' )
3333# Note:
34- # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
35- # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
34+ # 1) if you change tagfind/attrfind remember to update locatetagend too;
35+ # 2) if you change tagfind/attrfind and/or locatetagend the parser will
3636# explode, so don't do it.
37- # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
38- # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
39- tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*' )
40- attrfind_tolerant = re .compile (
41- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
42- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' )
37+ # see the HTML5 specs section "13.2.5.6 Tag open state",
38+ # "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
39+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
40+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
41+ # https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
42+ tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*' )
43+ attrfind_tolerant = re .compile (r"""
44+ (
45+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
46+ )
47+ (= # value indicator
48+ ('[^']*' # LITA-enclosed value
49+ |"[^"]*" # LIT-enclosed value
50+ |(?!['"])[^>\t\n\r\f ]* # bare value
51+ )
52+ )?
53+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
54+ """ , re .VERBOSE )
55+ locatetagend = re .compile (r"""
56+ [a-zA-Z][^\t\n\r\f />]* # tag name
57+ [\t\n\r\f /]* # optional whitespace before attribute name
58+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
59+ (?:= # value indicator
60+ (?:'[^']*' # LITA-enclosed value
61+ |"[^"]*" # LIT-enclosed value
62+ |(?!['"])[^>\t\n\r\f ]* # bare value
63+ )
64+ )?
65+ [\t\n\r\f /]* # possibly followed by a space
66+ )*
67+ >?
68+ """ , re .VERBOSE )
69+ # The following variables are not used, but are temporarily left for
70+ # backward compatibility.
4371locatestarttagend_tolerant = re .compile (r"""
4472 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4573 (?:[\s/]* # optional whitespace before attribute name
5684 \s* # trailing whitespace
5785""" , re .VERBOSE )
5886endendtag = re .compile ('>' )
59- # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
60- # </ and the tag name, so maybe this should be fixed
6187endtagfind = re .compile (r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
6288
6389# Character reference processing logic specific to attribute values
@@ -141,7 +167,8 @@ def get_starttag_text(self):
141167
142168 def set_cdata_mode (self , elem ):
143169 self .cdata_elem = elem .lower ()
144- self .interesting = re .compile (r'</\s*%s\s*>' % self .cdata_elem , re .I )
170+ self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
171+ re .IGNORECASE | re .ASCII )
145172
146173 def clear_cdata_mode (self ):
147174 self .interesting = interesting_normal
@@ -166,7 +193,7 @@ def goahead(self, end):
166193 # & near the end and see if it's followed by a space or ;.
167194 amppos = rawdata .rfind ('&' , max (i , n - 34 ))
168195 if (amppos >= 0 and
169- not re .compile (r'[\s ;]' ).search (rawdata , amppos )):
196+ not re .compile (r'[\t\n\r\f ;]' ).search (rawdata , amppos )):
170197 break # wait till we get all the text
171198 j = n
172199 else :
@@ -310,7 +337,7 @@ def parse_html_declaration(self, i):
310337 return self .parse_bogus_comment (i )
311338
312339 # Internal -- parse bogus comment, return length or -1 if not terminated
313- # see http ://www.w3. org/TR/html5/tokenization .html#bogus-comment-state
340+ # see https ://html.spec.whatwg. org/multipage/parsing .html#bogus-comment-state
314341 def parse_bogus_comment (self , i , report = 1 ):
315342 rawdata = self .rawdata
316343 assert rawdata [i :i + 2 ] in ('<!' , '</' ), ('unexpected call to '
@@ -336,6 +363,8 @@ def parse_pi(self, i):
336363
337364 # Internal -- handle starttag, return end or -1 if not terminated
338365 def parse_starttag (self , i ):
366+ # See the HTML5 specs section "13.2.5.8 Tag name state"
367+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
339368 self .__starttag_text = None
340369 endpos = self .check_for_whole_start_tag (i )
341370 if endpos < 0 :
@@ -381,76 +410,42 @@ def parse_starttag(self, i):
381410 # or -1 if incomplete.
382411 def check_for_whole_start_tag (self , i ):
383412 rawdata = self .rawdata
384- m = locatestarttagend_tolerant .match (rawdata , i )
385- if m :
386- j = m .end ()
387- next = rawdata [j :j + 1 ]
388- if next == ">" :
389- return j + 1
390- if next == "/" :
391- if rawdata .startswith ("/>" , j ):
392- return j + 2
393- if rawdata .startswith ("/" , j ):
394- # buffer boundary
395- return - 1
396- # else bogus input
397- if j > i :
398- return j
399- else :
400- return i + 1
401- if next == "" :
402- # end of input
403- return - 1
404- if next in ("abcdefghijklmnopqrstuvwxyz=/"
405- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ):
406- # end of input in or before attribute value, or we have the
407- # '/' from a '/>' ending
408- return - 1
409- if j > i :
410- return j
411- else :
412- return i + 1
413- raise AssertionError ("we should not get here!" )
413+ match = locatetagend .match (rawdata , i + 1 )
414+ assert match
415+ j = match .end ()
416+ if rawdata [j - 1 ] != ">" :
417+ return - 1
418+ return j
414419
415420 # Internal -- parse endtag, return end or -1 if incomplete
416421 def parse_endtag (self , i ):
422+ # See the HTML5 specs section "13.2.5.7 End tag open state"
423+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
417424 rawdata = self .rawdata
418425 assert rawdata [i :i + 2 ] == "</" , "unexpected call to parse_endtag"
419- match = endendtag .search (rawdata , i + 1 ) # >
420- if not match :
426+ if rawdata .find ('>' , i + 2 ) < 0 : # fast check
421427 return - 1
422- gtpos = match .end ()
423- match = endtagfind .match (rawdata , i ) # </ + tag + >
424- if not match :
425- if self .cdata_elem is not None :
426- self .handle_data (rawdata [i :gtpos ])
427- return gtpos
428- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
429- namematch = tagfind_tolerant .match (rawdata , i + 2 )
430- if not namematch :
431- # w3.org/TR/html5/tokenization.html#end-tag-open-state
432- if rawdata [i :i + 3 ] == '</>' :
433- return i + 3
434- else :
435- return self .parse_bogus_comment (i )
436- tagname = namematch .group (1 ).lower ()
437- # consume and ignore other stuff between the name and the >
438- # Note: this is not 100% correct, since we might have things like
439- # </tag attr=">">, but looking for > after the name should cover
440- # most of the cases and is much simpler
441- gtpos = rawdata .find ('>' , namematch .end ())
442- self .handle_endtag (tagname )
443- return gtpos + 1
428+ if not endtagopen .match (rawdata , i ): # </ + letter
429+ if rawdata [i + 2 :i + 3 ] == '>' : # </> is ignored
430+ # "missing-end-tag-name" parser error
431+ return i + 3
432+ else :
433+ return self .parse_bogus_comment (i )
444434
445- elem = match . group ( 1 ). lower () # script or style
446- if self . cdata_elem is not None :
447- if elem != self . cdata_elem :
448- self . handle_data ( rawdata [i : gtpos ])
449- return gtpos
435+ match = locatetagend . match ( rawdata , i + 2 )
436+ assert match
437+ j = match . end ()
438+ if rawdata [j - 1 ] != ">" :
439+ return - 1
450440
451- self .handle_endtag (elem )
441+ # find the name: "13.2.5.8 Tag name state"
442+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
443+ match = tagfind_tolerant .match (rawdata , i + 2 )
444+ assert match
445+ tag = match .group (1 ).lower ()
446+ self .handle_endtag (tag )
452447 self .clear_cdata_mode ()
453- return gtpos
448+ return j
454449
455450 # Overridable -- finish processing of start+end tag: <tag.../>
456451 def handle_startendtag (self , tag , attrs ):
0 commit comments