@@ -381,13 +381,13 @@ def class_weight(self, e):
381381 def score_node (self , elem ):
382382 content_score = self .class_weight (elem )
383383 name = elem .tag .lower ()
384- if name == "div" :
384+ if name in [ "div" , "article" ] :
385385 content_score += 5
386386 elif name in ["pre" , "td" , "blockquote" ]:
387387 content_score += 3
388- elif name in ["address" , "ol" , "ul" , "dl" , "dd" , "dt" , "li" , "form" ]:
388+ elif name in ["address" , "ol" , "ul" , "dl" , "dd" , "dt" , "li" , "form" , "aside" ]:
389389 content_score -= 3
390- elif name in ["h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "th" ]:
390+ elif name in ["h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "th" , "header" , "footer" , "nav" ]:
391391 content_score -= 5
392392 return {
393393 'content_score' : content_score ,
@@ -463,7 +463,7 @@ def sanitize(self, node, candidates):
463463
464464 allowed = {}
465465 # Conditionally clean <table>s, <ul>s, and <div>s
466- for el in self .reverse_tags (node , "table" , "ul" , "div" ):
466+ for el in self .reverse_tags (node , "table" , "ul" , "div" , "aside" , "header" , "footer" , "section" ):
467467 if el in allowed :
468468 continue
469469 weight = self .class_weight (el )
@@ -577,7 +577,7 @@ def sanitize(self, node, candidates):
577577 if siblings and sum (siblings ) > 1000 :
578578 to_remove = False
579579 log .debug ("Allowing %s" % describe (el ))
580- for desnode in self .tags (el , "table" , "ul" , "div" ):
580+ for desnode in self .tags (el , "table" , "ul" , "div" , "section" ):
581581 allowed [desnode ] = True
582582
583583 if to_remove :
0 commit comments