@@ -182,10 +182,10 @@ def summary(self, html_partial=False):
182
182
if ruthless :
183
183
self .remove_unlikely_candidates ()
184
184
self .transform_misused_divs_into_paragraphs ()
185
+
185
186
candidates = self .score_paragraphs ()
186
187
187
188
best_candidate = self .select_best_candidate (candidates )
188
-
189
189
if best_candidate :
190
190
article = self .get_article (candidates , best_candidate ,
191
191
html_partial = html_partial )
@@ -381,13 +381,13 @@ def class_weight(self, e):
381
381
def score_node (self , elem ):
382
382
content_score = self .class_weight (elem )
383
383
name = elem .tag .lower ()
384
- if name == "div" :
384
+ if name in [ "div" , "article" ] :
385
385
content_score += 5
386
386
elif name in ["pre" , "td" , "blockquote" ]:
387
387
content_score += 3
388
- elif name in ["address" , "ol" , "ul" , "dl" , "dd" , "dt" , "li" , "form" ]:
388
+ elif name in ["address" , "ol" , "ul" , "dl" , "dd" , "dt" , "li" , "form" , "aside" ]:
389
389
content_score -= 3
390
- elif name in ["h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "th" ]:
390
+ elif name in ["h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "th" , "header" , "footer" , "nav" ]:
391
391
content_score -= 5
392
392
return {
393
393
'content_score' : content_score ,
@@ -400,8 +400,10 @@ def remove_unlikely_candidates(self):
400
400
if len (s ) < 2 :
401
401
continue
402
402
if REGEXES ['unlikelyCandidatesRe' ].search (s ) and (not REGEXES ['okMaybeItsACandidateRe' ].search (s )) and elem .tag not in ['html' , 'body' ]:
403
+ #print("Removing", describe(elem))
403
404
log .debug ("Removing unlikely candidate - %s" % describe (elem ))
404
405
elem .drop_tree ()
406
+ #print("After removal: {}".format(tostring(self.html)))
405
407
406
408
def transform_misused_divs_into_paragraphs (self ):
407
409
for elem in self .tags (self .html , 'div' ):
@@ -463,7 +465,7 @@ def sanitize(self, node, candidates):
463
465
464
466
allowed = {}
465
467
# Conditionally clean <table>s, <ul>s, and <div>s
466
- for el in self .reverse_tags (node , "table" , "ul" , "div" ):
468
+ for el in self .reverse_tags (node , "table" , "ul" , "div" , "aside" , "header" , "footer" , "section" ):
467
469
if el in allowed :
468
470
continue
469
471
weight = self .class_weight (el )
@@ -577,7 +579,7 @@ def sanitize(self, node, candidates):
577
579
if siblings and sum (siblings ) > 1000 :
578
580
to_remove = False
579
581
log .debug ("Allowing %s" % describe (el ))
580
- for desnode in self .tags (el , "table" , "ul" , "div" ):
582
+ for desnode in self .tags (el , "table" , "ul" , "div" , "section" ):
581
583
allowed [desnode ] = True
582
584
583
585
if to_remove :
0 commit comments