Skip to content

Commit

Permalink
Some language tidy-ups (#406)
Browse files Browse the repository at this point in the history
* fix spacing, tidy imports

isort to sort and tidy imports
remove unnecessary trailing whitespace
prefix strings with escape with a \r

* update format strings to f-strings

Use flynt to process them
  • Loading branch information
marksmayo authored Aug 18, 2023
1 parent c6f4559 commit 46fb398
Show file tree
Hide file tree
Showing 35 changed files with 116 additions and 138 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@

import os
import sys

sys.path.insert(0, os.path.abspath('.'))

import trafilatura


# -- Project information -----------------------------------------------------

project = 'trafilatura'
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import re
from pathlib import Path

from setuptools import setup


Expand Down
9 changes: 1 addition & 8 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
# -*- coding: utf-8 -*-

#import os
import pytest # unittest?
import pytest # unittest?

#import trafilatura







3 changes: 0 additions & 3 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,17 @@
import re
import subprocess
import sys

from contextlib import redirect_stdout
from datetime import datetime
from unittest.mock import patch

import pytest

from courlan import UrlStore

from trafilatura import cli, cli_utils, settings, spider
from trafilatura.downloads import add_to_compressed_dict, fetch_url
from trafilatura.filters import LANGID_FLAG


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')

Expand Down
44 changes: 22 additions & 22 deletions tests/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import time

from lxml import html # etree

#from lxml.html.clean import Cleaner
#HTML_CLEANER = Cleaner()

Expand All @@ -19,7 +20,6 @@
import html2text
import html_text
import justext

from boilerpy3 import extractors
from bs4 import BeautifulSoup
#from dragnet import extract_content #, extract_content_and_comments
Expand All @@ -31,20 +31,20 @@
from newsplease import NewsPlease
from readabilipy import simple_json_from_html_string
from readability import Document

from resiliparse.parse.encoding import detect_encoding, bytes_to_str
from resiliparse.parse.html import HTMLTree
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse.encoding import bytes_to_str, detect_encoding
from resiliparse.parse.html import HTMLTree

from trafilatura import extract

try:
from trafilatura.core import baseline
except ImportError:
baseline = None
from trafilatura.utils import sanitize

from evaldata import EVAL_PAGES

from trafilatura.utils import sanitize

## TODO: time, best of 3

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
Expand Down Expand Up @@ -648,32 +648,32 @@ def calculate_scores(mydict):
print('html2text')
print(html2text_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(html2text_result)))
print("time diff.: %.2f" % (html2text_result['time'] / baseline_result['time']))
print(f"time diff.: {html2text_result['time'] / baseline_result['time']:.2f}")

print('html_text')
print(html_text_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(html_text_result)))
print("time diff.: %.2f" % (html_text_result['time'] / baseline_result['time']))
print(f"time diff.: {html_text_result['time'] / baseline_result['time']:.2f}")

print('inscriptis')
print(inscriptis_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(inscriptis_result)))
print("time diff.: %.2f" % (inscriptis_result['time'] / baseline_result['time']))
print(f"time diff.: {inscriptis_result['time'] / baseline_result['time']:.2f}")

print('justext')
print(justext_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(justext_result)))
print("time diff.: %.2f" % (justext_result['time'] / baseline_result['time']))
print(f"time diff.: {justext_result['time'] / baseline_result['time']:.2f}")

print('goose')
print(goose_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(goose_result)))
print("time diff.: %.2f" % (goose_result['time'] / baseline_result['time']))
print(f"time diff.: {goose_result['time'] / baseline_result['time']:.2f}")

print('newspaper')
print(newspaper_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(newspaper_result)))
print("time diff.: %.2f" % (newspaper_result['time'] / baseline_result['time']))
print(f"time diff.: {newspaper_result['time'] / baseline_result['time']:.2f}")

#print('dragnet')
#print(dragnet_result)
Expand All @@ -683,7 +683,7 @@ def calculate_scores(mydict):
print('boilerpipe')
print(boilerpipe_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(boilerpipe_result)))
print("time diff.: %.2f" % (boilerpipe_result['time'] / baseline_result['time']))
print(f"time diff.: {boilerpipe_result['time'] / baseline_result['time']:.2f}")

#print('jparser')
#print(jparser_result)
Expand All @@ -693,44 +693,44 @@ def calculate_scores(mydict):
print('newsplease')
print(newsplease_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(newsplease_result)))
print("time diff.: %.2f" % (newsplease_result['time'] / baseline_result['time']))
print(f"time diff.: {newsplease_result['time'] / baseline_result['time']:.2f}")

print('readability')
print(readability_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(readability_result)))
print("time diff.: %.2f" % (readability_result['time'] / baseline_result['time']))
print(f"time diff.: {readability_result['time'] / baseline_result['time']:.2f}")

print('readabilipy')
print(readabilipy_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(readabilipy_result)))
print("time diff.: %.2f" % (readabilipy_result['time'] / baseline_result['time']))
print(f"time diff.: {readabilipy_result['time'] / baseline_result['time']:.2f}")

print('resiliparse')
print(resiliparse_result)
print("time diff.: %.2f" % (resiliparse_result['time'] / baseline_result['time']))
print(f"time diff.: {resiliparse_result['time'] / baseline_result['time']:.2f}")
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(resiliparse_result)))

print('bs4')
print(bs4_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(bs4_result)))
print("time diff.: %.2f" % (bs4_result['time'] / baseline_result['time']))
print(f"time diff.: {bs4_result['time'] / baseline_result['time']:.2f}")

print('trafilatura')
print(trafilatura_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(trafilatura_result)))
print("time diff.: %.2f" % (trafilatura_result['time'] / baseline_result['time']))
print(f"time diff.: {trafilatura_result['time'] / baseline_result['time']:.2f}")

print('trafilatura + X')
print(trafilatura_fallback_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(trafilatura_fallback_result)))
print("time diff.: %.2f" % (trafilatura_fallback_result['time'] / baseline_result['time']))
print(f"time diff.: {trafilatura_fallback_result['time'] / baseline_result['time']:.2f}")

print('trafilatura precision')
print(trafilatura_precision)
print("time diff.: %.2f" % (trafilatura_precision['time'] / baseline_result['time']))
print(f"time diff.: {trafilatura_precision['time'] / baseline_result['time']:.2f}")
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(trafilatura_precision)))

print('trafilatura recall')
print(trafilatura_recall)
print("time diff.: %.2f" % (trafilatura_recall['time'] / baseline_result['time']))
print(f"time diff.: {trafilatura_recall['time'] / baseline_result['time']:.2f}")
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(trafilatura_recall)))
13 changes: 9 additions & 4 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,25 @@
brotli = None

import gzip

from time import sleep
from unittest.mock import Mock, patch

from courlan import UrlStore

from trafilatura.cli import parse_args
from trafilatura.cli_utils import download_queue_processing, url_processing_pipeline
from trafilatura.cli_utils import (download_queue_processing,
url_processing_pipeline)
from trafilatura.core import extract
from trafilatura.downloads import DEFAULT_HEADERS, USER_AGENT, add_to_compressed_dict, fetch_url, is_live_page, load_download_buffer, _determine_headers, _handle_response, _parse_config, _pycurl_is_live_page, _send_request, _send_pycurl_request, _urllib3_is_live_page
from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT,
_determine_headers, _handle_response,
_parse_config, _pycurl_is_live_page,
_send_pycurl_request, _send_request,
_urllib3_is_live_page,
add_to_compressed_dict, fetch_url,
is_live_page, load_download_buffer)
from trafilatura.settings import DEFAULT_CONFIG, use_config
from trafilatura.utils import decode_response, load_html


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

ZERO_CONFIG = DEFAULT_CONFIG
Expand Down
6 changes: 3 additions & 3 deletions tests/eval_authors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import time

from evaldata import EVAL_PAGES

from trafilatura import bare_extraction

TEST_DIR = os.path.abspath(os.path.dirname(__file__))
Expand Down Expand Up @@ -51,7 +52,6 @@ def run_trafilatura(htmlstring):
if 'author' in result and result['author'] == author_gold:
correct += 1

print('exec. time:', '%.2f' % (time.time() - start))
print('exec. time:', f'{time.time() - start:.2f}')
print('total, correct, percentage:')
print(i, correct, '%.2f' % ((correct/i)*100))

print(i, correct, f'{correct / i * 100:.2f}')
2 changes: 0 additions & 2 deletions tests/feeds_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
import logging
import os
import sys

from unittest.mock import patch

from trafilatura import cli, feeds


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

TEST_DIR = os.path.abspath(os.path.dirname(__file__))
Expand Down
8 changes: 3 additions & 5 deletions tests/filters_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,15 @@

from lxml import etree, html

from trafilatura import extract

import trafilatura.filters

from trafilatura import extract
from trafilatura.core import Extractor
from trafilatura.filters import check_html_lang, duplicate_test, language_filter
from trafilatura.filters import (check_html_lang, duplicate_test,
language_filter)
from trafilatura.lru import LRUCache
from trafilatura.metadata import Document
from trafilatura.settings import DEFAULT_CONFIG


ZERO_CONFIG = DEFAULT_CONFIG
ZERO_CONFIG['DEFAULT']['MIN_OUTPUT_SIZE'] = '0'
ZERO_CONFIG['DEFAULT']['MIN_EXTRACTED_SIZE'] = '0'
Expand Down
4 changes: 2 additions & 2 deletions tests/hashing_tests.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@


from trafilatura.hashing import Simhash, content_fingerprint, generate_hash_filename

from trafilatura.hashing import (Simhash, content_fingerprint,
generate_hash_filename)


def test_hashes():
Expand Down
8 changes: 4 additions & 4 deletions tests/json_metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import sys

from lxml import html
from trafilatura.metadata import extract_metadata, extract_meta_json, Document

from trafilatura.metadata import Document, extract_meta_json, extract_metadata

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

Expand Down Expand Up @@ -697,7 +697,7 @@ def test_json_extraction():
assert metadata is not None and metadata.title == 'Apple Spring Forward Event Live Blog' and metadata.pagetype == 'liveblogposting'

metadata = Document()
metadata = extract_meta_json(html.fromstring('''
metadata = extract_meta_json(html.fromstring(r'''
<html><body>
<script type="application/ld+json">
{
Expand Down Expand Up @@ -749,7 +749,7 @@ def test_json_extraction():

metadata = Document()
metadata.sitename = "https://bbcnews.com"
metadata = extract_meta_json(html.fromstring('''
metadata = extract_meta_json(html.fromstring(r'''
<html><body>
<script type="application/ld+json">
{
Expand Down Expand Up @@ -900,7 +900,7 @@ def test_json_extraction():

metadata = Document()
metadata = extract_meta_json(html.fromstring(
"""
r"""
<html>
<body>
<script type="application/ld+json">
Expand Down
5 changes: 3 additions & 2 deletions tests/metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from lxml import html

from trafilatura.json_metadata import normalize_json
from trafilatura.metadata import check_authors, extract_metadata, extract_meta_json, extract_url
from trafilatura.metadata import (check_authors, extract_meta_json,
extract_metadata, extract_url)
from trafilatura.utils import normalize_authors

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
Expand All @@ -25,7 +26,7 @@ def test_titles():
# too short/empty
metadata = extract_metadata('<html><body><h3 class="title">T</h3><h3 id="title"></h3></body></html>')
assert metadata.title is None

metadata = extract_metadata('<html><head><title>Test Title</title></head><body></body></html>')
assert metadata.title == 'Test Title'
metadata = extract_metadata('<html><body><h1>First</h1><h1>Second</h1></body></html>')
Expand Down
6 changes: 3 additions & 3 deletions tests/realworld_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import sys

import pytest

# https://docs.pytest.org/en/latest/


Expand All @@ -21,7 +22,6 @@
from trafilatura import extract
from trafilatura.metadata import extract_metadata


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)


Expand Down Expand Up @@ -394,7 +394,7 @@ def test_extract(xmloutput):
assert 'Brazil went dark.' in result and 'the highest number of deforestation warnings.”' in result and 'Tagged:' not in result and 'to the VICE newsletter.' not in result and 'Watch this next' not in result

result = load_mock_page('https://www.heise.de/newsticker/meldung/Lithium-aus-dem-Schredder-4451133.html', xmloutput)
assert 'Die Ökobilanz von Elektroautos' in result and 'Nur die Folie bleibt zurück' in result and 'Forum zum Thema:' not in result # and 'Highlights aus dem Heft:' not in result and 'TR 7/2019' not in result
assert 'Die Ökobilanz von Elektroautos' in result and 'Nur die Folie bleibt zurück' in result and 'Forum zum Thema:' not in result # and 'Highlights aus dem Heft:' not in result and 'TR 7/2019' not in result

result = load_mock_page('https://www.theverge.com/2019/7/3/20680681/ios-13-beta-3-facetime-attention-correction-eye-contact', xmloutput)
assert 'Normally, video calls tend to' in result and 'across both the eyes and nose.' in result and 'Added ARKit explanation and tweet.' in result and 'Singapore’s public health program' not in result and 'Command Line delivers daily updates' not in result
Expand Down Expand Up @@ -708,7 +708,7 @@ def test_pages():
url = "https://www.mercurynews.com/2023/01/16/letters-1119/"
metadata = extract_metadata(load_mock_page(url, xml_flag=True))


if __name__ == '__main__':
test_extract(False)
test_extract(True)
Expand Down
1 change: 1 addition & 0 deletions tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import sys

from courlan import get_hostinfo

from trafilatura import sitemaps
from trafilatura.utils import decode_response, is_similar_domain

Expand Down
Loading

0 comments on commit 46fb398

Please sign in to comment.