From a70a9ce0ae7385cebd07a99f6cb0adca0ba874f1 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 1 Sep 2023 13:53:09 +0200 Subject: [PATCH 1/6] prepare v1.6.2 --- .github/workflows/tests.yml | 2 +- HISTORY.md | 23 +++++++++++++++++++++++ setup.py | 9 +++++---- trafilatura/__init__.py | 2 +- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cb2bb177..a4578e37 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: [3.8, "3.11"] # "3.12-dev" + python-version: [3.8, "3.11", "3.12-dev"] env: [{ MINIMAL: "true" }, { MINIMAL: "false" }] include: # custom python versions diff --git a/HISTORY.md b/HISTORY.md index de5512c4..d8b3df21 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,29 @@ ## History / Changelog +### 1.6.2 + +Extraction: +- more lenient HTML parsing (#370) +- improved code block support by @idoshamun (#372) +- convertion of relative links to absolute by @feltcat (#377) +- remove use of signal from core functions (#384) + +Metadata: +- JSON-LD fix for sitenames by @felipehertzer (#383) + +Command-line interface: +- more robust batch processing (#381) +- added `--probe` option to CLI to check for extractable content (#378, #392) + +Maintenance: +- simplified code (#408) +- support for Python 3.12 +- pinned LXML version for MacOS (#393) +- updated dependencies and parameters +- code cleaning by @marksmayo in #406 + + ### 1.6.1 Extraction: diff --git a/setup.py b/setup.py index 63e84866..38cf215f 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def get_long_description(): "brotli", "cchardet >= 2.1.7; python_version < '3.11'", # build issue "faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build - "htmldate[speed] >= 1.4.3", + "htmldate[speed] >= 1.5.0", "py3langid >= 0.2.2", "pycurl >= 7.45.2", ], @@ -66,6 +66,7 @@ def get_long_description(): "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Internet :: WWW/HTTP", "Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Security", @@ -109,9 +110,9 @@ def get_long_description(): install_requires=[ "certifi", "charset_normalizer >= 3.0.1; python_version < '3.7'", - "charset_normalizer >= 3.1.0; python_version >= '3.7'", - "courlan @ git+https://github.com/adbar/courlan", - "htmldate >= 1.4.3", + "charset_normalizer >= 3.2.0; python_version >= '3.7'", + "courlan @ git+https://github.com/adbar/courlan", # TODO: >= 0.9.4 + "htmldate >= 1.5.0", "justext >= 3.0.0", "lxml >= 4.9.3 ; platform_system != 'Darwin'", "lxml == 4.9.2 ; platform_system == 'Darwin'", diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py index 69ed4f97..8ad21d90 100644 --- a/trafilatura/__init__.py +++ b/trafilatura/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Adrien Barbaresi and contributors' __license__ = 'GNU GPL v3+' __copyright__ = 'Copyright 2019-2023, Adrien Barbaresi' -__version__ = '1.6.1' +__version__ = '1.6.2' import logging From 89eae324551b35dcdf0a1d9eb04f35c69e326eb8 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 1 Sep 2023 14:06:47 +0200 Subject: [PATCH 2/6] setup: adjust test versions --- .github/workflows/tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a4578e37..b9261923 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,12 +17,14 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: [3.8, "3.11", "3.12-dev"] + python-version: ["3.10", "3.11"] env: [{ MINIMAL: "true" }, { MINIMAL: "false" }] include: # custom python versions - os: ubuntu-20.04 python-version: 3.6 + - os: ubuntu-20.04 + python-version: 3.7 - os: macos-latest python-version: 3.8 - os: windows-latest @@ -30,7 +32,7 @@ jobs: - os: ubuntu-latest python-version: 3.9 - os: ubuntu-latest - python-version: "3.10" + python-version: "3.12-dev" steps: # Python and pip setup - name: Set up Python ${{ matrix.python-version }} From e7d52e6aa0c658bc69961312dbfcab69872a3734 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 5 Sep 2023 13:50:46 +0200 Subject: [PATCH 3/6] fix tests --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b9261923..f15efab9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.10", "3.11"] + python-version: ["3.9", "3.11"] env: [{ MINIMAL: "true" }, { MINIMAL: "false" }] include: # custom python versions @@ -30,7 +30,7 @@ jobs: - os: windows-latest python-version: 3.8 - os: ubuntu-latest - python-version: 3.9 + python-version: 3.10 - os: ubuntu-latest python-version: "3.12-dev" steps: From 26e1ca2fd8036d41868997aee7ec5f2013ce3ccd Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 5 Sep 2023 14:07:44 +0200 Subject: [PATCH 4/6] fix tests --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f15efab9..bddf5ee3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -30,7 +30,7 @@ jobs: - os: windows-latest python-version: 3.8 - os: ubuntu-latest - python-version: 3.10 + python-version: "3.10" - os: ubuntu-latest python-version: "3.12-dev" steps: From f91146c876549f673afabaa2c28f6328021623c2 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 5 Sep 2023 16:41:07 +0200 Subject: [PATCH 5/6] update changes --- HISTORY.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d8b3df21..d696a144 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,7 +5,7 @@ Extraction: - more lenient HTML parsing (#370) -- improved code block support by @idoshamun (#372) +- improved code block support with @idoshamun (#372, #401) - convertion of relative links to absolute by @feltcat (#377) - remove use of signal from core functions (#384) @@ -20,8 +20,8 @@ Maintenance: - simplified code (#408) - support for Python 3.12 - pinned LXML version for MacOS (#393) -- updated dependencies and parameters -- code cleaning by @marksmayo in #406 +- updated dependencies and parameters (notably `htmldate` and `courlan`) +- code cleaning by @marksmayo (#406) ### 1.6.1 diff --git a/setup.py b/setup.py index 38cf215f..986f81df 100644 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ def get_long_description(): "charset_normalizer >= 3.0.1; python_version < '3.7'", "charset_normalizer >= 3.2.0; python_version >= '3.7'", "courlan @ git+https://github.com/adbar/courlan", # TODO: >= 0.9.4 - "htmldate >= 1.5.0", + "htmldate >= 1.5.1", "justext >= 3.0.0", "lxml >= 4.9.3 ; platform_system != 'Darwin'", "lxml == 4.9.2 ; platform_system == 'Darwin'", From 4e565969421b55534931dafad0d5d95ea88dcbd4 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 6 Sep 2023 17:20:31 +0200 Subject: [PATCH 6/6] setup: update dependencies --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 986f81df..b7746bda 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def get_long_description(): "brotli", "cchardet >= 2.1.7; python_version < '3.11'", # build issue "faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build - "htmldate[speed] >= 1.5.0", + "htmldate[speed] >= 1.5.1", "py3langid >= 0.2.2", "pycurl >= 7.45.2", ], @@ -111,7 +111,7 @@ def get_long_description(): "certifi", "charset_normalizer >= 3.0.1; python_version < '3.7'", "charset_normalizer >= 3.2.0; python_version >= '3.7'", - "courlan @ git+https://github.com/adbar/courlan", # TODO: >= 0.9.4 + "courlan >= 0.9.4", "htmldate >= 1.5.1", "justext >= 3.0.0", "lxml >= 4.9.3 ; platform_system != 'Darwin'",