syntax: add extract_with_metadata method (#765)

* add extract_with_metadata method in core and corresponding ut * regroup code * add method comment --------- Co-authored-by: CodyInnowhere <[email protected]>
adbar · Dec 11, 2024 · b010779 · b010779
1 parent ad30d66
commit b010779
Show file tree

Hide file tree

Showing 3 changed files with 219 additions and 17 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -21,7 +21,7 @@
     from charset_normalizer import detect
 
 import trafilatura.htmlprocessing
-from trafilatura import bare_extraction, extract, xml
+from trafilatura import bare_extraction, extract, extract_with_metadata, xml
 from trafilatura.core import Extractor
 from trafilatura.external import sanitize_tree, try_justext, try_readability
 from trafilatura.main_extractor import (handle_formatting, handle_image,
@@ -443,6 +443,59 @@ def test_formatting():
     assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result
 
 
+def test_extract_with_metadata():
+    '''Test extract_with_metadata method'''
+    url = 'http://aa.bb/cc.html'
+    my_document = html.fromstring("""<html>
+        <head></head>
+        <body>
+        <article>
+        <p>AAA, <p>BBB</p>, CCC.</p>
+        </article>
+        </body>
+        </html>
+    """)
+    parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
+    content = parsed_doc.text
+    assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+    assert url == parsed_doc.url and parsed_doc.date is None and parsed_doc.title is None
+
+    my_document = html.fromstring("""<html>
+        <head><title>title</title></head>
+        <body>
+        <article>
+        <div>May 24, 2021</div>
+        <p>AAA, <p>BBB</p>, CCC.</p>
+        </article>
+        </body>
+        </html>
+    """)
+    parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
+    content = parsed_doc.text
+    assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+    assert url == parsed_doc.url and '2021-05-24' == parsed_doc.date and 'title' == parsed_doc.title
+
+    parsed_doc = extract_with_metadata(my_document, output_format='xml')
+    assert 'AAA, BBB , CCC.' == parsed_doc.raw_text and 'ee7d2fb6fcf2837d' == parsed_doc.fingerprint
+    content = parsed_doc.text
+    assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+
+    my_document = html.fromstring("""<html>
+        <head><meta http-equiv="content-language" content="es"></head>
+        <body>
+        <article>
+        <p>AAA, <p>BBB</p>, CCC.</p>
+        </article>
+        </body>
+        </html>
+    """)
+    parsed_doc = extract_with_metadata(my_document, target_language='en', fast=True)
+    assert parsed_doc is None
+
+    with pytest.raises(ValueError) as err:
+        extract_with_metadata(my_document, output_format="python")
+
+
 def test_external():
     '''Test external components'''
     options = DEFAULT_OPTIONS
@@ -1644,6 +1697,7 @@ def test_deprecations():
     test_trim()
     test_input()
     test_formatting()
+    test_extract_with_metadata()
     test_exotic_tags()
     test_images()
     test_links()

diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py
@@ -13,7 +13,7 @@
 import logging
 
 from .baseline import baseline, html2txt
-from .core import bare_extraction, extract
+from .core import bare_extraction, extract, extract_with_metadata
 from .downloads import fetch_response, fetch_url
 from .metadata import extract_metadata
 from .utils import load_html
@@ -25,6 +25,7 @@
     "baseline",
     "extract",
     "extract_metadata",
+    "extract_with_metadata",
     "fetch_response",
     "fetch_url",
     "html2txt",

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -193,19 +193,7 @@ def bare_extraction(
     """
 
     # deprecations
-    if no_fallback:
-        fast = no_fallback
-        warnings.warn(
-            '"no_fallback" will be deprecated in a future version, use "fast" instead',
-            PendingDeprecationWarning
-        )
-    if as_dict:
-        warnings.warn(
-            '"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
-            PendingDeprecationWarning
-        )
-    if max_tree_size:
-        raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
+    _check_deprecation(no_fallback=no_fallback, as_dict=as_dict, max_tree_size=max_tree_size)
 
     # regroup extraction options
     if not options or not isinstance(options, Extractor):
@@ -424,16 +412,174 @@ def extract(
         A string in the desired format or None.
 
     """
+    document = _internal_extraction(
+        filecontent=filecontent,
+        url=url,
+        record_id=record_id,
+        fast=fast,
+        no_fallback=no_fallback,
+        favor_precision=favor_precision,
+        favor_recall=favor_recall,
+        include_comments=include_comments,
+        output_format=output_format,
+        tei_validation=tei_validation,
+        target_language=target_language,
+        include_tables=include_tables,
+        include_images=include_images,
+        include_formatting=include_formatting,
+        include_links=include_links,
+        deduplicate=deduplicate,
+        date_extraction_params=date_extraction_params,
+        with_metadata=with_metadata,
+        only_with_metadata=only_with_metadata,
+        max_tree_size=max_tree_size,
+        url_blacklist=url_blacklist,
+        author_blacklist=author_blacklist,
+        settingsfile=settingsfile,
+        prune_xpath=prune_xpath,
+        config=config,
+        options=options)
+    return document.text if document is not None else None
+
+
+def extract_with_metadata(
+    filecontent: Any,
+    url: Optional[str] = None,
+    record_id: Optional[str] = None,
+    fast: bool = False,
+    favor_precision: bool = False,
+    favor_recall: bool = False,
+    include_comments: bool = True,
+    output_format: str = "txt",
+    tei_validation: bool = False,
+    target_language: Optional[str] = None,
+    include_tables: bool = True,
+    include_images: bool = False,
+    include_formatting: bool = False,
+    include_links: bool = False,
+    deduplicate: bool = False,
+    date_extraction_params: Optional[Dict[str, Any]] = None,
+    url_blacklist: Optional[Set[str]] = None,
+    author_blacklist: Optional[Set[str]] = None,
+    settingsfile: Optional[str] = None,
+    prune_xpath: Optional[Any] = None,
+    config: Any = DEFAULT_CONFIG,
+    options: Optional[Extractor] = None,
+) -> Optional[Document]:
+    """Main function exposed by the package:
+       Wrapper for text extraction and conversion to chosen output format.
+       This method also returns document metadata.
+
+    Args:
+        filecontent: HTML code as string.
+        url: URL of the webpage.
+        record_id: Add an ID to the metadata.
+        fast: Use faster heuristics and skip backup extraction.
+        no_fallback: Will be deprecated, use "fast" instead.
+        favor_precision: prefer less text but correct extraction.
+        favor_recall: when unsure, prefer more text.
+        include_comments: Extract comments along with the main text.
+        output_format: Define an output format:
+            "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
+        tei_validation: Validate the XML-TEI output with respect to the TEI standard.
+        target_language: Define a language to discard invalid documents (ISO 639-1 format).
+        include_tables: Take into account information within the HTML <table> element.
+        include_images: Take images into account (experimental).
+        include_formatting: Keep structural elements related to formatting
+            (only valuable if output_format is set to XML).
+        include_links: Keep links along with their targets (experimental).
+        deduplicate: Remove duplicate segments and documents.
+        date_extraction_params: Provide extraction parameters to htmldate as dict().
+        url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
+        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
+        settingsfile: Use a configuration file to override the standard settings.
+        prune_xpath: Provide an XPath expression to prune the tree before extraction.
+            can be str or list of str.
+        config: Directly provide a configparser configuration.
+        options: Directly provide a whole extractor configuration.
+
+    Returns:
+        Document metadata with content string in the desired format or None.
+    """
+    return _internal_extraction(
+        filecontent=filecontent,
+        url=url,
+        record_id=record_id,
+        fast=fast,
+        favor_precision=favor_precision,
+        favor_recall=favor_recall,
+        include_comments=include_comments,
+        output_format=output_format,
+        tei_validation=tei_validation,
+        target_language=target_language,
+        include_tables=include_tables,
+        include_images=include_images,
+        include_formatting=include_formatting,
+        include_links=include_links,
+        deduplicate=deduplicate,
+        date_extraction_params=date_extraction_params,
+        with_metadata=True,
+        only_with_metadata=False,
+        url_blacklist=url_blacklist,
+        author_blacklist=author_blacklist,
+        settingsfile=settingsfile,
+        prune_xpath=prune_xpath,
+        config=config,
+        options=options)
+
+
+def _check_deprecation(
+        no_fallback: bool = False,
+        as_dict: bool = False,
+        max_tree_size: Optional[int] = None,
+)-> None:
+    '''Check deprecated or to-be-deprecated params'''
     if no_fallback:
         fast = no_fallback
         warnings.warn(
             '"no_fallback" will be deprecated in a future version, use "fast" instead',
             PendingDeprecationWarning
         )
-
+    if as_dict:
+        warnings.warn(
+            '"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
+            PendingDeprecationWarning
+        )
     if max_tree_size:
         raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
 
+
+def _internal_extraction(
+        filecontent: Any,
+        url: Optional[str] = None,
+        record_id: Optional[str] = None,
+        fast: bool = False,
+        no_fallback: bool = False,
+        favor_precision: bool = False,
+        favor_recall: bool = False,
+        include_comments: bool = True,
+        output_format: str = "txt",
+        tei_validation: bool = False,
+        target_language: Optional[str] = None,
+        include_tables: bool = True,
+        include_images: bool = False,
+        include_formatting: bool = False,
+        include_links: bool = False,
+        deduplicate: bool = False,
+        date_extraction_params: Optional[Dict[str, Any]] = None,
+        with_metadata: bool = False,
+        only_with_metadata: bool = False,
+        max_tree_size: Optional[int] = None,
+        url_blacklist: Optional[Set[str]] = None,
+        author_blacklist: Optional[Set[str]] = None,
+        settingsfile: Optional[str] = None,
+        prune_xpath: Optional[Any] = None,
+        config: Any = DEFAULT_CONFIG,
+        options: Optional[Extractor] = None,
+) -> Optional[Document]:
+    '''Internal method to do the extraction'''
+    _check_deprecation(no_fallback=no_fallback, as_dict=False, max_tree_size=max_tree_size)
+
     # regroup extraction options
     if not options or not isinstance(options, Extractor):
         options = Extractor(
@@ -485,4 +631,5 @@ def extract(
             )
 
     # return
-    return determine_returnstring(document, options)
+    document.text = determine_returnstring(document, options)
+    return document