Skip to content

Commit

Permalink
syntax: add extract_with_metadata method (#765)
Browse files Browse the repository at this point in the history
* add extract_with_metadata method in core and corresponding ut

* regroup code

* add method comment

---------

Co-authored-by: CodyInnowhere <[email protected]>
  • Loading branch information
unsleepy22 and CodyInnowhere authored Dec 11, 2024
1 parent ad30d66 commit b010779
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 17 deletions.
56 changes: 55 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from charset_normalizer import detect

import trafilatura.htmlprocessing
from trafilatura import bare_extraction, extract, xml
from trafilatura import bare_extraction, extract, extract_with_metadata, xml
from trafilatura.core import Extractor
from trafilatura.external import sanitize_tree, try_justext, try_readability
from trafilatura.main_extractor import (handle_formatting, handle_image,
Expand Down Expand Up @@ -443,6 +443,59 @@ def test_formatting():
assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result


def test_extract_with_metadata():
'''Test extract_with_metadata method'''
url = 'http://aa.bb/cc.html'
my_document = html.fromstring("""<html>
<head></head>
<body>
<article>
<p>AAA, <p>BBB</p>, CCC.</p>
</article>
</body>
</html>
""")
parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
content = parsed_doc.text
assert 'AAA' in content and 'BBB' in content and 'CCC' in content
assert url == parsed_doc.url and parsed_doc.date is None and parsed_doc.title is None

my_document = html.fromstring("""<html>
<head><title>title</title></head>
<body>
<article>
<div>May 24, 2021</div>
<p>AAA, <p>BBB</p>, CCC.</p>
</article>
</body>
</html>
""")
parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
content = parsed_doc.text
assert 'AAA' in content and 'BBB' in content and 'CCC' in content
assert url == parsed_doc.url and '2021-05-24' == parsed_doc.date and 'title' == parsed_doc.title

parsed_doc = extract_with_metadata(my_document, output_format='xml')
assert 'AAA, BBB , CCC.' == parsed_doc.raw_text and 'ee7d2fb6fcf2837d' == parsed_doc.fingerprint
content = parsed_doc.text
assert 'AAA' in content and 'BBB' in content and 'CCC' in content

my_document = html.fromstring("""<html>
<head><meta http-equiv="content-language" content="es"></head>
<body>
<article>
<p>AAA, <p>BBB</p>, CCC.</p>
</article>
</body>
</html>
""")
parsed_doc = extract_with_metadata(my_document, target_language='en', fast=True)
assert parsed_doc is None

with pytest.raises(ValueError) as err:
extract_with_metadata(my_document, output_format="python")


def test_external():
'''Test external components'''
options = DEFAULT_OPTIONS
Expand Down Expand Up @@ -1644,6 +1697,7 @@ def test_deprecations():
test_trim()
test_input()
test_formatting()
test_extract_with_metadata()
test_exotic_tags()
test_images()
test_links()
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import logging

from .baseline import baseline, html2txt
from .core import bare_extraction, extract
from .core import bare_extraction, extract, extract_with_metadata
from .downloads import fetch_response, fetch_url
from .metadata import extract_metadata
from .utils import load_html
Expand All @@ -25,6 +25,7 @@
"baseline",
"extract",
"extract_metadata",
"extract_with_metadata",
"fetch_response",
"fetch_url",
"html2txt",
Expand Down
177 changes: 162 additions & 15 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,19 +193,7 @@ def bare_extraction(
"""

# deprecations
if no_fallback:
fast = no_fallback
warnings.warn(
'"no_fallback" will be deprecated in a future version, use "fast" instead',
PendingDeprecationWarning
)
if as_dict:
warnings.warn(
'"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
PendingDeprecationWarning
)
if max_tree_size:
raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
_check_deprecation(no_fallback=no_fallback, as_dict=as_dict, max_tree_size=max_tree_size)

# regroup extraction options
if not options or not isinstance(options, Extractor):
Expand Down Expand Up @@ -424,16 +412,174 @@ def extract(
A string in the desired format or None.
"""
document = _internal_extraction(
filecontent=filecontent,
url=url,
record_id=record_id,
fast=fast,
no_fallback=no_fallback,
favor_precision=favor_precision,
favor_recall=favor_recall,
include_comments=include_comments,
output_format=output_format,
tei_validation=tei_validation,
target_language=target_language,
include_tables=include_tables,
include_images=include_images,
include_formatting=include_formatting,
include_links=include_links,
deduplicate=deduplicate,
date_extraction_params=date_extraction_params,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
max_tree_size=max_tree_size,
url_blacklist=url_blacklist,
author_blacklist=author_blacklist,
settingsfile=settingsfile,
prune_xpath=prune_xpath,
config=config,
options=options)
return document.text if document is not None else None


def extract_with_metadata(
filecontent: Any,
url: Optional[str] = None,
record_id: Optional[str] = None,
fast: bool = False,
favor_precision: bool = False,
favor_recall: bool = False,
include_comments: bool = True,
output_format: str = "txt",
tei_validation: bool = False,
target_language: Optional[str] = None,
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
url_blacklist: Optional[Set[str]] = None,
author_blacklist: Optional[Set[str]] = None,
settingsfile: Optional[str] = None,
prune_xpath: Optional[Any] = None,
config: Any = DEFAULT_CONFIG,
options: Optional[Extractor] = None,
) -> Optional[Document]:
"""Main function exposed by the package:
Wrapper for text extraction and conversion to chosen output format.
This method also returns document metadata.
Args:
filecontent: HTML code as string.
url: URL of the webpage.
record_id: Add an ID to the metadata.
fast: Use faster heuristics and skip backup extraction.
no_fallback: Will be deprecated, use "fast" instead.
favor_precision: prefer less text but correct extraction.
favor_recall: when unsure, prefer more text.
include_comments: Extract comments along with the main text.
output_format: Define an output format:
"csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
tei_validation: Validate the XML-TEI output with respect to the TEI standard.
target_language: Define a language to discard invalid documents (ISO 639-1 format).
include_tables: Take into account information within the HTML <table> element.
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
settingsfile: Use a configuration file to override the standard settings.
prune_xpath: Provide an XPath expression to prune the tree before extraction.
can be str or list of str.
config: Directly provide a configparser configuration.
options: Directly provide a whole extractor configuration.
Returns:
Document metadata with content string in the desired format or None.
"""
return _internal_extraction(
filecontent=filecontent,
url=url,
record_id=record_id,
fast=fast,
favor_precision=favor_precision,
favor_recall=favor_recall,
include_comments=include_comments,
output_format=output_format,
tei_validation=tei_validation,
target_language=target_language,
include_tables=include_tables,
include_images=include_images,
include_formatting=include_formatting,
include_links=include_links,
deduplicate=deduplicate,
date_extraction_params=date_extraction_params,
with_metadata=True,
only_with_metadata=False,
url_blacklist=url_blacklist,
author_blacklist=author_blacklist,
settingsfile=settingsfile,
prune_xpath=prune_xpath,
config=config,
options=options)


def _check_deprecation(
no_fallback: bool = False,
as_dict: bool = False,
max_tree_size: Optional[int] = None,
)-> None:
'''Check deprecated or to-be-deprecated params'''
if no_fallback:
fast = no_fallback
warnings.warn(
'"no_fallback" will be deprecated in a future version, use "fast" instead',
PendingDeprecationWarning
)

if as_dict:
warnings.warn(
'"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
PendingDeprecationWarning
)
if max_tree_size:
raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")


def _internal_extraction(
filecontent: Any,
url: Optional[str] = None,
record_id: Optional[str] = None,
fast: bool = False,
no_fallback: bool = False,
favor_precision: bool = False,
favor_recall: bool = False,
include_comments: bool = True,
output_format: str = "txt",
tei_validation: bool = False,
target_language: Optional[str] = None,
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
with_metadata: bool = False,
only_with_metadata: bool = False,
max_tree_size: Optional[int] = None,
url_blacklist: Optional[Set[str]] = None,
author_blacklist: Optional[Set[str]] = None,
settingsfile: Optional[str] = None,
prune_xpath: Optional[Any] = None,
config: Any = DEFAULT_CONFIG,
options: Optional[Extractor] = None,
) -> Optional[Document]:
'''Internal method to do the extraction'''
_check_deprecation(no_fallback=no_fallback, as_dict=False, max_tree_size=max_tree_size)

# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
Expand Down Expand Up @@ -485,4 +631,5 @@ def extract(
)

# return
return determine_returnstring(document, options)
document.text = determine_returnstring(document, options)
return document

0 comments on commit b010779

Please sign in to comment.