From 82ac6413a353e4ffec9e2d59a4b1fc6316d6f446 Mon Sep 17 00:00:00 2001 From: Sungeun An Date: Tue, 19 Nov 2024 00:17:27 -0800 Subject: [PATCH 1/3] updated README file and added a sample notebook Signed-off-by: Sungeun An --- .../html2parquet/notebooks/html2parquet.ipynb | 220 ++++++++++++++++++ .../language/html2parquet/python/README.md | 103 +++++++- 2 files changed, 315 insertions(+), 8 deletions(-) create mode 100644 transforms/language/html2parquet/notebooks/html2parquet.ipynb diff --git a/transforms/language/html2parquet/notebooks/html2parquet.ipynb b/transforms/language/html2parquet/notebooks/html2parquet.ipynb new file mode 100644 index 000000000..c2713899d --- /dev/null +++ b/transforms/language/html2parquet/notebooks/html2parquet.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c4f9c952-cb3b-40f1-bfb5-00d9a43a5715", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install data-prep-toolkit==0.2.2.dev2\n", + "!pip install 'data-prep-toolkit-transforms[html2parquet]==0.2.2.dev2'\n", + "!pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "20663a67-5aa1-4b61-b989-94201613e41f", + "metadata": {}, + "outputs": [], + "source": [ + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "\n", + "from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e75f6922-eb0f-4164-a536-f96393e04604", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "# create parameters\n", + "local_conf = {\n", + " \"input_folder\": \"input\",\n", + " \"output_folder\": \"output\",\n", + "}\n", + "\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.html']\"),\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4d2354db-1bb3-4a71-98df-f0f148af3a02", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "17:09:40 INFO - html2parquet parameters are : {'output_format': , 'favor_precision': , 'favor_recall': }\n", + "17:09:40 INFO - pipeline id pipeline_id\n", + "17:09:40 INFO - code location None\n", + "17:09:40 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n", + "17:09:40 INFO - data factory data_ max_files -1, n_sample -1\n", + "17:09:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n", + "17:09:40 INFO - orchestrator html2parquet started at 2024-11-13 17:09:40\n", + "17:09:40 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n", + "17:09:47 INFO - Completed 1 files (100.0%) in 0.111 min\n", + "17:09:47 INFO - Done processing 1 files, waiting for flush() completion.\n", + "17:09:47 INFO - done flushing in 0.0 sec\n", + "17:09:47 INFO - Completed execution in 0.111 min, execution result 0\n" + ] + } + ], + "source": [ + "\n", + "import sys\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params))\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e2bee8da-c566-4e45-bca1-354dfd04b0df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titledocumentcontentsdocument_idsizedate_acquired
0ai-alliance-index.htmlai-alliance-index.html![](https://images.prismic.io/ai-alliance/Ztf3...f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...3942024-11-13T17:09:40.947095
\n", + "
" + ], + "text/plain": [ + " title document \\\n", + "0 ai-alliance-index.html ai-alliance-index.html \n", + "\n", + " contents \\\n", + "0 ![](https://images.prismic.io/ai-alliance/Ztf3... \n", + "\n", + " document_id size \\\n", + "0 f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121... 394 \n", + "\n", + " date_acquired \n", + "0 2024-11-13T17:09:40.947095 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pyarrow.parquet as pq\n", + "import pandas as pd\n", + "table = pq.read_table('output/ai-alliance-index.parquet')\n", + "table.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cde6e37d-c437-490f-8e01-f4f51a123484", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.to_pandas()['contents'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fd0d13b-1ff6-4988-91fb-52c25ba998c8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "587e43ee-7b51-4a9c-8bf2-0a23e309a7ae", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md index 0d25553e1..6b12bffea 100644 --- a/transforms/language/html2parquet/python/README.md +++ b/transforms/language/html2parquet/python/README.md @@ -1,25 +1,55 @@ -# html2parquet Transform +# HTML to Parquet Transform -This tranforms iterate through zip of HTML files or single HTML files and generates parquet files containing the converted document in string. +--- -The HTML conversion is using the [Trafilatura](https://trafilatura.readthedocs.io/en/latest/usage-python.html). +## Description -## Output format +This transform iterates through zipped collections of HTML files or single HTML files and generates Parquet files containing the extracted content, leveraging the [Trafilatura library](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for extraction of text, tables, images, and other components. -The output format will contain the following colums +--- + +## Contributors + +- Sungeun An (sungeun.an@ibm.com) +- Syed Zawad (szawad@ibm.com) + +--- + +## Date + +**Last updated:** 10/16/24 +- **Update details:** + - Added Trafilatura parameters (`favor_precision` and `favor_recall`) for enhanced control over content extraction. + - Enhanced table and image extraction features. + - See [Pull Request #707](https://github.com/IBM/data-prep-kit/pull/707) for more details. + +--- + +## Input and Output + +### Input +- Accepted Formats: Single HTML files or zipped collections of HTML files. +- Sample Input Files: [sample html files](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/html2parquet/python/test-data/input) + +### Output +- Format: Parquet files with the following structure: ```jsonc { - "title": "string", // the member filename - "document": "string", // the base of the source archive - "contents": "string", // the content of the HTML + "title": "string", // the member filename + "document": "string", // the base of the source archive + "contents": "string", // the content of the HTML "document_id": "string", // the document id, a hash of `contents` "size": "string", // the size of `contents` "date_acquired": "date", // the date when the transform was executing } ``` + + ## Parameters +### User-Configurable Parameters + The table below provides the parameters that users can adjust to control the behavior of the extraction: | Parameter | Default | Description | @@ -28,6 +58,8 @@ The table below provides the parameters that users can adjust to control the beh | `favor_precision` | `True` | Prefers less content but more accurate extraction. Options: `True`, `False`. | | `favor_recall` | `True` | Extracts more content when uncertain. Options: `True`, `False`. | +### Default Parameters + The table below provides the parameters that are enabled by default to ensure a comprehensive extraction process: | Parameter | Default | Description | @@ -43,6 +75,7 @@ The table below provides the parameters that are enabled by default to ensure a - To prioritize extracting more content over accuracy, set `favor_recall=True` and `favor_precision=False`. - When invoking the CLI, use the following syntax for these parameters: `--html2parquet_`. For example: `--html2parquet_output_format='markdown'`. + ## Example ### Sample HTML @@ -155,3 +188,57 @@ Chicago | ## Contact Us ``` +## Usage + +### Command-Line Interface (CLI) + +Run the transform with the following command: + +``` +python ../html2parquet/python/src/html2parquet_transform_python.py \ + --data_local_config "{'input_folder': '../html2parquet/python/test-data/input', 'output_folder': '../html2parquet/python/test-data/expected'}" \ + --data_files_to_use '[".html", ".zip"]' +``` + +- When invoking the CLI, use the following syntax for these parameters: `--html2parquet_`. For example: `--html2parquet_output_format='markdown'`. + +### Python Code + +To run the transform programmatically: + +``` +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + +from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration +import ast +import sys + +# create parameters +local_conf = { + "input_folder": "input", + "output_folder": "output", +} + +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "data_files_to_use": ast.literal_eval("['.html']"), +} + +sys.argv = ParamsUtils.dict_to_req(d=(params)) +# create launcher +launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration()) +# launch +return_code = launcher.launch() + +``` + +### Sample Notebook + +See the [sample notebook](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/html2parquet/notebooks/html2parquet.ipynb) for an example. + + +## Further Resources + +- [Trafilatura](https://trafilatura.readthedocs.io/en/latest/usage-python.html). From 76e067dfa3795bf1001570465e77822026ab1f4e Mon Sep 17 00:00:00 2001 From: Sungeun An Date: Tue, 19 Nov 2024 09:36:37 -0800 Subject: [PATCH 2/3] removed python code in README and minor changes in the notebook Signed-off-by: Sungeun An --- .../html2parquet/notebooks/html2parquet.ipynb | 8 ++--- .../language/html2parquet/python/README.md | 31 ------------------- 2 files changed, 4 insertions(+), 35 deletions(-) diff --git a/transforms/language/html2parquet/notebooks/html2parquet.ipynb b/transforms/language/html2parquet/notebooks/html2parquet.ipynb index c2713899d..230805144 100644 --- a/transforms/language/html2parquet/notebooks/html2parquet.ipynb +++ b/transforms/language/html2parquet/notebooks/html2parquet.ipynb @@ -37,14 +37,14 @@ "\n", "# create parameters\n", "local_conf = {\n", - " \"input_folder\": \"input\",\n", - " \"output_folder\": \"output\",\n", + " \"input_folder\": \"/path/to/your/input/folder\",\n", + " \"output_folder\": \"/path/to/your/output/folder\",\n", "}\n", "\n", "params = {\n", " # Data access. Only required parameters are specified\n", " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " \"data_files_to_use\": ast.literal_eval(\"['.html']\"),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.zip', '.html']\"),\n", "}\n" ] }, @@ -154,7 +154,7 @@ "source": [ "import pyarrow.parquet as pq\n", "import pandas as pd\n", - "table = pq.read_table('output/ai-alliance-index.parquet')\n", + "table = pq.read_table('/path/to/your/output/folder/sample.parquet')\n", "table.to_pandas()" ] }, diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md index 6b12bffea..eadd082fb 100644 --- a/transforms/language/html2parquet/python/README.md +++ b/transforms/language/html2parquet/python/README.md @@ -202,37 +202,6 @@ python ../html2parquet/python/src/html2parquet_transform_python.py \ - When invoking the CLI, use the following syntax for these parameters: `--html2parquet_`. For example: `--html2parquet_output_format='markdown'`. -### Python Code - -To run the transform programmatically: - -``` -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils - -from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration -import ast -import sys - -# create parameters -local_conf = { - "input_folder": "input", - "output_folder": "output", -} - -params = { - # Data access. Only required parameters are specified - "data_local_config": ParamsUtils.convert_to_ast(local_conf), - "data_files_to_use": ast.literal_eval("['.html']"), -} - -sys.argv = ParamsUtils.dict_to_req(d=(params)) -# create launcher -launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration()) -# launch -return_code = launcher.launch() - -``` ### Sample Notebook From b018b22d2af483eb4b5fcba60ac20bd953064dbf Mon Sep 17 00:00:00 2001 From: Sungeun An Date: Thu, 21 Nov 2024 12:52:23 -0800 Subject: [PATCH 3/3] updated with relative path and added markdown for notebook Signed-off-by: Sungeun An --- .../html2parquet/notebooks/html2parquet.ipynb | 57 ++++++++++++------- .../language/html2parquet/python/README.md | 10 ++-- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/transforms/language/html2parquet/notebooks/html2parquet.ipynb b/transforms/language/html2parquet/notebooks/html2parquet.ipynb index 230805144..669a4d30d 100644 --- a/transforms/language/html2parquet/notebooks/html2parquet.ipynb +++ b/transforms/language/html2parquet/notebooks/html2parquet.ipynb @@ -1,9 +1,17 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "8435e1f7-0c2e-49f4-a77a-b525ee6c532b", + "metadata": {}, + "source": [ + "# Html2Parquet Transform Sample Notebook" + ] + }, { "cell_type": "code", - "execution_count": 1, - "id": "c4f9c952-cb3b-40f1-bfb5-00d9a43a5715", + "execution_count": null, + "id": "d9420989-ec8a-4fde-9a93-dc25096389f1", "metadata": {}, "outputs": [], "source": [ @@ -26,6 +34,14 @@ "from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n" ] }, + { + "cell_type": "markdown", + "id": "6d85491b-0093-46e7-8653-ca8052ea59f0", + "metadata": {}, + "source": [ + "## Specify input/output folders and parameters" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -37,7 +53,7 @@ "\n", "# create parameters\n", "local_conf = {\n", - " \"input_folder\": \"/path/to/your/input/folder\",\n", + " \"input_folder\": \"/path/to/your/input/folder\", # For the sample input files, refer to the 'python/test-data/input' folder\n", " \"output_folder\": \"/path/to/your/output/folder\",\n", "}\n", "\n", @@ -48,6 +64,14 @@ "}\n" ] }, + { + "cell_type": "markdown", + "id": "0dcd1249-1eb8-4b33-9827-626f90c840b4", + "metadata": {}, + "source": [ + "## Invoke the html2parquet transformation" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -74,7 +98,6 @@ } ], "source": [ - "\n", "import sys\n", "sys.argv = ParamsUtils.dict_to_req(d=(params))\n", "# create launcher\n", @@ -83,6 +106,14 @@ "return_code = launcher.launch()\n" ] }, + { + "cell_type": "markdown", + "id": "3c66468d-703f-427f-a1dd-a758edd334de", + "metadata": {}, + "source": [ + "## Checking the output Parquet file" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -178,22 +209,6 @@ "source": [ "table.to_pandas()['contents'][0]" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fd0d13b-1ff6-4988-91fb-52c25ba998c8", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "587e43ee-7b51-4a9c-8bf2-0a23e309a7ae", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -212,7 +227,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.10" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md index eadd082fb..35e781007 100644 --- a/transforms/language/html2parquet/python/README.md +++ b/transforms/language/html2parquet/python/README.md @@ -18,10 +18,7 @@ This transform iterates through zipped collections of HTML files or single HTML ## Date **Last updated:** 10/16/24 -- **Update details:** - - Added Trafilatura parameters (`favor_precision` and `favor_recall`) for enhanced control over content extraction. - - Enhanced table and image extraction features. - - See [Pull Request #707](https://github.com/IBM/data-prep-kit/pull/707) for more details. +**Update details:** Enhanced table and image extraction features by adding the corresponding Trafilatura parameters. --- @@ -29,7 +26,7 @@ This transform iterates through zipped collections of HTML files or single HTML ### Input - Accepted Formats: Single HTML files or zipped collections of HTML files. -- Sample Input Files: [sample html files](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/html2parquet/python/test-data/input) +- Sample Input Files: [sample html files](test-data/input) ### Output - Format: Parquet files with the following structure: @@ -205,7 +202,8 @@ python ../html2parquet/python/src/html2parquet_transform_python.py \ ### Sample Notebook -See the [sample notebook](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/html2parquet/notebooks/html2parquet.ipynb) for an example. +See the [sample notebook](../notebooks/html2parquet.ipynb) +) for an example. ## Further Resources