From 82ac6413a353e4ffec9e2d59a4b1fc6316d6f446 Mon Sep 17 00:00:00 2001
From: Sungeun An <sungeun.an@ibm.com>
Date: Tue, 19 Nov 2024 00:17:27 -0800
Subject: [PATCH 1/3] updated README file and added a sample notebook

Signed-off-by: Sungeun An <sungeun.an@ibm.com>
---
 .../html2parquet/notebooks/html2parquet.ipynb | 220 ++++++++++++++++++
 .../language/html2parquet/python/README.md    | 103 +++++++-
 2 files changed, 315 insertions(+), 8 deletions(-)
 create mode 100644 transforms/language/html2parquet/notebooks/html2parquet.ipynb

diff --git a/transforms/language/html2parquet/notebooks/html2parquet.ipynb b/transforms/language/html2parquet/notebooks/html2parquet.ipynb
new file mode 100644
index 000000000..c2713899d
--- /dev/null
+++ b/transforms/language/html2parquet/notebooks/html2parquet.ipynb
@@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c4f9c952-cb3b-40f1-bfb5-00d9a43a5715",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip install data-prep-toolkit==0.2.2.dev2\n",
+    "!pip install 'data-prep-toolkit-transforms[html2parquet]==0.2.2.dev2'\n",
+    "!pip install pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "20663a67-5aa1-4b61-b989-94201613e41f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
+    "from data_processing.utils import ParamsUtils\n",
+    "\n",
+    "from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e75f6922-eb0f-4164-a536-f96393e04604",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "\n",
+    "# create parameters\n",
+    "local_conf = {\n",
+    "    \"input_folder\": \"input\",\n",
+    "    \"output_folder\": \"output\",\n",
+    "}\n",
+    "\n",
+    "params = {\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    \"data_files_to_use\": ast.literal_eval(\"['.html']\"),\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4d2354db-1bb3-4a71-98df-f0f148af3a02",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "17:09:40 INFO - html2parquet parameters are : {'output_format': <html2parquet_output_format.MARKDOWN: 'markdown'>, 'favor_precision': <html2parquet_favor_precision.TRUE: 'True'>, 'favor_recall': <html2parquet_favor_recall.TRUE: 'True'>}\n",
+      "17:09:40 INFO - pipeline id pipeline_id\n",
+      "17:09:40 INFO - code location None\n",
+      "17:09:40 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n",
+      "17:09:40 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "17:09:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n",
+      "17:09:40 INFO - orchestrator html2parquet started at 2024-11-13 17:09:40\n",
+      "17:09:40 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n",
+      "17:09:47 INFO - Completed 1 files (100.0%) in 0.111 min\n",
+      "17:09:47 INFO - Done processing 1 files, waiting for flush() completion.\n",
+      "17:09:47 INFO - done flushing in 0.0 sec\n",
+      "17:09:47 INFO - Completed execution in 0.111 min, execution result 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "import sys\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=(params))\n",
+    "# create launcher\n",
+    "launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())\n",
+    "# launch\n",
+    "return_code = launcher.launch()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e2bee8da-c566-4e45-bca1-354dfd04b0df",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>document</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ai-alliance-index.html</td>\n",
+       "      <td>ai-alliance-index.html</td>\n",
+       "      <td>![](https://images.prismic.io/ai-alliance/Ztf3...</td>\n",
+       "      <td>f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...</td>\n",
+       "      <td>394</td>\n",
+       "      <td>2024-11-13T17:09:40.947095</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    title                document  \\\n",
+       "0  ai-alliance-index.html  ai-alliance-index.html   \n",
+       "\n",
+       "                                            contents  \\\n",
+       "0  ![](https://images.prismic.io/ai-alliance/Ztf3...   \n",
+       "\n",
+       "                                         document_id  size  \\\n",
+       "0  f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...   394   \n",
+       "\n",
+       "                date_acquired  \n",
+       "0  2024-11-13T17:09:40.947095  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pyarrow.parquet as pq\n",
+    "import pandas as pd\n",
+    "table = pq.read_table('output/ai-alliance-index.parquet')\n",
+    "table.to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "cde6e37d-c437-490f-8e01-f4f51a123484",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table.to_pandas()['contents'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fd0d13b-1ff6-4988-91fb-52c25ba998c8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "587e43ee-7b51-4a9c-8bf2-0a23e309a7ae",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md
index 0d25553e1..6b12bffea 100644
--- a/transforms/language/html2parquet/python/README.md
+++ b/transforms/language/html2parquet/python/README.md
@@ -1,25 +1,55 @@
-# html2parquet Transform 
+# HTML to Parquet Transform
 
-This tranforms iterate through zip of HTML files or single HTML files and generates parquet files containing the converted document in string.
+---
 
-The HTML conversion is using the [Trafilatura](https://trafilatura.readthedocs.io/en/latest/usage-python.html).
+## Description
 
-## Output format
+This transform iterates through zipped collections of HTML files or single HTML files and generates Parquet files containing the extracted content, leveraging the [Trafilatura library](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for extraction of text, tables, images, and other components.
 
-The output format will contain the following colums
+---
+
+## Contributors
+
+- Sungeun An (sungeun.an@ibm.com)
+- Syed Zawad (szawad@ibm.com)
+
+---
+
+## Date
+
+**Last updated:** 10/16/24  
+- **Update details:**
+  - Added Trafilatura parameters (`favor_precision` and `favor_recall`) for enhanced control over content extraction.
+  - Enhanced table and image extraction features.  
+  - See [Pull Request #707](https://github.com/IBM/data-prep-kit/pull/707) for more details.
+
+---
+
+## Input and Output
+
+### Input
+- Accepted Formats: Single HTML files or zipped collections of HTML files.  
+- Sample Input Files: [sample html files](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/html2parquet/python/test-data/input)  
+
+### Output
+- Format: Parquet files with the following structure:
 
 ```jsonc
 {
-	"title": "string",             // the member filename
-	"document": "string",          // the base of the source archive
-	"contents": "string",          // the content of the HTML
+    "title": "string",             // the member filename
+    "document": "string",          // the base of the source archive
+    "contents": "string",          // the content of the HTML
     "document_id": "string",      // the document id, a hash of `contents`
     "size": "string",             // the size of `contents`
     "date_acquired": "date",      // the date when the transform was executing
 }
 ```
+
+
 ## Parameters
 
+### User-Configurable Parameters
+
 The table below provides the parameters that users can adjust to control the behavior of the extraction:
 
 | Parameter         | Default    | Description                                                                 |
@@ -28,6 +58,8 @@ The table below provides the parameters that users can adjust to control the beh
 | `favor_precision`  | `True`     | Prefers less content but more accurate extraction. Options: `True`, `False`. |
 | `favor_recall`     | `True`     | Extracts more content when uncertain. Options: `True`, `False`.              |
 
+### Default Parameters
+
 The table below provides the parameters that are enabled by default to ensure a comprehensive extraction process:
 
 | Parameter           | Default   | Description                                                                 |
@@ -43,6 +75,7 @@ The table below provides the parameters that are enabled by default to ensure a
 - To prioritize extracting more content over accuracy, set `favor_recall=True` and `favor_precision=False`.
 - When invoking the CLI, use the following syntax for these parameters: `--html2parquet_<parameter_name>`. For example: `--html2parquet_output_format='markdown'`.
 
+
 ## Example
 
 ### Sample HTML 
@@ -155,3 +188,57 @@ Chicago |
 ## Contact Us
 ```
 
+## Usage
+
+### Command-Line Interface (CLI)
+
+Run the transform with the following command:
+
+```
+python ../html2parquet/python/src/html2parquet_transform_python.py \
+  --data_local_config "{'input_folder': '../html2parquet/python/test-data/input', 'output_folder': '../html2parquet/python/test-data/expected'}" \
+  --data_files_to_use '[".html", ".zip"]'
+```
+
+- When invoking the CLI, use the following syntax for these parameters: `--html2parquet_<parameter_name>`. For example: `--html2parquet_output_format='markdown'`.
+
+### Python Code
+
+To run the transform programmatically:
+
+```
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+
+from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration
+import ast
+import sys
+
+# create parameters
+local_conf = {
+    "input_folder": "input",
+    "output_folder": "output",
+}
+
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    "data_files_to_use": ast.literal_eval("['.html']"),
+}
+
+sys.argv = ParamsUtils.dict_to_req(d=(params))
+# create launcher
+launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())
+# launch
+return_code = launcher.launch()
+
+```
+
+### Sample Notebook
+
+See the [sample notebook](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/html2parquet/notebooks/html2parquet.ipynb) for an example.
+
+
+## Further Resources
+
+- [Trafilatura](https://trafilatura.readthedocs.io/en/latest/usage-python.html).

From 76e067dfa3795bf1001570465e77822026ab1f4e Mon Sep 17 00:00:00 2001
From: Sungeun An <sungeun.an@ibm.com>
Date: Tue, 19 Nov 2024 09:36:37 -0800
Subject: [PATCH 2/3] removed python code in README and minor changes in the
 notebook

Signed-off-by: Sungeun An <sungeun.an@ibm.com>
---
 .../html2parquet/notebooks/html2parquet.ipynb |  8 ++---
 .../language/html2parquet/python/README.md    | 31 -------------------
 2 files changed, 4 insertions(+), 35 deletions(-)

diff --git a/transforms/language/html2parquet/notebooks/html2parquet.ipynb b/transforms/language/html2parquet/notebooks/html2parquet.ipynb
index c2713899d..230805144 100644
--- a/transforms/language/html2parquet/notebooks/html2parquet.ipynb
+++ b/transforms/language/html2parquet/notebooks/html2parquet.ipynb
@@ -37,14 +37,14 @@
     "\n",
     "# create parameters\n",
     "local_conf = {\n",
-    "    \"input_folder\": \"input\",\n",
-    "    \"output_folder\": \"output\",\n",
+    "    \"input_folder\": \"/path/to/your/input/folder\",\n",
+    "    \"output_folder\": \"/path/to/your/output/folder\",\n",
     "}\n",
     "\n",
     "params = {\n",
     "    # Data access. Only required parameters are specified\n",
     "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-    "    \"data_files_to_use\": ast.literal_eval(\"['.html']\"),\n",
+    "    \"data_files_to_use\": ast.literal_eval(\"['.zip', '.html']\"),\n",
     "}\n"
    ]
   },
@@ -154,7 +154,7 @@
    "source": [
     "import pyarrow.parquet as pq\n",
     "import pandas as pd\n",
-    "table = pq.read_table('output/ai-alliance-index.parquet')\n",
+    "table = pq.read_table('/path/to/your/output/folder/sample.parquet')\n",
     "table.to_pandas()"
    ]
   },
diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md
index 6b12bffea..eadd082fb 100644
--- a/transforms/language/html2parquet/python/README.md
+++ b/transforms/language/html2parquet/python/README.md
@@ -202,37 +202,6 @@ python ../html2parquet/python/src/html2parquet_transform_python.py \
 
 - When invoking the CLI, use the following syntax for these parameters: `--html2parquet_<parameter_name>`. For example: `--html2parquet_output_format='markdown'`.
 
-### Python Code
-
-To run the transform programmatically:
-
-```
-from data_processing.runtime.pure_python import PythonTransformLauncher
-from data_processing.utils import ParamsUtils
-
-from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration
-import ast
-import sys
-
-# create parameters
-local_conf = {
-    "input_folder": "input",
-    "output_folder": "output",
-}
-
-params = {
-    # Data access. Only required parameters are specified
-    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
-    "data_files_to_use": ast.literal_eval("['.html']"),
-}
-
-sys.argv = ParamsUtils.dict_to_req(d=(params))
-# create launcher
-launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())
-# launch
-return_code = launcher.launch()
-
-```
 
 ### Sample Notebook
 

From b018b22d2af483eb4b5fcba60ac20bd953064dbf Mon Sep 17 00:00:00 2001
From: Sungeun An <sungeun.an@ibm.com>
Date: Thu, 21 Nov 2024 12:52:23 -0800
Subject: [PATCH 3/3] updated with relative path and added markdown for
 notebook

Signed-off-by: Sungeun An <sungeun.an@ibm.com>
---
 .../html2parquet/notebooks/html2parquet.ipynb | 57 ++++++++++++-------
 .../language/html2parquet/python/README.md    | 10 ++--
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/transforms/language/html2parquet/notebooks/html2parquet.ipynb b/transforms/language/html2parquet/notebooks/html2parquet.ipynb
index 230805144..669a4d30d 100644
--- a/transforms/language/html2parquet/notebooks/html2parquet.ipynb
+++ b/transforms/language/html2parquet/notebooks/html2parquet.ipynb
@@ -1,9 +1,17 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8435e1f7-0c2e-49f4-a77a-b525ee6c532b",
+   "metadata": {},
+   "source": [
+    "# Html2Parquet Transform Sample Notebook"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "c4f9c952-cb3b-40f1-bfb5-00d9a43a5715",
+   "execution_count": null,
+   "id": "d9420989-ec8a-4fde-9a93-dc25096389f1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,6 +34,14 @@
     "from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6d85491b-0093-46e7-8653-ca8052ea59f0",
+   "metadata": {},
+   "source": [
+    "## Specify input/output folders and parameters"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -37,7 +53,7 @@
     "\n",
     "# create parameters\n",
     "local_conf = {\n",
-    "    \"input_folder\": \"/path/to/your/input/folder\",\n",
+    "    \"input_folder\": \"/path/to/your/input/folder\", # For the sample input files, refer to the 'python/test-data/input' folder\n",
     "    \"output_folder\": \"/path/to/your/output/folder\",\n",
     "}\n",
     "\n",
@@ -48,6 +64,14 @@
     "}\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0dcd1249-1eb8-4b33-9827-626f90c840b4",
+   "metadata": {},
+   "source": [
+    "## Invoke the html2parquet transformation"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -74,7 +98,6 @@
     }
    ],
    "source": [
-    "\n",
     "import sys\n",
     "sys.argv = ParamsUtils.dict_to_req(d=(params))\n",
     "# create launcher\n",
@@ -83,6 +106,14 @@
     "return_code = launcher.launch()\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3c66468d-703f-427f-a1dd-a758edd334de",
+   "metadata": {},
+   "source": [
+    "## Checking the output Parquet file"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -178,22 +209,6 @@
    "source": [
     "table.to_pandas()['contents'][0]"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2fd0d13b-1ff6-4988-91fb-52c25ba998c8",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "587e43ee-7b51-4a9c-8bf2-0a23e309a7ae",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -212,7 +227,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md
index eadd082fb..35e781007 100644
--- a/transforms/language/html2parquet/python/README.md
+++ b/transforms/language/html2parquet/python/README.md
@@ -18,10 +18,7 @@ This transform iterates through zipped collections of HTML files or single HTML
 ## Date
 
 **Last updated:** 10/16/24  
-- **Update details:**
-  - Added Trafilatura parameters (`favor_precision` and `favor_recall`) for enhanced control over content extraction.
-  - Enhanced table and image extraction features.  
-  - See [Pull Request #707](https://github.com/IBM/data-prep-kit/pull/707) for more details.
+**Update details:** Enhanced table and image extraction features by adding the corresponding Trafilatura parameters.
 
 ---
 
@@ -29,7 +26,7 @@ This transform iterates through zipped collections of HTML files or single HTML
 
 ### Input
 - Accepted Formats: Single HTML files or zipped collections of HTML files.  
-- Sample Input Files: [sample html files](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/html2parquet/python/test-data/input)  
+- Sample Input Files: [sample html files](test-data/input) 
 
 ### Output
 - Format: Parquet files with the following structure:
@@ -205,7 +202,8 @@ python ../html2parquet/python/src/html2parquet_transform_python.py \
 
 ### Sample Notebook
 
-See the [sample notebook](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/html2parquet/notebooks/html2parquet.ipynb) for an example.
+See the [sample notebook](../notebooks/html2parquet.ipynb)
+) for an example.
 
 
 ## Further Resources