added notebook and fix makefile

Signed-off-by: Maroun Touma <[email protected]>
IBM · Dec 10, 2024 · 0a0e785 · 0a0e785
1 parent d1ad598
commit 0a0e785
Show file tree

Hide file tree

Showing 3 changed files with 212 additions and 3 deletions.
diff --git a/transforms/language/lang_id/Makefile b/transforms/language/lang_id/Makefile
@@ -18,15 +18,16 @@ TRANSFORM_NAME=$(shell basename `pwd`)
 run-cli-sample:
 	make venv
 	$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
-                --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
+                --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
                 --lang_id_model_credential "ANY CREDENTIAL"	\
 				--lang_id_model_kind "fasttext"	\
 				--lang_id_model_url "facebook/fasttext-language-identification" \
 				--lang_id_content_column_name "text"
 
 run-cli-ray-sample: 
+	make venv
 	$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
-                --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
+                --run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
                 --lang_id_model_credential "ANY CREDENTIAL"	\
 				--lang_id_model_kind "fasttext"	\
 				--lang_id_model_url "facebook/fasttext-language-identification" \

diff --git a/transforms/language/lang_id/dpk_lang_id/transform_python.py b/transforms/language/lang_id/dpk_lang_id/transform_python.py
@@ -9,12 +9,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ################################################################################
+import sys
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.runtime.pure_python.runtime_configuration import (
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.utils import get_logger
+from data_processing.utils import ParamsUtils, get_logger
 from dpk_lang_id.transform import LangIdentificationTransformConfiguration
 
 
@@ -36,6 +37,29 @@ def __init__(self):
         super().__init__(transform_config=LangIdentificationTransformConfiguration())
 
 
+class LangIdRuntime:
+    def __init__(self, **kwargs):
+        self.params = {}
+        for key in kwargs:
+            self.params[key] = kwargs[key]
+        # if input_folder and output_folder are specified, then assume it is represent data_local_config
+        try:
+            local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")}
+            self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
+            del self.params["input_folder"]
+            del self.params["output_folder"]
+        except:
+            pass
+
+    def transform(self):
+        sys.argv = ParamsUtils.dict_to_req(d=(self.params))
+        # create launcher
+        launcher = PythonTransformLauncher(LangIdentificationPythonTransformConfiguration())
+        # launch
+        return_code = launcher.launch()
+        return return_code
+
+
 if __name__ == "__main__":
     launcher = PythonTransformLauncher(LangIdentificationPythonTransformConfiguration())
     logger.info("Launching lang_id transform")

diff --git a/transforms/language/lang_id/lang_id.ipynb b/transforms/language/lang_id/lang_id.ipynb
@@ -0,0 +1,184 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+   "metadata": {},
+   "source": [
+    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+    "```\n",
+    "make venv \n",
+    "source venv/bin/activate \n",
+    "pip install jupyterlab\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "## This is here as a reference only\n",
+    "# Users and application developers must use the right tag for the latest from pypi\n",
+    "%pip install data-prep-toolkit\n",
+    "%pip install data-prep-toolkit-transforms"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n",
+    "| Key name  | Default  | Description |\n",
+    "|------------|----------|--------------|\n",
+    "| _model_credential_ | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |\n",
+    "| _model_kind_ | _unset_ | specifies what kind of model you want to use for language identification. Currently, only `fasttext` is available. |\n",
+    "| _model_url_ | _unset_ |  specifies url that model locates. For fasttext, this will be repo nme of the model, like `facebook/fasttext-language-identification` |\n",
+    "| _content_column_name_ | `contents` | specifies name of the column containing documents |\n",
+    "| _output_lang_column_name_ | `lang` | specifies name of the output column to hold predicted language code |\n",
+    "| _output_score_column_name_ | `score` | specifies name of the output column to hold score of prediction |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+   "metadata": {},
+   "source": [
+    "##### ***** Import required classes and modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dpk_lang_id.transform_python import LangIdRuntime"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+   "metadata": {},
+   "source": [
+    "##### ***** Setup runtime parameters for this transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "00:06:41 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n",
+      "00:06:41 INFO - pipeline id pipeline_id\n",
+      "00:06:41 INFO - code location None\n",
+      "00:06:41 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
+      "00:06:41 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:06:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:06:41 INFO - orchestrator lang_id started at 2024-12-11 00:06:41\n",
+      "00:06:41 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n",
+      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
+      "00:06:47 INFO - Completed 1 files (33.33%) in 0.074 min\n",
+      "00:06:47 INFO - Completed 2 files (66.67%) in 0.076 min\n",
+      "00:06:48 INFO - Completed 3 files (100.0%) in 0.081 min\n",
+      "00:06:48 INFO - Done processing 3 files, waiting for flush() completion.\n",
+      "00:06:48 INFO - done flushing in 0.0 sec\n",
+      "00:06:48 INFO - Completed execution in 0.111 min, execution result 0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "LangIdRuntime(input_folder= \"test-data/input\",\n",
+    "        output_folder= \"output\",\n",
+    "        lang_id_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n",
+    "        lang_id_model_kind= \"fasttext\",\n",
+    "        lang_id_model_url= \"facebook/fasttext-language-identification\",\n",
+    "        lang_id_content_column_name= \"text\").transform()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+   "metadata": {},
+   "source": [
+    "##### **** The specified folder will include the transformed parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['output/test_03.parquet',\n",
+       " 'output/test_02.parquet',\n",
+       " 'output/metadata.json',\n",
+       " 'output/test_01.parquet']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import glob\n",
+    "glob.glob(\"output/*\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}