Skip to content

Commit

Permalink
added notebook and fix makefile
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <[email protected]>
  • Loading branch information
touma-I committed Dec 10, 2024
1 parent d1ad598 commit 0a0e785
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 3 deletions.
5 changes: 3 additions & 2 deletions transforms/language/lang_id/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@ TRANSFORM_NAME=$(shell basename `pwd`)
run-cli-sample:
make venv
$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
--data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}" \
--data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--lang_id_model_credential "ANY CREDENTIAL" \
--lang_id_model_kind "fasttext" \
--lang_id_model_url "facebook/fasttext-language-identification" \
--lang_id_content_column_name "text"

run-cli-ray-sample:
make venv
$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
--run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}" \
--run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--lang_id_model_credential "ANY CREDENTIAL" \
--lang_id_model_kind "fasttext" \
--lang_id_model_url "facebook/fasttext-language-identification" \
Expand Down
26 changes: 25 additions & 1 deletion transforms/language/lang_id/dpk_lang_id/transform_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
import sys

from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.runtime.pure_python.runtime_configuration import (
PythonTransformRuntimeConfiguration,
)
from data_processing.utils import get_logger
from data_processing.utils import ParamsUtils, get_logger
from dpk_lang_id.transform import LangIdentificationTransformConfiguration


Expand All @@ -36,6 +37,29 @@ def __init__(self):
super().__init__(transform_config=LangIdentificationTransformConfiguration())


class LangIdRuntime:
def __init__(self, **kwargs):
self.params = {}
for key in kwargs:
self.params[key] = kwargs[key]
# if input_folder and output_folder are specified, then assume it is represent data_local_config
try:
local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")}
self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
del self.params["input_folder"]
del self.params["output_folder"]
except:
pass

def transform(self):
sys.argv = ParamsUtils.dict_to_req(d=(self.params))
# create launcher
launcher = PythonTransformLauncher(LangIdentificationPythonTransformConfiguration())
# launch
return_code = launcher.launch()
return return_code


if __name__ == "__main__":
launcher = PythonTransformLauncher(LangIdentificationPythonTransformConfiguration())
logger.info("Launching lang_id transform")
Expand Down
184 changes: 184 additions & 0 deletions transforms/language/lang_id/lang_id.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "afd55886-5f5b-4794-838e-ef8179fb0394",
"metadata": {},
"source": [
"##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
"```\n",
"make venv \n",
"source venv/bin/activate \n",
"pip install jupyterlab\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"## This is here as a reference only\n",
"# Users and application developers must use the right tag for the latest from pypi\n",
"%pip install data-prep-toolkit\n",
"%pip install data-prep-toolkit-transforms"
]
},
{
"cell_type": "markdown",
"id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n",
"| Key name | Default | Description |\n",
"|------------|----------|--------------|\n",
"| _model_credential_ | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |\n",
"| _model_kind_ | _unset_ | specifies what kind of model you want to use for language identification. Currently, only `fasttext` is available. |\n",
"| _model_url_ | _unset_ | specifies url that model locates. For fasttext, this will be repo nme of the model, like `facebook/fasttext-language-identification` |\n",
"| _content_column_name_ | `contents` | specifies name of the column containing documents |\n",
"| _output_lang_column_name_ | `lang` | specifies name of the output column to hold predicted language code |\n",
"| _output_score_column_name_ | `score` | specifies name of the output column to hold score of prediction |"
]
},
{
"cell_type": "markdown",
"id": "ebf1f782-0e61-485c-8670-81066beb734c",
"metadata": {},
"source": [
"##### ***** Import required classes and modules"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
"metadata": {},
"outputs": [],
"source": [
"from dpk_lang_id.transform_python import LangIdRuntime"
]
},
{
"cell_type": "markdown",
"id": "7234563c-2924-4150-8a31-4aec98c1bf33",
"metadata": {},
"source": [
"##### ***** Setup runtime parameters for this transform"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"00:06:41 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n",
"00:06:41 INFO - pipeline id pipeline_id\n",
"00:06:41 INFO - code location None\n",
"00:06:41 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
"00:06:41 INFO - data factory data_ max_files -1, n_sample -1\n",
"00:06:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"00:06:41 INFO - orchestrator lang_id started at 2024-12-11 00:06:41\n",
"00:06:41 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n",
"Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
"00:06:47 INFO - Completed 1 files (33.33%) in 0.074 min\n",
"00:06:47 INFO - Completed 2 files (66.67%) in 0.076 min\n",
"00:06:48 INFO - Completed 3 files (100.0%) in 0.081 min\n",
"00:06:48 INFO - Done processing 3 files, waiting for flush() completion.\n",
"00:06:48 INFO - done flushing in 0.0 sec\n",
"00:06:48 INFO - Completed execution in 0.111 min, execution result 0\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"LangIdRuntime(input_folder= \"test-data/input\",\n",
" output_folder= \"output\",\n",
" lang_id_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n",
" lang_id_model_kind= \"fasttext\",\n",
" lang_id_model_url= \"facebook/fasttext-language-identification\",\n",
" lang_id_content_column_name= \"text\").transform()"
]
},
{
"cell_type": "markdown",
"id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
"metadata": {},
"source": [
"##### **** The specified folder will include the transformed parquet files."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7276fe84-6512-4605-ab65-747351e13a7c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['output/test_03.parquet',\n",
" 'output/test_02.parquet',\n",
" 'output/metadata.json',\n",
" 'output/test_01.parquet']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import glob\n",
"glob.glob(\"output/*\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 0a0e785

Please sign in to comment.