IBM · touma-I · Dec 17, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 10, 2024
diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets
@@ -51,63 +51,78 @@ publish:
 
 test-image-sequence:: .defaults.lib-whl-image .transforms.test-image-help .transforms.clean
 
+test-image-python:
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=python test-image
+
+test-image-ray:
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=ray test-image
+
+test-image-spark:
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image
+
 test-image:: .default.build-lib-wheel
-	@if [ -e Dockerfile.python ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.python \
-				TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
-				test-image-sequence ; \
+	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
+		if [ -e Dockerfile.python ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.python \
+					TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
+					test-image-sequence ; \
+		fi ;\
 	fi
-	@if [ -e Dockerfile.ray ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.ray \
-				TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
-				BASE_IMAGE=$(RAY_BASE_IMAGE)  \
-				test-image-sequence ; \
+	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
+		if [ -e Dockerfile.ray ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.ray \
+					TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
+					BASE_IMAGE=$(RAY_BASE_IMAGE)  \
+					test-image-sequence ; \
+		fi ;\
 	fi
-	@if [ -e Dockerfile.spark ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.spark \
-				TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
-				BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
-				test-image-sequence ; \
+	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
+		if [ -e Dockerfile.spark ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.spark \
+					TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
+					BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
+					test-image-sequence ; \
+		fi ;\
 	fi
 	-rm -rf data-processing-dist
 
 
 image-python:
-	@if [ -e Dockerfile.python ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.python \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
-				.defaults.lib-whl-image ; \
-	fi
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=python image
 
 image-ray:
-	@if [ -e Dockerfile.ray ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.ray \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
-				BASE_IMAGE=$(RAY_BASE_IMAGE)  \
-				.defaults.lib-whl-image ; \
-	fi
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=ray image
 
 image-spark:
-	@if [ -e Dockerfile.spark ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.spark \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
-				BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
-				.defaults.lib-whl-image ; \
-	fi
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=spark image
 
 image:: .default.build-lib-wheel
 	## Build all possible images unless a specific runtime is specified
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
-		$(MAKE) image-python ; \
+		if [ -e Dockerfile.python ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.python \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
+					.defaults.lib-whl-image ; \
+		fi ; \
 	fi
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
-		$(MAKE) image-ray ; \
+		if [ -e Dockerfile.ray ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.ray \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
+					BASE_IMAGE=$(RAY_BASE_IMAGE)  \
+					.defaults.lib-whl-image ; \
+		fi ; \
 	fi
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
-		$(MAKE) image-spark ; \
+		if [ -e Dockerfile.spark ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.spark \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
+					BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
+					.defaults.lib-whl-image ; \
+		fi ; \
 	fi
 	-rm -rf data-processing-dist
 

diff --git a/...sforms/language/lang_id/python/Dockerfile → ...sforms/language/lang_id/Dockerfile.python b/...sforms/language/lang_id/python/Dockerfile → ...sforms/language/lang_id/Dockerfile.python
@@ -24,10 +24,9 @@ RUN apt update && apt install gcc g++ -y
 RUN mkdir -p /home/dpk/.cache/huggingface/hub && chmod -R 777 /home/dpk/.cache/huggingface/hub
 USER dpk
 
-COPY --chown=dpk:root src/ src/
-COPY --chown=dpk:root pyproject.toml pyproject.toml 
+COPY --chown=dpk:root dpk_lang_id/ dpk_lang_id/
 COPY --chown=dpk:root requirements.txt requirements.txt
-RUN pip install --no-cache-dir -e .
+RUN pip install --no-cache-dir -r requirements.txt
 
 # clean up apt
 USER root 
@@ -36,18 +35,6 @@ RUN apt-get remove gcc g++ -y \
     && rm -rf /var/cache/apt/archives/* /var/lib/apt/lists/*
 USER dpk
 
-#COPY requirements.txt requirements.txt
-#RUN pip install --no-cache-dir -r  requirements.txt
-
-# copy the main() entry point to the image 
-COPY ./src/lang_id_transform_python.py .
-
-# copy some of the samples in
-COPY ./src/lang_id_local.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/dpk

diff --git a/transforms/language/lang_id/ray/Dockerfile → transforms/language/lang_id/Dockerfile.ray b/transforms/language/lang_id/ray/Dockerfile → transforms/language/lang_id/Dockerfile.ray
@@ -19,13 +19,10 @@ USER ray
 COPY --chown=ray:users data-processing-dist data-processing-dist
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
 
-## Copy the python version of the tansform
-COPY --chown=ray:users python-transform/  python-transform/
-RUN cd python-transform && pip install --no-cache-dir -e .
 
-COPY --chown=ray:users src/ src/
-COPY --chown=ray:users pyproject.toml pyproject.toml 
-RUN pip install --no-cache-dir -e .
+COPY --chown=ray:users dpk_lang_id/ dpk_lang_id/
+COPY --chown=ray:users requirements.txt requirements.txt 
+RUN pip install --no-cache-dir -r requirements.txt
 
 # clean up apt
 USER root 
@@ -34,16 +31,6 @@ RUN sudo apt remove gcc g++ -y \
     && sudo rm -rf /var/cache/apt/archives/* /var/lib/apt/lists/*
 USER ray
 
-# copy the main() entry point to the image 
-COPY ./src/lang_id_transform_ray.py .
-
-# copy some of the samples in
-COPY ./src/lang_id_local_ray.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
-
 # Set environment
 ENV PYTHONPATH /home/ray
 

diff --git a/transforms/language/lang_id/Makefile b/transforms/language/lang_id/Makefile
@@ -1,84 +1,37 @@
 REPOROOT=../../..
 # Use make help, to see the available rules
-include $(REPOROOT)/.make.defaults
-
-setup::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-clean::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-build::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-setup::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-venv::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-set-versions::  
-	@# Help: Recursively $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-publish:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-src::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-kind-load-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-docker-load-image::
-	@# Help: Recursively make $@ in all subdirs
-	$(MAKE) RULE=$@ .recurse
-
-docker-save-image::
-	@# Help: Recursively make $@ in all subdirs
-	$(MAKE) RULE=$@ .recurse
-
-.PHONY: workflow-venv
-workflow-venv:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-venv;   \
-	fi
-
-.PHONY: workflow-test
-workflow-test:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-test;   \
-	fi
-
-.PHONY: workflow-upload
-workflow-upload:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-upload; \
-	fi
-
-.PHONY: workflow-build
-workflow-build:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C  kfp_ray workflow-build; \
-	fi
+include $(REPOROOT)/transforms/.make.cicd.targets
+
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
+
+################################################################################
+
+
+
+run-cli-sample:
+	make venv
+	source venv/bin/activate && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
+                --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
+                --lang_id_model_credential "ANY CREDENTIAL"	\
+				--lang_id_model_kind "fasttext"	\
+				--lang_id_model_url "facebook/fasttext-language-identification" \
+				--lang_id_content_column_name "text"
+
+run-cli-ray-sample: 
+	make venv
+	source venv/bin/activate && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
+                --run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
+                --lang_id_model_credential "ANY CREDENTIAL"	\
+				--lang_id_model_kind "fasttext"	\
+				--lang_id_model_url "facebook/fasttext-language-identification" \
+				--lang_id_content_column_name "text"
 
diff --git a/transforms/language/lang_id/README.md b/transforms/language/lang_id/README.md
@@ -1,12 +1,75 @@
 # Language Identification Transform 
 The Language Identification transforms serves as a simple exemplar to demonstrate the development
-of a simple 1:1 transform.  Per the set of 
+of a simple 1:1 transform.  
+Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up.
+
+## Summary 
+This transform will identify language of each text with confidence score with fasttext language identification model. [ref](https://huggingface.co/facebook/fasttext-language-identification)
+
+## Configuration and command line Options
+
+The set of dictionary keys holding [LangIdentificationTransform](dpk_lang_id/transform.py) 
+configuration for values are as follows:
+
+| Key name  | Default  | Description |
+|------------|----------|--------------|
+| _model_credential_ | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |
+| _model_kind_ | _unset_ | specifies what kind of model you want to use for language identification. Currently, only `fasttext` is available. |
+| _model_url_ | _unset_ |  specifies url that model locates. For fasttext, this will be repo nme of the model, like `facebook/fasttext-language-identification` |
+| _content_column_name_ | `contents` | specifies name of the column containing documents |
+| _output_lang_column_name_ | `lang` | specifies name of the output column to hold predicted language code |
+| _output_score_column_name_ | `score` | specifies name of the output column to hold score of prediction |
+
+## Running
+
+### Launched Command Line Options 
+The following command line arguments are available in addition to 
+the options provided by 
+the [python launcher options](../../../data-processing-lib/doc/python-launcher-options.md).
+```
+  --lang_id_model_credential LANG_ID_MODEL_CREDENTIAL   the credential you use to get model. This will be huggingface token.
+  --lang_id_model_kind LANG_ID_MODEL_KIND   what kind of model you want to use for language identification. Currently, only `fasttext` is available.
+  --lang_id_model_url LANG_ID_MODEL_URL   url that model locates. For fasttext, this will be repo name of the model, like `facebook/fasttext-language-identification`
+  --lang_id_content_column_name LANG_ID_CONTENT_COLUMN_NAME   A name of the column containing documents
+  --lang_id_output_lang_column_name LANG_ID_OUTPUT_LANG_COLUMN_NAME   Column name to store identified language
+  --lang_id_output_score_column_name LANG_ID_OUTPUT_SCORE_COLUMN_NAME   Column name to store the score of language identification
+```
+These correspond to the configuration keys described above.
+
+### Code example
+Here is a sample [notebook](lang_id.ipynb)
+
+## Troubleshooting guide
+
+For M1 Mac user, if you see following error during make command, `error: command '/usr/bin/clang' failed with exit code 1`, you should follow [this step](https://freeman.vc/notes/installing-fasttext-on-an-m1-mac)
+
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+# Language Identification Ray Transform 
+Please see the set of
 [transform project conventions](../../README.md#transform-project-conventions)
-the following runtimes are available:
-
-* [python](python/README.md) - provides the base python-based transformation 
-implementation.
-* [ray](ray/README.md) - enables the running of the base python transformation
-in a Ray runtime
-* [kfp](kfp_ray/README.md) - enables running the ray docker image 
-in a kubernetes cluster using a generated `yaml` file.
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary 
+This project wraps the language identification transform with a Ray runtime.
+
+## Configuration and command line Options
+
+Language Identification configuration and command line options are the same as for the base python transform. 
+
+### Launched Command Line Options 
+In addition to those available to the transform as defined here,
+the set of 
+[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available.
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
diff --git a/...anguage/lang_id/python/src/lang_models.py → ...nguage/lang_id/dpk_lang_id/lang_models.py b/...anguage/lang_id/python/src/lang_models.py → ...nguage/lang_id/dpk_lang_id/lang_models.py
diff --git a/...guage/lang_id/python/src/lang_id_local.py → ...rms/language/lang_id/dpk_lang_id/local.py b/...guage/lang_id/python/src/lang_id_local.py → ...rms/language/lang_id/dpk_lang_id/local.py
@@ -13,7 +13,7 @@
 import os
 
 from data_processing.data_access import DataAccessLocal
-from lang_id_transform import (
+from dpk_lang_id.transform import (
     LangIdentificationTransform,
     content_column_name_key,
     model_credential_key,