diff --git a/transforms/universal/hap/python/hap_python.ipynb b/transforms/universal/hap/python/hap_python.ipynb index deb147341..ad0c42a02 100644 --- a/transforms/universal/hap/python/hap_python.ipynb +++ b/transforms/universal/hap/python/hap_python.ipynb @@ -1,8 +1,54 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "cefa9cf6-e043-4b75-b416-a0b26c8cb3ad", + "metadata": {}, + "source": [ + "**** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + " make venv \n", + " source venv/bin/activate \n", + " pip install jupyterlab" + ] + }, { "cell_type": "code", "execution_count": 1, + "id": "4a84e965-feeb-424d-9263-9f127e53a1aa", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install data-prep-toolkit\n", + "%pip install data-prep-toolkit-transforms==0.2.2.dev3" + ] + }, + { + "cell_type": "markdown", + "id": "1d695832-16bc-48d3-a9c3-6ce650ae4a5c", + "metadata": {}, + "source": [ + "**** Configure the transform parameters. The set of dictionary keys holding DocQualityTransform configuration for values are as follows:\n", + " - model_name_or_path - specify the HAP model, which should be compatible with HuggingFace's AutoModelForSequenceClassification. Defaults to IBM's open-source toxicity classifier ibm-granite/granite-guardian-hap-38m.\n", + " - annotation_column - the column name containing hap (toxicity) score in the output .parquet file. Defaults to hap_score.\n", + " - doc_text_column - the column name containing the document text in the input .parquet file. Defaults to contents.\n", + " - batch_size - modify it based on the infrastructure capacity. Defaults to 128.\n", + " - max_length - the maximum length for the tokenizer. Defaults to 512." + ] + }, + { + "cell_type": "markdown", + "id": "3f9dbf94-2db4-492d-bbcb-53ac3948c256", + "metadata": {}, + "source": [ + "***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "id": "38aebf49-9460-4951-bb04-7045dec28690", "metadata": {}, "outputs": [ @@ -16,7 +62,6 @@ } ], "source": [ - "# import necessary packages\n", "import ast\n", "import os\n", "import sys\n", @@ -26,9 +71,17 @@ "from hap_transform_python import HAPPythonTransformConfiguration" ] }, + { + "cell_type": "markdown", + "id": "f443108f-40e4-40e5-a052-e8a7f4fbccdf", + "metadata": {}, + "source": [ + "***** Setup runtime parameters for this transform" + ] + }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "6a8ec5e4-1f52-4c61-9c9e-4618f9034b80", "metadata": {}, "outputs": [], @@ -61,9 +114,17 @@ "}" ] }, + { + "cell_type": "markdown", + "id": "d70abda8-3d66-4328-99ce-4075646a7756", + "metadata": {}, + "source": [ + "***** Use python runtime to invoke the transform" + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "94e908e2-1891-4dc7-9f85-85bbf8d44c5e", "metadata": {}, "outputs": [ @@ -71,40 +132,36 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:40:12 INFO - hap params are {'model_name_or_path': 'ibm-granite/granite-guardian-hap-38m', 'annotation_column': 'hap_score', 'doc_text_column': 'contents', 'inference_engine': 'CPU', 'max_length': 512, 'batch_size': 128} \n", - "22:40:12 INFO - pipeline id pipeline_id\n", - "22:40:12 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", - "22:40:12 INFO - data factory data_ is using local data access: input_folder - /Users/ian/Desktop/data-prep-kit/transforms/universal/hap/test-data/input output_folder - /Users/ian/Desktop/data-prep-kit/transforms/universal/hap/output\n", - "22:40:12 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:40:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:40:12 INFO - orchestrator hap started at 2024-12-02 22:40:12\n", - "22:40:12 ERROR - No input files to process - exiting\n", - "22:40:12 INFO - Completed execution in 0.0 min, execution result 0\n" + "11:29:11 INFO - hap params are {'model_name_or_path': 'ibm-granite/granite-guardian-hap-38m', 'annotation_column': 'hap_score', 'doc_text_column': 'contents', 'inference_engine': 'CPU', 'max_length': 512, 'batch_size': 128} \n", + "11:29:11 INFO - pipeline id pipeline_id\n", + "11:29:11 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", + "11:29:11 INFO - data factory data_ is using local data access: input_folder - /Users/ian/Desktop/data-prep-kit/transforms/universal/hap/test-data/input output_folder - /Users/ian/Desktop/data-prep-kit/transforms/universal/hap/output\n", + "11:29:11 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:29:11 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:29:11 INFO - orchestrator hap started at 2024-12-03 11:29:11\n", + "11:29:11 ERROR - No input files to process - exiting\n", + "11:29:11 INFO - Completed execution in 0.0 min, execution result 0\n" ] - }, - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "# Set the simulated command line args\n", + "%%capture\n", "sys.argv = ParamsUtils.dict_to_req(d=params | hap_params)\n", - "# create launcher\n", "launcher = PythonTransformLauncher(runtime_config=HAPPythonTransformConfiguration())\n", - "# Launch to process the input\n", "launcher.launch()" ] }, + { + "cell_type": "markdown", + "id": "0bd4ad5c-a1d9-4ea2-abb7-e43571095392", + "metadata": {}, + "source": [ + "**** The specified folder will include the transformed parquet files." + ] + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "f21d5d9b-562d-4530-8cea-2de5b63eb1dc", "metadata": {}, "outputs": [ @@ -114,7 +171,7 @@ "['../output/metadata.json', '../output/test1.parquet']" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" }