From edd5841bb199c974489a8f612968c587bdeebab3 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 25 Nov 2024 17:08:43 -0500
Subject: [PATCH] Add jupyter notebooks for python, ray and spark fuzzy dedup

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/fdedup.ipynb | 203 -----------------------
 1 file changed, 203 deletions(-)
 delete mode 100644 transforms/universal/fdedup/fdedup.ipynb

diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb
deleted file mode 100644
index 88bcd87aa..000000000
--- a/transforms/universal/fdedup/fdedup.ipynb
+++ /dev/null
@@ -1,203 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
-   "metadata": {},
-   "source": [
-    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
-    "```\n",
-    "make venv\n",
-    "source venv/bin/activate && pip install jupyterlab\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%capture\n",
-    "## This is here as a reference only\n",
-    "# Users and application developers must use the right tag for the latest from pypi\n",
-    "#!pip install data-prep-toolkit\n",
-    "#!pip install data-prep-toolkit-transforms\n",
-    "#!pip install data-prep-connector"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
-   "metadata": {},
-   "source": [
-    "##### ***** Import required Classes and modules"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import ast\n",
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "from data_processing.utils import ParamsUtils\n",
-    "from fdedup_transform_python import parse_args\n",
-    "from fdedup_transform_ray import RayServiceOrchestrator"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
-   "metadata": {},
-   "source": [
-    "##### ***** Setup runtime parameters for this transform"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create parameters\n",
-    "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n",
-    "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n",
-    "params = {\n",
-    "    # transform configuration parameters\n",
-    "    \"input_folder\": input_folder,\n",
-    "    \"output_folder\": output_folder,\n",
-    "    \"contents_column\": \"contents\",\n",
-    "    \"document_id_column\": \"int_id_column\",\n",
-    "    \"num_permutations\": 112,\n",
-    "    \"num_bands\": 14,\n",
-    "    \"num_minhashes_per_band\": 8,\n",
-    "    \"num_segments\": 1,\n",
-    "    \"operation_mode\": \"filter_duplicates\",\n",
-    "    # ray configuration parameters\n",
-    "    \"run_locally\": True,\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
-   "metadata": {},
-   "source": [
-    "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0775e400-7469-49a6-8998-bd4772931459",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-    "args = parse_args()\n",
-    "# Initialize the orchestrator\n",
-    "orchestrator = RayServiceOrchestrator(global_params=args)\n",
-    "# Launch ray fuzzy dedup execution\n",
-    "orchestrator.orchestrate()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
-   "metadata": {},
-   "source": [
-    "##### **** The specified folder will include the transformed parquet files."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import glob\n",
-    "glob.glob(\"ray/output/cleaned/*\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d30489d9-fc98-423e-90a8-e8f372787e88",
-   "metadata": {},
-   "source": [
-    "***** print the input data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import polars as pl\n",
-    "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n",
-    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
-    "    print(input_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e",
-   "metadata": {},
-   "source": [
-    "***** print the output result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import polars as pl\n",
-    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n",
-    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
-    "    print(output_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d60e391d-cf58-47ae-9991-04c05d114edc",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "fdedup_ray",
-   "language": "python",
-   "name": "fdedup_ray"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}