From edd5841bb199c974489a8f612968c587bdeebab3 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 17:08:43 -0500 Subject: [PATCH] Add jupyter notebooks for python, ray and spark fuzzy dedup Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/fdedup.ipynb | 203 ----------------------- 1 file changed, 203 deletions(-) delete mode 100644 transforms/universal/fdedup/fdedup.ipynb diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb deleted file mode 100644 index 88bcd87aa..000000000 --- a/transforms/universal/fdedup/fdedup.ipynb +++ /dev/null @@ -1,203 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "afd55886-5f5b-4794-838e-ef8179fb0394", - "metadata": {}, - "source": [ - "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", - "```\n", - "make venv\n", - "source venv/bin/activate && pip install jupyterlab\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "## This is here as a reference only\n", - "# Users and application developers must use the right tag for the latest from pypi\n", - "#!pip install data-prep-toolkit\n", - "#!pip install data-prep-toolkit-transforms\n", - "#!pip install data-prep-connector" - ] - }, - { - "cell_type": "markdown", - "id": "ebf1f782-0e61-485c-8670-81066beb734c", - "metadata": {}, - "source": [ - "##### ***** Import required Classes and modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_python import parse_args\n", - "from fdedup_transform_ray import RayServiceOrchestrator" - ] - }, - { - "cell_type": "markdown", - "id": "7234563c-2924-4150-8a31-4aec98c1bf33", - "metadata": {}, - "source": [ - "##### ***** Setup runtime parameters for this transform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90a853e-412f-45d7-af3d-959e755aeebb", - "metadata": {}, - "outputs": [], - "source": [ - "# create parameters\n", - "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", - "params = {\n", - " # transform configuration parameters\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - " \"contents_column\": \"contents\",\n", - " \"document_id_column\": \"int_id_column\",\n", - " \"num_permutations\": 112,\n", - " \"num_bands\": 14,\n", - " \"num_minhashes_per_band\": 8,\n", - " \"num_segments\": 1,\n", - " \"operation_mode\": \"filter_duplicates\",\n", - " # ray configuration parameters\n", - " \"run_locally\": True,\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", - "metadata": {}, - "source": [ - "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0775e400-7469-49a6-8998-bd4772931459", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "args = parse_args()\n", - "# Initialize the orchestrator\n", - "orchestrator = RayServiceOrchestrator(global_params=args)\n", - "# Launch ray fuzzy dedup execution\n", - "orchestrator.orchestrate()" - ] - }, - { - "cell_type": "markdown", - "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", - "metadata": {}, - "source": [ - "##### **** The specified folder will include the transformed parquet files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7276fe84-6512-4605-ab65-747351e13a7c", - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "glob.glob(\"ray/output/cleaned/*\")" - ] - }, - { - "cell_type": "markdown", - "id": "d30489d9-fc98-423e-90a8-e8f372787e88", - "metadata": {}, - "source": [ - "***** print the input data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", - "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", - " print(input_df)" - ] - }, - { - "cell_type": "markdown", - "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", - "metadata": {}, - "source": [ - "***** print the output result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", - "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", - " print(output_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d60e391d-cf58-47ae-9991-04c05d114edc", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fdedup_ray", - "language": "python", - "name": "fdedup_ray" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}