From 8972ff9013712ad4107c96ba6b391f0cd2899fe3 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 1 Feb 2024 19:21:10 +0100 Subject: [PATCH] update indexing notebook --- README.md | 4 +- requirements.txt | 2 +- .../aggregate_eval_results/requirements.txt | 2 +- src/components/text_cleaning/Dockerfile | 13 - .../text_cleaning/fondant_component.yaml | 11 - src/components/text_cleaning/requirements.txt | 1 - src/components/text_cleaning/src/main.py | 18 -- src/{pipeline.ipynb => indexing.ipynb} | 230 +++++++++--------- src/parameter_search.ipynb | 4 +- 9 files changed, 124 insertions(+), 161 deletions(-) delete mode 100644 src/components/text_cleaning/Dockerfile delete mode 100644 src/components/text_cleaning/fondant_component.yaml delete mode 100644 src/components/text_cleaning/requirements.txt delete mode 100644 src/components/text_cleaning/src/main.py rename src/{pipeline.ipynb => indexing.ipynb} (76%) diff --git a/README.md b/README.md index 1b00e9b..0f9b5ba 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Check out the Fondant [website](https://fondant.ai/) if you want to learn more a ### A simple RAG indexing pipeline -A [**notebook**](./src/pipeline.ipynb) with a simple Fondant pipeline to index your data into a +A [**notebook**](./src/indexing.ipynb) with a simple Fondant pipeline to index your data into a RAG system. ### Iterative tuning of a RAG indexing pipeline @@ -84,4 +84,4 @@ fondant --help There are two options to run the pipeline: - [**Via python files and the Fondant CLI:**](https://fondant.ai/en/latest/pipeline/#running-a-pipeline) how you should run Fondant in production -- [**Via a Jupyter notebook**](./src/pipeline.ipynb): ideal to learn about Fondant +- [**Via a Jupyter notebook**](./src/indexing.ipynb): ideal to learn about Fondant diff --git a/requirements.txt b/requirements.txt index 609e239..374d700 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -fondant==0.9.0 +fondant==0.10.0 notebook==7.0.6 weaviate-client==3.25.3 diff --git a/src/components/aggregate_eval_results/requirements.txt b/src/components/aggregate_eval_results/requirements.txt index 53e5d83..894a8b5 100644 --- a/src/components/aggregate_eval_results/requirements.txt +++ b/src/components/aggregate_eval_results/requirements.txt @@ -1 +1 @@ -fondant[component]==0.9.0 \ No newline at end of file +fondant[component]==0.10.0 \ No newline at end of file diff --git a/src/components/text_cleaning/Dockerfile b/src/components/text_cleaning/Dockerfile deleted file mode 100644 index d32a608..0000000 --- a/src/components/text_cleaning/Dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -FROM --platform=linux/amd64 python:3.8-slim - -# Install requirements -COPY requirements.txt / -RUN pip3 install --no-cache-dir -r requirements.txt - -# Set the working directory to the component folder -WORKDIR /component/src - -# Copy over src-files -COPY src/ . - -ENTRYPOINT ["fondant", "execute", "main"] diff --git a/src/components/text_cleaning/fondant_component.yaml b/src/components/text_cleaning/fondant_component.yaml deleted file mode 100644 index ee2d561..0000000 --- a/src/components/text_cleaning/fondant_component.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: Text cleaning component -description: Clean text passages -image: ghcr.io/ml6team/text_cleaning:dev - -consumes: - text: - type: string - -produces: - text: - type: string diff --git a/src/components/text_cleaning/requirements.txt b/src/components/text_cleaning/requirements.txt deleted file mode 100644 index f2c5454..0000000 --- a/src/components/text_cleaning/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -fondant[component]==0.9.0 diff --git a/src/components/text_cleaning/src/main.py b/src/components/text_cleaning/src/main.py deleted file mode 100644 index 687abdb..0000000 --- a/src/components/text_cleaning/src/main.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from fondant.component import PandasTransformComponent - - -class TextCleaningComponent(PandasTransformComponent): - def __init__(self, **kwargs): - """Initialize your component.""" - - def remove_empty_lines(self, text): - lines = text.split("\n") - non_empty_lines = [line.strip() for line in lines if line.strip()] - return "\n".join(non_empty_lines) - - def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - dataframe["text"] = dataframe["text"].apply( - self.remove_empty_lines, - ) - return dataframe diff --git a/src/pipeline.ipynb b/src/indexing.ipynb similarity index 76% rename from src/pipeline.ipynb rename to src/indexing.ipynb index 5735e42..dc2db5a 100644 --- a/src/pipeline.ipynb +++ b/src/indexing.ipynb @@ -109,7 +109,9 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -r ../requirements.txt" + "!pip install -r ../requirements.txt\n", + "# TODO: remove after component inspection PR is merged \n", + "!pip3 install \"fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@f0326c09c1c681a5d275605fe57eeb65918ce6c7\"" ] }, { @@ -156,7 +158,7 @@ "import pyarrow as pa\n", "\n", "text = pipeline.read(\n", - " \"load_from_hf_hub\",\n", + " \"components/load_from_hf_hub\",\n", " arguments={\n", " # Add arguments\n", " \"dataset_name\": \"wikitext@~parquet\",\n", @@ -168,6 +170,90 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import typing as t \n", + "\n", + "from fondant.component import PandasTransformComponent\n", + "from fondant.pipeline import lightweight_component\n", + "import logging\n", + "import typing as t \n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + "\n", + "#TODO: Move all imports defined within functions under the class definition after https://github.com/ml6team/fondant/pull/835 is merged \n", + "@lightweight_component(\n", + " consumes={\"text\":pa.string()},\n", + " produces={\"text\":pa.string(), \"original_document_id\":pa.string()},\n", + " extra_requires=[\"langchain==0.0.329\"]\n", + ")\n", + "class ChunkTextComponent(PandasTransformComponent):\n", + " \"\"\"Component that chunks text into smaller segments.\n", + " More information about the different chunking strategies can be here:\n", + " - https://python.langchain.com/docs/modules/data_connection/document_transformers/\n", + " - https://www.pinecone.io/learn/chunking-strategies/.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " *,\n", + " chunk_size: int,\n", + " chunk_overlap: int,\n", + " ):\n", + " \"\"\"\n", + " Args:\n", + " chunk_size: the chunk size \n", + " chunk_overlap: the overlap between chunks\n", + " \"\"\"\n", + " import logging\n", + " import typing as t \n", + " from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + " self.logger = logging.getLogger(__name__)\n", + " self.chunker = RecursiveCharacterTextSplitter(\n", + " chunk_size=chunk_size,\n", + " chunk_overlap=chunk_overlap\n", + " )\n", + "\n", + " def chunk_text(self, row) -> t.List[t.Tuple]:\n", + " # Multi-index df has id under the name attribute\n", + " doc_id = row.name\n", + " text_data = row[\"text\"]\n", + " docs = self.chunker.create_documents([text_data])\n", + "\n", + " return [\n", + " (doc_id, f\"{doc_id}_{chunk_id}\", chunk.page_content)\n", + " for chunk_id, chunk in enumerate(docs)\n", + " ]\n", + "\n", + " def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:\n", + " import itertools\n", + " \n", + " self.logger.info(f\"Chunking {len(dataframe)} documents...\")\n", + "\n", + " results = dataframe.apply(\n", + " self.chunk_text,\n", + " axis=1,\n", + " ).to_list()\n", + "\n", + " # Flatten results\n", + " results = list(itertools.chain.from_iterable(results))\n", + "\n", + " # Turn into dataframes\n", + " results_df = pd.DataFrame(\n", + " results,\n", + " columns=[\"original_document_id\", \"id\", \"text\"],\n", + " )\n", + " results_df = results_df.set_index(\"id\")\n", + "\n", + " return results_df\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -183,16 +269,18 @@ "source": [ "import utils\n", "\n", + "# TODO: remove /components after using a stable release \n", "\n", "chunks = text.apply(\n", - " \"chunk_text\",\n", + " ChunkTextComponent,\n", " arguments={\n", - " \"chunk_args\": {\"chunk_size\": 512, \"chunk_overlap\": 32}\n", + " \"chunk_size\": 512, \"chunk_overlap\": 32\n", " }\n", ")\n", "\n", + "\n", "embeddings = chunks.apply(\n", - " \"embed_text\",\n", + " \"components/embed_text\",\n", " arguments={\n", " \"model_provider\": \"huggingface\",\n", " \"model\": \"all-MiniLM-L6-v2\"\n", @@ -205,7 +293,7 @@ ")\n", "\n", "embeddings.write(\n", - " \"index_weaviate\",\n", + " \"components/index_weaviate\",\n", " arguments={\n", " \"weaviate_url\": f\"http://{utils.get_host_ip()}:8081\",\n", " \"class_name\": \"index\",\n", @@ -328,19 +416,7 @@ "\n", "Certainly, you can create your own custom components and use them in the pipeline. Let's consider building a component that cleans our text articles. For demo purpose we will implement a component thats removes all empty lines.\n", "\n", - "To implement a custom component, a couple of files need to be defined:\n", - "\n", - "- Fondant component specification\n", - "- main.py script in a src folder\n", - "- Dockerfile\n", - "- requirements.txt\n", - "\n", - "If you want to learn more about the creating custom components checkout [our documentation](https://fondant.ai/en/latest/components/custom_component/).\n", - "\n", - "\n", - "### Component specification\n", - "\n", - "The component specification is represented by a single `fondant_component.yaml` file. There you can define which fields your component consumes and produces. " + "We will implement this component as a lightweight component, checkout our [guide](https://fondant.ai/en/latest/components/lightweight_components/) on lightweight components for more info. " ] }, { @@ -349,48 +425,28 @@ "metadata": {}, "outputs": [], "source": [ - "%%writefile components/text_cleaning/fondant_component.yaml\n", - "name: Text cleaning component\n", - "description: Clean text passages\n", - "image: ghcr.io/ml6team/text_cleaning:dev\n", - "\n", - "consumes:\n", - " text:\n", - " type: string\n", - "\n", - "produces:\n", - " text:\n", - " type: string" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Main.py script\n", - "\n", - "The core logic of the component should be implemented in a `main.py` script in a folder called `src`. We can implement the text cleaning logic as a class. We will inherit from the base class `PandasTransformComponent`. The `PandasTransformComponent` operates on pandas dataframes. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile components/text_cleaning/src/main.py\n", - "import pandas as pd\n", - "from fondant.component import PandasTransformComponent\n", - "\n", - "\n", + "@lightweight_component(\n", + " consumes={\"text\":pa.string()},\n", + " produces={\"text\":pa.string()},\n", + ")\n", "class TextCleaningComponent(PandasTransformComponent):\n", - " def __init__(self, **kwargs):\n", - " \"\"\"Initialize your component\"\"\"\n", + " def __init__(\n", + " self\n", + " ):\n", + " \"\"\"\n", + " Args:\n", + " chunk_size: the chunk size \n", + " chunk_overlap: the overlap between chunks\n", + " \"\"\"\n", + " import logging\n", + " import typing as t \n", + "\n", + " self.logger = logging.getLogger(__name__)\n", "\n", " def remove_empty_lines(self, text):\n", " lines = text.split(\"\\n\")\n", " non_empty_lines = [line.strip() for line in lines if line.strip()]\n", - " return \"\\n\".join(non_empty_lines)\n", + " return 1\n", "\n", " def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:\n", " dataframe[\"text\"] = dataframe[\"text\"].apply(\n", @@ -399,62 +455,13 @@ " return dataframe" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dockerfile \n", - "The Dockerfile defines how to build the component into a Docker image. You can use the following:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile components/text_cleaning/Dockerfile\n", - "FROM --platform=linux/amd64 python:3.8-slim\n", - "\n", - "# Install requirements\n", - "COPY requirements.txt /\n", - "RUN pip3 install --no-cache-dir -r requirements.txt\n", - "\n", - "# Set the working directory to the component folder\n", - "WORKDIR /component/src\n", - "\n", - "# Copy over src-files\n", - "COPY src/ .\n", - "\n", - "ENTRYPOINT [\"fondant\", \"execute\", \"main\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Requirements.txt\n", - "\n", - "In the requirements.txt we define all dependencies of the component." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile components/text_cleaning/requirements.txt\n", - "fondant[component]==0.9.0" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add the new component to the pipeline\n", "\n", - "Now we can add the new component to the pipeline with the `Dataset.apply` function. We just specify the path to the directory containing the custom component instead of the name of the reusable component." + "Now we can add the new component to the pipeline with the `Dataset.apply` function. We just specify the reference to the component class containing the custom component instead of the name of the reusable component." ] }, { @@ -467,6 +474,7 @@ "from fondant.pipeline import Pipeline\n", "\n", "\n", + "# TODO: remove /components after using a stable release \n", "pipeline = Pipeline(\n", " name=\"ingestion-pipeline\",\n", " description=\"Pipeline to prepare and process data for building a RAG solution\",\n", @@ -474,7 +482,7 @@ ")\n", "\n", "text = pipeline.read(\n", - " \"load_from_hf_hub\",\n", + " \"components/load_from_hf_hub\",\n", " arguments={\n", " \"dataset_name\": \"wikitext@~parquet\",\n", " \"n_rows_to_load\": 1000,\n", @@ -485,11 +493,11 @@ ")\n", "\n", "cleaned_text = text.apply(\n", - " \"components/text_cleaning\", # Path to custom component\n", + " TextCleaningComponent\n", ")\n", "\n", "chunks = cleaned_text.apply(\n", - " \"chunk_text\",\n", + " \"components/chunk_text\",\n", " arguments={\n", " \"chunk_size\": 512,\n", " \"chunk_overlap\": 32,\n", @@ -497,7 +505,7 @@ ")\n", "\n", "embeddings = chunks.apply(\n", - " \"embed_text\",\n", + " \"components/embed_text\",\n", " arguments={\n", " \"model_provider\": \"huggingface\",\n", " \"model\": \"all-MiniLM-L6-v2\",\n", @@ -505,7 +513,7 @@ ")\n", "\n", "embeddings.write(\n", - " \"index_weaviate\",\n", + " \"components/index_weaviate\",\n", " arguments={\n", " \"weaviate_url\": f\"http://{utils.get_host_ip()}:8081\",\n", " \"class_name\": \"index\",\n", diff --git a/src/parameter_search.ipynb b/src/parameter_search.ipynb index 6cdb36b..8615693 100644 --- a/src/parameter_search.ipynb +++ b/src/parameter_search.ipynb @@ -400,9 +400,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "from utils import ParameterSearch\n",