From b09a974a7d2cb7734d2c168fb24bfee3f370087c Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 14 Nov 2023 14:40:06 +0100 Subject: [PATCH] Add docker build CICD (#5) Adding fondant build for custom component to github actions. --- .github/workflows/build.yaml | 36 +++++++++++ .github/{ => workflows}/pipeline.yaml | 0 scripts/build_components.sh | 63 +++++++++++++++++++ .../text_cleaning/fondant_component.yaml | 2 +- src/components/text_cleaning/src/main.py | 6 +- src/pipeline.py | 4 +- 6 files changed, 106 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/build.yaml rename .github/{ => workflows}/pipeline.yaml (100%) create mode 100755 scripts/build_components.sh diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..72498b9 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,36 @@ +name: Build dev images + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + docker: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Set buildx alias + run: docker buildx install + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Install fondant + run: | + pip install --upgrade pip + pip install fondant[docker]@git+https://github.com/ml6team/fondant + + + - name: Build components + run: ./scripts/build_components.sh -r ghcr.io -n ml6team -t $GITHUB_SHA --label org.opencontainers.image.source=https://github.com/ml6team/fondant-usecase-RAG diff --git a/.github/pipeline.yaml b/.github/workflows/pipeline.yaml similarity index 100% rename from .github/pipeline.yaml rename to .github/workflows/pipeline.yaml diff --git a/scripts/build_components.sh b/scripts/build_components.sh new file mode 100755 index 0000000..3362802 --- /dev/null +++ b/scripts/build_components.sh @@ -0,0 +1,63 @@ +#!/bin/bash +set -e + +function usage { + echo "Usage: $0 [options]" + echo "Options:" + echo " -t, --tag Tag to add to image + The tag is set in the component specifications" + echo " -d, --components-dir Directory containing components to build as subdirectories. + The path should be relative to the root directory (default:src/components)" + echo " -r, --registry The container registry prefix to use e.g. ghcr(default: null (DockerHub))" + echo " -n, --namespace The registry namespace for the built images (default: ml6team)" + echo " -co, --component Specific component to build. Pass the component subdirectory name(s) to build + certain component(s) or 'all' to build all components in the components + directory (default: all)" + echo " -r, --repo Set the repo (default: ml6team/fondant-usecase-RAG)" + echo " -l, --label Set a container label, repeatable + (e.g. org.opencontainers.image.source=https://github.com/ml6team/fondant-usecase-RAG)" + echo " -h, --help Display this help message" +} + +# Parse the arguments +while [[ "$#" -gt 0 ]]; do case $1 in + -r |--registry) registry="$2"; shift;; + -n |--namespace) namespace="$2"; shift;; + -d |--components-dir ) components_dir="$2"; shift;; + -r |--repo) repo="$2"; shift;; + -t |--tag) tag=("$2"); shift;; + -co|--component) components+=("$2"); shift;; + -h |--help) usage; exit;; + -l |--label) labels+=("$2"); shift;; + *) echo "Unknown parameter passed: $1"; exit 1;; +esac; shift; done + +# Set default values for optional arguments if not passed +components_dir="${components_dir:-src/components}" +namespace="${namespace:-ml6team}" + +# Get the component directory +scripts_dir=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +root_dir=$(dirname "$scripts_dir") +components_dir=$root_dir/$components_dir + +# Determine the components to build +# Only directories that contains a Dockerfile will be considered for the component build +for dir in "$components_dir"/*/; do + # Check if a Dockerfile exists in the current subdirectory + if [ -f "$dir/Dockerfile" ]; then + components_to_build+=("$dir") + fi +done + +# Loop through all subdirectories +for dir in "${components_to_build[@]}"; do + pushd "$dir" + BASENAME=${dir%/} + BASENAME=${BASENAME##*/} + + full_image_name=${registry}/${namespace}/${BASENAME}:${tag} + echo "Tagging image as $full_image_name" + fondant build $dir -t $full_image_name --nocache ${labels[@]/#/--label } + popd +done diff --git a/src/components/text_cleaning/fondant_component.yaml b/src/components/text_cleaning/fondant_component.yaml index b1a1544..717c250 100644 --- a/src/components/text_cleaning/fondant_component.yaml +++ b/src/components/text_cleaning/fondant_component.yaml @@ -1,6 +1,6 @@ name: Text cleaning component description: Clean text passages -image: text-cleaning-component:latest +image: ghcr.io/ml6team/text_cleaning:dev consumes: text: diff --git a/src/components/text_cleaning/src/main.py b/src/components/text_cleaning/src/main.py index 4a0cc44..d6464bb 100644 --- a/src/components/text_cleaning/src/main.py +++ b/src/components/text_cleaning/src/main.py @@ -6,7 +6,7 @@ logger = logging.getLogger(__name__) -class TextCleaningComponent(PandasTransformComponent): +class TextCleaningComponent(PandasTransformComponent): def __init__(self, *_): """Initialize your component""" @@ -16,5 +16,7 @@ def remove_empty_lines(self, text): return "\n".join(non_empty_lines) def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: self.remove_empty_lines) + dataframe[("text", "data")] = dataframe[("text", "data")].apply( + lambda x: self.remove_empty_lines + ) return dataframe diff --git a/src/pipeline.py b/src/pipeline.py index 79b84c4..663f309 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -6,7 +6,7 @@ pipeline = Pipeline( - pipeline_name="ingestion-pipeline", + pipeline_name="ingestion-pipeline", pipeline_description="Pipeline to prepare and process \ data for building a RAG solution", base_path="./data-dir", # The demo pipelines uses a local \ @@ -43,7 +43,7 @@ name="index_weaviate", arguments={ "weaviate_url": "http://host.docker.internal:8080", - "class_name": "index" + "class_name": "index", }, )