From 6f109d7d048a3bca4e0cdea33a8a8f4aec9e8835 Mon Sep 17 00:00:00 2001 From: Fabrice Jammes Date: Wed, 6 Nov 2024 16:39:26 +0100 Subject: [PATCH 1/5] Replace minio with HDFS - Install stackable operators with argoCD - Improve ignoreDifferences field in Argo * add /spec/names/categories and shortNames - Install HDFS with argoCD - Reduce memory for spark pods to 1Gi - Bump spark-operator to v2.1.0 - Add hdfs init for user 185 inside chart --- .ciux | 2 +- .github/workflows/e2e-common.yml | 8 +-- TODO.913-replace-minio-s3-with-hdfs.org | 39 +++++++++++ chart/templates/_helpers.tpl | 4 ++ chart/templates/job-hdfs-init.yaml | 25 +++++++ chart/templates/spark-fink-distribution.yaml | 6 ++ chart/templates/spark-fink-raw2science.yaml | 6 ++ chart/templates/spark-fink-stream2raw.yaml | 6 ++ chart/values-ci-noscience.yaml | 4 -- chart/values.yaml | 5 +- doc/troubleshoot.md | 29 +++++++- e2e/argocd.sh | 3 +- e2e/run.sh | 4 +- fink_broker/spark_utils.py | 12 +++- hdfs-tmp/README.md | 11 +++ hdfs-tmp/install.sh | 11 +++ hdfs-tmp/mkdir.sh | 31 ++++++++ hdfs-tmp/test-hdfs.sh | 74 ++++++++++++++++++++ 18 files changed, 265 insertions(+), 15 deletions(-) create mode 100644 TODO.913-replace-minio-s3-with-hdfs.org create mode 100644 chart/templates/job-hdfs-init.yaml create mode 100644 hdfs-tmp/README.md create mode 100755 hdfs-tmp/install.sh create mode 100755 hdfs-tmp/mkdir.sh create mode 100755 hdfs-tmp/test-hdfs.sh diff --git a/.ciux b/.ciux index 1e7af2ce..ddb29554 100644 --- a/.ciux +++ b/.ciux @@ -31,7 +31,7 @@ dependencies: - image: gitlab-registry.in2p3.fr/astrolabsoftware/fink/spark-py:k8s-3.4.1 labels: build: "true" - - package: github.com/k8s-school/ktbx@v1.1.4-rc1 + - package: github.com/k8s-school/ktbx@v1.1.4-rc3 labels: itest: "optional" - package: github.com/astrolabsoftware/finkctl/v3@v3.1.3-rc1 diff --git a/.github/workflows/e2e-common.yml b/.github/workflows/e2e-common.yml index 8c9f5395..7e962f93 100644 --- a/.github/workflows/e2e-common.yml +++ b/.github/workflows/e2e-common.yml @@ -24,7 +24,7 @@ on: private_registry_token: required: true env: - CIUX_VERSION: v0.0.4-rc9 + CIUX_VERSION: v0.0.4-rc10 GHA_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} SUFFIX: ${{ inputs.suffix }} CI_REPO: ${{ inputs.ci_repo }} @@ -134,13 +134,13 @@ jobs: else echo "Using pre-existing image from registry (See "Ciux project ignition" section)" fi - - name: Run argoCD - run: | - ./e2e/argocd.sh # - name: Setup tmate session # uses: mxschmitt/action-tmate@v3 # with: # detached: true + - name: Run argoCD + run: | + ./e2e/argocd.sh - name: Check results run: | ./e2e/check-results.sh diff --git a/TODO.913-replace-minio-s3-with-hdfs.org b/TODO.913-replace-minio-s3-with-hdfs.org new file mode 100644 index 00000000..919703d2 --- /dev/null +++ b/TODO.913-replace-minio-s3-with-hdfs.org @@ -0,0 +1,39 @@ +* DONE Reduce mem limit for pod journalnode !!!! +* Upgrade stackable operator to v2.1.0 + +* TODO hdfs operator + +** limit and request for memory: see https://github.com/stackabletech/hdfs-operator/issues/625 +** TODO: open issue: zkfc on datanode is not compliant with memory setting + +In the example below memory limit is 256Mi for nameNode in hdfscluster CR, but it become 768Mi in each related pod because the `zkfs` container is not impacted by the CR configuration. +This should be fixed because it prevents running the setup on CI platforms with low memory like Github Action for instances. + +kubectl get -n hdfs hdfscluster simple-hdfs -o yaml -o jsonpath -o=jsonpath='{.spec.nameNodes.config.resources}' +{"cpu":{"min":"0"},"memory":{"limit":"256Mi"}} + +kubectl describe nodes | grep namenode + hdfs simple-hdfs-namenode-default-0 100m (0%) 1400m (1%) 768Mi (0%) 768Mi (0%) 34m + hdfs simple-hdfs-namenode-default-1 100m (0%) 1400m (1%) 768Mi (0%) 768Mi (0%) 31m + +kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath='{.spec.containers[1].name}' +zkfc + +kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath='{.spec.containers[1].resources}' | jq +{ + "limits": { + "cpu": "400m", + "memory": "512Mi" + }, + "requests": { + "cpu": "100m", + "memory": "512Mi" + } +} + + +** management of argoCD default values (jqpath expression): https://github.com/stackabletech/hdfs-operator/issues/626 +** TODO: open issue: be able to run only one dataNode on CI + +* Add helm option on HDFS cpu.min (also for operators!) +* Move fink image to docker.stackable.tech/stackable/hadoop:3.3.6-stackable24.11.0 diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index 0cc22e35..87d3a69b 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -85,7 +85,11 @@ restartPolicy: - '-log_level' - '{{ .Values.log_level }}' - '-online_data_prefix' +{{- if .Values.online_data_prefix }} +- '{{ .Values.online_data_prefix }}' +{{- else }} - 's3a://{{ tpl .Values.s3.bucket . }}' +{{- end }} - '-producer' - '{{ .Values.producer }}' - '-tinterval' diff --git a/chart/templates/job-hdfs-init.yaml b/chart/templates/job-hdfs-init.yaml new file mode 100644 index 00000000..0ab3941b --- /dev/null +++ b/chart/templates/job-hdfs-init.yaml @@ -0,0 +1,25 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: hdfs-init + namespace: hdfs + annotations: + "helm.sh/hook": "pre-install" +spec: + template: + spec: + containers: + - name: hdfs-client + image: apache/hadoop:3.4.0 + command: ["sh", "-c"] + args: + - | + hdfs dfs -fs $HDFS_URL -mkdir -p /user/185 && \ + hdfs dfs -fs $HDFS_URL -chown 185:hdfs /user/185 && \ + hdfs dfs -fs $HDFS_URL -chmod 700 /user/185 + env: + - name: HDFS_URL + value: hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.hdfs:8020 + - name: HADOOP_USER_NAME + value: stackable + restartPolicy: OnFailure diff --git a/chart/templates/spark-fink-distribution.yaml b/chart/templates/spark-fink-distribution.yaml index 3fb5232e..df6aba2d 100644 --- a/chart/templates/spark-fink-distribution.yaml +++ b/chart/templates/spark-fink-distribution.yaml @@ -23,6 +23,9 @@ spec: driver: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.distribution.coreRequest . }}" + env: + - name: SPARK_USER + value: "{{ .Values.hadoop_user_name }}" memory: "{{ tpl .Values.distribution.memory . }}" javaOptions: "-Divy.cache.dir=/tmp -Divy.home=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true" labels: @@ -31,6 +34,9 @@ spec: executor: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.distribution.coreRequest . }}" + env: + - name: SPARK_USER + value: "{{ .Values.hadoop_user_name }}" memory: "{{ tpl .Values.distribution.memory . }}" instances: {{ tpl .Values.distribution.instances . }} javaOptions: "-Djava.security.auth.login.config=/etc/fink-broker/kafka-jaas.conf -Dcom.amazonaws.sdk.disableCertChecking=true" diff --git a/chart/templates/spark-fink-raw2science.yaml b/chart/templates/spark-fink-raw2science.yaml index bdc56953..04936078 100644 --- a/chart/templates/spark-fink-raw2science.yaml +++ b/chart/templates/spark-fink-raw2science.yaml @@ -12,6 +12,9 @@ spec: driver: cores: {{ tpl .Values.raw2science.cores . }} coreRequest: "{{ tpl .Values.raw2science.coreRequest . }}" + env: + - name: SPARK_USER + value: "{{ .Values.hadoop_user_name }}" memory: "{{ tpl .Values.raw2science.memory . }}" javaOptions: "-Divy.cache.dir=/tmp -Divy.home=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true" labels: @@ -20,6 +23,9 @@ spec: executor: cores: {{ tpl .Values.raw2science.cores . }} coreRequest: "{{ tpl .Values.raw2science.coreRequest . }}" + env: + - name: SPARK_USER + value: "{{ .Values.hadoop_user_name }}" memory: "{{ tpl .Values.raw2science.memory . }}" javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true" instances: {{ tpl .Values.raw2science.instances . }} diff --git a/chart/templates/spark-fink-stream2raw.yaml b/chart/templates/spark-fink-stream2raw.yaml index c54a68a6..75b04064 100644 --- a/chart/templates/spark-fink-stream2raw.yaml +++ b/chart/templates/spark-fink-stream2raw.yaml @@ -21,6 +21,9 @@ spec: driver: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.stream2raw.coreRequest . }}" + env: + - name: SPARK_USER + value: "{{ .Values.hadoop_user_name }}" memory: "{{ tpl .Values.stream2raw.memory . }}" labels: version: 3.4.1 @@ -29,6 +32,9 @@ spec: executor: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.stream2raw.coreRequest . }}" + env: + - name: SPARK_USER + value: "{{ .Values.hadoop_user_name }}" memory: "{{ tpl .Values.stream2raw.memory . }}" instances: {{ tpl .Values.distribution.instances . }} javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true" diff --git a/chart/values-ci-noscience.yaml b/chart/values-ci-noscience.yaml index 3a7c0dc4..1d2ce651 100644 --- a/chart/values-ci-noscience.yaml +++ b/chart/values-ci-noscience.yaml @@ -9,10 +9,6 @@ instances: 1 fink_trigger_update: "2" -# Can be overriden using --image option - -# Default to s3a:// -# online_data_prefix: s3a://fink-broker-online producer: sims log_level: INFO diff --git a/chart/values.yaml b/chart/values.yaml index 81e17cf8..41312f44 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -3,6 +3,7 @@ night: "20240101" +hadoop_user_name: "185" image: pullPolicy: IfNotPresent repository: gitlab-registry.in2p3.fr/astrolabsoftware/fink @@ -13,7 +14,7 @@ image: cores: 1 coreRequest: 0 instances: 1 -memory: 1500m +memory: "1000m" # instances: 1 fink_trigger_update: "2" @@ -21,7 +22,7 @@ fink_trigger_update: "2" # Can be overriden using --image option # Default to s3a:// -# online_data_prefix: s3a://fink-broker-online +online_data_prefix: hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.hdfs:8020///user/185 producer: sims log_level: INFO diff --git a/doc/troubleshoot.md b/doc/troubleshoot.md index dfaf0e78..704d10da 100644 --- a/doc/troubleshoot.md +++ b/doc/troubleshoot.md @@ -1,6 +1,5 @@ # Troubleshooting guide - ## Run s5cmd (s3 client) From inside the k8s cluster: @@ -22,4 +21,32 @@ kubectl run -it --rm s5cmd --image=peakcom/s5cmd --env AWS_ACCESS_KEY_ID=minio - ## Use --all if needed kubectl delete -n spark sparkapplication fink-broker-distribution argocd app sync fink-broker +``` + +## Debug fink-broker helm chart + +```shell +cd fink-broker +helm install --debug fink ./chart -f ./chart/values-ci-noscience.yaml --dry-run +``` + +## ArgoCD + +### Access argoCD web UI + +```bash +kubectl port-forward -n argocd $(kubectl get pods --selector=app.kubernetes.io/name=argocd-server -n argocd --output=jsonpath="{.items..metadata.name}") 8080 +# Login is "admin, Password is set to "password", fix this in production +kubectl -n argocd patch secret argocd-secret -p '{"stringData": {"admin.password": "$2a$10$rRyBsGSHK6.uc8fntPwVIuLVHgsAhAX7TcdrqW/RADU0uh7CaChLa", "admin.passwordMtime": "'$(date +%FT%T%Z)'" }}' +``` + +### Fine-tune "ignoreDifferences" field of an ArgoCD Application + +```bash +# Install yq +sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq &&\\n sudo chmod +x /usr/bin/yq +# Retrieve failedsyncmanifest.yaml file in ArgoCD web UI +yq failedsyncmanifest.yaml -o json > failedsyncmanifest.json +# Fine-tune 'jqPathExpressions' +cat failedsyncmanifest.json | jq '.spec.versions[].additionalPrinterColumns | select(. == [])' ``` \ No newline at end of file diff --git a/e2e/argocd.sh b/e2e/argocd.sh index 2e2bd3df..6f163418 100755 --- a/e2e/argocd.sh +++ b/e2e/argocd.sh @@ -76,7 +76,8 @@ argocd app wait -l app.kubernetes.io/part-of=fink,app.kubernetes.io/component=st # Sync fink-broker argocd app sync -l app.kubernetes.io/instance=fink -if [ $e2e_enabled == "true" ]; then +if [ $e2e_enabled == "true" ] +then echo "Retrieve kafka secrets for e2e tests" while ! kubectl get secret fink-producer --namespace kafka do diff --git a/e2e/run.sh b/e2e/run.sh index ab7c7cda..711727a8 100755 --- a/e2e/run.sh +++ b/e2e/run.sh @@ -21,7 +21,7 @@ usage () { SUFFIX="noscience" -ciux_version=v0.0.4-rc8 +ciux_version=v0.0.4-rc10 export CIUXCONFIG=$HOME/.ciux/ciux.sh src_dir=$DIR/.. @@ -30,6 +30,7 @@ build=false e2e=false monitoring=false push=false +CIUX_IMAGE_URL="undefined" token="${TOKEN:-}" @@ -120,6 +121,7 @@ then fi $DIR/prereq-install.sh $monitoring_opt + . $CIUXCONFIG if [ $CIUX_BUILD = true ]; then kind load docker-image $CIUX_IMAGE_URL --name "$cluster" diff --git a/fink_broker/spark_utils.py b/fink_broker/spark_utils.py index 0df4470b..ebe2d6b3 100644 --- a/fink_broker/spark_utils.py +++ b/fink_broker/spark_utils.py @@ -336,7 +336,17 @@ def connect_to_raw_database(basepath: str, path: str, latestfirst: bool) -> Data wait_sec = increase_wait_time(wait_sec) # Create a DF from the database - userschema = spark.read.parquet(basepath).schema + # We need to wait for the schema to be available + while True: + try: + userschema = spark.read.parquet(basepath).schema + except Exception as e: + _LOG.error("Error while reading %s, %s", basepath, e) + time.sleep(wait_sec) + wait_sec = increase_wait_time(wait_sec) + continue + else: + break df = ( spark.readStream.format("parquet") diff --git a/hdfs-tmp/README.md b/hdfs-tmp/README.md new file mode 100644 index 00000000..c21051b4 --- /dev/null +++ b/hdfs-tmp/README.md @@ -0,0 +1,11 @@ +# Minio should be disabled in fink-cd! + +kubectl run --image apache/hadoop:3.4.0 hdfs-client -- sleep infinity + +kubectl exec -it hdfs-client bash + hdfs dfs -fs hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:8020 -df + hdfs dfs -fs hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:8020 -ls + export HADOOP_USER_NAME=stackable + hdfs dfs -fs hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:8020 -touch /toto + hdfs dfs -fs hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:8020 -ls / + diff --git a/hdfs-tmp/install.sh b/hdfs-tmp/install.sh new file mode 100755 index 00000000..8e076bd8 --- /dev/null +++ b/hdfs-tmp/install.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -euxo pipefail + +DIR=$(cd "$(dirname "$0")"; pwd -P) + +STCK_BIN="/tmp/stackable" + +# TODO move to a job inside argoCD +ink "Create hdfs directory" +$DIR/mkdir.sh diff --git a/hdfs-tmp/mkdir.sh b/hdfs-tmp/mkdir.sh new file mode 100755 index 00000000..b7cddb55 --- /dev/null +++ b/hdfs-tmp/mkdir.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# This script creates a directory in HDFS and sets the owner to user 185 + +set -euxo pipefail + +DIR=$(cd "$(dirname "$0")"; pwd -P) + +NS=hdfs + +timeout=300s + +# Wait for HDFS statefulset to be available +# TODO improve this +kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=hdfs --timeout=$timeout -n $NS +kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=zookeeper --timeout=$timeout -n $NS +sleep 60 + +hdfs_url="hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.$NS:8020" + +# Check if pod hdfs-client exists +if ! kubectl get -n "$NS" pod hdfs-client &> /dev/null; then + kubectl run -n "$NS" --image apache/hadoop:3.4.0 hdfs-client -- sleep infinity +fi + +kubectl wait -n "$NS" --for=condition=ready pod/hdfs-client --timeout=$timeout + +kubectl exec -n "$NS" -it hdfs-client -- sh -c "export HADOOP_USER_NAME=stackable && \ + hdfs dfs -fs $hdfs_url -mkdir -p /user/185 && \ + hdfs dfs -fs $hdfs_url -chown 185:hdfs /user/185 && \ + hdfs dfs -fs $hdfs_url -chmod 700 /user/185" diff --git a/hdfs-tmp/test-hdfs.sh b/hdfs-tmp/test-hdfs.sh new file mode 100755 index 00000000..3339692f --- /dev/null +++ b/hdfs-tmp/test-hdfs.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +set -euo pipefail + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Step 1: Create the webhdfs.yaml file +cat < $DIR.webhdfs.yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: webhdfs + labels: + app: webhdfs +spec: + replicas: 1 + serviceName: webhdfs-svc + selector: + matchLabels: + app: webhdfs + template: + metadata: + labels: + app: webhdfs + spec: + containers: + - name: webhdfs + image: docker.stackable.tech/stackable/testing-tools:0.2.0-stackable0.0.0-dev + stdin: true + tty: true +EOF + +# Step 2: Apply the StatefulSet and monitor progress +echo "Applying webhdfs StatefulSet..." +kubectl apply -f $DIR/webhdfs.yaml +echo "Waiting for webhdfs pod to be ready..." +kubectl rollout status --watch --timeout=5m statefulset/webhdfs + +# Step 3: Check the root directory status in HDFS (should be empty) +echo "Checking root directory in HDFS..." +kubectl exec -n default webhdfs-0 -- curl -s -XGET "http://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:9870/webhdfs/v1/?op=LISTSTATUS" + +# Step 4: Create a sample file for uploading +echo "Creating sample file testdata.txt..." +echo "This is a test file for HDFS upload." > $DIR/testdata.txt + +# Step 5: Copy the file to the helper pod +echo "Copying testdata.txt to webhdfs pod..." +kubectl cp -n default $DIR/testdata.txt webhdfs-0:/tmp + +# Step 6: Initiate a two-step PUT request to create the file in HDFS +echo "Initiating file creation in HDFS (first step)..." +create_response=$(kubectl exec -n default webhdfs-0 -- \ +curl -s -XPUT -T /tmp/testdata.txt "http://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=CREATE&noredirect=true") + +# Extract the location for the second PUT request +location=$(echo "$create_response" | grep -o 'http://[^"]*') +echo "Location for second PUT request: $location" + +# Step 7: Complete the file upload +echo "Completing file creation in HDFS (second step)..." +kubectl exec -n default webhdfs-0 -- curl -s -XPUT -T /tmp/testdata.txt "$location" + +# Step 8: Verify that the file has been created in HDFS +echo "Re-checking root directory in HDFS to verify file creation..." +kubectl exec -n default webhdfs-0 -- curl -s -XGET "http://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:9870/webhdfs/v1/?op=LISTSTATUS" + +# Step 9: Delete the file from HDFS to clean up +echo "Deleting testdata.txt from HDFS..." +kubectl exec -n default webhdfs-0 -- curl -s -XDELETE "http://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=DELETE" + +# Clean up local files +rm $DIR/webhdfs.yaml $DIR/testdata.txt +echo "Cleanup completed. HDFS testing script finished." From fb010a11b8ca913cb9d8b3fd6d73d64122960485 Mon Sep 17 00:00:00 2001 From: Fabrice Jammes Date: Fri, 20 Dec 2024 15:35:51 +0100 Subject: [PATCH 2/5] Cleanup and merge todo files --- Dockerfile | 1 + TODO.913-replace-minio-s3-with-hdfs.org | 39 -------------- TODO.argocd | 3 -- TODO.org | 68 ++++++++++++++----------- 4 files changed, 38 insertions(+), 73 deletions(-) delete mode 100644 TODO.913-replace-minio-s3-with-hdfs.org delete mode 100644 TODO.argocd diff --git a/Dockerfile b/Dockerfile index d5ba75bc..7e1eaa48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,7 @@ RUN apt-get update && \ apt install -y --no-install-recommends wget git apt-transport-https ca-certificates gnupg-agent apt-utils build-essential && \ rm -rf /var/cache/apt/* +# Download and install Spark dependencies listed in jars-urls.txt ADD deps/jars-urls.txt $FINK_HOME/ RUN xargs -n 1 curl --fail --output-dir /opt/spark/jars -O < $FINK_HOME/jars-urls.txt diff --git a/TODO.913-replace-minio-s3-with-hdfs.org b/TODO.913-replace-minio-s3-with-hdfs.org deleted file mode 100644 index 919703d2..00000000 --- a/TODO.913-replace-minio-s3-with-hdfs.org +++ /dev/null @@ -1,39 +0,0 @@ -* DONE Reduce mem limit for pod journalnode !!!! -* Upgrade stackable operator to v2.1.0 - -* TODO hdfs operator - -** limit and request for memory: see https://github.com/stackabletech/hdfs-operator/issues/625 -** TODO: open issue: zkfc on datanode is not compliant with memory setting - -In the example below memory limit is 256Mi for nameNode in hdfscluster CR, but it become 768Mi in each related pod because the `zkfs` container is not impacted by the CR configuration. -This should be fixed because it prevents running the setup on CI platforms with low memory like Github Action for instances. - -kubectl get -n hdfs hdfscluster simple-hdfs -o yaml -o jsonpath -o=jsonpath='{.spec.nameNodes.config.resources}' -{"cpu":{"min":"0"},"memory":{"limit":"256Mi"}} - -kubectl describe nodes | grep namenode - hdfs simple-hdfs-namenode-default-0 100m (0%) 1400m (1%) 768Mi (0%) 768Mi (0%) 34m - hdfs simple-hdfs-namenode-default-1 100m (0%) 1400m (1%) 768Mi (0%) 768Mi (0%) 31m - -kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath='{.spec.containers[1].name}' -zkfc - -kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath='{.spec.containers[1].resources}' | jq -{ - "limits": { - "cpu": "400m", - "memory": "512Mi" - }, - "requests": { - "cpu": "100m", - "memory": "512Mi" - } -} - - -** management of argoCD default values (jqpath expression): https://github.com/stackabletech/hdfs-operator/issues/626 -** TODO: open issue: be able to run only one dataNode on CI - -* Add helm option on HDFS cpu.min (also for operators!) -* Move fink image to docker.stackable.tech/stackable/hadoop:3.3.6-stackable24.11.0 diff --git a/TODO.argocd b/TODO.argocd deleted file mode 100644 index a35dfb35..00000000 --- a/TODO.argocd +++ /dev/null @@ -1,3 +0,0 @@ -- check https://stackoverflow.com/questions/78922618/how-to-enforce-sync-order-in-argocd-app-of-apps-pattern and argocd issue -- WIP4 upgrade spark-operator chart to 2.0.0-rc0 and remove prereq from fink-cd -- check https://github.com/argoproj/argocd-example-apps/tree/master/helm-dependency to install spark-operator, and other operator diff --git a/TODO.org b/TODO.org index eac8e4f5..cb9efa6c 100644 --- a/TODO.org +++ b/TODO.org @@ -1,9 +1,3 @@ -* DONE use gitlab@virtualdata as a CI repo -* DONE check fink-alert-simulator error message in CI: - ⚠ fink-alert-simulator-cjxv2 main fink-alert-simulator-cjxv2 5m Error (exit code 1): pods "fink-alert-simulator-cjxv2" is forbidden: User "system:serviceaccount:argocd:default" cannot patch resource "pods" in API group "" in the namespace "argocd" -* DONE trigger ci for OOMkill -* 729 -** DONE use "kubectl get kafkatopics.kafka.strimzi.io -n kafka" to check success of integration tests, maybe in fnkctl? ** TODO DELAYED BECAUSE IT NOT BLOCKS BUT WARN create topic in distribute before sending alerts in order to avoid error below: https://fink-broker.slack.com/archives/D03KJ390F17/p1692008729660549 Du coup ça fonctionne avec un compte utilisateur, par contre j'ai pas activé les autorisations dans kafka car le fink-alert-simulator aurait pu plus écrire dans le topic sans authentification. 12 h 28 @@ -15,24 +9,9 @@ En fait c'est du au fait que le topic existe pas, ça fonctionne si on relance l Tu crois qu'on pourrais pré-créer les topic pour éviter ce problème @JulienPeloton ? -** DONE add user authentication in kafka https://stackoverflow.com/questions/65729535/how-to-do-i-connect-kafka-python-to-accept-username-and-password-for-jaas-like-i * TODO Enable authZ in kafka (require authN setup in fink-alert-simulator) -* TODO [#B] distribute should wait for data to appear instead of crashing in connect_to_raw_database() * TODO move nodeport to internal for svc kafka-cluster-kafka-external-bootstrap -* DONE improve final test in CI (check Kafka with fink-client https://github.com/astrolabsoftware/fink-client) * TODO run code-check.sh in CI -* DONE add unit test for schema_converter -* TODO https://stackoverflow.com/questions/30385981/how-to-access-s3a-files-from-apache-spark -Document +add SO post?: -Download hadoop binary release: https://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-3.2.4/hadoop-3.2.4.tar.gz -extract and copy jar: - fjammes@clrinfopo18  ~/Downloads/hadoop-3.2.4  cp ./share/hadoop/tools/lib/hadoop-aws-3.2.4.jar ~/src/k8s-spark-py/custom/jars - fjammes@clrinfopo18  ~/Downloads/hadoop-3.2.4  cp ./share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.901.jar ~/src/k8s-spark-py/custom/jars - // WARNING package are not deployed in spark-executor - // see https://stackoverflow.com/a/67299668/2784039 -* TODO document hack to retrieve Maven URLs -kubectl logs stream2raw-py-f529af864f8dee60-driver | grep downlo | cut -d' ' -f2 > jars-urls.txt -OR add mnv copy:dependencies when building the image? * TODO manage dependencies What to do with: 1. hbase-spark-hbase2.4_spark3_scala2.12_hadoop3.2.jar @@ -40,16 +19,43 @@ hbase-spark-protocol-shaded-hbase2.4_spark3_scala2.12_hadoop3.2.jar which are both in k8s-spark-py/custom and fink-broker/libs (cf. FINK_JARS) cf. Julien are they required? 2. custom/jars/commons-pool2-2.6.2.jar which was in k8s-spark-py/custom -* DONE document minio install and bucket creation: - 5 curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o $HOME/minio-binaries/mc - 6 chmod +x $HOME/minio-binaries/mc - 15 export PATH=$PATH:$HOME/minio-binaries/ - 17 mc alias set s3 http://minio.minio:9000 minioadmin minioadmin - 19 mc ls s3 - 27 mc mb s3/fink-broker-online - mc ls f1 --recursive fink-broker-online/ -* TODO test removal of options below +* TODO test removal of options below whith useing hdfs + --conf spark.driver.extraJavaOptions="-Divy.cache.dir=/tmp -Divy.home=/tmp" \ --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ -* DONE INSTALL MINIO https://min.io/docs/minio/kubernetes/upstream/index.html? +* TODO hdfs operator management + +** limit and request for memory: see https://github.com/stackabletech/hdfs-operator/issues/625 +** TODO: open issue: zkfc on datanode is not compliant with memory setting + +In the example below memory limit is 256Mi for nameNode in hdfscluster CR, but it become 768Mi in each related pod because the `zkfs` container is not impacted by the CR configuration. +This should be fixed because it prevents running the setup on CI platforms with low memory like Github Action for instances. + +kubectl get -n hdfs hdfscluster simple-hdfs -o yaml -o jsonpath -o=jsonpath='{.spec.nameNodes.config.resources}' +{"cpu":{"min":"0"},"memory":{"limit":"256Mi"}} + +kubectl describe nodes | grep namenode + hdfs simple-hdfs-namenode-default-0 100m (0%) 1400m (1%) 768Mi (0%) 768Mi (0%) 34m + hdfs simple-hdfs-namenode-default-1 100m (0%) 1400m (1%) 768Mi (0%) 768Mi (0%) 31m + +kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath='{.spec.containers[1].name}' +zkfc + +kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath='{.spec.containers[1].resources}' | jq +{ + "limits": { + "cpu": "400m", + "memory": "512Mi" + }, + "requests": { + "cpu": "100m", + "memory": "512Mi" + } +} + + +** management of argoCD default values (jqpath expression): https://github.com/stackabletech/hdfs-operator/issues/626 +** TODO: open issue: be able to run only one dataNode on CI + +* Add helm option on HDFS cpu.min (also for operators!) +* Move fink image to docker.stackable.tech/stackable/hadoop:3.3.6-stackable24.11. \ No newline at end of file From 2605ba70dff1edffaf620238fe26bbe74d2dd4ce Mon Sep 17 00:00:00 2001 From: Fabrice Jammes Date: Mon, 23 Dec 2024 12:07:05 +0100 Subject: [PATCH 3/5] Split hdfs/s3 configuration --- .ciux | 2 +- TODO.org | 66 ++++++++++---------- chart/templates/_helpers.tpl | 9 +++ chart/templates/spark-fink-distribution.yaml | 8 +-- chart/templates/spark-fink-raw2science.yaml | 8 +-- chart/templates/spark-fink-stream2raw.yaml | 8 +-- chart/values.yaml | 7 ++- 7 files changed, 56 insertions(+), 52 deletions(-) diff --git a/.ciux b/.ciux index ddb29554..6f05e384 100644 --- a/.ciux +++ b/.ciux @@ -31,7 +31,7 @@ dependencies: - image: gitlab-registry.in2p3.fr/astrolabsoftware/fink/spark-py:k8s-3.4.1 labels: build: "true" - - package: github.com/k8s-school/ktbx@v1.1.4-rc3 + - package: github.com/k8s-school/ktbx@v1.1.4-rc4 labels: itest: "optional" - package: github.com/astrolabsoftware/finkctl/v3@v3.1.3-rc1 diff --git a/TODO.org b/TODO.org index cb9efa6c..2da5df81 100644 --- a/TODO.org +++ b/TODO.org @@ -1,33 +1,8 @@ -** TODO DELAYED BECAUSE IT NOT BLOCKS BUT WARN create topic in distribute before sending alerts in order to avoid error below: https://fink-broker.slack.com/archives/D03KJ390F17/p1692008729660549 -Du coup ça fonctionne avec un compte utilisateur, par contre j'ai pas activé les autorisations dans kafka car le fink-alert-simulator aurait pu plus écrire dans le topic sans authentification. -12 h 28 -J'ai maintenant ce message d'erreur: -23/08/14 10:26:52 WARN NetworkClient: [Producer clientId=producer-1] Error while fetching metadata with correlation id 29 : {fink_simbad_grav_candidates_ztf=LEADER_NOT_AVAILABLE} -12 h 32 -En fait c'est du au fait que le topic existe pas, ça fonctionne si on relance lae job distribute... -12 h 33 -Tu crois qu'on pourrais pré-créer les topic pour éviter ce problème -@JulienPeloton -? -* TODO Enable authZ in kafka (require authN setup in fink-alert-simulator) -* TODO move nodeport to internal for svc kafka-cluster-kafka-external-bootstrap -* TODO run code-check.sh in CI -* TODO manage dependencies -What to do with: -1. hbase-spark-hbase2.4_spark3_scala2.12_hadoop3.2.jar -hbase-spark-protocol-shaded-hbase2.4_spark3_scala2.12_hadoop3.2.jar -which are both in k8s-spark-py/custom and fink-broker/libs (cf. FINK_JARS) -cf. Julien are they required? -2. custom/jars/commons-pool2-2.6.2.jar which was in k8s-spark-py/custom -* TODO test removal of options below whith useing hdfs -+ --conf spark.driver.extraJavaOptions="-Divy.cache.dir=/tmp -Divy.home=/tmp" \ - --conf spark.hadoop.fs.s3a.path.style.access=true \ -+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ +#+TITLE: current * TODO hdfs operator management -** limit and request for memory: see https://github.com/stackabletech/hdfs-operator/issues/625 -** TODO: open issue: zkfc on datanode is not compliant with memory setting - +** TODO limit and request for memory: monitor issue https://github.com/stackabletech/hdfs-operator/issues/625 +** TODO open issue: zkfc on datanode is not compliant with memory setting In the example below memory limit is 256Mi for nameNode in hdfscluster CR, but it become 768Mi in each related pod because the `zkfs` container is not impacted by the CR configuration. This should be fixed because it prevents running the setup on CI platforms with low memory like Github Action for instances. @@ -54,8 +29,35 @@ kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath } -** management of argoCD default values (jqpath expression): https://github.com/stackabletech/hdfs-operator/issues/626 -** TODO: open issue: be able to run only one dataNode on CI +** TODO management of argoCD default values (jqpath expression): monitor issue https://github.com/stackabletech/hdfs-operator/issues/626 +** TODO open issue: be able to run only one dataNode on CI -* Add helm option on HDFS cpu.min (also for operators!) -* Move fink image to docker.stackable.tech/stackable/hadoop:3.3.6-stackable24.11. \ No newline at end of file +** TODO Add helm option on HDFS cpu.min (also for operators!) +** TODO Move fink image to docker.stackable.tech/stackable/hadoop:3.3.6-stackable24.11. + +#+TITLE: previous +* TODO DELAYED BECAUSE IT NOT BLOCKS BUT WARN create topic in distribute before sending alerts in order to avoid error below: https://fink-broker.slack.com/archives/D03KJ390F17/p1692008729660549 +Du coup ça fonctionne avec un compte utilisateur, par contre j'ai pas activé les autorisations dans kafka car le fink-alert-simulator aurait pu plus écrire dans le topic sans authentification. +12 h 28 +J'ai maintenant ce message d'erreur: +23/08/14 10:26:52 WARN NetworkClient: [Producer clientId=producer-1] Error while fetching metadata with correlation id 29 : {fink_simbad_grav_candidates_ztf=LEADER_NOT_AVAILABLE} +12 h 32 +En fait c'est du au fait que le topic existe pas, ça fonctionne si on relance lae job distribute... +12 h 33 +Tu crois qu'on pourrais pré-créer les topic pour éviter ce problème +@JulienPeloton +? +* TODO Enable authZ in kafka (require authN setup in fink-alert-simulator) +* TODO move nodeport to internal for svc kafka-cluster-kafka-external-bootstrap +* TODO run code-check.sh in CI +* TODO manage dependencies +What to do with: +1. hbase-spark-hbase2.4_spark3_scala2.12_hadoop3.2.jar +hbase-spark-protocol-shaded-hbase2.4_spark3_scala2.12_hadoop3.2.jar +which are both in k8s-spark-py/custom and fink-broker/libs (cf. FINK_JARS) +cf. Julien are they required? +2. custom/jars/commons-pool2-2.6.2.jar which was in k8s-spark-py/custom +* TODO test removal of options below whith useing hdfs ++ --conf spark.driver.extraJavaOptions="-Divy.cache.dir=/tmp -Divy.home=/tmp" \ + --conf spark.hadoop.fs.s3a.path.style.access=true \ ++ --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index 87d3a69b..f37207e5 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -52,6 +52,7 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{/* Generate s3 configuration */}} {{- define "fink.s3config" -}} +{{ if eq .Values.storage "s3" -}} spark.hadoop.fs.s3a.endpoint: {{ .Values.s3.endpoint }} spark.hadoop.fs.s3a.access.key: {{ .Values.s3.access_key }} spark.hadoop.fs.s3a.secret.key: {{ .Values.s3.secret_key }} @@ -62,7 +63,15 @@ spark.hadoop.fs.s3a.path.style.access: "true" spark.hadoop.fs.s3a.aws.credentials.provider: "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" spark.hadoop.fs.s3a.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem" {{- end }} +{{- end }} +{{/* Generate hdfs configuration */}} +{{- define "fink.hdfsconfig" -}} +{{ if eq .Values.storage "hdfs" -}} +- name: SPARK_USER + value: "{{ .Values.hdfs.hadoop_user_name }}" +{{- end }} +{{- end }} {{/* Generate common configuration */}} {{- define "fink.common" -}} diff --git a/chart/templates/spark-fink-distribution.yaml b/chart/templates/spark-fink-distribution.yaml index df6aba2d..3ac81084 100644 --- a/chart/templates/spark-fink-distribution.yaml +++ b/chart/templates/spark-fink-distribution.yaml @@ -23,9 +23,7 @@ spec: driver: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.distribution.coreRequest . }}" - env: - - name: SPARK_USER - value: "{{ .Values.hadoop_user_name }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.distribution.memory . }}" javaOptions: "-Divy.cache.dir=/tmp -Divy.home=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true" labels: @@ -34,9 +32,7 @@ spec: executor: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.distribution.coreRequest . }}" - env: - - name: SPARK_USER - value: "{{ .Values.hadoop_user_name }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.distribution.memory . }}" instances: {{ tpl .Values.distribution.instances . }} javaOptions: "-Djava.security.auth.login.config=/etc/fink-broker/kafka-jaas.conf -Dcom.amazonaws.sdk.disableCertChecking=true" diff --git a/chart/templates/spark-fink-raw2science.yaml b/chart/templates/spark-fink-raw2science.yaml index 04936078..b08920da 100644 --- a/chart/templates/spark-fink-raw2science.yaml +++ b/chart/templates/spark-fink-raw2science.yaml @@ -12,9 +12,7 @@ spec: driver: cores: {{ tpl .Values.raw2science.cores . }} coreRequest: "{{ tpl .Values.raw2science.coreRequest . }}" - env: - - name: SPARK_USER - value: "{{ .Values.hadoop_user_name }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.raw2science.memory . }}" javaOptions: "-Divy.cache.dir=/tmp -Divy.home=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true" labels: @@ -23,9 +21,7 @@ spec: executor: cores: {{ tpl .Values.raw2science.cores . }} coreRequest: "{{ tpl .Values.raw2science.coreRequest . }}" - env: - - name: SPARK_USER - value: "{{ .Values.hadoop_user_name }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.raw2science.memory . }}" javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true" instances: {{ tpl .Values.raw2science.instances . }} diff --git a/chart/templates/spark-fink-stream2raw.yaml b/chart/templates/spark-fink-stream2raw.yaml index 75b04064..783bdb83 100644 --- a/chart/templates/spark-fink-stream2raw.yaml +++ b/chart/templates/spark-fink-stream2raw.yaml @@ -21,9 +21,7 @@ spec: driver: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.stream2raw.coreRequest . }}" - env: - - name: SPARK_USER - value: "{{ .Values.hadoop_user_name }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.stream2raw.memory . }}" labels: version: 3.4.1 @@ -32,9 +30,7 @@ spec: executor: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.stream2raw.coreRequest . }}" - env: - - name: SPARK_USER - value: "{{ .Values.hadoop_user_name }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.stream2raw.memory . }}" instances: {{ tpl .Values.distribution.instances . }} javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true" diff --git a/chart/values.yaml b/chart/values.yaml index 41312f44..115e00d5 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -3,7 +3,6 @@ night: "20240101" -hadoop_user_name: "185" image: pullPolicy: IfNotPresent repository: gitlab-registry.in2p3.fr/astrolabsoftware/fink @@ -63,6 +62,9 @@ distribution: schema: "/home/fink/fink-alert-schemas/ztf/distribution_schema_0p2.avsc" substream_prefix: "fink_" + +storage: hdfs + # # Parameters used to access the S3 bucket # @@ -74,6 +76,9 @@ s3: access_key: "minio" secret_key: "minio123" +hdfs: + hadoop_user_name: "185" + serviceAccount: # Specifies whether a service account should be created create: true From cc842db0cbdb7112eb58348b06f05fa5e395c2a4 Mon Sep 17 00:00:00 2001 From: Fabrice Jammes Date: Fri, 27 Dec 2024 11:53:58 +0100 Subject: [PATCH 4/5] Bump fink hdfs image to v24.11.0 --- .ciux | 3 +++ doc/release.md | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.ciux b/.ciux index 6f05e384..469da244 100644 --- a/.ciux +++ b/.ciux @@ -28,6 +28,9 @@ dependencies: dev: "true" itest: "true" release: "true" + - image: gitlab-registry.in2p3.fr/astrolabsoftware/fink/stackable-hadoop:v24.11.0 + labels: + itest: "true" - image: gitlab-registry.in2p3.fr/astrolabsoftware/fink/spark-py:k8s-3.4.1 labels: build: "true" diff --git a/doc/release.md b/doc/release.md index ecb6ad97..5708c430 100644 --- a/doc/release.md +++ b/doc/release.md @@ -17,7 +17,7 @@ Url for the CI is: https://github.com/astrolabsoftware/fink-broker/actions ciux get deps ./fink-broker -l release ``` - Clone all the necessary repositories and ensure you are using their `main` branch. + Clone all the necessary repositories and ensure you are using their `master/main` branch. ## Get Release Tag From 676bff65e641ad2410b0346949bb5209736c2a2c Mon Sep 17 00:00:00 2001 From: Fabrice Jammes Date: Fri, 27 Dec 2024 12:38:57 +0100 Subject: [PATCH 5/5] Test both hdfs and s3 in GHA --- .github/workflows/e2e-common.yml | 18 ++++++++------ .github/workflows/e2e-gha.yml | 4 --- chart/templates/job-hdfs-init.yaml | 2 ++ e2e/argocd.sh | 40 +++++++++++++++++++++++++++++- e2e/run.sh | 19 ++++++++------ fink_broker/spark_utils.py | 2 +- 6 files changed, 64 insertions(+), 21 deletions(-) diff --git a/.github/workflows/e2e-common.yml b/.github/workflows/e2e-common.yml index 7e962f93..98e40a50 100644 --- a/.github/workflows/e2e-common.yml +++ b/.github/workflows/e2e-common.yml @@ -6,28 +6,29 @@ on: required: true type: string ci_repo: - required: true + description: 'Intermediate registry to use' + required: false type: string + default: "" runner: required: true type: string kind_version: - required: true + description: 'Kind version to use' + required: false type: string + default: "v0.20.0" secrets: registry_username: required: true registry_token: required: true - private_registry_username: - required: true - private_registry_token: - required: true env: CIUX_VERSION: v0.0.4-rc10 GHA_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} SUFFIX: ${{ inputs.suffix }} CI_REPO: ${{ inputs.ci_repo }} + STORAGE: ${{ inputs.storage }} # Override the self-hosted runner value POD_NAMESPACE: default jobs: @@ -78,6 +79,9 @@ jobs: name: docker-artifact path: artifacts integration-tests: + strategy: + matrix: + storage: [hdfs, s3] name: Run integration tests runs-on: ${{ fromJSON(inputs.runner) }} outputs: @@ -140,7 +144,7 @@ jobs: # detached: true - name: Run argoCD run: | - ./e2e/argocd.sh + ./e2e/argocd.sh -S "${{ matrix.storage }}" - name: Check results run: | ./e2e/check-results.sh diff --git a/.github/workflows/e2e-gha.yml b/.github/workflows/e2e-gha.yml index 52874541..26dc2803 100644 --- a/.github/workflows/e2e-gha.yml +++ b/.github/workflows/e2e-gha.yml @@ -12,11 +12,7 @@ jobs: uses: ./.github/workflows/e2e-common.yml with: suffix: "noscience" - ci_repo: "" runner: "['ubuntu-22.04']" - kind_version: "v0.20.0" secrets: registry_username: ${{ secrets.REGISTRY_USERNAME }} registry_token: ${{ secrets.REGISTRY_TOKEN }} - private_registry_username: ${{ secrets.PRIVATE_REGISTRY_USERNAME }} - private_registry_token: ${{ secrets.PRIVATE_REGISTRY_TOKEN }} diff --git a/chart/templates/job-hdfs-init.yaml b/chart/templates/job-hdfs-init.yaml index 0ab3941b..4f966be6 100644 --- a/chart/templates/job-hdfs-init.yaml +++ b/chart/templates/job-hdfs-init.yaml @@ -1,3 +1,4 @@ +{{ if eq .Values.storage "hdfs" -}} apiVersion: batch/v1 kind: Job metadata: @@ -23,3 +24,4 @@ spec: - name: HADOOP_USER_NAME value: stackable restartPolicy: OnFailure +{{- end }} diff --git a/e2e/argocd.sh b/e2e/argocd.sh index 6f163418..27f2a968 100755 --- a/e2e/argocd.sh +++ b/e2e/argocd.sh @@ -9,6 +9,27 @@ set -euxo pipefail DIR=$(cd "$(dirname "$0")"; pwd -P) +storage="hdfs" + +usage() { + cat << EOD +Usage: $(basename "$0") [options] +Available options: + -h This message + -S Storage to use (hdfs or minio) +EOD +} + +# Get the options +while getopts hS: c ; do + case $c in + h) usage ; exit 0 ;; + S) storage="$OPTARG" ;; + \?) usage ; exit 2 ;; + esac +done +shift "$((OPTIND-1))" + CIUXCONFIG=${CIUXCONFIG:-"$HOME/.ciux/ciux.sh"} . $CIUXCONFIG @@ -36,11 +57,26 @@ e2e_enabled="true" argocd login --core kubectl config set-context --current --namespace="$NS" +if [ $storage == "s3" ] +then + hdfs_enabled="false" + s3_enabled="true" + online_data_prefix="" +elif [ $storage == "hdfs" ] +then + hdfs_enabled="true" + s3_enabled="false" + online_data_prefix="hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.hdfs:8020///user/185" +fi + + # Create fink app argocd app create fink --dest-server https://kubernetes.default.svc \ --dest-namespace "$NS" \ --repo https://github.com/astrolabsoftware/fink-cd.git \ --path apps --revision "$FINK_CD_WORKBRANCH" \ + -p s3.enabled="$s3_enabled" \ + -p hdfs.enabled="$hdfs_enabled" \ -p spec.source.targetRevision.default="$FINK_CD_WORKBRANCH" \ -p spec.source.targetRevision.finkbroker="$FINK_BROKER_WORKBRANCH" \ -p spec.source.targetRevision.finkalertsimulator="$FINK_ALERT_SIMULATOR_WORKBRANCH" @@ -61,7 +97,9 @@ argocd app set fink-broker -p image.repository="$CIUX_IMAGE_REGISTRY" \ -p e2e.enabled="$e2e_enabled" \ -p image.tag="$CIUX_IMAGE_TAG" \ -p log_level="DEBUG" \ - -p night="20200101" + -p night="20200101" \ + -p online_data_prefix="$online_data_prefix" \ + -p storage="$storage" argocd app set fink-alert-simulator -p image.tag="$FINK_ALERT_SIMULATOR_VERSION" diff --git a/e2e/run.sh b/e2e/run.sh index 711727a8..fc58b8aa 100755 --- a/e2e/run.sh +++ b/e2e/run.sh @@ -30,26 +30,29 @@ build=false e2e=false monitoring=false push=false +storage="hdfs" CIUX_IMAGE_URL="undefined" token="${TOKEN:-}" # Get options for suffix -while getopts hcms opt; do +while getopts hcmsS: opt; do case ${opt} in - s ) - SUFFIX="" - ;; + c ) cleanup=true ;; - m ) - monitoring=true - ;; h ) usage exit 0 ;; + m ) + monitoring=true + ;; + s ) + SUFFIX="" + ;; + S) storage="$OPTARG" ;; \? ) usage exit 1 @@ -128,7 +131,7 @@ if [ $CIUX_BUILD = true ]; then fi echo "Run ArgoCD to install the whole fink e2e tests stack" -$DIR/argocd.sh +$DIR/argocd.sh -S "$storage" echo "Check the results of the tests." $DIR/check-results.sh diff --git a/fink_broker/spark_utils.py b/fink_broker/spark_utils.py index ebe2d6b3..088e477b 100644 --- a/fink_broker/spark_utils.py +++ b/fink_broker/spark_utils.py @@ -340,7 +340,7 @@ def connect_to_raw_database(basepath: str, path: str, latestfirst: bool) -> Data while True: try: userschema = spark.read.parquet(basepath).schema - except Exception as e: + except Exception as e: # noqa: PERF203 _LOG.error("Error while reading %s, %s", basepath, e) time.sleep(wait_sec) wait_sec = increase_wait_time(wait_sec)