diff --git a/.gitattributes b/.gitattributes
index 8f4aec0b7..8e5c1acf0 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,7 +1,7 @@
###############################
# Git Line Endings #
# Set default behaviour to automatically normalize line endings.
-* text eolf=lf
+* text eol=lf
# Force batch scripts to always use CRLF line endings so that if a repo is accessed
# in Windows via a file share from Linux, the scripts will work.
*.{cmd,[cC][mM][dD]} text eol=crlf
diff --git a/.github/workflows/_docker-template.yml b/.github/workflows/_docker-template.yml
index 7d186bddc..c41244868 100644
--- a/.github/workflows/_docker-template.yml
+++ b/.github/workflows/_docker-template.yml
@@ -114,7 +114,7 @@ jobs:
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
- name: Docker metadata (again)
- id: meta
+ id: meta_manifest
uses: docker/metadata-action@v5
with:
images: ${{ inputs.image }}
@@ -141,4 +141,4 @@ jobs:
--tag "${img}:${tag}" \
"${img}:${tag}-amd64" \
"${img}:${tag}-arm64"
- done < <(printf "%s" "${{ steps.meta.outputs.tags }}")
+ done < <(printf "%s" "${{ steps.meta_manifest.outputs.tags }}")
diff --git a/.github/workflows/docker-nifi.yml b/.github/workflows/docker-nifi.yml
index a99e66f9a..576df0c27 100644
--- a/.github/workflows/docker-nifi.yml
+++ b/.github/workflows/docker-nifi.yml
@@ -17,7 +17,7 @@ jobs:
uses: ./.github/workflows/_docker-template.yml
with:
image: cogstacksystems/cogstack-nifi
- context: nifi
+ context: .
dockerfile: nifi/Dockerfile
cache_scope: nifi
secrets: inherit
diff --git a/.gitignore b/.gitignore
index 564452318..bae75c1c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,9 +5,18 @@
.vscode
.venv
.ruff_cache
+.mypy_cache
venv
**__pycache__
**/venv
+*.pyc
+.pyc
+build
+
+# Ignore setuptools metadata
+*.egg-info/
+*.egg-info
+**/*.egg-info/
# keys and certificates
*.pem
@@ -37,6 +46,7 @@ security/templates/**
docs/build/*
# Ignore all .env files at any level
+.env
*.env
**/*.env
!*.env.template
diff --git a/deploy/database.env b/deploy/database.env
index d4ad0d7fc..478e13c66 100644
--- a/deploy/database.env
+++ b/deploy/database.env
@@ -1,9 +1,13 @@
-# production db name
-POSTGRES_DATABANK_DB=cogstack
-
POSTGRES_DB_MAX_CONNECTIONS=100
# Prefix of file names to load the DB schema for in /services/cogstack-db/(pgsql/mssql)/schemas/ folder
POSTGRES_DB_SCHEMA_PREFIX="cogstack_db"
-POSTGRES_SHM_SIZE="1g"
+# production db name
+DATABASE_DB_NAME=cogstack
+
+DATABASE_DOCKER_SHM_SIZE=1g
+
+DATABASE_DOCKER_CPU_MIN=1
+DATABASE_DOCKER_CPU_MAX=1
+DATABASE_DOCKER_RAM=1g
diff --git a/deploy/elasticsearch.env b/deploy/elasticsearch.env
index b084dba38..9b3616f55 100644
--- a/deploy/elasticsearch.env
+++ b/deploy/elasticsearch.env
@@ -9,10 +9,10 @@ ELASTICSEARCH_VERSION=opensearch
# possible values :
# - elasticsearch : docker.elastic.co/elasticsearch/elasticsearch:8.18.2
# - elasticsearch (custom cogstack image) : cogstacksystems/cogstack-elasticsearch:latest
-# - opensearch : opensearchproject/opensearch:3.2.0
+# - opensearch : opensearchproject/opensearch:3.4.0
# the custom cogstack image is always based on the last image of ES native
-ELASTICSEARCH_DOCKER_IMAGE=opensearchproject/opensearch:3.2.0
+ELASTICSEARCH_DOCKER_IMAGE=opensearchproject/opensearch:3.4.0
ELASTICSEARCH_LOG_LEVEL=INFO
@@ -88,9 +88,14 @@ ELASTICSEARCH_BACKUP_PARTITION_CONFIG=../data/es_snapshot_backups/config_backup
ELASTICSEARCH_SECURITY_DIR=../security/certificates/elastic/
# MEMORY CONFIG
-ELASTICSEARCH_JAVA_OPTS="-Xms2048m -Xmx2048m -Des.failure_store_feature_flag_enabled=true"
+ELASTICSEARCH_JAVA_OPTS="-Xms512m -Xmx512m -Des.failure_store_feature_flag_enabled=true"
+
+ELASTICSEARCH_DOCKER_CPU_MIN=1
+ELASTICSEARCH_DOCKER_CPU_MAX=1
+ELASTICSEARCH_DOCKER_RAM=1g
+
+ELASTICSEARCH_DOCKER_SHM_SIZE=512m
-ELASTICSEARCH_SHM_SIZE="1g"
ELASTICSEARCH_DOCKER_LOG_SIZE_PER_FILE="1000m"
ELASTICSEARCH_DOCKER_LOG_NUM_FILES=10
@@ -140,9 +145,6 @@ ELASTICSEARCH_HOSTS='["https://elasticsearch-1:9200","https://elasticsearch-2:92
KIBANA_HOST="https://kibana:5601"
-KIBANA_SERVER_NAME="cogstack-kibana"
-
-
########################################################################## KIBANA Env vars ###########################################################################
# NOTE: some variables from the Elasticsearch section are used
# - ${ELASTICSEARCH_VERSION} is used for certificate paths, as well as kibana.yml config path.
@@ -158,15 +160,15 @@ KIBANA_VERSION=opensearch-dashboards
# - kibana
# - opensearch_dashboards # make note of the underscore...
-KIBANA_CONFIG_FILE_VERSION=opensearch_dashboards
+KIBANA_CONFIG_FILE_VERSION=opensearch_dashboards
# possible values:
# - elasticsearch : docker.elastic.co/kibana/kibana:8.18.2
# - elasticsearch (custom cogstack image) : cogstacksystems/cogstack-kibana:latest
-# - opensearch : opensearchproject/opensearch-dashboards:3.2.0
+# - opensearch : opensearchproject/opensearch-dashboards:3.4.0
# the custom cogstack image is always based on the last image of ES native
-ELASTICSEARCH_KIBANA_DOCKER_IMAGE=opensearchproject/opensearch-dashboards:3.2.0
+ELASTICSEARCH_KIBANA_DOCKER_IMAGE=opensearchproject/opensearch-dashboards:3.4.0
KIBANA_SERVER_NAME="cogstack-kibana"
KIBANA_PUBLIC_BASE_URL="https://elasticsearch-1:5601"
@@ -174,7 +176,11 @@ KIBANA_PUBLIC_BASE_URL="https://elasticsearch-1:5601"
KIBANA_SERVER_HOST="0.0.0.0"
KIBANA_SERVER_OUTPUT_PORT=5601
-KIBANA_SHM_SIZE="1g"
+KIBANA_DOCKER_SHM_SIZE=512m
+KIBANA_DOCKER_CPU_MIN=1
+KIBANA_DOCKER_CPU_MAX=1
+KIBANA_DOCKER_RAM=1g
+
# this is used in Kibana
# it needs to be generated via the API
@@ -201,6 +207,10 @@ ELASTICSEARCH_XPACK_SECURITY_REPORTING_ENCRYPTION_KEY="e0Y1gTxHWOopIWMTtpjQsDS6K
METRICBEAT_IMAGE="docker.elastic.co/beats/metricbeat:8.18.2"
+METRICBEAT_DOCKER_SHM=512m
+METRICBEAT_DOCKER_CPU_MIN=1
+METRICBEAT_DOCKER_CPU_MAX=1
+METRICBEAT_DOCKER_RAM=1g
########################################################################## FILEBEAT Env vars ###########################################################################
@@ -213,3 +223,9 @@ FILEBEAT_STARTUP_COMMAND="-e --strict.perms=false"
FILEBEAT_HOST="https://elasticsearch-1:9200"
FILEBEAT_IMAGE="docker.elastic.co/beats/filebeat:8.18.2"
+
+
+FILEBEAT_DOCKER_SHM=512m
+FILEBEAT_DOCKER_CPU_MIN=1
+FILEBEAT_DOCKER_CPU_MAX=1
+FILEBEAT_DOCKER_RAM=1g
diff --git a/deploy/export_env_vars.sh b/deploy/export_env_vars.sh
index ea8266095..2ee8a95cf 100755
--- a/deploy/export_env_vars.sh
+++ b/deploy/export_env_vars.sh
@@ -3,12 +3,15 @@
# Enable strict mode (without -e to avoid exit-on-error)
set -uo pipefail
+# Support being sourced in shells where BASH_SOURCE is unset (e.g. zsh)
+SCRIPT_SOURCE="${BASH_SOURCE[0]-$0}"
+SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_SOURCE")" && pwd)"
+SCRIPT_NAME="$(basename "$SCRIPT_SOURCE")"
-echo "🔧 Running $(basename "${BASH_SOURCE[0]}")..."
+echo "🔧 Running $SCRIPT_NAME..."
set -a
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DEPLOY_DIR="$SCRIPT_DIR"
SECURITY_DIR="$SCRIPT_DIR/../security/env"
SERVICES_DIR="$SCRIPT_DIR/../services"
@@ -38,6 +41,22 @@ env_files=(
"$SERVICES_DIR/cogstack-nlp/medcat-service/env/medcat.env"
)
+LINT_SCRIPT="$SCRIPT_DIR/../nifi/user_scripts/utils/lint_env.py"
+
+if [ -e "$LINT_SCRIPT" ]; then
+ chmod +x $LINT_SCRIPT
+fi
+
+if [ -x "$LINT_SCRIPT" ]; then
+ echo "🔍 Validating env files..."
+ if ! python3 "$LINT_SCRIPT" "${env_files[@]}"; then
+ echo "❌ Env validation failed. Fix the errors above before continuing."
+ exit 1
+ fi
+else
+ echo "⚠️ Skipping env validation; $LINT_SCRIPT not found or not executable."
+fi
+
for env_file in "${env_files[@]}"; do
if [ -f "$env_file" ]; then
echo "✅ Sourcing $env_file"
@@ -56,4 +75,4 @@ set +a
# Restore safe defaults for interactive/dev shell
set +u
-set +o pipefail
\ No newline at end of file
+set +o pipefail
diff --git a/deploy/gitea.env b/deploy/gitea.env
index 0009d5759..e2ef85779 100644
--- a/deploy/gitea.env
+++ b/deploy/gitea.env
@@ -23,3 +23,8 @@ GITEA_LOCAL_PUB_KEY_PATH="$GITEA_LOCAL_KEY_PATH.pub"
GITEA_LOCAL_KEY_TITLE="gitea-cogstack-$(hostname)-$(date +%s)"
GITEA_DEFAULT_MAIN_REMOTE_NAME="cogstack-gitea"
+
+GITEA_DOCKER_SHM_SIZE=512m
+GITEA_DOCKER_CPU_MIN=1
+GITEA_DOCKER_CPU_MAX=1
+GITEA_DOCKER_RAM=1g
diff --git a/deploy/nginx.env b/deploy/nginx.env
index aae2c825d..a08762ca2 100644
--- a/deploy/nginx.env
+++ b/deploy/nginx.env
@@ -1,3 +1,9 @@
NGINX_KIBANA_HOST=kibana
NGINX_KIBANA_PROXY_PORT=5601
NGINX_ES_NODE_SOURCE_INSTANCE_NAME="elasticsearch-1"
+
+
+NGINX_SHM_SIZE=1g
+NGINX_DOCKER_CPU_MIN=1
+NGINX_DOCKER_CPU_MAX=1
+NGINX_DOCKER_RAM=1g
diff --git a/deploy/nifi.env b/deploy/nifi.env
index dcc710c06..c9d007d7a 100644
--- a/deploy/nifi.env
+++ b/deploy/nifi.env
@@ -1,16 +1,46 @@
-# NiFi
+
+
+##############################################################################################################################
+# IMPORTANT SETTINGS FOR DEPLOYMENTS RESOURCE SCOPED
+##############################################################################################################################
+NIFI_JVM_OPTS="-XX:+UseG1GC -XX:MaxGCPauseMillis=250 -XX:+ParallelRefProcEnabled -Djava.security.egd=file:/dev/./urandom"
+NIFI_JVM_HEAP_INIT=768m
+NIFI_JVM_HEAP_MAX=1g
+
+
+NIFI_DOCKER_SHM_SIZE=1g
+NIFI_DOCKER_REGISTRY_SHM_SIZE=1g
+
+NIFI_DOCKER_CPU_MIN=1
+NIFI_DOCKER_CPU_MAX=1
+NIFI_DOCKER_RAM=1g
+
+NIFI_REGISTRY_DOCKER_CPU_MIN=1
+NIFI_REGISTRY_DOCKER_CPU_MAX=1
+NIFI_REGISTRY_DOCKER_RAM=1g
+
+NIFI_DOCKER_LOG_SIZE_PER_FILE="250m"
+NIFI_DOCKER_LOG_NUM_FILES=10
+
+##############################################################################################################################
+
+NIFI_VERSION="2.7.2"
+NIFI_REGISTRY_VERSION=$NIFI_VERSION
+
+# NiFi/NiFi Registry Docker image
+NIFI_DOCKER_IMAGE="cogstacksystems/cogstack-nifi:latest"
+NIFI_REGISTRY_DOCKER_IMAGE="apache/nifi-registry:${NIFI_REGISTRY_VERSION:-2.7.2}"
+
+
+##############################################################################################################################
+# NIFI SECTION
+##############################################################################################################################
NIFI_ENV_FILE="./nifi.env"
NIFI_SECURITY_DIR="../security/certificates/nifi/"
NIFI_DATA_PATH="../data/"
-NIFI_VERSION="2.7.2"
NIFI_TOOLKIT_VERSION=$NIFI_VERSION
-NIFI_SHM_SIZE="1g"
-NIFI_REGISTRY_SHM_SIZE="1g"
-NIFI_DOCKER_LOG_SIZE_PER_FILE="250m"
-NIFI_DOCKER_LOG_NUM_FILES=10
-
#### Port and network settings
NIFI_WEB_PROXY_CONTEXT_PATH="/nifi"
@@ -26,8 +56,6 @@ NIFI_INTERNAL_PORT=8443
# this is for the nginx service
NIFI_EXTERNAL_PORT_NGINX=8443
NIFI_INTERNAL_PORT_NGINX=8443
-NIFI_REGISTRY_EXTERNAL_PORT_NGINX=18443
-NIFI_REGISTRY_INTERNAL_PORT_NGINX=18443
NIFI_OUTPUT_PORT=8082
NIFI_INPUT_SOCKET_PORT=10000
@@ -50,6 +78,7 @@ NIFI_PYTHON_EXTENSIONS_SOURCE_DIRECTORY_DEFAULT="/opt/nifi/nifi-current/python_e
# nifi.python.working.directory=/opt/nifi/user-scripts
NIFI_PYTHON_WORKING_DIRECTORY="/opt/nifi/user-scripts"
+NIFI_PYTHONPATH=$NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY
NIFI_LOG_LEVEL="ERROR"
@@ -58,9 +87,6 @@ NIFI_AUTH=tls
NIFI_KEYSTORE_PATH="/security/certificates/nifi/nifi-keystore.jks"
NIFI_TRUSTSTORE_PATH="/security/certificates/nifi/nifi-truststore.jks"
-NIFI_REGISTRY_KEYSTORE_PATH="/opt/nifi-registry/nifi-registry-current/conf/keystore.jks"
-NIFI_REGISTRY_TRUSTSTORE_PATH="/opt/nifi-registry/nifi-registry-current/conf/truststore.jks"
-
NIFI_KEYSTORE_TYPE=JKS
NIFI_TRUSTSTORE_TYPE=JKS
@@ -70,10 +96,16 @@ NIFI_INITIAL_ADMIN_IDENTITY="cogstack"
##############################################################################################################################
# NIFI REGISTRY FLOW SECTION
##############################################################################################################################
-NIFI_REGISTRY_VERSION=$NIFI_VERSION
NIFI_REGISTRY_DB_DIR=/opt/nifi-registry/nifi-registry-current/database
NIFI_REGISTRY_FLOW_PROVIDER=file
NIFI_REGISTRY_FLOW_STORAGE_DIR=/opt/nifi-registry/nifi-registry-current/flow_storage
NIFI_REGISTRY_FLOW_OUTPUT_PORT=8083
NIFI_REGISTRY_FLOW_INPUT_PORT=18443
+
+NIFI_REGISTRY_EXTERNAL_PORT_NGINX=18443
+NIFI_REGISTRY_INTERNAL_PORT_NGINX=18443
+
+NIFI_REGISTRY_KEYSTORE_PATH="/security/certificates/nifi/nifi-keystore.jks"
+NIFI_REGISTRY_TRUSTSTORE_PATH="/security/certificates/nifi/nifi-truststore.jks"
+
diff --git a/deploy/services-dev.yml b/deploy/services-dev.yml
deleted file mode 100644
index 593af6ec9..000000000
--- a/deploy/services-dev.yml
+++ /dev/null
@@ -1,259 +0,0 @@
-#---------------------------------------------------------------------------#
-# Used services #
-#---------------------------------------------------------------------------#
-services:
-
-#---------------------------------------------------------------------------#
-# NiFi webapp #
-#---------------------------------------------------------------------------#
- nifi:
- # image: cogstacksystems/cogstack-nifi:latest
- build:
- context: ../nifi/
- args:
- HTTP_PROXY: $HTTP_PROXY
- HTTPS_PROXY: $HTTPS_PROXY
- no_proxy: $no_proxy
- container_name: cogstack-nifi
- hostname: nifi
- restart: always
- env_file:
- - ./general.env
- - ./project.env
- - ./nifi.env
- - ./elasticsearch.env
- - ./network_settings.env
- - ../security/users_nifi.env
- - ../security/users_elasticsearch.env
- - ../security/certificates_general.env
- - ../security/certificates_elasticsearch.env
- - ../security/certificates_nifi.env
- shm_size: 1024mb
- environment:
- - USER_ID=${NIFI_UID:-1000}
- - GROUP_ID=${NIFI_GID:-1000}
- - NIFI_WEB_PROXY_HOST=${NIFI_WEB_PROXY_HOST:-"localhost:8443"}
- - NIFI_WEB_PROXY_CONTEXT_PATH=${NIFI_WEB_PROXY_CONTEXT_PATH:-"/nifi"}
- - NIFI_INTERNAL_PORT=${NIFI_INTERNAL_PORT:-8443}
- - NIFI_OUTPUT_PORT=${NIFI_OUTPUT_PORT:-8082}
- - NIFI_INPUT_SOCKET_PORT=${NIFI_INPUT_SOCKET_PORT:-10000}
- - NIFI_SECURITY_DIR=${NIFI_SECURITY_DIR:-../security/nifi_certificates/}
- - ELASTICSEARCH_SECURITY_DIR=${ELASTICSEARCH_SECURITY_DIR:-../security/es_certificates/}
- volumes:
- # INFO: drivers folder
- - ../nifi/drivers:/opt/nifi/drivers
-
- # INFO: if there are local changes, map these content from local host to container
- # (normally, these 3 directories below are bundled with our NiFi image)
- # N.B. The container user may not have the permission to read these directories/files.
- - ../nifi/user-templates:/opt/nifi/nifi-current/conf/templates:rw
- - ../nifi/user-scripts:/opt/nifi/user-scripts:rw
- - ../nifi/user-schemas:/opt/nifi/user-schemas:rw
-
- # this is a direct mapping to where we store the NiFi python processors as of NiFi 2.0.x
- - ../nifi/user-python-extensions:/opt/nifi/nifi-current/python_extensions:rw
-
- # INFO: uncomment below to map security certificates if need to secure NiFi endpoints
- - ./${NIFI_SECURITY_DIR:-../security/nifi_certificates/}:/opt/nifi/nifi-current/nifi_certificates:ro
- - ./${ELASTICSEARCH_SECURITY_DIR:-../security/es_certificates/}:/opt/nifi/nifi-current/es_certificates:ro
- - ./${NIFI_SECURITY_DIR:-../security/nifi_certificates/}nifi-keystore.jks:/opt/nifi/nifi-current/conf/keystore.jks
- - ./${NIFI_SECURITY_DIR:-../security/nifi_certificates/}nifi-truststore.jks:/opt/nifi/nifi-current/conf/truststore.jks
-
- # Security credentials scripts
- - ../security/nifi_create_single_user_auth.sh:/opt/nifi/nifi-current/security_scripts/nifi_create_single_user_auth.sh:ro
-
- # # Nifi properties file:
- - ../nifi/conf/:/opt/nifi/nifi-current/conf/:rw
-
- # this is where you should place data to be ingested, under the form of symbolic
- - ./${NIFI_DATA_PATH:-../data/}:/opt/data/:rw
-
- # DB-schemas, from the services folder
- - ../services/cogstack-db/:/opt/cogstack-db/:rw
-
- # medcat models
- - ./${RES_MEDCAT_SERVICE_MODEL_PRODUCTION_PATH:-../services/nlp-services/medcat-service/models/}:/opt/models:rw
-
- # rest of volumes to persist the state
- - nifi-vol-logs:/opt/nifi/nifi-current/logs
- - nifi-vol-provenance:/opt/nifi/nifi-current/provenance_repository
- - nifi-vol-database:/opt/nifi/nifi-current/database_repository
- - nifi-vol-flowfiles:/opt/nifi/nifi-current/flowfile_repository
- - nifi-vol-content:/opt/nifi/nifi-current/content_repository
- - nifi-vol-state:/opt/nifi/nifi-current/state
-
- # errors generated during data processing
- - nifi-vol-errors:/opt/nifi/pipeline/flowfile-errors
-
- extra_hosts:
- - ${ELASTICSEARCH_1_HOST_NAME:-test-1:0.0.0.0}
- - ${ELASTICSEARCH_2_HOST_NAME:-test-2:0.0.0.0}
- - ${ELASTICSEARCH_3_HOST_NAME:-test-3:0.0.0.0}
- - ${KIBANA_HOST_NAME:-test-4:0.0.0.0}
- - ${NIFI_HOST_NAME:-test-5:0.0.0.0}
- - ${NIFI_REGISTRY_HOST_NAME:-test-6:0.0.0.0}
-
- # user: "${NIFI_UID:-1000}:${NIFI_GID:-1000}"
- ulimits:
- memlock:
- soft: -1
- hard: -1
- nofile:
- soft: 65536
- hard: 262144
-
- # INFO : Uncomment the below line to generate your own USERNAME and PASSWORD,
- # a bit messy this way as you will need to copy the credentials back
- # to the "login-identity-providers.xml" section.
- # entrypoint: bash -c "/opt/nifi/nifi-current/bin/nifi.sh set-single-user-credentials admin admincogstacknifi"
-
- tty: true
- ports:
- - "${NIFI_OUTPUT_PORT:-8082}:${NIFI_INTERNAL_PORT:-8443}"
- - "${NIFI_INPUT_SOCKET_PORT:-10000}"
- networks:
- - cognet
-
- nifi-registry-flow:
- image: apache/nifi-registry:${NIFI_REGISTRY_VERSION:-2.7.1}
- hostname: nifi-registry
- container_name: cogstack-nifi-registry-flow
- restart: always
- user: root
- env_file:
- - ./general.env
- - ./network_settings.env
- - ./nifi.env
- - ./project.env
- - ../security/users_nifi.env
- - ../security/users_elasticsearch.env
- - ../security/certificates_general.env
- - ../security/certificates_elasticsearch.env
- - ../security/certificates_nifi.env
- environment:
- - http_proxy=$HTTP_PROXY
- - https_proxy=$HTTPS_PROXY
- - no_proxy=$no_proxy
- - USER_ID=${NIFI_UID:-1000}
- - GROUP_ID=${NIFI_GID:-1000}
- - KEYSTORE_PATH=${NIFI_REGISTRY_KEYSTORE_PATH:-./conf/keystore.jks}
- - KEYSTORE_TYPE=${NIFI_KEYSTORE_TYPE:-jks}
- - KEYSTORE_PASSWORD=${NIFI_KEYSTORE_PASSWORD:-"cogstackNifi"}
- - TRUSTSTORE_PASSWORD=${NIFI_TRUSTSTORE_PASSWORD:-"cogstackNifi"}
- - TRUSTSTORE_PATH=${NIFI_REGISTRY_TRUSTSTORE_PATH:-./conf/truststore.jks}
-
- - TRUSTSTORE_TYPE=${NIFI_TRUSTSTORE_TYPE:-jks}
- - INITIAL_ADMIN_IDENTITY=${NIFI_INITIAL_ADMIN_IDENTITY:-"CN=admin, OU=nifi"}
- - AUTH=${NIFI_AUTH:-"tls"}
- - NIFI_REGISTRY_DB_DIR=${NIFI_REGISTRY_DB_DIR:-/opt/nifi-registry/nifi-registry-current/database}
- #- NIFI_REGISTRY_FLOW_PROVIDER=${NIFI_REGISTRY_FLOW_PROVIDER:-file}
- - NIFI_REGISTRY_FLOW_STORAGE_DIR=${NIFI_REGISTRY_FLOW_STORAGE_DIR:-/opt/nifi-registry/nifi-registry-current/flow_storage}
- volumes:
- - ../nifi/nifi-registry/:/opt/nifi-registry/nifi-registry-current/conf/:rw
- - ./${NIFI_SECURITY_DIR:-../security/nifi_certificates/}nifi-keystore.jks:/opt/nifi-registry/nifi-registry-current/conf/keystore.jks:ro
- - ./${NIFI_SECURITY_DIR:-../security/nifi_certificates/}nifi-truststore.jks://opt/nifi-registry/nifi-registry-current/conf/truststore.jks:ro
- - nifi-registry-vol-database:/opt/nifi-registry/nifi-registry-current/database
- - nifi-registry-vol-flow-storage:/opt/nifi-registry/nifi-registry-current/flow_storage
- - nifi-registry-vol-work:/opt/nifi-registry/nifi-registry-current/work
- - nifi-registry-vol-logs:/opt/nifi-registry/nifi-registry-current/logs
- extra_hosts:
- - ${NIFI_HOST_NAME:-test-5:0.0.0.0}
- - ${NIFI_REGISTRY_HOST_NAME:-test-6:0.0.0.0}
-
- ulimits:
- memlock:
- soft: -1
- hard: -1
- nofile:
- soft: 65536
- hard: 262144
-
- tty: true
- ports:
- - "${NIFI_REGISTRY_FLOW_OUTPUT_PORT:-8083}:${NIFI_REGISTRY_FLOW_INPUT_PORT:-18443}"
-
- entrypoint: bash -c "chown -R nifi:nifi /opt/nifi-registry/nifi-registry-current/database && \
- chown -R nifi:nifi /opt/nifi-registry/nifi-registry-current/flow_storage && \
- chown -R nifi:nifi /opt/nifi-registry/nifi-registry-current/work && \
- chown -R nifi:nifi /opt/nifi-registry/nifi-registry-current/logs && \
- bash /opt/nifi-registry/scripts/start.sh"
-
- networks:
- - cognet
-
- nifi-nginx:
- # image: cogstacksystems/nifi-nginx:latest
- build:
- context: ../services/nginx/
- args:
- HTTP_PROXY: $HTTP_PROXY
- HTTPS_PROXY: $HTTPS_PROXY
- no_proxy: $no_proxy
- container_name: cogstack-nifi-nginx
- restart: always
- env_file:
- - ./network_settings.env
- - ./nginx.env
- - ./nifi.env
- - ./elasticsearch.env
- - ./project.env
- - ./nlp_service.env
- volumes:
- - ../services/nginx/sites-enabled:/etc/nginx/sites-enabled:ro
- - ../services/nginx/config/nginx.conf.template:/etc/nginx/config/nginx.conf.template:rw
- - ../services/nginx/config/nginx.conf:/etc/nginx/nginx.conf:rw
- - ../security/root_certificates:/etc/nginx/root_certificates:ro
- - ../security/nifi_certificates:/etc/nginx/nifi_certificates:ro
-
- - ../security/es_certificates/${ELASTICSEARCH_VERSION:-opensearch}/elastic-stack-ca.crt.pem:/etc/nginx/es_certificates/elastic-stack-ca.crt.pem:ro
- - ../security/es_certificates/${ELASTICSEARCH_VERSION:-opensearch}/elastic-stack-ca.key.pem:/etc/nginx/es_certificates/elastic-stack-ca.key.pem:ro
- # - ../security/es_certificates/:/etc/nginx/es_certificates/:ro
- ports:
- - "${NIFI_EXTERNAL_PORT_NGINX:-8443}:${NIFI_INTERNAL_PORT_NGINX:-8443}"
- - "${NIFI_REGISTRY_EXTERNAL_PORT_NGINX:-18443}:${NIFI_REGISTRY_INTERNAL_PORT_NGINX:-18443}"
- networks:
- - cognet
- command: /bin/bash -c "envsubst < /etc/nginx/config/nginx.conf.template > /etc/nginx/config/nginx.conf && nginx -g 'daemon off;'"
-
-#---------------------------------------------------------------------------#
-# Docker named volumes #
-#---------------------------------------------------------------------------#
-volumes:
- # NiFi related
- nifi-vol-logs:
- driver: local
-
- nifi-vol-provenance:
- driver: local
-
- nifi-vol-database:
- driver: local
-
- nifi-vol-flowfiles:
- driver: local
-
- nifi-vol-content:
- driver: local
-
- nifi-vol-state:
- driver: local
-
- nifi-vol-errors:
- driver: local
-
- nifi-registry-vol-database:
- driver: local
- nifi-registry-vol-flow-storage:
- driver: local
- nifi-registry-vol-work:
- driver: local
- nifi-registry-vol-logs:
- driver: local
-
-#---------------------------------------------------------------------------#
-# Docker networks. #
-#---------------------------------------------------------------------------#
-networks:
- cognet:
- driver: bridge
- name: cogstack-net
diff --git a/deploy/services.dev.yml b/deploy/services.dev.yml
new file mode 100644
index 000000000..fcbac398c
--- /dev/null
+++ b/deploy/services.dev.yml
@@ -0,0 +1,282 @@
+#---------------------------------------------------------------------------#
+# Common snippets / anchors #
+#---------------------------------------------------------------------------#
+
+x-nifi-logging-common: &nifi-logging-common
+ driver: "json-file"
+ options:
+ max-size: ${NIFI_DOCKER_LOG_SIZE_PER_FILE:-250m}
+ max-file: ${NIFI_DOCKER_LOG_NUM_FILES:-10}
+
+x-all-env: &all-env
+ - ./project.env
+ - ./general.env
+ - ./nifi.env
+ - ./gitea.env
+ - ./nginx.env
+ - ./database.env
+ - ./elasticsearch.env
+ - ./network_settings.env
+ - ../security/env/users_nifi.env
+ - ../security/env/users_database.env
+ - ../security/env/users_nginx.env
+ - ../security/env/users_elasticsearch.env
+ - ../security/env/certificates_general.env
+ - ../security/env/certificates_elasticsearch.env
+ - ../security/env/certificates_nifi.env
+
+x-es-env: &es-env
+ - ./network_settings.env
+ - ./elasticsearch.env
+ - ../security/env/users_elasticsearch.env
+ - ../security/env/certificates_elasticsearch.env
+
+x-db-env: &db-env
+ - ./database.env
+ - ../security/env/users_database.env
+
+x-common-hosts: &common-hosts
+ - ${ELASTICSEARCH_1_HOST_NAME:-test-1:0.0.0.0}
+ - ${ELASTICSEARCH_2_HOST_NAME:-test-2:0.0.0.0}
+ - ${ELASTICSEARCH_3_HOST_NAME:-test-3:0.0.0.0}
+ - ${KIBANA_HOST_NAME:-test-4:0.0.0.0}
+ - ${NIFI_HOST_NAME:-test-5:0.0.0.0}
+ - ${NIFI_REGISTRY_HOST_NAME:-test-6:0.0.0.0}
+
+x-common-ulimits: &common-ulimits
+ ulimits:
+ nofile:
+ soft: 65535
+ hard: 65535
+ nproc: 65535
+ memlock:
+ soft: -1
+ hard: -1
+
+x-nifi-common: &nifi-common
+ <<: *common-ulimits
+ restart: unless-stopped
+ env_file: *all-env
+ extra_hosts: *common-hosts
+ networks:
+ - cognet
+
+x-nifi-volumes: &nifi-volumes
+ # Drivers
+ - ../nifi/drivers:/opt/nifi/drivers
+
+ # User overrides bundled in the image
+ - ../nifi/user_scripts:/opt/nifi/user_scripts:rw
+ - ../nifi/user_schemas:/opt/nifi/user_schemas:rw
+
+ # Python processors (NiFi 2.x)
+ - ../nifi/user_python_extensions:/opt/nifi/nifi-current/python_extensions:rw
+
+ # Security certificates
+ - ../security:/security:ro
+
+ # Security helper scripts
+ - ../security/scripts/nifi_create_single_user_auth.sh:/opt/nifi/nifi-current/security_scripts/nifi_create_single_user_auth.sh:ro
+
+ # NiFi configuration
+ - ../nifi/conf/:/opt/nifi/nifi-current/conf/:rw
+
+ # Ingest data directory
+ - ./${NIFI_DATA_PATH:-../data/}:/data/:rw
+
+ # DB schemas
+ - ../services/cogstack-db/:/opt/cogstack-db/:rw
+
+ # MedCAT models
+ - ./${RES_MEDCAT_SERVICE_MODEL_PRODUCTION_PATH:-../services/cogstack-nlp/medcat-service/models/}:/opt/models:rw
+
+ # NiFi repositories/state
+ - nifi-vol-logs:/opt/nifi/nifi-current/logs
+ - nifi-vol-provenance:/opt/nifi/nifi-current/provenance_repository
+ - nifi-vol-database:/opt/nifi/nifi-current/database_repository
+ - nifi-vol-flowfiles:/opt/nifi/nifi-current/flowfile_repository
+ - nifi-vol-content:/opt/nifi/nifi-current/content_repository
+ - nifi-vol-state:/opt/nifi/nifi-current/state
+
+ # Flowfile error output
+ - nifi-vol-errors:/opt/nifi/pipeline/flowfile-errors
+
+x-nifi-registry-volumes: &nifi-registry-volumes
+ # Registry configuration
+ - ../nifi/nifi-registry/:/opt/nifi-registry/nifi-registry-current/conf/:rw
+
+ # Security certificates
+ - ../security:/security:ro
+
+ # Registry persistence
+ - nifi-registry-vol-database:/opt/nifi-registry/nifi-registry-current/database
+ - nifi-registry-vol-flow-storage:/opt/nifi-registry/nifi-registry-current/flow_storage
+ - nifi-registry-vol-work:/opt/nifi-registry/nifi-registry-current/work
+ - nifi-registry-vol-logs:/opt/nifi-registry/nifi-registry-current/logs
+
+#---------------------------------------------------------------------------#
+# Used services #
+#---------------------------------------------------------------------------#
+services:
+
+#---------------------------------------------------------------------------#
+# NiFi webapp #
+#---------------------------------------------------------------------------#
+ nifi:
+ <<: *nifi-common
+ build:
+ context: ../nifi
+ dockerfile: nifi/Dockerfile
+ args:
+ HTTP_PROXY: $HTTP_PROXY
+ HTTPS_PROXY: $HTTPS_PROXY
+ no_proxy: $no_proxy
+ container_name: cogstack-nifi
+ hostname: nifi
+ shm_size: ${NIFI_DOCKER_SHM_SIZE:-"1g"}
+ environment:
+ - USER_ID=${NIFI_UID:-1000}
+ - GROUP_ID=${NIFI_GID:-1000}
+ - NIFI_WEB_PROXY_HOST=${NIFI_WEB_PROXY_HOST:-"localhost:8443"}
+ - NIFI_WEB_PROXY_CONTEXT_PATH=${NIFI_WEB_PROXY_CONTEXT_PATH:-"/nifi"}
+ - NIFI_INTERNAL_PORT=${NIFI_INTERNAL_PORT:-8443}
+ - NIFI_OUTPUT_PORT=${NIFI_OUTPUT_PORT:-8082}
+ - NIFI_INPUT_SOCKET_PORT=${NIFI_INPUT_SOCKET_PORT:-10000}
+ - PYTHONPATH=${NIFI_PYTHONPATH:-/opt/nifi/nifi-current/python/framework}
+ - JVM_OPTS="${NIFI_JVM_OPTS:--XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+ParallelRefProcEnabled -Djava.security.egd=file:/dev/./urandom}"
+ deploy:
+ resources:
+ limits:
+ cpus: "${NIFI_DOCKER_CPU_MAX}"
+ memory: "${NIFI_DOCKER_RAM}"
+ reservations:
+ cpus: "${NIFI_DOCKER_CPU_MIN}"
+ memory: "${NIFI_DOCKER_RAM}"
+ volumes: *nifi-volumes
+
+ # INFO : Uncomment the below line to generate your own USERNAME and PASSWORD,
+ # a bit messy this way as you will need to copy the credentials back
+ # to the "login-identity-providers.xml" section.
+ # entrypoint: bash -c "/opt/nifi/nifi-current/bin/nifi.sh set-single-user-credentials admin admincogstacknifi"
+ tty: true
+ ports:
+ - "${NIFI_OUTPUT_PORT:-8082}:${NIFI_INTERNAL_PORT:-8443}"
+ - "${NIFI_INPUT_SOCKET_PORT:-10000}"
+ logging: *nifi-logging-common
+
+ nifi-registry-flow:
+ <<: *nifi-common
+ image: ${NIFI_REGISTRY_DOCKER_IMAGE:-apache/nifi-registry:${NIFI_REGISTRY_VERSION:-latest}}
+ hostname: nifi-registry
+ container_name: cogstack-nifi-registry-flow
+ shm_size: ${NIFI_DOCKER_REGISTRY_SHM_SIZE:-1g}
+ user: root
+ environment:
+ - http_proxy=$HTTP_PROXY
+ - https_proxy=$HTTPS_PROXY
+ - no_proxy=$no_proxy
+ - USER_ID=${NIFI_UID:-1000}
+ - GROUP_ID=${NIFI_GID:-1000}
+ - KEYSTORE_PATH=${NIFI_REGISTRY_KEYSTORE_PATH:-/security/certificates/nifi/nifi-keystore.jks}
+ - KEYSTORE_TYPE=${NIFI_KEYSTORE_TYPE:-jks}
+ - KEYSTORE_PASSWORD=${NIFI_KEYSTORE_PASSWORD:-"cogstackNifi"}
+ - TRUSTSTORE_PASSWORD=${NIFI_TRUSTSTORE_PASSWORD:-"cogstackNifi"}
+ - TRUSTSTORE_PATH=${NIFI_REGISTRY_TRUSTSTORE_PATH:-/security/certificates/nifi/nifi-truststore.jks}
+
+ - TRUSTSTORE_TYPE=${NIFI_TRUSTSTORE_TYPE:-jks}
+ - INITIAL_ADMIN_IDENTITY=${NIFI_INITIAL_ADMIN_IDENTITY:-"cogstack"}
+ - AUTH=${NIFI_AUTH:-"tls"}
+ - NIFI_REGISTRY_DB_DIR=${NIFI_REGISTRY_DB_DIR:-/opt/nifi-registry/nifi-registry-current/database}
+ #- NIFI_REGISTRY_FLOW_PROVIDER=${NIFI_REGISTRY_FLOW_PROVIDER:-file}
+ - NIFI_REGISTRY_FLOW_STORAGE_DIR=${NIFI_REGISTRY_FLOW_STORAGE_DIR:-/opt/nifi-registry/nifi-registry-current/flow_storage}
+ deploy:
+ resources:
+ limits:
+ cpus: "${NIFI_REGISTRY_DOCKER_CPU_MAX}"
+ memory: "${NIFI_REGISTRY_DOCKER_RAM}"
+ reservations:
+ cpus: "${NIFI_REGISTRY_DOCKER_CPU_MIN}"
+ memory: "${NIFI_REGISTRY_DOCKER_RAM}"
+ volumes: *nifi-registry-volumes
+ extra_hosts: *common-hosts
+ tty: true
+ ports:
+ - "${NIFI_REGISTRY_FLOW_OUTPUT_PORT:-8083}:${NIFI_REGISTRY_FLOW_INPUT_PORT:-18443}"
+
+ entrypoint: bash -c "chown -R nifi:nifi /opt/nifi-registry/nifi-registry-current/database && \
+ chown -R nifi:nifi /opt/nifi-registry/nifi-registry-current/flow_storage && \
+ chown -R nifi:nifi /opt/nifi-registry/nifi-registry-current/work && \
+ chown -R nifi:nifi /opt/nifi-registry/nifi-registry-current/logs && \
+ bash /opt/nifi-registry/scripts/start.sh"
+ logging: *nifi-logging-common
+
+ nifi-nginx:
+ image: cogstacksystems/nifi-nginx:latest
+ container_name: cogstack-nifi-nginx
+ restart: always
+ shm_size: ${NGINX_SHM_SIZE:-1g}
+ env_file: *all-env
+ deploy:
+ resources:
+ limits:
+ cpus: "${NGINX_DOCKER_CPU_MAX}"
+ memory: "${NGINX_DOCKER_RAM}"
+ reservations:
+ cpus: "${NGINX_DOCKER_CPU_MIN}"
+ memory: "${NGINX_DOCKER_RAM}"
+ volumes:
+ - ../services/nginx/sites-enabled:/etc/nginx/sites-enabled:ro
+ - ../services/nginx/config/nginx.conf.template:/etc/nginx/config/nginx.conf.template:rw
+ - ../services/nginx/config/nginx.conf:/etc/nginx/nginx.conf:rw
+ - ../security/certificates:/certificates:ro
+ ports:
+ - "${NIFI_EXTERNAL_PORT_NGINX:-8443}:${NIFI_INTERNAL_PORT_NGINX:-8443}"
+ - "${NIFI_REGISTRY_EXTERNAL_PORT_NGINX:-18443}:${NIFI_REGISTRY_INTERNAL_PORT_NGINX:-18443}"
+ networks:
+ - cognet
+ command: /bin/bash -c "envsubst < /etc/nginx/config/nginx.conf.template > /etc/nginx/config/nginx.conf && nginx -g 'daemon off;'"
+ extra_hosts: *common-hosts
+ logging: *nifi-logging-common
+
+#---------------------------------------------------------------------------#
+# Docker named volumes #
+#---------------------------------------------------------------------------#
+volumes:
+ # NiFi related
+ nifi-vol-logs:
+ driver: local
+
+ nifi-vol-provenance:
+ driver: local
+
+ nifi-vol-database:
+ driver: local
+
+ nifi-vol-flowfiles:
+ driver: local
+
+ nifi-vol-content:
+ driver: local
+
+ nifi-vol-state:
+ driver: local
+
+ nifi-vol-errors:
+ driver: local
+
+ nifi-registry-vol-database:
+ driver: local
+ nifi-registry-vol-flow-storage:
+ driver: local
+ nifi-registry-vol-work:
+ driver: local
+ nifi-registry-vol-logs:
+ driver: local
+
+#---------------------------------------------------------------------------#
+# Docker networks. #
+#---------------------------------------------------------------------------#
+networks:
+ cognet:
+ driver: bridge
+ name: cogstack-net
diff --git a/deploy/services.yml b/deploy/services.yml
index 27c19bbfa..cd6b7ed5c 100644
--- a/deploy/services.yml
+++ b/deploy/services.yml
@@ -42,6 +42,10 @@ x-es-env: &es-env
- ../security/env/users_elasticsearch.env
- ../security/env/certificates_elasticsearch.env
+x-db-env: &db-env
+ - ./database.env
+ - ../security/env/users_database.env
+
x-common-hosts: &common-hosts
- ${ELASTICSEARCH_1_HOST_NAME:-test-1:0.0.0.0}
- ${ELASTICSEARCH_2_HOST_NAME:-test-2:0.0.0.0}
@@ -62,12 +66,79 @@ x-common-ulimits: &common-ulimits
x-nifi-common: &nifi-common
<<: *common-ulimits
- restart: always
+ restart: unless-stopped
env_file: *all-env
extra_hosts: *common-hosts
networks:
- cognet
+x-nifi-volumes: &nifi-volumes
+ # Drivers
+ - ../nifi/drivers:/opt/nifi/drivers
+
+ # User overrides bundled in the image
+ - ../nifi/user_scripts:/opt/nifi/user_scripts:rw
+ - ../nifi/user_schemas:/opt/nifi/user_schemas:rw
+
+ # Python processors (NiFi 2.x)
+ - ../nifi/user_python_extensions:/opt/nifi/nifi-current/python_extensions:rw
+
+ # Security certificates
+ - ../security:/security:ro
+
+ # Security helper scripts
+ - ../security/scripts/nifi_create_single_user_auth.sh:/opt/nifi/nifi-current/security_scripts/nifi_create_single_user_auth.sh:ro
+
+ # NiFi configuration
+ - ../nifi/conf/:/opt/nifi/nifi-current/conf/:rw
+
+ # Ingest data directory
+ - ./${NIFI_DATA_PATH:-../data/}:/data/:rw
+
+ # DB schemas
+ - ../services/cogstack-db/:/opt/cogstack-db/:rw
+
+ # MedCAT models
+ - ./${RES_MEDCAT_SERVICE_MODEL_PRODUCTION_PATH:-../services/cogstack-nlp/medcat-service/models/}:/opt/models:rw
+
+ # NiFi repositories/state
+ - nifi-vol-logs:/opt/nifi/nifi-current/logs
+ - nifi-vol-provenance:/opt/nifi/nifi-current/provenance_repository
+ - nifi-vol-database:/opt/nifi/nifi-current/database_repository
+ - nifi-vol-flowfiles:/opt/nifi/nifi-current/flowfile_repository
+ - nifi-vol-content:/opt/nifi/nifi-current/content_repository
+ - nifi-vol-state:/opt/nifi/nifi-current/state
+
+ # Flowfile error output
+ - nifi-vol-errors:/opt/nifi/pipeline/flowfile-errors
+
+x-nifi-registry-volumes: &nifi-registry-volumes
+ # Registry configuration
+ - ../nifi/nifi-registry/:/opt/nifi-registry/nifi-registry-current/conf/:rw
+
+ # Security certificates
+ - ../security:/security:ro
+
+ # Registry persistence
+ - nifi-registry-vol-database:/opt/nifi-registry/nifi-registry-current/database
+ - nifi-registry-vol-flow-storage:/opt/nifi-registry/nifi-registry-current/flow_storage
+ - nifi-registry-vol-work:/opt/nifi-registry/nifi-registry-current/work
+ - nifi-registry-vol-logs:/opt/nifi-registry/nifi-registry-current/logs
+
+x-db-common: &db-common
+ <<: *common-ulimits
+ shm_size: ${DATABASE_DOCKER_SHM_SIZE:-"1g"}
+ restart: unless-stopped
+ env_file: *db-env
+ deploy:
+ resources:
+ limits:
+ cpus: "${DATABASE_DOCKER_CPU_MAX}"
+ memory: "${DATABASE_DOCKER_RAM}"
+ reservations:
+ cpus: "${DATABASE_DOCKER_CPU_MIN}"
+ memory: "${DATABASE_DOCKER_RAM}"
+
x-es-common-volumes: &es-common-volumes
# Shared configs
- ../services/elasticsearch/config/${ELASTICSEARCH_VERSION:-opensearch}.yml:/usr/share/${ELASTICSEARCH_VERSION:-opensearch}/config/${ELASTICSEARCH_VERSION:-opensearch}.yml:ro
@@ -93,9 +164,9 @@ x-es-common-volumes: &es-common-volumes
x-es-common: &es-common
<<: *common-ulimits
- image: ${ELASTICSEARCH_DOCKER_IMAGE:-opensearchproject/opensearch:3.2.0}
- shm_size: ${ELASTICSEARCH_SHM_SIZE:-"1g"}
- restart: always
+ image: ${ELASTICSEARCH_DOCKER_IMAGE:-opensearchproject/opensearch:latest}
+ shm_size: ${ELASTICSEARCH_DOCKER_SHM_SIZE:-1g}
+ restart: unless-stopped
env_file: *es-env
networks:
- cognet
@@ -108,12 +179,21 @@ x-es-common: &es-common
OPENSEARCH_INITIAL_ADMIN_PASSWORD: ${OPENSEARCH_INITIAL_ADMIN_PASSWORD:-kibanaserver}
ELASTICSEARCH_VERSION: ${ELASTICSEARCH_VERSION:-opensearch}
logging: *es-logging-common
+ deploy:
+ resources:
+ limits:
+ cpus: "${ELASTICSEARCH_DOCKER_CPU_MAX}"
+ memory: "${ELASTICSEARCH_DOCKER_RAM}"
+ reservations:
+ cpus: "${ELASTICSEARCH_DOCKER_CPU_MIN}"
+ memory: "${ELASTICSEARCH_DOCKER_RAM}"
x-metricbeat-common: &metricbeat-common
<<: *common-ulimits
- image: ${METRICBEAT_IMAGE:-docker.elastic.co/beats/metricbeat:8.18.2}
+ image: ${METRICBEAT_IMAGE:-docker.elastic.co/beats/metricbeat:latest}
command: -e --strict.perms=false
restart: unless-stopped
+ shm_size: ${METRICBEAT_DOCKER_SHM:-1g}
env_file:
- ./elasticsearch.env
- ../security/env/users_elasticsearch.env
@@ -122,6 +202,14 @@ x-metricbeat-common: &metricbeat-common
- METRICBEAT_USER=${METRICBEAT_USER:-elastic}
- METRICBEAT_PASSWORD=${METRICBEAT_PASSWORD:-kibanaserver}
- KIBANA_HOST=${KIBANA_HOST:-"https://kibana:5601"}
+ deploy:
+ resources:
+ limits:
+ cpus: "${METRICBEAT_DOCKER_CPU_MAX}"
+ memory: "${METRICBEAT_DOCKER_RAM}"
+ reservations:
+ cpus: "${METRICBEAT_DOCKER_CPU_MIN}"
+ memory: "${METRICBEAT_DOCKER_RAM}"
volumes:
- ../services/metricbeat/metricbeat.yml:/usr/share/metricbeat/metricbeat.yml:ro
- ../security/certificates/elastic/elasticsearch/elastic-stack-ca.crt.pem:/usr/share/metricbeat/root-ca.crt:ro
@@ -133,9 +221,10 @@ x-metricbeat-common: &metricbeat-common
x-filebeat-common: &filebeat-common
<<: *common-ulimits
- image: ${FILEBEAT_IMAGE:-docker.elastic.co/beats/filebeat:8.18.2}
+ image: ${FILEBEAT_IMAGE:-docker.elastic.co/beats/filebeat:latest}
command: ${FILEBEAT_STARTUP_COMMAND:-'-e --strict.perms=false'}
restart: unless-stopped
+ shm_size: ${FILEBEAT_DOCKER_SHM:-1g}
env_file:
- ./elasticsearch.env
- ../security/env/users_elasticsearch.env
@@ -144,6 +233,14 @@ x-filebeat-common: &filebeat-common
- FILEBEAT_USER=${FILEBEAT_USER:-elastic}
- FILEBEAT_PASSWORD=${FILEBEAT_PASSWORD:-kibanaserver}
- KIBANA_HOST=${KIBANA_HOST:-"https://kibana:5601"}
+ deploy:
+ resources:
+ limits:
+ cpus: "${FILEBEAT_DOCKER_CPU_MAX}"
+ memory: "${FILEBEAT_DOCKER_RAM}"
+ reservations:
+ cpus: "${FILEBEAT_DOCKER_CPU_MIN}"
+ memory: "${FILEBEAT_DOCKER_RAM}"
volumes:
- ../services/filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:rw
- ../security/certificates/elastic/elasticsearch/elastic-stack-ca.crt.pem:/etc/pki/root/root-ca.crt:ro
@@ -162,15 +259,10 @@ services:
# Postgres container with sample data #
#---------------------------------------------------------------------------#
samples-db:
- <<: *common-ulimits
+ <<: *db-common
image: postgres:17.5-alpine
container_name: cogstack-samples-db
- shm_size: ${POSTGRES_SHM_SIZE:-"1g"}
- restart: always
platform: linux/amd64
- env_file:
- - ./database.env
- - ../security/env/users_database.env
environment:
# PG env vars
- POSTGRES_USER=${POSTGRES_USER_SAMPLES:-test}
@@ -194,19 +286,14 @@ services:
# CogStack Databank / Cogstack-DB, production database #
#---------------------------------------------------------------------------#
cogstack-databank-db:
- <<: *common-ulimits
+ <<: *db-common
image: postgres:17.5-alpine
container_name: cogstack-production-databank-db
- shm_size: ${POSTGRES_SHM_SIZE:-"1g"}
- restart: always
platform: linux/amd64
- env_file:
- - ./database.env
- - ../security/env/users_database.env
environment:
- - POSTGRES_USER=${POSTGRES_USER:-admin}
- - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-admin}
- - POSTGRES_DATABANK_DB=${POSTGRES_DATABANK_DB:-cogstack}
+ - POSTGRES_USER=${DATABASE_USER:-admin}
+ - POSTGRES_PASSWORD=${DATABASE_PASSWORD:-admin}
+ - POSTGRES_DATABANK_DB=${DATABASE_DB_NAME:-cogstack}
volumes:
# mapping postgres data dump and initialization
- ../services/cogstack-db/pgsql/schemas:/data/:ro
@@ -222,14 +309,9 @@ services:
- cognet
cogstack-databank-db-mssql:
- <<: *common-ulimits
+ <<: *db-common
image: mcr.microsoft.com/mssql/server:2019-latest
container_name: cogstack-production-databank-db-mssql
- shm_size: ${POSTGRES_SHM_SIZE:-"1g"}
- restart: always
- env_file:
- - ./database.env
- - ../security/env/users_database.env
environment:
- ACCEPT_EULA=y
- MSSQL_SA_USER=${MSSQL_SA_USER:-sa}
@@ -251,8 +333,8 @@ services:
#---------------------------------------------------------------------------#
es_native_create_certs:
container_name: es_create_certs
- image: docker.elastic.co/elasticsearch/elasticsearch:8.18.2
- shm_size: ${ELASTICSEARCH_SHM_SIZE:-"1g"}
+ image: docker.elastic.co/elasticsearch/elasticsearch:latest
+ shm_size: ${ELASTICSEARCH_DOCKER_SHM_SIZE:-1g}
env_file: *es-env
restart: "no"
command: bash -c "bash /usr/share/elasticsearch/es_native_cert_generator.sh"
@@ -287,7 +369,7 @@ services:
ports:
- "${ELASTICSEARCH_NODE_1_OUTPUT_PORT:-9200}:9200"
- "${ELASTICSEARCH_NODE_1_COMM_OUTPUT_PORT:-9300}:9300"
- - "${ELASTICSEARCH_NODE_1_ANALYZER_OUTPUT_PORT:-9600}:9600" # required for Performance Analyzer
+ - "${ELASTICSEARCH_NODE_1_ANALYZER_OUTPUT_PORT:-9600}:9600"
elasticsearch-2:
extends:
@@ -306,7 +388,7 @@ services:
ports:
- "${ELASTICSEARCH_NODE_2_OUTPUT_PORT:-9201}:9200"
- "${ELASTICSEARCH_NODE_2_COMM_OUTPUT_PORT:-9301}:9300"
- - "${ELASTICSEARCH_NODE_2_ANALYZER_OUTPUT_PORT:-9601}:9600" # required for Performance Analyzer
+ - "${ELASTICSEARCH_NODE_2_ANALYZER_OUTPUT_PORT:-9601}:9600"
elasticsearch-3:
extends:
@@ -325,7 +407,7 @@ services:
ports:
- "${ELASTICSEARCH_NODE_3_OUTPUT_PORT:-9202}:9200"
- "${ELASTICSEARCH_NODE_3_COMM_OUTPUT_PORT:-9302}:9300"
- - "${ELASTICSEARCH_NODE_3_ANALYZER_OUTPUT_PORT:-9602}:9600" # required for Performance Analyzer
+ - "${ELASTICSEARCH_NODE_3_ANALYZER_OUTPUT_PORT:-9602}:9600"
metricbeat-1:
<<: *metricbeat-common
@@ -350,9 +432,9 @@ services:
container_name: cogstack-metricbeat-3
volumes:
- metricbeat-data-3:/usr/share/metricbeat/data
- - ../security/certificates/elastic/elasticsearch/elasticsearch/${ES_INSTANCE_NAME_3:-elasticsearch-3}/${ES_INSTANCE_NAME3:-elasticsearch-3}.p12:/usr/share/metricbeat/esnode.p12:ro
- - ../security/certificates/elastic/elasticsearch/elasticsearch/${ES_INSTANCE_NAME_3:-elasticsearch-3}/${ES_INSTANCE_NAME3:-elasticsearch-3}.crt:/usr/share/metricbeat/esnode.crt:ro
- - ../security/certificates/elastic/elasticsearch/elasticsearch/${ES_INSTANCE_NAME_3:-elasticsearch-3}/${ES_INSTANCE_NAME3:-elasticsearch-3}.key:/usr/share/metricbeat/esnode.key:ro
+ - ../security/certificates/elastic/elasticsearch/elasticsearch/${ES_INSTANCE_NAME_3:-elasticsearch-3}/${ES_INSTANCE_NAME_3:-elasticsearch-3}.p12:/usr/share/metricbeat/esnode.p12:ro
+ - ../security/certificates/elastic/elasticsearch/elasticsearch/${ES_INSTANCE_NAME_3:-elasticsearch-3}/${ES_INSTANCE_NAME_3:-elasticsearch-3}.crt:/usr/share/metricbeat/esnode.crt:ro
+ - ../security/certificates/elastic/elasticsearch/elasticsearch/${ES_INSTANCE_NAME_3:-elasticsearch-3}/${ES_INSTANCE_NAME_3:-elasticsearch-3}.key:/usr/share/metricbeat/esnode.key:ro
filebeat-1:
<<: *filebeat-common
@@ -389,9 +471,9 @@ services:
#---------------------------------------------------------------------------#
kibana:
<<: *common-ulimits
- image: ${ELASTICSEARCH_KIBANA_DOCKER_IMAGE:-opensearchproject/opensearch-dashboards:3.2.0}
+ image: ${ELASTICSEARCH_KIBANA_DOCKER_IMAGE:-opensearchproject/opensearch-dashboards:latest}
container_name: cogstack-kibana
- shm_size: ${KIBANA_SHM_SIZE:-"1g"}
+ shm_size: ${KIBANA_DOCKER_SHM_SIZE:-1g}
restart: always
env_file: *es-env
environment:
@@ -401,7 +483,14 @@ services:
# INFO: uncomment below to enable SSL keys
SERVER_SSL_ENABLED: ${ELASTICSEARCH_SSL_ENABLED:-"true"}
OPENSEARCH_INITIAL_ADMIN_PASSWORD: ${OPENSEARCH_INITIAL_ADMIN_PASSWORD:-kibanaserver}
-
+ deploy:
+ resources:
+ limits:
+ cpus: "${KIBANA_DOCKER_CPU_MAX}"
+ memory: "${KIBANA_DOCKER_RAM}"
+ reservations:
+ cpus: "${KIBANA_DOCKER_CPU_MIN}"
+ memory: "${KIBANA_DOCKER_RAM}"
volumes:
# INFO: Kibana configuration mapped via volume (make sure to comment this and uncomment the next line if you are using NATIVE kibana deployment)
- ../services/kibana/config/${ELASTICSEARCH_VERSION:-opensearch}.yml:/usr/share/${KIBANA_VERSION:-opensearch-dashboards}/config/${KIBANA_CONFIG_FILE_VERSION:-opensearch_dashboards}.yml:ro
@@ -431,10 +520,10 @@ services:
#---------------------------------------------------------------------------#
nifi:
<<: *nifi-common
- image: cogstacksystems/cogstack-nifi:latest
+ image: ${NIFI_DOCKER_IMAGE:-cogstacksystems/cogstack-nifi:latest}
container_name: cogstack-nifi
hostname: nifi
- shm_size: ${NIFI_SHM_SIZE:-"1g"}
+ shm_size: ${NIFI_DOCKER_SHM_SIZE:-"1g"}
environment:
- USER_ID=${NIFI_UID:-1000}
- GROUP_ID=${NIFI_GID:-1000}
@@ -443,48 +532,17 @@ services:
- NIFI_INTERNAL_PORT=${NIFI_INTERNAL_PORT:-8443}
- NIFI_OUTPUT_PORT=${NIFI_OUTPUT_PORT:-8082}
- NIFI_INPUT_SOCKET_PORT=${NIFI_INPUT_SOCKET_PORT:-10000}
- volumes:
- # INFO: drivers folder
- - ../nifi/drivers:/opt/nifi/drivers
-
- # INFO: if there are local changes, map these content from local host to container
- # (normally, these 3 directories below are bundled with our NiFi image)
- # N.B. The container user may not have the permission to read these directories/files.
- - ../nifi/user-templates:/opt/nifi/nifi-current/conf/templates:rw
- - ../nifi/user-scripts:/opt/nifi/user-scripts:rw
- - ../nifi/user-schemas:/opt/nifi/user-schemas:rw
-
- # this is a direct mapping to where we store the NiFi python processors as of NiFi 2.x.x
- - ../nifi/user-python-extensions:/opt/nifi/nifi-current/python_extensions:rw
-
- # INFO: uncomment below to map security certificates if need to secure NiFi endpoints
- - ../security:/security:ro
-
- # Security credentials scripts
- - ../security/scripts/nifi_create_single_user_auth.sh:/opt/nifi/nifi-current/security_scripts/nifi_create_single_user_auth.sh:ro
-
- # # Nifi properties file:
- - ../nifi/conf/:/opt/nifi/nifi-current/conf/:rw
-
- # this is where you should place data to be ingested, under the form of symbolic
- - ./${NIFI_DATA_PATH:-../data/}:/opt/data/:rw
-
- # DB-schemas, from the services folder
- - ../services/cogstack-db/:/opt/cogstack-db/:rw
-
- # medcat models
- - ./${RES_MEDCAT_SERVICE_MODEL_PRODUCTION_PATH:-../services/cogstack-nlp/medcat-service/models/}:/opt/models:rw
-
- # rest of volumes to persist the state
- - nifi-vol-logs:/opt/nifi/nifi-current/logs
- - nifi-vol-provenance:/opt/nifi/nifi-current/provenance_repository
- - nifi-vol-database:/opt/nifi/nifi-current/database_repository
- - nifi-vol-flowfiles:/opt/nifi/nifi-current/flowfile_repository
- - nifi-vol-content:/opt/nifi/nifi-current/content_repository
- - nifi-vol-state:/opt/nifi/nifi-current/state
-
- # errors generated during data processing
- - nifi-vol-errors:/opt/nifi/pipeline/flowfile-errors
+ - PYTHONPATH=${NIFI_PYTHONPATH:-/opt/nifi/nifi-current/python/framework}
+ - JVM_OPTS="${NIFI_JVM_OPTS:--XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+ParallelRefProcEnabled -Djava.security.egd=file:/dev/./urandom}"
+ deploy:
+ resources:
+ limits:
+ cpus: "${NIFI_DOCKER_CPU_MAX}"
+ memory: "${NIFI_DOCKER_RAM}"
+ reservations:
+ cpus: "${NIFI_DOCKER_CPU_MIN}"
+ memory: "${NIFI_DOCKER_RAM}"
+ volumes: *nifi-volumes
# INFO : Uncomment the below line to generate your own USERNAME and PASSWORD,
# a bit messy this way as you will need to copy the credentials back
@@ -498,10 +556,10 @@ services:
nifi-registry-flow:
<<: *nifi-common
- image: apache/nifi-registry:${NIFI_REGISTRY_VERSION:-2.7.2}
+ image: ${NIFI_REGISTRY_DOCKER_IMAGE:-apache/nifi-registry:${NIFI_REGISTRY_VERSION:-latest}}
hostname: nifi-registry
container_name: cogstack-nifi-registry-flow
- shm_size: ${NIFI_REGISTRY_SHM_SIZE:-"1g"}
+ shm_size: ${NIFI_DOCKER_REGISTRY_SHM_SIZE:-1g}
user: root
environment:
- http_proxy=$HTTP_PROXY
@@ -509,11 +567,11 @@ services:
- no_proxy=$no_proxy
- USER_ID=${NIFI_UID:-1000}
- GROUP_ID=${NIFI_GID:-1000}
- - KEYSTORE_PATH=${NIFI_REGISTRY_KEYSTORE_PATH:-./conf/keystore.jks}
+ - KEYSTORE_PATH=${NIFI_REGISTRY_KEYSTORE_PATH:-/security/certificates/nifi/nifi-keystore.jks}
- KEYSTORE_TYPE=${NIFI_KEYSTORE_TYPE:-jks}
- KEYSTORE_PASSWORD=${NIFI_KEYSTORE_PASSWORD:-"cogstackNifi"}
- TRUSTSTORE_PASSWORD=${NIFI_TRUSTSTORE_PASSWORD:-"cogstackNifi"}
- - TRUSTSTORE_PATH=${NIFI_REGISTRY_TRUSTSTORE_PATH:-./conf/truststore.jks}
+ - TRUSTSTORE_PATH=${NIFI_REGISTRY_TRUSTSTORE_PATH:-/security/certificates/nifi/nifi-truststore.jks}
- TRUSTSTORE_TYPE=${NIFI_TRUSTSTORE_TYPE:-jks}
- INITIAL_ADMIN_IDENTITY=${NIFI_INITIAL_ADMIN_IDENTITY:-"cogstack"}
@@ -521,14 +579,15 @@ services:
- NIFI_REGISTRY_DB_DIR=${NIFI_REGISTRY_DB_DIR:-/opt/nifi-registry/nifi-registry-current/database}
#- NIFI_REGISTRY_FLOW_PROVIDER=${NIFI_REGISTRY_FLOW_PROVIDER:-file}
- NIFI_REGISTRY_FLOW_STORAGE_DIR=${NIFI_REGISTRY_FLOW_STORAGE_DIR:-/opt/nifi-registry/nifi-registry-current/flow_storage}
- volumes:
- - ../nifi/nifi-registry/:/opt/nifi-registry/nifi-registry-current/conf/:rw
- - ./${NIFI_SECURITY_DIR:-../security/certificates/nifi/}nifi-keystore.jks:/opt/nifi-registry/nifi-registry-current/conf/keystore.jks:ro
- - ./${NIFI_SECURITY_DIR:-../security/certificates/nifi/}nifi-truststore.jks://opt/nifi-registry/nifi-registry-current/conf/truststore.jks:ro
- - nifi-registry-vol-database:/opt/nifi-registry/nifi-registry-current/database
- - nifi-registry-vol-flow-storage:/opt/nifi-registry/nifi-registry-current/flow_storage
- - nifi-registry-vol-work:/opt/nifi-registry/nifi-registry-current/work
- - nifi-registry-vol-logs:/opt/nifi-registry/nifi-registry-current/logs
+ deploy:
+ resources:
+ limits:
+ cpus: "${NIFI_REGISTRY_DOCKER_CPU_MAX}"
+ memory: "${NIFI_REGISTRY_DOCKER_RAM}"
+ reservations:
+ cpus: "${NIFI_REGISTRY_DOCKER_CPU_MIN}"
+ memory: "${NIFI_REGISTRY_DOCKER_RAM}"
+ volumes: *nifi-registry-volumes
extra_hosts: *common-hosts
tty: true
ports:
@@ -545,8 +604,16 @@ services:
image: cogstacksystems/nifi-nginx:latest
container_name: cogstack-nifi-nginx
restart: always
- shm_size: 512mb
+ shm_size: ${NGINX_SHM_SIZE:-1g}
env_file: *all-env
+ deploy:
+ resources:
+ limits:
+ cpus: "${NGINX_DOCKER_CPU_MAX}"
+ memory: "${NGINX_DOCKER_RAM}"
+ reservations:
+ cpus: "${NGINX_DOCKER_CPU_MIN}"
+ memory: "${NGINX_DOCKER_RAM}"
volumes:
- ../services/nginx/sites-enabled:/etc/nginx/sites-enabled:ro
- ../services/nginx/config/nginx.conf.template:/etc/nginx/config/nginx.conf.template:rw
@@ -587,12 +654,20 @@ services:
<<: *common-ulimits
container_name: cogstack-gitea
image: gitea/gitea:1.23-rootless
- shm_size: ${DOCKER_SHM_SIZE:-"1g"}
+ shm_size: ${GITEA_DOCKER_SHM_SIZE:-"1g"}
restart: always
environment:
- http_proxy=$HTTP_PROXY
- https_proxy=$HTTPS_PROXY
- no_proxy=$no_proxy
+ deploy:
+ resources:
+ limits:
+ cpus: "${GITEA_DOCKER_CPU_MAX}"
+ memory: "${GITEA_DOCKER_RAM}"
+ reservations:
+ cpus: "${GITEA_DOCKER_CPU_MIN}"
+ memory: "${GITEA_DOCKER_RAM}"
volumes:
# app config
- ../services/gitea/app.ini:/etc/gitea/app.ini:rw
@@ -656,19 +731,14 @@ volumes:
nifi-vol-provenance:
driver: local
-
nifi-vol-database:
driver: local
-
nifi-vol-flowfiles:
driver: local
-
nifi-vol-content:
driver: local
-
nifi-vol-state:
driver: local
-
nifi-vol-errors:
driver: local
diff --git a/docs/deploy/troubleshooting.md b/docs/deploy/troubleshooting.md
index 15dc8c94f..b448a3dad 100644
--- a/docs/deploy/troubleshooting.md
+++ b/docs/deploy/troubleshooting.md
@@ -75,15 +75,15 @@ ERROR: [1] bootstrap checks failed
To solve this one needs to simply execute :
- on Linux/Mac OS X :
- ```sysctl -w vm.max_map_count=262144``` in terminal.
- To make the same change systemwide plase add ```vm.max_map_count=262144``` to /etc/sysctl.conf and restart the dockerservice/machine.
+ `sysctl -w vm.max_map_count=262144` in terminal.
+ To make the same change systemwide plase add `vm.max_map_count=262144` to /etc/sysctl.conf and restart the dockerservice/machine.
An example of this can be found under /services/elasticsearch/sysctl.conf
- on Windows you need to enter the following commands in a powershell instance:
- ```wsl -d docker-desktop```
+ `wsl -d docker-desktop`
- ```sysctl -w vm.max_map_count=262144```
+ `sysctl -w vm.max_map_count=262144`
For more on this issue please read: https://www.elastic.co/guide/en/elasticsearch/reference/current/vm-max-map-count.html
diff --git a/docs/deploy/workflows.md b/docs/deploy/workflows.md
index 0ad86ff0b..ddc7ee78d 100644
--- a/docs/deploy/workflows.md
+++ b/docs/deploy/workflows.md
@@ -260,7 +260,7 @@ Given a document content encoded as JSON, it will return payload containing the
There are several NiFi components involved in this process which stand out:
1. `ConvertAvroToJSON` - converts the AVRO records to JSON format using a generic format transcoder,
2. `ExecuteScript-ConvertRecordToMedCATinput` - prepares the JSON payload for MedCAT Service, this is Jython script, it has several configurable process properties:
- - `document_id_field\ = `docid` , the exact name of the unique Id column for the DB/ES record
+ - `document_id_field` = `docid`, the exact name of the unique Id column for the DB/ES record
- `document_text_field` = `document`, field/column name containing free text
- `log_file_name` = `nlp_request_bulk_parse_medical_text.log`, creates a log file in the repo folder `/nifi/user-scripts/`
- `log_invalid_records_to_file` = `True`, enable/disable logging errors to logfile with the above mentioned file name
@@ -403,4 +403,3 @@ Prerequisites for this workflow:
4. datetime fields must have the same format.
The script used for this process is located here: `nifi/user-scripts/cogstack_cohort_generate_data.py`. Please read all the info provided in the NiFi template.
-
diff --git a/docs/main.md b/docs/main.md
index 9ab7119c7..ab98b59f8 100644
--- a/docs/main.md
+++ b/docs/main.md
@@ -1,2 +1,3 @@
```{include} ../README.md
+```
diff --git a/docs/nifi/main.md b/docs/nifi/main.md
index 42b1ae1fc..82d899b85 100644
--- a/docs/nifi/main.md
+++ b/docs/nifi/main.md
@@ -1,13 +1,15 @@
# 💧 NiFi
This directory contains files related with our custom Apache NiFi image and example deployment templates with associated services.
-Apache NiFi is used as a customizable data pipeline engine for controlling and executing data flow between used services.
+Apache NiFi is used as a customizable data pipeline engine for controlling and executing data flow between used services.
There are multiple workflow templates provided with custom user scripts to work with NiFi.
For more information about Apache NiFi please refer to [the official website](https://nifi.apache.org/) and the [guide](https://nifi.apache.org/docs/nifi-docs/html/administration-guide.html#how-to-install-and-start-nifi).
## Concepts you should understand in NiFi
+
Before going deeper into the NiFi setup and workflows, a few key concepts need to be understood:
+
- processor: main compontent responsible for executing tasks, this can be anything: custom scripts, DB queries, http queries, .etc
- flowfiles: these represent the common datastructure used in a NiFi session to store data between two or more entities in the workflow, a flowfile will hold one or multiple records of data.
- avro files: common data seralisation system that is used to store record data, the flowfile records that are created by most of the built-in processors are stored in this format, if you aim to use your own custom scripts with alongside the built-in processors, you will likely have to write your own code that converts your records to Avro.
@@ -19,29 +21,37 @@ Avro Schema:[official documentation](https://avro.apache.org/docs/1.11.1/)
## `NiFi directory layout : /nifi`
-```
-├── Dockerfile - contains the base definition of the NiFi image along with all the packages/addons installed
-├── conf - NiFi configuration files, this folder is mounted on the NiFi service container at runtime, it needs to have read & write permissions by the user
-├── devel - custom folder that is mounted on the NiFi container where you may place your own scripts, again, read & write permissions required
-├── drivers - drivers used for DB connections, currently PostgreSQL and MSSQL
-├── nifi-app.log - log file mounted directly from the container for easy log checking
-├── user-schemas - Avro schemas used within workflows, it can also contain other schemas used in specific custom processors
-├── user-scripts - custom scripts used in workflows, you can put them here
-└── user-templates - here we store the fully exported templates of the workflows within NiFi
-```
+ ```
+ ├── Dockerfile - contains the base definition of the NiFi image along with all the packages/addons installed
+ ├── conf - NiFi configuration files, this folder is mounted on the NiFi service container at runtime, it needs to have read & write permissions by the user
+ ├── devel - custom folder that is mounted on the NiFi container where you may place your own scripts, again, read & write permissions required
+ ├── drivers - drivers used for DB connections, currently PostgreSQL and MSSQL
+ ├── nifi-app.log - log file mounted directly from the container for easy log checking
+ ├── user_schemas - Avro schemas used within workflows, it can also contain other schemas used in specific custom processors
+ ├── user_scripts - custom scripts used in workflows, you can put them here
+ ├── user_python_extensions - Python FlowFileTransform processors exposed to NiFi's extension framework
+ └── user_templates - here we store the fully exported templates of the workflows within NiFi
+ ```
+
+For user script organization and usage guidelines, see [user scripts](user_scripts.md).
+For Python extension processors, see [Python extensions](user_python_extensions.md).
## Custom Docker image
+
For any deployment it is recommended to build and use the custom Docker image of Apache NiFi that will contain all the necessary configuration, drivers, custom user scripts and workflows.
The Docker image recipe is defined in `Dockerfile` file.
There are two images being built as part of CI process:
+
- `cogstacksystems/cogstack-nifi:latest` - the latest image built from `master` branch,
- `cogstacksystems/cogstack-nifi:dev-latest` - the latest image built from `devel` branch.
There are also release images built for each release, for example:
+
- `cogstacksystems/cogstack-nifi:1.0.0` - release 1.0
## Apache NiFi configuration
+
The main configuration files for NiFi are provided in [`conf`](https://github.com/CogStack/CogStack-Nifi/conf) directory.
This section provides only a brief description of the most useful properties that you may need to modify to fit your own setup.
@@ -61,101 +71,106 @@ This NiFi custom image will use less resources and storage size for data provena
The corresponding properties have been commented out in the file.
Important properties to look for:
-```
-nifi.flow.configuration.archive.enabled=true
-nifi.flow.configuration.archive.max.time=1 days
-nifi.flow.configuration.archive.max.storage=32 GB
-```
+
+ ```
+ nifi.flow.configuration.archive.enabled=true
+ nifi.flow.configuration.archive.max.time=1 days
+ nifi.flow.configuration.archive.max.storage=32 GB
+ ```
The above lines are used to specify if backups of the current flow-files should be kept, keep in mind that the archive size can get quite big depending on the number of files you attempt to put through a workflow. This can easily get over 32GB so it is recommended you modify depending on the workflows and of the filesizes of the flow-files, this setting is directly affected by `nifi.queue.backpressure.count` and `nifi.queue.backpressure.size`
-```
-nifi.queue.backpressure.count=10000
-nifi.queue.backpressure.size=1 GB
-```
+ ```
+ nifi.queue.backpressure.count=10000
+ nifi.queue.backpressure.size=1 GB
+ ```
+
These settings specify how large can a queue's size be (any que between two processes), it is recommended to keep this as 1GB, the count is the ma number of flow-files a que can have, again 10000 is a reasonable number. These values should be modified only if you are really certain flow-files are not being held in the queue for long as the queued flow-files are stored in RAM memory as well as on the disk. A much more safer way of doing things if you wish to change the above settings is to change the settings only for the queues you would need, this can be done during runtime, without the need to touch the `nifi.properties` file.
-```
-nifi.bored.yield.duration=100 millis
-```
+ ```
+ nifi.bored.yield.duration=100 millis
+ ```
+
A timer that specifies how long should NiFi waits before checking for work, CPU dependent, the lower the time the higher the CPU usage, as it would do more frequent checks. The default is 10ms but it seems too excessive for most use cases, it would also result in significant CPU usage if a large number of workflows are running in parallel, so it has been set to 100ms instead.
-```
-nifi.flow.configuration.archive.enabled=true
-nifi.flow.configuration.archive.max.time=1 days
-nifi.flow.configuration.archive.max.storage=12 GB
-```
+ ```
+ nifi.flow.configuration.archive.enabled=true
+ nifi.flow.configuration.archive.max.time=1 days
+ nifi.flow.configuration.archive.max.storage=12 GB
+ ```
-By default, the flowfiles thar are out of the processing queues will be archived for a set period of time. The ```nifi.flow.configuration.archive.max.time``` sets the max duration, max size configurable via ```nifi.flow.configuration.archive.max.storage```, take note of these properties, the storage limit can quickly be hit if you have a high flow-file throughput.
+By default, the flowfiles thar are out of the processing queues will be archived for a set period of time. The `nifi.flow.configuration.archive.max.time` sets the max duration, max size configurable via `nifi.flow.configuration.archive.max.storage`, take note of these properties, the storage limit can quickly be hit if you have a high flow-file throughput.
Make sure to check the archive storage and flowfile storage settings as these will be the first to impact the space used for logging.
#### IMPORTANT NOTE about nifi properties
-:::{admonition} IMPORTANT NOTE about `nifi.properties
+:::{admonition} IMPORTANT NOTE about `nifi.properties`
:class: warning
For Linux users : This is a file that will get modified on runtime as when the container is up some of the properties within the file will get changed ( `nifi.cluster.node.address` for example). Some permission error's might pop out as the UID and GID of the folder permissions are different from that of the user within the container, which is using UID=1000 and GID=1000, declared in the `Dockerfile` and in `deploy/services.yml` under the `nifi` service section. To avoid permission issues, on the host container you will need to create a group with the GID 1000, assign the user that is running the docker command to the created group, and everything should work.
:::
-
+
Recommendation: If the account/group creation is not possible, you will need to build your own docker image on NiFi, but before you do this, you need to get hold of your group id and user id of the account you are logged in with.
To find out your GID and UID, you must do the following commands in terminal:
-```
-echo "user id (UID):"$(id -u $USER)
-echo "group id (GID):"$(id -g $USER)
-```
+
+ ```bash
+ echo "user id (UID):"$(id -u $USER)
+ echo "group id (GID):"$(id -g $USER)
+ ```
+
You'd need to export your ENV vars:
-```
-export NIFI_UID=$(id -u $USER)
-export NIFI_GID=$(id -g $USER)
-```
+ ```bash
+ export NIFI_UID=$(id -u $USER)
+ export NIFI_GID=$(id -g $USER)
+ ```
A better way is to also manually edit the `./deploy/nifi.env` file and change the default NIFI_UID and NIFI_GID variables there, after which you must execute the `export_env_vars.sh` script.
-```
-cd ./deploy/
-source export_env_vars.sh
-cd ../
-```
+ ```bash
+ cd ./deploy/
+ source export_env_vars.sh
+ cd ../
+ ```
You should check if the env vars have been set after running the script:
-```
-echo $NIFI_UID
-echo $NIFI_GID
-```
+ ```bash
+ echo $NIFI_UID
+ echo $NIFI_GID
+ ```
If the above command prints some numbers then it means that the `export_env_vars.sh` script worked. Otherwise, if you don't see anything, or just blank lines, then you need to execute the following:
-```
- set -o allexport
- source nifi.env
- set +o allexport
-```
+ ```bash
+ set -o allexport
+ source nifi.env
+ set +o allexport
+ ```
or, on Windows, via `git bash` terminal:
-```
- set -a
- source nifi.env
- set +a
-```
+ ```bash
+ set -a
+ source nifi.env
+ set +a
+ ```
Make sure to execute the above commands in the order they are mentioned.
Delete the older docker image from the nifi repo:
-```
-docker image rm cogstacksystems/cogstack-nifi:latest -f
-```
+ ```bash
+ docker image rm cogstacksystems/cogstack-nifi:latest -f
+ ```
Then execute the `recreate_nifi_docker_image.sh` script located in the `./nifi` folder.
-```
-cd ./nifi
-bash recreate_nifi_docker_image.sh
-```
+ ```bash
+ cd ./nifi
+ bash recreate_nifi_docker_image.sh
+ ```
Remember that the above export script and/or command are only visible in the current shell, so every time you restart your shell terminal you must execute the `./deploy/export_env_vars.sh` so that the variables will be visible by docker at runtime, because it uses the GID/UID in the `services.yml` file , specifying in the service definition `user: "${USER_ID:-${NIFI_UID:-1000}}:${GROUP_ID:-${NIFI_GID:-1000}}"`.
@@ -164,10 +179,10 @@ Remember that the above export script and/or command are only visible in the cur
This file allows users to configure settings for how NiFi should be started, it deals with location of configuration folder, files, JVM heap and Java System Properties.
-```
-java.arg.2=-Xms8g
-java.arg.3=-Xmx16g
-```
+ ```text
+ java.arg.2=-Xms8g
+ java.arg.3=-Xmx16g
+ ```
These properties specify the maximum memory that can be allocated to the JVM `-Xmx16g` and the initial memory allocation `-Xms8g`, values of 8g and 16g are used by default, however you may need to change these to fully utilise the memory of the machines you are spinning the service on.
@@ -185,29 +200,34 @@ Possible log level settings: `OFF`(inside `logback.xml`) or `NONE` (inside the N

-
The most useful log sections are:
-```
-
-```
+
+ ```
+
+ ```
+
- This refers to the overall NiFi log, useful for finding out what may cause startup issues.
-```
-
-```
+ ```
+
+ ```
+
- Security level logging, this controls certificate or authorisation issues here.
-```
-
-```
+ ```
+
+ ```
+
- This handles issues related to the startup parameters.
-```
-
-```
-- This is controls that is logged into the `./nifi/nifi-app.log`
+ ```
+
+ ```
+
+- This is controls that is logged into the `./nifi/nifi-app.log`
### `{zookeeper.conf}`
+
Apache Zookeeper is a highly consistent, scalable and reliable cluster co-ordination service.
When deploying Apache NiFi, an exernal Apache Zookeper service can be used or embedeed within NiFi service (the default option).
@@ -220,9 +240,11 @@ In previous nifi versions by default there was no user assigned and authenticati
Please use the guide provided in the [SECURITY.md](../security.md#apache-nifi) section to set up accounts and certificates.
## Drivers
+
The drivers are provided in [`drivers`](https://github.com/CogStack/CogStack-NiFi/tree/main/nifi/drivers) directory.
-The key used ones are:
+The key used ones are:
+
- `mssql-jdbc-9.4.1.jre11.jar` \ `mssql-jdbc-9.4.1.jre8.jar` and `mssql-jdbc-11.2.0.jre11.jar` \ `mssql-jdbc-11.2.0.jre8.jar` - MS SQL Server JDBC driver, older version of the driver for backwards compatibility across setups.
- `postgresql-42.6.0.jar` - PostgreSQL JDBC driver.
- `mysql-connector-j-8.1.0.jar` - MySQL JDBC driver.
@@ -230,20 +252,23 @@ The key used ones are:
These drivers come bundled for both `jre8` and `jre11`.
## User resources
+
With our custom image there are bundled resources to get up and running example workflows.
Please see [WORKFLOWS.md](../deploy/workflows.md) in the `deploy` directory for more details on the workflows.
## Workflow templates
+
Workflow templates define example data workflows that can be tailored and executed by the user.
The templates are stored in [user-templates](https://github.com/CogStack/CogStack-NiFi/tree/main/nifi/user-templates) directory.
## User scripts
+
Apache NiFi gives users the ability to execute custom scripts inside the data flow (supported languages: Python, Groovy, Clojure, Ruby, Lua, ECMAScript).
[`user-scripts`](https://github.com/CogStack/CogStack-NiFi/tree/main/nifi/user-scripts) directory contains example scripts, these are mostly used when parsing the data from Flow Files.
## User schemas
-[`user-schemas`](https://github.com/CogStack/CogStack-NiFi/tree/main/nifi/user-schemas) directory contains example AVRO type schemas that can be used by data parsers and processor.
+[`user-schemas`](https://github.com/CogStack/CogStack-NiFi/tree/main/nifi/user-schemas) directory contains example AVRO type schemas that can be used by data parsers and processor.
## Performance settings
@@ -263,10 +288,8 @@ If you have proceessors that are based on events then feel free to change the va
Recommendation: set the value of this parameter to be anywhere from from 2-4 times the CPU count you have on the machine.
-

-
### Process scheduling and task management
Users should be aware that although NiFi is quite efficient in terms of resource usage, it still needs manual configuration at a process-level to maintain a high flow-file throughput. To do this, each process ca be manually configured to ensure maximum efficiency.
@@ -295,11 +318,12 @@ You can then select the type of history you wish to check by clicking the drop-d
Certain methods can be executed via scripts, either python or shell. Python has the `nifi-api` package for this. Check this [article](https://nifi.apache.org/docs/nifi-docs/rest-api/index.html) for more details on the methods available.
-
## Various data type issues
-This section covers dealing with data type issues depending on DB types and/or other data structures to Apache AVRO format.
+This section covers dealing with data type issues depending on DB types and/or other data structures to Apache AVRO format.
### MySQL
+
Issues have been found with MySQL:
-- allows zero dates in DateTime fields -> solution: can be overcome in the URL connection string using parameters
\ No newline at end of file
+
+- allows zero dates in DateTime fields -> solution: can be overcome in the URL connection string using parameters
diff --git a/docs/nifi/user_python_extensions.md b/docs/nifi/user_python_extensions.md
new file mode 100644
index 000000000..ac0689f20
--- /dev/null
+++ b/docs/nifi/user_python_extensions.md
@@ -0,0 +1,69 @@
+# NiFi Python extensions
+
+This page covers Python FlowFileTransform processors loaded by NiFi's Python extension framework.
+
+## Location and wiring
+
+- Source folder: `nifi/user_python_extensions/`
+- Container path: `/opt/nifi/nifi-current/python_extensions`
+- Config property: `nifi.python.extensions.source.directory.default`
+ (set in `nifi/conf/nifi.properties`)
+- Compose mounts: `deploy/services.yml` and `deploy/services-dev.yml`
+- Import path: `PYTHONPATH` should include `/opt/nifi/nifi-current/python/framework`
+ (set via `NIFI_PYTHONPATH` in `deploy/nifi.env`)
+
+## When to use
+
+Use Python extensions when you want custom processors to appear in the NiFi UI with properties and
+relationships, and when you need tighter integration than `ExecuteStreamCommand`.
+
+If you only need stdin/stdout scripts, use `nifi/user_scripts/processors/` instead.
+
+## Minimal processor example
+
+```python
+from nifiapi.flowfiletransform import FlowFileTransformResult
+from nifiapi.properties import ProcessContext
+from py4j.java_gateway import JavaObject, JVMView
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
+
+
+class ExampleProcessor(BaseNiFiProcessor):
+ class Java:
+ implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
+
+ class ProcessorDetails:
+ version = "0.1.0"
+
+ def __init__(self, jvm: JVMView):
+ super().__init__(jvm)
+
+ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTransformResult:
+ return FlowFileTransformResult(
+ relationship="success",
+ attributes=flowFile.getAttributes(),
+ contents=flowFile.getContentsAsBytes(),
+ )
+```
+
+See `nifi/user_python_extensions/sample_processor.py` for a fuller example.
+
+## Imports and shared utilities
+
+If a processor uses helpers from `nifi/user_scripts/`, import via `nifi.user_scripts`
+and set `PYTHONPATH` to `/opt/nifi/nifi-current/python/framework` in the container.
+
+## Dependencies
+
+Python dependencies are installed into NiFi's Python framework from `nifi/requirements.txt`
+during the custom image build. Add new dependencies there if your extension needs them.
+
+## Development workflow
+
+- Edit files in `nifi/user_python_extensions/`.
+- Restart the container to pick up bind-mounted extension changes, and rebuild the image to refresh the
+ installed `nifi.user_scripts` package.
+
+## Related docs
+
+- `docs/nifi/user_scripts.md` for stdin/stdout processors and script layout.
diff --git a/docs/nifi/user_scripts.md b/docs/nifi/user_scripts.md
new file mode 100644
index 000000000..7357bf41a
--- /dev/null
+++ b/docs/nifi/user_scripts.md
@@ -0,0 +1,58 @@
+# NiFi user scripts
+
+This page describes how scripts under `nifi/user_scripts/` are organized and when to use each folder.
+
+## Layout
+
+- `processors/`: runnable scripts invoked by NiFi (reads stdin, writes stdout). These are Python scripts, not
+ native Java processors.
+- `utils/`: shared helper modules for processors or other user extensions. Keep side-effect free and avoid
+ stdin/stdout usage here.
+- `db/`: database helper modules used by scripts and processors.
+- `dto/`: simple data containers/config objects.
+- `tests/`: script-level tests and fixtures.
+- `legacy_scripts/`: historical scripts kept for reference; avoid new use unless needed.
+
+## Guidelines
+
+- If a script is executed directly by NiFi (uses stdin/stdout), place it in `processors/`.
+- If a module is imported by other scripts, place it in `utils/` or a subpackage.
+- Keep processor scripts small and delegate reusable logic to `utils/`.
+- Put static lookup data next to the utility that uses it (for example, `utils/data/`).
+
+## ExecuteStreamCommand example (processors)
+
+Use `ExecuteStreamCommand` when you want to run a script that reads from stdin and writes to stdout.
+The FlowFile content becomes the script input, and the script output becomes the new FlowFile content.
+
+Example configuration using `clean_doc.py`:
+
+```text
+Processor: ExecuteStreamCommand
+Command: python3
+Command Arguments: /opt/nifi/user_scripts/processors/clean_doc.py
+Script args: text_field_name=text
+Working Directory: /opt/nifi/user_scripts
+Environment Variables: PYTHONPATH=/opt/nifi/nifi-current/python/framework
+```
+
+Notes:
+- Ensure `PYTHONPATH` includes `/opt/nifi/nifi-current/python/framework` so `nifi.user_scripts` imports resolve.
+- Rebuild the NiFi image to pick up changes in `nifi/user_scripts` utilities used via package imports.
+- Ensure upstream processors output the expected format (most scripts here expect JSON).
+- Handle errors by writing useful messages to stderr and exiting non-zero so NiFi can route failures.
+
+## NiFi Python extensions (user_python_extensions)
+
+The `nifi/user_python_extensions/` folder contains Python FlowFileTransform processors loaded by
+NiFi's Python extension framework. In the container, this is mounted at:
+`/opt/nifi/nifi-current/python_extensions` and wired via
+`nifi.python.extensions.source.directory.default`.
+
+Use this folder when you want processors to appear in the NiFi UI as native Python processors.
+See `nifi/user_python_extensions/sample_processor.py` for a reference implementation and
+`nifi/user_scripts/utils/nifi/base_nifi_processor.py` for shared utilities.
+
+If a Python extension needs shared helpers from `nifi/user_scripts/`, import via the
+`nifi.user_scripts` package and ensure `PYTHONPATH` includes
+`/opt/nifi/nifi-current/python/framework` in the container.
diff --git a/docs/security/nifi.md b/docs/security/nifi.md
index b5406bd7e..eb0aa63d5 100644
--- a/docs/security/nifi.md
+++ b/docs/security/nifi.md
@@ -41,7 +41,7 @@ Before starting the NIFI container it's important to take note of the following
- **(OPTIONAL, DO NOT USE FOR NIFI VERSION >= 2.0)** the `nifi_toolkit_security.sh` script is used to download the nifi toolkit and generate new certificates and keys that are used by the container, take note that inside the `localhost` folder there is another nifi.properties file that is generated, we must look to the following setttings which are generated randomly and copy them to the `nifi/conf/nifi.properties` file.
- the trust/store keys generated for production will be in the `nifi_certificates/localhost` folder and the `nifi-cert.pem` + `nifi-key.key` files. in the base `nifi_certificates` folder.
-- as part of the security process the `nifi.sensitive.props.key` should be set to a random string or a password of minimum 12 characters. Once this is set do NOT modify it as all the other sensitive passwords will be hashed with this string. By default this is set to ```cogstackNiFipass```
+- as part of the security process the `nifi.sensitive.props.key` should be set to a random string or a password of minimum 12 characters. Once this is set do NOT modify it as all the other sensitive passwords will be hashed with this string. By default this is set to cogstackNiFipass
Example (`nifi/conf/nifi.properties`):
```properties
@@ -54,7 +54,7 @@ Example (`nifi/conf/nifi.properties`):
### Setting up access via user account (SINGLE USER CREDETIAL)
-This is entirely optional, if you have configered the security certs as described in ```security/README.md``` then you are good to go.
+This is entirely optional, if you have configered the security certs as described in security/README.md then you are good to go.
Default username :
diff --git a/nifi/.gitignore b/nifi/.gitignore
index e84903bf4..3eda65175 100644
--- a/nifi/.gitignore
+++ b/nifi/.gitignore
@@ -1,8 +1,8 @@
conf/*
-user-scripts/db/*
+user_scripts/db/*
**devel
devel
**flow
**login
nifi-registry/*
-user-scripts/extensions/*
\ No newline at end of file
+user_scripts/extensions/*
\ No newline at end of file
diff --git a/nifi/Dockerfile b/nifi/Dockerfile
index 746c57988..45bd4d468 100644
--- a/nifi/Dockerfile
+++ b/nifi/Dockerfile
@@ -2,6 +2,11 @@ ARG NIFI_VERSION=2.7.2
FROM apache/nifi:${NIFI_VERSION}
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+ARG NIFI_VERSION=2.7.2
+ARG GROOVY_VERSION=5.0.2
+
ARG HTTP_PROXY=""
ARG HTTPS_PROXY=""
ARG no_proxy=""
@@ -14,45 +19,77 @@ ARG GID=${NIFI_GID:-1000}
ARG TZ="Europe/London"
ARG NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY="/opt/nifi/nifi-current/python/framework"
ARG NIFI_PYTHON_EXTENSIONS_SOURCE_DIRECTORY_DEFAULT="/opt/nifi/nifi-current/python_extensions"
-ARG NIFI_PYTHON_WORKING_DIRECTORY="/opt/nifi/user-scripts"
+ARG NIFI_PYTHON_WORKING_DIRECTORY="/opt/nifi/user_scripts"
ENV TZ=${TZ}
ENV NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY=${NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY}
ENV NIFI_PYTHON_EXTENSIONS_SOURCE_DIRECTORY_DEFAULT=${NIFI_PYTHON_EXTENSIONS_SOURCE_DIRECTORY_DEFAULT}
ENV NIFI_PYTHON_WORKING_DIRECTORY=${NIFI_PYTHON_WORKING_DIRECTORY}
+ENV PYTHONPATH=${NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY}
# python stuff
ENV PIP_PREFER_BINARY=1
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
ENV PIP_NO_CACHE_DIR=1
+# Enables Python to generate .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=0
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
# default env vars to prevent NiFi from running on HTTP
ENV NIFI_WEB_HTTP_PORT=""
ENV NIFI_WEB_HTTP_HOST=""
-RUN echo "GID=${GID}"
-RUN echo "UID=${UID}"
-
USER root
-# run updates and install some base utility packages along with python support
-RUN apt-get update && apt-get upgrade -y --no-install-recommends && apt-get install -y --no-install-recommends iputils-ping libssl-dev openssl apt-transport-https apt-utils curl software-properties-common wget git build-essential make cmake ca-certificates zip unzip tzdata jq
-
-RUN echo "deb http://deb.debian.org/debian/ bookworm main contrib non-free non-free-firmware" >> /etc/apt/sources.list.d/debian.sources
-RUN echo "deb http://deb.debian.org/debian/ bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list.d/debian.sources
-RUN echo "deb http://deb.debian.org/debian/ bookworm-backports main contrib non-free non-free-firmware" >> /etc/apt/sources.list.d/debian.sources
-RUN echo "deb http://security.debian.org/debian-security/ bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list.d/debian.sources
-
-RUN echo "deb-src http://deb.debian.org/debian/ bookworm main contrib non-free non-free-firmware" >> /etc/apt/sources.list.d/debian.sources
-RUN echo "deb-src http://deb.debian.org/debian/ bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list.d/debian.sources
-RUN echo "deb-src http://deb.debian.org/debian/ bookworm-backports main contrib non-free non-free-firmware" >> /etc/apt/sources.list.d/debian.sources
-RUN echo "deb-src http://security.debian.org/debian-security/ bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list.d/debian.sources
-
-# Microsoft repos
-RUN wget -q -O- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /etc/apt/trusted.gpg.d/packages.microsoft.gpg
-RUN echo "deb [arch=amd64,armhf,arm64] https://packages.microsoft.com/ubuntu/22.04/prod jammy main" | tee -a /etc/apt/sources.list
-
-RUN apt-get update && apt-get install --no-install-recommends -y ssl-cert libsqlite3-dev python3-dev python3-pip python3.11 python3.11-dev python3-venv sqlite3 postgresql-server-dev-all
+# add repositories, install tooling, and clean up apt metadata in one layer
+RUN set -eux; \
+ apt-get update -y; \
+ apt-get install -y --no-install-recommends \
+ apt-transport-https \
+ ca-certificates \
+ curl \
+ gnupg \
+ wget; \
+ printf '%s\n' \
+ "deb http://deb.debian.org/debian/ bookworm main contrib non-free non-free-firmware" \
+ "deb http://deb.debian.org/debian/ bookworm-updates main contrib non-free non-free-firmware" \
+ "deb http://deb.debian.org/debian/ bookworm-backports main contrib non-free non-free-firmware" \
+ "deb http://security.debian.org/debian-security/ bookworm-security main contrib non-free non-free-firmware" \
+ "deb-src http://deb.debian.org/debian/ bookworm main contrib non-free non-free-firmware" \
+ "deb-src http://deb.debian.org/debian/ bookworm-updates main contrib non-free non-free-firmware" \
+ "deb-src http://deb.debian.org/debian/ bookworm-backports main contrib non-free non-free-firmware" \
+ "deb-src http://security.debian.org/debian-security/ bookworm-security main contrib non-free non-free-firmware" \
+ > /etc/apt/sources.list.d/debian.list; \
+ wget -q -O- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /etc/apt/trusted.gpg.d/packages.microsoft.gpg; \
+ echo "deb [arch=amd64,armhf,arm64] https://packages.microsoft.com/ubuntu/22.04/prod jammy main" > /etc/apt/sources.list.d/microsoft.list; \
+ apt-get update -y; \
+ apt-get upgrade -y --no-install-recommends; \
+ apt-get install -y --no-install-recommends \
+ apt-utils \
+ build-essential \
+ cmake \
+ git \
+ iputils-ping \
+ jq \
+ libsqlite3-dev \
+ libssl-dev \
+ make \
+ openssl \
+ postgresql-server-dev-all \
+ python3.11 \
+ python3.11-dev \
+ python3-dev \
+ python3-pip \
+ python3-venv \
+ software-properties-common \
+ sqlite3 \
+ ssl-cert \
+ tzdata \
+ unzip \
+ zip; \
+ apt-get clean; \
+ rm -rf /var/lib/apt/lists/*
# bust cache
ENV UV_VERSION=latest
@@ -60,47 +97,107 @@ ENV UV_VERSION=latest
# install rust, medcat requirement, install UV
ENV HOME=/root
ENV PATH="/root/.cargo/bin:${PATH}"
+ENV UV_INSTALL_DIR=/usr/local/bin
-RUN curl -sSf https://sh.rustup.rs -o /tmp/rustup-init.sh \
- && chmod +x /tmp/rustup-init.sh \
- && /tmp/rustup-init.sh -y \
- && rm /tmp/rustup-init.sh
-
-RUN curl -Ls https://astral.sh/uv/install.sh -o /tmp/install_uv.sh \
- && bash /tmp/install_uv.sh
+RUN set -eux; \
+ curl -sSf https://sh.rustup.rs -o /tmp/rustup-init.sh; \
+ chmod +x /tmp/rustup-init.sh; \
+ /tmp/rustup-init.sh -y; \
+ rm /tmp/rustup-init.sh
-RUN UV_PATH=$(find / -name uv -type f | head -n1) && \
- ln -s "$UV_PATH" /usr/local/bin/uv
-
-# clean up apt
-RUN apt-get clean autoclean && apt-get autoremove --purge -y
+RUN set -eux; \
+ curl -Ls https://astral.sh/uv/install.sh -o /tmp/install_uv.sh; \
+ bash /tmp/install_uv.sh; \
+ rm /tmp/install_uv.sh
######################################## Python / PIP SECTION ########################################
RUN uv pip install --no-cache-dir --break-system-packages --system --upgrade pip setuptools wheel
-# install util packages used in NiFi scripts (such as MedCAT, avro, nifyapi, etc.)
-COPY ./requirements.txt ./requirements.txt
-RUN uv pip install --no-cache-dir --break-system-packages --target=${NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY} -r "./requirements.txt"
+# install util packages used in NiFi scripts (such as avro, nifyapi, etc.)
+COPY nifi/requirements.txt ./requirements.txt
+RUN uv pip install --no-cache-dir --break-system-packages --target=${NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY} -r "./requirements.txt" --index-url https://pypi.org/simple
+
+# install the local cogstack_nifi package so nifi.user_scripts imports resolve
+COPY pyproject.toml /tmp/cogstack_nifi/pyproject.toml
+COPY nifi /tmp/cogstack_nifi/nifi
+RUN uv pip install --no-cache-dir --break-system-packages --target=${NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY} /tmp/cogstack_nifi
#######################################################################################################
# solve groovy grape proxy issues, grape ignores the current environment's proxy settings
-RUN export JAVA_OPTS="-Dhttp.proxyHost=$HTTP_PROXY -Dhttps.proxyHost=$HTTPS_PROXY -Dhttp.nonProxyHosts=$no_proxy"
+ENV JAVA_OPTS="-Dhttp.proxyHost=${HTTP_PROXY} -Dhttps.proxyHost=${HTTPS_PROXY} -Dhttp.nonProxyHosts=${no_proxy}"
-# INSTALL NAR extensions
-WORKDIR /opt/nifi/nifi-current/lib/
# Install Groovy
RUN mkdir -p /opt/nifi/groovy
WORKDIR /opt/nifi/groovy/
-RUN curl https://archive.apache.org/dist/groovy/5.0.2/distribution/apache-groovy-binary-5.0.2.zip --output apache-groovy-binary-5.0.2.zip --max-time 3600 && \
- unzip apache-groovy-binary-5.0.2.zip && \
- rm apache-groovy-binary-5.0.2.zip
+RUN set -eux; \
+ mkdir -p /opt/nifi/groovy; \
+ cd /opt/nifi/groovy; \
+ url1="https://archive.apache.org/dist/groovy/${GROOVY_VERSION}/distribution/apache-groovy-binary-${GROOVY_VERSION}.zip"; \
+ url2="https://repo1.maven.org/maven2/org/apache/groovy/apache-groovy-binary/${GROOVY_VERSION}/apache-groovy-binary-${GROOVY_VERSION}.zip"; \
+ curl -fsSL --retry 8 --retry-delay 3 --connect-timeout 20 -o groovy.zip "$url1" \
+ || curl -fsSL --retry 8 --retry-delay 3 --connect-timeout 20 -o groovy.zip "$url2"; \
+ unzip groovy.zip; rm groovy.zip
+
ENV GROOVY_BIN=/opt/nifi/groovy/groovy-5.0.2/bin
RUN $GROOVY_BIN/grape -V install org.apache.avro avro 1.12.0
+####################################### NAR INSTALL SECTION ##########################################
+# INSTALL NAR extensions
+WORKDIR /opt/nifi/nifi-current/lib/
+
+# --- Hotfix: NiFi 2.7.2 + Azure + Netty QUIC missing class (netty 4.2.9.Final) ---
+ARG NETTY_VERSION=4.2.9.Final
+RUN set -eux; \
+ cd /opt/nifi/nifi-current/lib; \
+ curl -fL --retry 5 --retry-delay 2 \
+ -o "netty-codec-classes-quic-${NETTY_VERSION}.jar" \
+ "https://repo1.maven.org/maven2/io/netty/netty-codec-classes-quic/${NETTY_VERSION}/netty-codec-classes-quic-${NETTY_VERSION}.jar"; \
+ # sanity check: ensure the Quic class exists in the jar
+ jar tf "netty-codec-classes-quic-${NETTY_VERSION}.jar" | grep -q '^io/netty/handler/codec/quic/Quic.class$'
+
+# Install parquet NAR
+RUN set -eux; \
+ base="https://repo.maven.apache.org/maven2/org/apache/nifi"; \
+ nars=( \
+ nifi-aws-nar \
+ nifi-aws-service-api-nar \
+ nifi-aws-kinesis-nar \
+ nifi-azure-nar \
+ nifi-azure-services-api-nar \
+ nifi-snowflake-processors-nar \
+ nifi-snowflake-services-nar \
+ nifi-snowflake-services-api-nar \
+ nifi-parquet-nar \
+ nifi-database-dialect-service-nar \
+ nifi-iceberg-shared-nar \
+ nifi-iceberg-service-api-nar \
+ nifi-iceberg-processors-nar \
+ nifi-iceberg-rest-catalog-nar \
+ nifi-iceberg-azure-nar \
+ nifi-hashicorp-vault-nar \
+ nifi-hashicorp-vault-client-service-api-nar \
+ nifi-hadoop-libraries-nar \
+ nifi-hadoop-nar \
+ nifi-github-nar \
+ nifi-gitlab-nar \
+ nifi-graph-client-service-api-nar \
+ nifi-couchbase-nar \
+ nifi-media-nar \
+ nifi-neo4j-cypher-service-nar \
+ ); \
+ for a in "${nars[@]}"; do \
+ curl -fsSL --retry 5 --retry-delay 2 \
+ -o "${a}-${NIFI_VERSION}.nar" \
+ "${base}/${a}/${NIFI_VERSION}/${a}-${NIFI_VERSION}.nar"; \
+ done; \
+ chown ${UID}:${GID} ./*.nar
+
+########################################################################################################
+
# copy configuration files
WORKDIR /opt/nifi/nifi-current/conf/
diff --git a/nifi/user-schemas/avro/.keep b/nifi/__init__.py
similarity index 100%
rename from nifi/user-schemas/avro/.keep
rename to nifi/__init__.py
diff --git a/nifi/conf/nifi.properties b/nifi/conf/nifi.properties
index 1363e7deb..6be2c3176 100644
--- a/nifi/conf/nifi.properties
+++ b/nifi/conf/nifi.properties
@@ -48,9 +48,9 @@ nifi.templates.enabled=true
nifi.python.command=python3.11
nifi.python.framework.source.directory=/opt/nifi/nifi-current/python/framework
nifi.python.extensions.source.directory.default=/opt/nifi/nifi-current/python_extensions
-nifi.python.working.directory=/opt/nifi/user-scripts
-nifi.python.logs.directory=./logs
-nifi.python.max.processes.per.extension.type=10
+nifi.python.working.directory=/opt/nifi/user_scripts
+nifi.python.logs.directory=./logs
+nifi.python.max.processes.per.extension.type=10
nifi.python.max.processes=100
####################
@@ -362,4 +362,3 @@ nifi.diagnostics.on.shutdown.max.filecount=10
# The diagnostics folder's maximum permitted size in bytes. If the limit is exceeded, the oldest files are deleted.
nifi.diagnostics.on.shutdown.max.directory.size=10 MB
-
diff --git a/nifi/nifi-registry/nifi-registry.properties b/nifi/nifi-registry/nifi-registry.properties
index 802c3a987..40bff0249 100644
--- a/nifi/nifi-registry/nifi-registry.properties
+++ b/nifi/nifi-registry/nifi-registry.properties
@@ -33,11 +33,11 @@ nifi.registry.web.proxy.context.path=/nifi-registry
nifi.registry.web.proxy.host=localhost:18443,nifi-registry:18443,nifi-registry-flow:18443,cogstack-nifi-registry-flow:18443,cogstack-nifi-registry:18443,nginx.local:18443
# security properties #
-nifi.registry.security.keystore=/opt/nifi-registry/nifi-registry-current/conf/keystore.jks
+nifi.registry.security.keystore=/security/certificates/nifi/nifi-keystore.jks
nifi.registry.security.keystoreType=JKS
nifi.registry.security.keystorePasswd=cogstackNifi
nifi.registry.security.keyPasswd=cogstackNifi
-nifi.registry.security.truststore=/opt/nifi-registry/nifi-registry-current/conf/truststore.jks
+nifi.registry.security.truststore=/security/certificates/nifi/nifi-truststore.jks
nifi.registry.security.truststoreType=JKS
nifi.registry.security.truststorePasswd=cogstackNifi
nifi.registry.security.needClientAuth=false
@@ -124,4 +124,4 @@ nifi.registry.security.user.authorizer=managed-authorizer
# revision management #
# This feature should remain disabled until a future NiFi release that supports the revision API changes
-nifi.registry.revisions.enabled=false
\ No newline at end of file
+nifi.registry.revisions.enabled=false
diff --git a/nifi/recreate_nifi_docker_image.sh b/nifi/recreate_nifi_docker_image.sh
index 9bfeb4043..640f11f71 100644
--- a/nifi/recreate_nifi_docker_image.sh
+++ b/nifi/recreate_nifi_docker_image.sh
@@ -13,4 +13,11 @@ if [[ $NIFI_GID == 1000 ]]; then
NIFI_GID=$(id -g)
fi
-docker build --build-arg GID=${NIFI_GID} --build-arg UID=${NIFI_UID} -t cogstacksystems/cogstack-nifi:latest -f Dockerfile .
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+docker build --build-arg GID=${NIFI_GID} \
+ --build-arg UID=${NIFI_UID} \
+ -t cogstacksystems/cogstack-nifi:latest \
+ -f "$SCRIPT_DIR/Dockerfile" \
+ "$REPO_ROOT"
diff --git a/nifi/requirements-dev.txt b/nifi/requirements-dev.txt
new file mode 100644
index 000000000..5547e70bf
--- /dev/null
+++ b/nifi/requirements-dev.txt
@@ -0,0 +1,7 @@
+ruff==0.12.12
+mypy==1.17.0
+mypy-extensions==1.1.0
+types-aiofiles==24.1.0.20250708
+types-PyYAML==6.0.12.20250516
+types-setuptools==80.9.0.20250529
+timeout-decorator==0.5.0
diff --git a/nifi/requirements.txt b/nifi/requirements.txt
index c9014368a..ba862b29e 100644
--- a/nifi/requirements.txt
+++ b/nifi/requirements.txt
@@ -1,13 +1,10 @@
+wheel==0.45.1
+
# data science pkgs
-seaborn==0.13.2
-matplotlib==3.10.6
-graphviz==0.21
-plotly==6.3.0
-keras==3.12.0
nltk==3.9.1
-numpy>=1.26.0,<2.0.0
-pandas==1.5.3
-dill>=0.3.6,<1.0.0
+numpy==2.3.5
+pandas==2.3.3
+dill>=0.4.0,<1.0.0
bokeh==3.8.2
psycopg[c,binary]==3.2.9
overrides==7.0.0
@@ -16,30 +13,23 @@ overrides==7.0.0
py4j==0.10.9.9
rancoord==0.0.6
geocoder==1.38.1
-avro==1.12.0
+avro==1.12.1
nipyapi==1.1.0
py7zr==1.0.0
-ipyparallel==9.0.1
-cython==3.1.3
-tqdm==4.67.1
jsonpickle==4.1.1
-certifi==2025.8.3
-xlsxwriter==3.2.5
-mysql-connector-python==9.4.0
-pymssql==2.3.7
+xlsxwriter==3.2.9
+mysql-connector-python==9.5.0
+pymssql==2.3.9
+requests==2.32.5
+PyYAML==6.0.3
+pydantic==2.12.5
+overrides==7.0.0
+pyarrow==22.0.0
# other utils
xnat==0.7.2
# ElasticSearch/OpenSearch packages
-opensearch-py==3.0.0
elasticsearch9==9.1.0
+opensearch-py==3.0.0
neo4j==5.28.2
-
-# git utils
-dvc==3.62.0
-GitPython==3.1.45
-PyYAML==6.0.2
-
-# code utils
-ruff==0.12.12
diff --git a/nifi/user-scripts/bootstrap_external_lib_imports.py b/nifi/user-scripts/bootstrap_external_lib_imports.py
deleted file mode 100644
index 830743b06..000000000
--- a/nifi/user-scripts/bootstrap_external_lib_imports.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import os
-import sys
-
-
-def running_in_docker() -> bool:
- if os.path.exists("/.dockerenv"):
- return True
- try:
- with open("/proc/1/cgroup", "rt") as f:
- return any("docker" in line or "containerd" in line for line in f)
- except FileNotFoundError:
- return False
-
-
-# we need to add it to the sys imports
-if running_in_docker():
- sys.path.insert(0, "/opt/nifi/user-scripts")
-else:
- sys.path.insert(0, "./user-scripts")
diff --git a/nifi/user-scripts/dto/nifi_api_config.py b/nifi/user-scripts/dto/nifi_api_config.py
deleted file mode 100644
index 303bdd1b1..000000000
--- a/nifi/user-scripts/dto/nifi_api_config.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import os
-
-
-class NiFiAPIConfig:
- NIFI_URL_SCHEME: str = "https"
- NIFI_HOST: str = "localhost"
- NIFI_PORT: int = 8443
- NIFI_REGISTRY_PORT: int = 18443
-
- NIFI_USERNAME: str = os.environ.get("NIFI_SINGLE_USER_CREDENTIALS_USERNAME", "admin")
- NIFI_PASSWORD: str = os.environ.get("NIFI_SINGLE_USER_CREDENTIALS_PASSWORD", "cogstackNiFi")
-
- ROOT_CERT_CA_PATH: str = os.path.abspath("../../../../security/certificates/root/root-ca.pem")
- NIFI_CERT_PEM_PATH: str = os.path.abspath("../../../../security/certificates/nifi/nifi.pem")
- NIFI_CERT_KEY_PATH: str = os.path.abspath("../../../../security/certificates/nifi/nifi.key")
-
- VERIFY_SSL: bool = True
-
- @property
- def nifi_base_url(self) -> str:
- """Full NiFi base URL, e.g. https://localhost:8443"""
- return f"{self.NIFI_URL_SCHEME}://{self.NIFI_HOST}:{self.NIFI_PORT}"
-
- @property
- def nifi_api_url(self) -> str:
- """"NiFi REST API root, e.g. https://localhost:8443/nifi-api"""
- return f"{self.nifi_base_url}/nifi-api"
-
- @property
- def nifi_registry_base_url(self) -> str:
- """"NiFi Registry REST API root, e.g. https://localhost:18443/nifi-registry"""
- return f"{self.NIFI_URL_SCHEME}://{self.NIFI_HOST}:{self.NIFI_REGISTRY_PORT}/nifi-registry/"
-
- @property
- def nifi_registry_api_url(self) -> str:
- """"NiFi Registry REST API root, e.g. https://localhost:18443/nifi-registry/nifi-registry-api"""
- return f"{self.NIFI_URL_SCHEME}://{self.NIFI_HOST}:{self.NIFI_REGISTRY_PORT}/nifi-registry-api"
-
- def auth_credentials(self) -> tuple[str, str]:
- """Convenience for requests auth=(user, password)."""
- return (self.NIFI_USERNAME, self.NIFI_PASSWORD)
-
- def get_nifi_ssl_certs(self) -> tuple[str, str]:
- """Convenience for requests cert=(cert_path, key_path)."""
- return (self.NIFI_CERT_PEM_PATH, self.NIFI_CERT_KEY_PATH)
diff --git a/nifi/user-scripts/dto/pg_config.py b/nifi/user-scripts/dto/pg_config.py
deleted file mode 100644
index 19f15d029..000000000
--- a/nifi/user-scripts/dto/pg_config.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from pydantic import BaseModel, Field
-
-
-class PGConfig(BaseModel):
- host: str = Field(default="localhost")
- port: int = Field(default=5432)
- db: str = Field(default="samples_db")
- user: str = Field(default="test")
- password: str = Field(default="test")
- timeout: int = Field(default=50)
diff --git a/nifi/user-scripts/dto/service_health.py b/nifi/user-scripts/dto/service_health.py
deleted file mode 100644
index 5f6455dbb..000000000
--- a/nifi/user-scripts/dto/service_health.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from datetime import datetime
-from typing import Literal
-
-from pydantic import BaseModel, Field
-
-
-class ServiceHealth(BaseModel):
- """
- Base health check model shared by all services.
- """
-
- service: str = Field(..., description="Service name, e.g. NiFi, PostgreSQL, OpenSearch/ElasticSearch, etc.")
- status: Literal["healthy", "unhealthy", "degraded"] = Field(
- ..., description="Current service status"
- )
- message: str | None = Field(None, description="Optional status message")
- timestamp: datetime = Field(default_factory=datetime.utcnow)
- avg_processing_ms: float | None = Field(None)
- service_info: str | None = Field(None)
- connected: bool | None = Field(None)
-
- class Config:
- extra = "ignore"
-
-class MLServiceHealth(ServiceHealth):
- model_name: str | None = Field(None, description="Name of the ML model")
- model_version: str | None = Field(None, description="Version of the ML model")
- model_card: str | None = Field(None, description="URL or path to the model card")
-
-class NiFiHealth(ServiceHealth):
- active_threads: int | None = Field(None, description="Number of active threads")
- queued_bytes: int | None = Field(None, description="Total queued bytes")
- queued_count: int | None = Field(None, description="Number of queued flowfiles")
-
-class ElasticsearchHealth(ServiceHealth):
- cluster_status: str | None = Field(None, description="Cluster health status")
- node_count: int | None = Field(None)
- active_shards: int | None = Field(None)
-
-class PostgresHealth(ServiceHealth):
- version: str | None = Field(None)
- latency_ms: float | None = Field(None, description="Ping latency in milliseconds")
- db_name: str | None = Field(None, description="Database name")
-
-class MedCATTrainerHealth(ServiceHealth):
- """Health check model for MedCAT Trainer web service."""
- app_version: str | None = Field(None, description="MedCAT Trainer app version")
-
-class CogstackCohortHealth(ServiceHealth):
- """Health check model for CogStack Cohort service."""
- pass
diff --git a/nifi/user-scripts/logs/.gitignore b/nifi/user-scripts/logs/.gitignore
deleted file mode 100644
index f59ec20aa..000000000
--- a/nifi/user-scripts/logs/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*
\ No newline at end of file
diff --git a/nifi/user-scripts/utils/helpers/nifi_api_client.py b/nifi/user-scripts/utils/helpers/nifi_api_client.py
deleted file mode 100644
index 1c353d2c1..000000000
--- a/nifi/user-scripts/utils/helpers/nifi_api_client.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from logging import Logger
-
-from dto.nifi_api_config import NiFiAPIConfig
-from nipyapi import canvas, security
-from nipyapi.nifi import ApiClient, ProcessGroupsApi
-from nipyapi.nifi.configuration import Configuration as NiFiConfiguration
-from nipyapi.nifi.models.process_group_entity import ProcessGroupEntity
-from nipyapi.nifi.models.processor_entity import ProcessorEntity
-from nipyapi.registry import ApiClient as RegistryApiClient
-from nipyapi.registry import BucketsApi
-from nipyapi.registry.configuration import Configuration as RegistryConfiguration
-from utils.generic import get_logger
-
-
-class NiFiRegistryClient:
- def __init__(self, config: NiFiAPIConfig) -> None:
- self.config = config or NiFiAPIConfig()
- self.nipyapi_config = RegistryConfiguration()
- self.nipyapi_config.host = self.config.nifi_registry_api_url
- self.nipyapi_config.verify_ssl = self.config.VERIFY_SSL
- self.nipyapi_config.cert_file = self.config.NIFI_CERT_PEM_PATH # type: ignore
- self.nipyapi_config.key_file = self.config.NIFI_CERT_KEY_PATH # type: ignore
- self.nipyapi_config.ssl_ca_cert = self.config.ROOT_CERT_CA_PATH # type: ignore
-
- self.logger: Logger = get_logger(self.__class__.__name__)
-
- self.api_client = RegistryApiClient(self.nipyapi_config.host)
- self.buckets_api = BucketsApi(self.api_client)
-
- def list_buckets(self):
- buckets = self.buckets_api.get_buckets()
- for b in buckets:
- self.logger.info("Bucket: %s (%s)", b.name, b.identifier)
- return buckets
-
-
-class NiFiClient:
- def __init__(self, config: NiFiAPIConfig) -> None:
- self.config = config or NiFiAPIConfig()
- self.nipyapi_config = NiFiConfiguration()
- self.nipyapi_config.host = self.config.nifi_api_url
- self.nipyapi_config.verify_ssl = self.config.VERIFY_SSL
- self.nipyapi_config.cert_file = self.config.NIFI_CERT_PEM_PATH # type: ignore
- self.nipyapi_config.key_file = self.config.NIFI_CERT_KEY_PATH # type: ignore
- self.nipyapi_config.ssl_ca_cert = self.config.ROOT_CERT_CA_PATH # type: ignore
-
- self.logger: Logger = get_logger(self.__class__.__name__)
-
- self.api_client = ApiClient(self.nipyapi_config)
- self.process_group_api = ProcessGroupsApi(self.api_client)
-
- self._login()
-
- def _login(self) -> None:
- security.service_login(
- service='nifi',
- username=self.config.NIFI_USERNAME,
- password=self.config.NIFI_PASSWORD
- )
- self.logger.info("✅ Logged in to NiFi")
-
- def get_root_process_group_id(self) -> str:
- return canvas.get_root_pg_id()
-
- def get_process_group_by_name(self, process_group_name: str) -> None | list[object] | object:
- return canvas.get_process_group(process_group_name, identifier_type="nam")
-
- def get_process_group_by_id(self, process_group_id: str) -> ProcessGroupEntity:
- return canvas.get_process_group(process_group_id, identifier_type="id")
-
- def start_process_group(self, process_group_id: str) -> bool:
- return canvas.schedule_process_group(process_group_id, True)
-
- def stop_process_group(self, process_group_id: str) -> bool:
- return canvas.schedule_process_group(process_group_id, False)
-
- def get_child_process_groups_from_parent_id(self, parent_process_group_id: str) -> list[ProcessGroupEntity]:
- parent_pg = canvas.get_process_group(parent_process_group_id, identifier_type="id")
- return canvas.list_all_process_groups(parent_pg.id)
-
- def get_all_processors_in_process_group(self, process_group_id: str) -> list[ProcessorEntity]:
- return canvas.list_all_processors(process_group_id)
diff --git a/nifi/user-scripts/utils/helpers/service.py b/nifi/user-scripts/utils/helpers/service.py
deleted file mode 100644
index 9d4b28080..000000000
--- a/nifi/user-scripts/utils/helpers/service.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import sys
-import time
-
-import psycopg2
-from psycopg2 import sql
-
-sys.path.append("../../dto/")
-
-from dto.pg_config import PGConfig
-
-
-def check_postgres(cfg: PGConfig) -> tuple[bool, float | None, str | None]:
- """Return (is_healthy, latency_ms, error_detail)"""
- start = time.perf_counter()
- try:
- conn = psycopg2.connect(
- host=cfg.host,
- port=cfg.port,
- dbname=cfg.db,
- user=cfg.user,
- password=cfg.password,
- connect_timeout=cfg.timeout
- )
- with conn.cursor() as cur:
- cur.execute(sql.SQL("SELECT 1;"))
- result = cur.fetchone()
- conn.close()
- if result != (1,):
- return False, None, f"Unexpected result: {result}"
- latency = (time.perf_counter() - start) * 1000
- return True, latency, None
- except Exception as e:
- return False, None, str(e)
diff --git a/nifi/user-python-extensions/convert_avro_binary_field_to_base64.py b/nifi/user_python_extensions/convert_avro_binary_field_to_base64.py
similarity index 97%
rename from nifi/user-python-extensions/convert_avro_binary_field_to_base64.py
rename to nifi/user_python_extensions/convert_avro_binary_field_to_base64.py
index 5a990df94..17829a55e 100644
--- a/nifi/user-python-extensions/convert_avro_binary_field_to_base64.py
+++ b/nifi/user_python_extensions/convert_avro_binary_field_to_base64.py
@@ -1,7 +1,3 @@
-import sys
-
-sys.path.insert(0, "/opt/nifi/user-scripts")
-
import base64
import copy
import io
@@ -20,10 +16,11 @@
from nifiapi.relationship import Relationship
from overrides import overrides
from py4j.java_gateway import JavaObject, JVMView
-from utils.helpers.base_nifi_processor import BaseNiFiProcessor
+
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
-class ConvertAvroBinaryRecordFieldToBase64(BaseNiFiProcessor):
+class CogStackConvertAvroBinaryRecordFieldToBase64(BaseNiFiProcessor):
"""NiFi Python processor to convert a binary field in Avro records to base64-encoded string.
Reads each FlowFile as Avro, locates the configured binary_field_name, and rewrites the Avro schema,
@@ -168,4 +165,4 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
contents=output_byte_buffer.getvalue())
except Exception as exception:
self.logger.error("Exception during Avro processing: " + traceback.format_exc())
- raise exception
+ raise exception
\ No newline at end of file
diff --git a/nifi/user-python-extensions/convert_json_record_schema.py b/nifi/user_python_extensions/convert_json_record_schema.py
similarity index 98%
rename from nifi/user-python-extensions/convert_json_record_schema.py
rename to nifi/user_python_extensions/convert_json_record_schema.py
index a9a20df4e..ac29e842e 100644
--- a/nifi/user-python-extensions/convert_json_record_schema.py
+++ b/nifi/user_python_extensions/convert_json_record_schema.py
@@ -1,7 +1,3 @@
-import sys
-
-sys.path.insert(0, "/opt/nifi/user-scripts")
-
import json
import traceback
from collections import defaultdict
@@ -12,7 +8,8 @@
from nifiapi.relationship import Relationship
from overrides import overrides
from py4j.java_gateway import JavaObject, JVMView
-from utils.helpers.base_nifi_processor import BaseNiFiProcessor
+
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
class ConvertJsonRecordSchema(BaseNiFiProcessor):
@@ -48,7 +45,7 @@ class ProcessorDetails:
def __init__(self, jvm: JVMView):
super().__init__(jvm)
- self.json_mapper_schema_path: str = "/opt/nifi/user-schemas/json/cogstack_common_schema_mapping.json"
+ self.json_mapper_schema_path: str = "/opt/nifi/user_schemas/json/cogstack_common_schema_mapping.json"
self.preserve_non_mapped_fields: bool = True
self.composite_first_non_empty_field: list[str] = []
@@ -57,8 +54,8 @@ def __init__(self, jvm: JVMView):
PropertyDescriptor(name="json_mapper_schema_path",
description="The path to the json schema mapping file, " \
"the schema directory is mounted as a volume in" \
- " the nifi container in the /opt/nifi/user-schemas/ folder",
- default_value="/opt/nifi/user-schemas/json/cogstack_common_schema_mapping.json",
+ " the nifi container in the /opt/nifi/user_schemas/ folder",
+ default_value="/opt/nifi/user_schemas/json/cogstack_common_schema_mapping.json",
required=True,
validators=[StandardValidators.NON_EMPTY_VALIDATOR]),
PropertyDescriptor(name="preserve_non_mapped_fields",
@@ -356,4 +353,4 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
contents=json.dumps(output_contents).encode('utf-8'))
except Exception as exception:
self.logger.error("Exception during flowfile processing: " + traceback.format_exc())
- raise exception
+ raise exception
\ No newline at end of file
diff --git a/nifi/user_python_extensions/convert_record_parquet_to_json.py b/nifi/user_python_extensions/convert_record_parquet_to_json.py
new file mode 100644
index 000000000..9c11175a9
--- /dev/null
+++ b/nifi/user_python_extensions/convert_record_parquet_to_json.py
@@ -0,0 +1,95 @@
+import io
+import json
+import traceback
+
+import pyarrow
+from nifiapi.flowfiletransform import FlowFileTransformResult
+from nifiapi.properties import (
+ ProcessContext,
+ PropertyDescriptor,
+ StandardValidators,
+)
+from nifiapi.relationship import Relationship
+from overrides import overrides
+from py4j.java_gateway import JavaObject, JVMView
+from pyarrow import parquet
+
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
+from nifi.user_scripts.utils.serialization.parquet_json_data_types_converter import parquet_json_data_type_convert
+
+
+class CogStackConvertParquetToJson(BaseNiFiProcessor):
+ """NiFi Python processor Read parquet file and output as JSON.
+ """
+
+ class Java:
+ implements = ['org.apache.nifi.python.processor.FlowFileTransform']
+
+ class ProcessorDetails:
+ version = '0.0.1'
+
+ def __init__(self, jvm: JVMView):
+ super().__init__(jvm)
+
+ # this is directly mirrored to the UI
+ self._properties = []
+
+ # self._relationships = [
+ # Relationship(
+ # name="success",
+ # description="All FlowFiles processed successfully."
+ # ),
+ # Relationship(
+ # name="failure",
+ # description="FlowFiles that failed processing."
+ # )
+ # ]
+
+ self.descriptors: list[PropertyDescriptor] = self._properties
+ #self.relationships: list[Relationship] = self._relationships
+
+ @overrides
+ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTransformResult:
+ """
+
+ """
+ try:
+ self.process_context = context
+ self.set_properties(context.getProperties())
+
+ # read avro record
+ input_raw_bytes: bytes = flowFile.getContentsAsBytes()
+ input_byte_buffer: io.BytesIO = io.BytesIO(input_raw_bytes)
+
+ parquet_file = parquet.ParquetFile(input_byte_buffer)
+
+ output_buffer: io.BytesIO = io.BytesIO()
+ record_count: int = 0
+
+ for batch in parquet_file.iter_batches(batch_size=10000):
+ records: list[dict] = batch.to_pylist()
+
+ for record in records:
+ json_record = json.dumps(
+ record,
+ ensure_ascii=False,
+ separators=(",", ":"),
+ default=parquet_json_data_type_convert,
+ )
+
+ output_buffer.write(json_record.encode("utf-8"))
+ output_buffer.write(b"\n")
+ record_count += len(records)
+
+ input_byte_buffer.close()
+
+ attributes: dict = {k: str(v) for k, v in flowFile.getAttributes().items()}
+ attributes["mime.type"] = "application/x-ndjson"
+ attributes["record.count"] = str(record_count)
+
+ return FlowFileTransformResult(relationship="success",
+ attributes=attributes,
+ contents=output_buffer.getvalue())
+ except Exception as exception:
+ self.logger.error("Exception during Avro processing: " + traceback.format_exc())
+ raise exception
\ No newline at end of file
diff --git a/nifi/user-python-extensions/parse_service_response.py b/nifi/user_python_extensions/parse_service_response.py
similarity index 97%
rename from nifi/user-python-extensions/parse_service_response.py
rename to nifi/user_python_extensions/parse_service_response.py
index 58be867a5..e4eddf8c3 100644
--- a/nifi/user-python-extensions/parse_service_response.py
+++ b/nifi/user_python_extensions/parse_service_response.py
@@ -1,7 +1,3 @@
-import sys
-
-sys.path.insert(0, "/opt/nifi/user-scripts")
-
import json
import traceback
@@ -14,10 +10,11 @@
from nifiapi.relationship import Relationship
from overrides import overrides
from py4j.java_gateway import JavaObject, JVMView
-from utils.helpers.base_nifi_processor import BaseNiFiProcessor
+
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
-class ParseCogStackServiceResult(BaseNiFiProcessor):
+class CogStackParseCogStackServiceResult(BaseNiFiProcessor):
""" Normalises JSON responses from CogStack OCR or MedCAT services, reading each FlowFile,
coercing single objects to lists.
Exposes configurable properties for output text field name, service message type,
@@ -199,4 +196,4 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
contents=json.dumps(output_contents).encode('utf-8'))
except Exception as exception:
self.logger.error("Exception during flowfile processing: " + traceback.format_exc())
- raise exception
+ raise exception
\ No newline at end of file
diff --git a/nifi/user-python-extensions/prepare_record_for_nlp.py b/nifi/user_python_extensions/prepare_record_for_nlp.py
similarity index 95%
rename from nifi/user-python-extensions/prepare_record_for_nlp.py
rename to nifi/user_python_extensions/prepare_record_for_nlp.py
index 82c1668db..5bd1171d2 100644
--- a/nifi/user-python-extensions/prepare_record_for_nlp.py
+++ b/nifi/user_python_extensions/prepare_record_for_nlp.py
@@ -1,7 +1,3 @@
-import sys
-
-sys.path.insert(0, "/opt/nifi/user-scripts")
-
import io
import json
import traceback
@@ -17,10 +13,11 @@
)
from overrides import overrides
from py4j.java_gateway import JavaObject, JVMView
-from utils.helpers.base_nifi_processor import BaseNiFiProcessor
+
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
-class PrepareRecordForNlp(BaseNiFiProcessor):
+class CogStackPrepareRecordForNlp(BaseNiFiProcessor):
class Java:
implements = ['org.apache.nifi.python.processor.FlowFileTransform']
@@ -121,4 +118,4 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
contents=json.dumps({"content": output_contents}).encode("utf-8"))
except Exception as exception:
self.logger.error("Exception during flowfile processing: " + traceback.format_exc())
- raise exception
+ raise exception
\ No newline at end of file
diff --git a/nifi/user-python-extensions/prepare_record_for_ocr.py b/nifi/user_python_extensions/prepare_record_for_ocr.py
similarity index 95%
rename from nifi/user-python-extensions/prepare_record_for_ocr.py
rename to nifi/user_python_extensions/prepare_record_for_ocr.py
index fe17576d2..1442d95fa 100644
--- a/nifi/user-python-extensions/prepare_record_for_ocr.py
+++ b/nifi/user_python_extensions/prepare_record_for_ocr.py
@@ -1,11 +1,6 @@
-import sys
-
-sys.path.insert(0, "/opt/nifi/user-scripts")
-
import base64
import io
import json
-import sys
import traceback
from typing import Any, Union
@@ -20,11 +15,12 @@
from nifiapi.relationship import Relationship
from overrides import overrides
from py4j.java_gateway import JavaObject, JVMView
-from utils.helpers.avro_json_encoder import AvroJSONEncoder
-from utils.helpers.base_nifi_processor import BaseNiFiProcessor
+
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
+from nifi.user_scripts.utils.serialization.avro_json_encoder import AvroJSONEncoder
-class PrepareRecordForOcr(BaseNiFiProcessor):
+class CogStackPrepareRecordForOcr(BaseNiFiProcessor):
class Java:
implements = ['org.apache.nifi.python.processor.FlowFileTransform']
@@ -142,4 +138,4 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
contents=json.dumps(output_contents))
except Exception as exception:
self.logger.error("Exception during flowfile processing: " + traceback.format_exc())
- raise exception
+ raise exception
\ No newline at end of file
diff --git a/nifi/user-python-extensions/record_add_geolocation.py b/nifi/user_python_extensions/record_add_geolocation.py
similarity index 96%
rename from nifi/user-python-extensions/record_add_geolocation.py
rename to nifi/user_python_extensions/record_add_geolocation.py
index e412197d4..2a027937f 100644
--- a/nifi/user-python-extensions/record_add_geolocation.py
+++ b/nifi/user_python_extensions/record_add_geolocation.py
@@ -1,7 +1,3 @@
-import sys
-
-sys.path.insert(0, "/opt/nifi/user-scripts")
-
import csv
import json
import os
@@ -17,11 +13,12 @@
)
from overrides import overrides
from py4j.java_gateway import JavaObject, JVMView
-from utils.generic import download_file_from_url, safe_delete_paths
-from utils.helpers.base_nifi_processor import BaseNiFiProcessor
+
+from nifi.user_scripts.utils.generic import download_file_from_url, safe_delete_paths
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
-class JsonRecordAddGeolocation(BaseNiFiProcessor):
+class CogStackJsonRecordAddGeolocation(BaseNiFiProcessor):
"""NiFi Python processor to add geolocation data to JSON records based on postcode lookup.
We use https://www.getthedata.com/open-postcode-geo for geolocation.
The schema of the file used is available at: https://www.getthedata.com/open-postcode-geo
@@ -57,7 +54,7 @@ def __init__(self, jvm: JVMView):
super().__init__(jvm)
self.lookup_datafile_url: str = "https://download.getthedata.com/downloads/open_postcode_geo.csv.zip"
- self.lookup_datafile_path: str = "/opt/nifi/user-scripts/db/open_postcode_geo.csv"
+ self.lookup_datafile_path: str = "/opt/nifi/user_scripts/db/open_postcode_geo.csv"
self.postcode_field_name: str = "address_postcode"
self.geolocation_field_name: str = "address_geolocation"
@@ -75,7 +72,7 @@ def __init__(self, jvm: JVMView):
description="specify the local path for the geolocation lookup datafile csv",
required=True,
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
- default_value="/opt/nifi/user-scripts/db/open_postcode_geo.csv"),
+ default_value="/opt/nifi/user_scripts/db/open_postcode_geo.csv"),
PropertyDescriptor(name="postcode_field_name",
description="postcode field name in the records",
required=True,
@@ -119,7 +116,7 @@ def _check_geolocation_lookup_datafile(self) -> bool:
bool: file exists or not
"""
- base_output_extract_dir_path: str = "/opt/nifi/user-scripts/db"
+ base_output_extract_dir_path: str = "/opt/nifi/user_scripts/db"
output_extract_dir_path: str = os.path.join(base_output_extract_dir_path, "open_postcode_geo")
output_download_path: str = os.path.join(base_output_extract_dir_path, "open_postcode_geo.zip")
datafile_csv_initial_path: str = os.path.join(output_extract_dir_path, "open_postcode_geo.csv")
@@ -224,4 +221,4 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
)
except Exception as exception:
self.logger.error("Exception during flowfile processing:\n" + traceback.format_exc())
- return self.build_failure_result(flowFile, exception)
+ return self.build_failure_result(flowFile, exception)
\ No newline at end of file
diff --git a/nifi/user-python-extensions/record_decompress_cerner_blob.py b/nifi/user_python_extensions/record_decompress_cerner_blob.py
similarity index 97%
rename from nifi/user-python-extensions/record_decompress_cerner_blob.py
rename to nifi/user_python_extensions/record_decompress_cerner_blob.py
index ac0eab134..dc0e31c62 100644
--- a/nifi/user-python-extensions/record_decompress_cerner_blob.py
+++ b/nifi/user_python_extensions/record_decompress_cerner_blob.py
@@ -1,10 +1,5 @@
-import sys
-
-sys.path.insert(0, "/opt/nifi/user-scripts")
-
import base64
import json
-import sys
import traceback
from nifiapi.flowfiletransform import FlowFileTransformResult
@@ -15,11 +10,12 @@
)
from overrides import overrides
from py4j.java_gateway import JavaObject, JVMView
-from utils.cerner_blob import DecompressLzwCernerBlob
-from utils.helpers.base_nifi_processor import BaseNiFiProcessor
+
+from nifi.user_scripts.utils.codecs.cerner_blob import DecompressLzwCernerBlob
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
-class JsonRecordDecompressCernerBlob(BaseNiFiProcessor):
+class CogStackJsonRecordDecompressCernerBlob(BaseNiFiProcessor):
""" This script decompresses Cerner LZW compressed blobs from a JSON input stream.
It expects a JSON array of records, each containing a field with the binary data.
All RECORDS are expected to have the same fields, and presumably belonging to the same DOCUMENT.
@@ -200,4 +196,4 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
contents=json.dumps(output_contents))
except Exception as exception:
self.logger.error("Exception during flowfile processing: " + traceback.format_exc())
- raise exception
+ raise exception
\ No newline at end of file
diff --git a/nifi/user-python-extensions/sample_processor.py b/nifi/user_python_extensions/sample_processor.py
similarity index 97%
rename from nifi/user-python-extensions/sample_processor.py
rename to nifi/user_python_extensions/sample_processor.py
index dd950b40d..a92a05430 100644
--- a/nifi/user-python-extensions/sample_processor.py
+++ b/nifi/user_python_extensions/sample_processor.py
@@ -1,11 +1,7 @@
-import sys
-from typing import Any
-
-sys.path.insert(0, "/opt/nifi/user-scripts")
-
import io
import json
import traceback
+from typing import Any
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
@@ -18,10 +14,11 @@
)
from nifiapi.relationship import Relationship
from py4j.java_gateway import JavaObject, JVMView
-from utils.helpers.base_nifi_processor import BaseNiFiProcessor
+
+from nifi.user_scripts.utils.nifi.base_nifi_processor import BaseNiFiProcessor
-class SampleTestProcessor(BaseNiFiProcessor):
+class CogStackSampleTestProcessor(BaseNiFiProcessor):
class Java:
implements = ['org.apache.nifi.python.processor.FlowFileTransform']
@@ -162,4 +159,4 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
contents=json.dumps(output_contents))
except Exception as exception:
self.logger.error("Exception during Avro processing: " + traceback.format_exc())
- raise exception
+ raise exception
\ No newline at end of file
diff --git a/nifi/user-schemas/elasticsearch/indices/.keep b/nifi/user_schemas/avro/.keep
similarity index 100%
rename from nifi/user-schemas/elasticsearch/indices/.keep
rename to nifi/user_schemas/avro/.keep
diff --git a/nifi/user-schemas/elasticsearch/base_index_settings.json b/nifi/user_schemas/elasticsearch/base_index_settings.json
similarity index 100%
rename from nifi/user-schemas/elasticsearch/base_index_settings.json
rename to nifi/user_schemas/elasticsearch/base_index_settings.json
diff --git a/nifi/user-schemas/elasticsearch/templates/.keep b/nifi/user_schemas/elasticsearch/indices/.keep
similarity index 100%
rename from nifi/user-schemas/elasticsearch/templates/.keep
rename to nifi/user_schemas/elasticsearch/indices/.keep
diff --git a/nifi/user-schemas/json/.keep b/nifi/user_schemas/elasticsearch/templates/.keep
similarity index 100%
rename from nifi/user-schemas/json/.keep
rename to nifi/user_schemas/elasticsearch/templates/.keep
diff --git a/nifi/user-scripts/db/.gitignore b/nifi/user_schemas/json/.keep
similarity index 100%
rename from nifi/user-scripts/db/.gitignore
rename to nifi/user_schemas/json/.keep
diff --git a/nifi/user-schemas/legacy/annotation-medcat.avsc b/nifi/user_schemas/legacy/annotation-medcat.avsc
similarity index 100%
rename from nifi/user-schemas/legacy/annotation-medcat.avsc
rename to nifi/user_schemas/legacy/annotation-medcat.avsc
diff --git a/nifi/user-schemas/legacy/annotation_elasticsearch_index_mapping.json b/nifi/user_schemas/legacy/annotation_elasticsearch_index_mapping.json
similarity index 100%
rename from nifi/user-schemas/legacy/annotation_elasticsearch_index_mapping.json
rename to nifi/user_schemas/legacy/annotation_elasticsearch_index_mapping.json
diff --git a/nifi/user-schemas/legacy/cogstack_common_schema.avsc b/nifi/user_schemas/legacy/cogstack_common_schema.avsc
similarity index 100%
rename from nifi/user-schemas/legacy/cogstack_common_schema.avsc
rename to nifi/user_schemas/legacy/cogstack_common_schema.avsc
diff --git a/nifi/user-schemas/legacy/cogstack_common_schema_elasticsearch_index_mapping_template.json b/nifi/user_schemas/legacy/cogstack_common_schema_elasticsearch_index_mapping_template.json
similarity index 100%
rename from nifi/user-schemas/legacy/cogstack_common_schema_elasticsearch_index_mapping_template.json
rename to nifi/user_schemas/legacy/cogstack_common_schema_elasticsearch_index_mapping_template.json
diff --git a/nifi/user-schemas/legacy/cogstack_common_schema_full.avsc b/nifi/user_schemas/legacy/cogstack_common_schema_full.avsc
similarity index 100%
rename from nifi/user-schemas/legacy/cogstack_common_schema_full.avsc
rename to nifi/user_schemas/legacy/cogstack_common_schema_full.avsc
diff --git a/nifi/user-schemas/legacy/cogstack_common_schema_mapping.json b/nifi/user_schemas/legacy/cogstack_common_schema_mapping.json
similarity index 100%
rename from nifi/user-schemas/legacy/cogstack_common_schema_mapping.json
rename to nifi/user_schemas/legacy/cogstack_common_schema_mapping.json
diff --git a/nifi/user-schemas/legacy/document.avsc b/nifi/user_schemas/legacy/document.avsc
similarity index 100%
rename from nifi/user-schemas/legacy/document.avsc
rename to nifi/user_schemas/legacy/document.avsc
diff --git a/nifi/user-schemas/legacy/document_all_fields.avsc b/nifi/user_schemas/legacy/document_all_fields.avsc
similarity index 100%
rename from nifi/user-schemas/legacy/document_all_fields.avsc
rename to nifi/user_schemas/legacy/document_all_fields.avsc
diff --git a/nifi/user-scripts/logs/parse_json/.gitkeep b/nifi/user_scripts/__init__.py
similarity index 100%
rename from nifi/user-scripts/logs/parse_json/.gitkeep
rename to nifi/user_scripts/__init__.py
diff --git a/nifi/user-scripts/tmp/.gitignore b/nifi/user_scripts/db/.gitignore
similarity index 100%
rename from nifi/user-scripts/tmp/.gitignore
rename to nifi/user_scripts/db/.gitignore
diff --git a/nifi/user_scripts/dto/__init__.py b/nifi/user_scripts/dto/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user_scripts/dto/database_config.py b/nifi/user_scripts/dto/database_config.py
new file mode 100644
index 000000000..0999ebb88
--- /dev/null
+++ b/nifi/user_scripts/dto/database_config.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+from typing import Any
+
+from pydantic import AliasChoices, Field, PositiveInt, SecretStr
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class DatabaseConfig(BaseSettings):
+ model_config = SettingsConfigDict(
+ env_prefix="DATABASE_",
+ env_file=[Path(__file__).resolve().parents[3] / "deploy" / "database.env",
+ Path(__file__).resolve().parents[3] / "security" / "env" / "users_database.env",
+ ],
+ extra="ignore",
+ env_ignore_empty=True,
+ populate_by_name=True
+ )
+
+ host: str = Field(default="localhost", validation_alias=AliasChoices("POSTGRES_HOST"))
+ port: int = Field(default=5432,validation_alias=AliasChoices("POSTGRES_PORT"), ge=1, le=65535)
+
+ database_name : str = Field(default="db_samples", validation_alias=AliasChoices("DB", "DB_NAME"))
+ username: str = Field(default="test", validation_alias=AliasChoices("POSTGRES_USER_SAMPLES", "POSTGRES_USER"))
+ password: SecretStr = Field(default_factory=lambda: SecretStr("test"),
+ validation_alias=AliasChoices("POSTGRES_PASSWORD_SAMPLES",
+ "password",
+ "POSTGRES_PASSWORD"))
+ timeout: PositiveInt = Field(default=60, validation_alias=AliasChoices("TIMEOUT"))
+
+ def get_field_values_kwargs(self) -> dict[str, Any]:
+ return self.model_dump()
diff --git a/nifi/user_scripts/dto/elastic_config.py b/nifi/user_scripts/dto/elastic_config.py
new file mode 100644
index 000000000..3ce08f0e2
--- /dev/null
+++ b/nifi/user_scripts/dto/elastic_config.py
@@ -0,0 +1,78 @@
+import json
+from pathlib import Path
+from typing import ClassVar
+
+from pydantic import AliasChoices, Field, SecretStr, field_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class ElasticConfig(BaseSettings):
+
+ ROOT_DIR: ClassVar = Path(__file__).resolve().parents[3]
+ CERT_ROOT_DIR: ClassVar = ROOT_DIR / "security" / "certificates" / "elastic"
+
+ model_config = SettingsConfigDict(
+ env_prefix="ELASTICSEARCH_",
+ env_file=[ROOT_DIR / "deploy" / "elasticsearch.env",
+ ROOT_DIR / "security" / "env" / "users_elasticsearch.env",
+ ],
+ extra="ignore",
+ env_ignore_empty=True,
+ populate_by_name=True
+ )
+
+ elasticsearch_version: str = Field(default="opensearch", validation_alias=AliasChoices("VERSION"))
+ kibana_version: str = Field(default="opensearch-dashboards", validation_alias=AliasChoices("KIBANA_VERSION"))
+
+ es_port_1: int = Field(default=9200,
+ validation_alias=AliasChoices("ELASTICSEARCH_NODE_1_OUTPUT_PORT"), ge=1, le=65535)
+ es_port_2: int = Field(default=9201,
+ validation_alias=AliasChoices("ELASTICSEARCH_NODE_2_OUTPUT_PORT"), ge=1, le=65535)
+ es_port_3: int = Field(default=9202,
+ validation_alias=AliasChoices("ELASTICSEARCH_NODE_3_OUTPUT_PORT"), ge=1, le=65535)
+
+ kibana_host: str = Field(default="https://localhost:5601",
+ validation_alias=AliasChoices("KIBANA_HOST", "kibana_host"))
+
+ kibana_port: int = Field(default=5601,
+ validation_alias=AliasChoices("KIBANA_SERVER_OUTPUT_PORT"), ge=1, le=65535)
+
+ hosts: list[str] = Field(default_factory=list)
+ timeout: int = Field(default=60)
+ verify_ssl: bool = Field(default=False, validation_alias=AliasChoices("SSL_ENABLED", "ELASTICSEARCH_SSL_ENABLED"))
+ user: str = Field(default="admin", validation_alias=AliasChoices("ELASTIC_USER"))
+ password: SecretStr = Field(default_factory=lambda: SecretStr("admin"),
+ validation_alias=AliasChoices("ELASTIC_PASSWORD",
+ "password",
+ "ELASTICSEARCH_PASSWORD",
+ "OPENSEARCH_INITIAL_ADMIN_PASSWORD"))
+
+ elastic_root_cert_ca_path: ClassVar = (CERT_ROOT_DIR / "opensearch" / "elastic-stack-ca.crt.pem").as_posix()
+ elastic_node_cert_key_path: ClassVar = (CERT_ROOT_DIR / "opensearch" /
+ "elasticsearch/elasticsearch-1/elasticsearch-1.key").as_posix()
+ elastic_node_cert_pem_path: ClassVar = (CERT_ROOT_DIR / "opensearch" /
+ "elasticsearch/elasticsearch-1/elasticsearch-1.crt").as_posix()
+
+ kibana_client_cert_key_path: ClassVar = (CERT_ROOT_DIR / "opensearch" / "es_kibana_client.key").as_posix()
+ kibana_client_cert_pem_path: ClassVar = (CERT_ROOT_DIR / "opensearch" / "es_kibana_client.pem").as_posix()
+
+ @field_validator("hosts", mode="before")
+ def parse_list(cls, v):
+ if isinstance(v, str):
+ return json.loads(v)
+ return v
+
+ @property
+ def ports(self) -> list[int]:
+ return [self.es_port_1, self.es_port_2, self.es_port_3]
+
+ def auth_credentials(self) -> tuple[str, str]:
+ """convenience for requests auth=(user, password)."""
+ return (self.user, self.password.get_secret_value())
+
+ def get_ssl_certs_paths(self) -> tuple[str, str]:
+ """convenience for requests cert=(cert_path, key_path)."""
+ return (self.elastic_node_cert_pem_path, self.elastic_node_cert_key_path)
+
+ def get_kibana_ssl_certs_path(self) -> tuple[str, str]:
+ return (self.kibana_client_cert_pem_path, self.kibana_client_cert_key_path)
\ No newline at end of file
diff --git a/nifi/user_scripts/dto/nifi_api_config.py b/nifi/user_scripts/dto/nifi_api_config.py
new file mode 100644
index 000000000..869444c87
--- /dev/null
+++ b/nifi/user_scripts/dto/nifi_api_config.py
@@ -0,0 +1,47 @@
+import os
+from pathlib import Path
+
+CERTS_ROOT = Path(__file__).resolve().parents[3] / "security" / "certificates"
+
+
+class NiFiAPIConfig:
+
+ def __init__(self):
+ self.nifi_url_scheme = "https"
+ self.nifi_host = "localhost"
+ self.nifi_port = 8443
+ self.nifi_registry_port = 18443
+ self.nifi_username = os.environ.get("NIFI_SINGLE_USER_CREDENTIALS_USERNAME", "admin")
+ self.nifi_password = os.environ.get("NIFI_SINGLE_USER_CREDENTIALS_PASSWORD", "cogstackNiFi")
+ self.root_cert_ca_path = (CERTS_ROOT / "root" / "root-ca.pem").as_posix()
+ self.nifi_cert_pem_path = (CERTS_ROOT / "nifi" / "nifi.pem").as_posix()
+ self.nifi_cert_key_path = (CERTS_ROOT / "nifi" / "nifi.key").as_posix()
+ self.verify_ssl = True
+
+ @property
+ def nifi_base_url(self) -> str:
+ """Full NiFi base URL, e.g. https://localhost:8443"""
+ return f"{self.nifi_url_scheme}://{self.nifi_host}:{self.nifi_port}"
+
+ @property
+ def nifi_api_url(self) -> str:
+ """NiFi REST API root, e.g. https://localhost:8443/nifi-api"""
+ return f"{self.nifi_base_url}/nifi-api"
+
+ @property
+ def nifi_registry_base_url(self) -> str:
+ """NiFi Registry REST API root, e.g. https://localhost:18443/nifi-registry/"""
+ return f"{self.nifi_url_scheme}://{self.nifi_host}:{self.nifi_registry_port}/nifi-registry/"
+
+ @property
+ def nifi_registry_api_url(self) -> str:
+ """nifi registry rest api root, e.g. https://localhost:18443/nifi-registry/nifi-registry-api"""
+ return f"{self.nifi_url_scheme}://{self.nifi_host}:{self.nifi_registry_port}/nifi-registry-api/"
+
+ def auth_credentials(self) -> tuple[str, str]:
+ """convenience for requests auth=(user, password)."""
+ return (self.nifi_username, self.nifi_password)
+
+ def get_ssl_certs_paths(self) -> tuple[str, str]:
+ """convenience for requests cert=(cert_path, key_path)."""
+ return (self.nifi_cert_pem_path, self.nifi_cert_key_path)
diff --git a/nifi/user_scripts/dto/service_health.py b/nifi/user_scripts/dto/service_health.py
new file mode 100644
index 000000000..878467afc
--- /dev/null
+++ b/nifi/user_scripts/dto/service_health.py
@@ -0,0 +1,51 @@
+from datetime import datetime
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class ServiceHealth(BaseModel):
+ """
+ Base health check model shared by all services.
+ """
+
+ service: str = Field(..., description="Service name, e.g. NiFi, PostgreSQL, OpenSearch/ElasticSearch, etc.")
+ status: Literal["healthy", "unhealthy", "degraded"] = Field(description="Current service status",
+ default="unhealthy")
+
+ message: str | None = Field(default=None, description="Optional status message")
+ timestamp: datetime | None = Field(default_factory=datetime.now)
+ avg_processing_ms: float | None = Field(default=None)
+ service_info: str | None = Field(default=None)
+ connected: bool = Field(default=False)
+ latency_ms: float = Field(default=0.0, description="Ping latency in milliseconds")
+
+ class Config:
+ extra = "ignore"
+
+class MLServiceHealth(ServiceHealth):
+ model_name: str | None = Field(None, description="Name of the ML model")
+ model_version: str | None = Field(None, description="Version of the ML model")
+ model_card: str | None = Field(None, description="URL or path to the model card")
+
+class NiFiHealth(ServiceHealth):
+ active_threads: int | None = Field(default=None, description="Number of active threads")
+ queued_bytes: int | None = Field(default=None, description="Total queued bytes")
+ queued_count: int | None = Field(default=None, description="Number of queued flowfiles")
+
+class ElasticHealth(ServiceHealth):
+ cluster_status: str | None = Field(default=None, description="Cluster health status")
+ node_count: int | None = Field(default=None)
+ active_shards: int | None = Field(default=None)
+
+class DatabaseHealth(ServiceHealth):
+ version: str | None = Field(default=None, description="Database version, e.g PgSQL 17, MSSQL 21, etc.")
+ db_name: str | None = Field(default=None, description="Database name")
+
+class MedCATTrainerHealth(ServiceHealth):
+ """Health check model for MedCAT Trainer web service."""
+ app_version: str | None = Field(None, description="MedCAT Trainer app version")
+
+class CogstackCohortHealth(ServiceHealth):
+ """Health check model for CogStack Cohort service."""
+ pass
diff --git a/nifi/user_scripts/legacy_scripts/__init__.py b/nifi/user_scripts/legacy_scripts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user-scripts/legacy_scripts/annotation_creator.py b/nifi/user_scripts/legacy_scripts/annotation_creator.py
similarity index 93%
rename from nifi/user-scripts/legacy_scripts/annotation_creator.py
rename to nifi/user_scripts/legacy_scripts/annotation_creator.py
index 6db6be63a..441fdba46 100644
--- a/nifi/user-scripts/legacy_scripts/annotation_creator.py
+++ b/nifi/user_scripts/legacy_scripts/annotation_creator.py
@@ -3,7 +3,11 @@
import sys
import traceback
-from utils.sqlite_query import check_db_exists, connect_and_query, create_db_from_file
+from nifi.user_scripts.utils.db.sqlite_query import (
+ check_db_exists,
+ connect_and_query,
+ create_db_from_file,
+)
ANNOTATION_DB_SQL_FILE_PATH = "/opt/cogstack-db/sqlite/schemas/annotations_nlp_create_schema.sql"
diff --git a/nifi/user-scripts/legacy_scripts/annotation_manager.py b/nifi/user_scripts/legacy_scripts/annotation_manager.py
similarity index 97%
rename from nifi/user-scripts/legacy_scripts/annotation_manager.py
rename to nifi/user_scripts/legacy_scripts/annotation_manager.py
index ffe124372..80f40582d 100644
--- a/nifi/user-scripts/legacy_scripts/annotation_manager.py
+++ b/nifi/user_scripts/legacy_scripts/annotation_manager.py
@@ -4,7 +4,12 @@
import sys
import traceback
-from utils.sqlite_query import check_db_exists, connect_and_query, create_connection, create_db_from_file
+from nifi.user_scripts.utils.db.sqlite_query import (
+ check_db_exists,
+ connect_and_query,
+ create_connection,
+ create_db_from_file,
+)
global DOCUMENT_ID_FIELD_NAME
global DOCUMENT_TEXT_FIELD_NAME
diff --git a/nifi/user-scripts/legacy_scripts/annotation_manager_docs.py b/nifi/user_scripts/legacy_scripts/annotation_manager_docs.py
similarity index 95%
rename from nifi/user-scripts/legacy_scripts/annotation_manager_docs.py
rename to nifi/user_scripts/legacy_scripts/annotation_manager_docs.py
index 844ca6acd..c9d9d3c96 100644
--- a/nifi/user-scripts/legacy_scripts/annotation_manager_docs.py
+++ b/nifi/user_scripts/legacy_scripts/annotation_manager_docs.py
@@ -3,7 +3,11 @@
import sys
import traceback
-from utils.sqlite_query import check_db_exists, connect_and_query, create_db_from_file
+from nifi.user_scripts.utils.db.sqlite_query import (
+ check_db_exists,
+ connect_and_query,
+ create_db_from_file,
+)
global DOCUMENT_ID_FIELD_NAME
global DOCUMENT_TEXT_FIELD_NAME
diff --git a/nifi/user-scripts/legacy_scripts/anonymise_doc.py b/nifi/user_scripts/legacy_scripts/anonymise_doc.py
similarity index 100%
rename from nifi/user-scripts/legacy_scripts/anonymise_doc.py
rename to nifi/user_scripts/legacy_scripts/anonymise_doc.py
diff --git a/nifi/user-scripts/legacy_scripts/flowfile_to_attribute_with_content.py b/nifi/user_scripts/legacy_scripts/flowfile_to_attribute_with_content.py
similarity index 100%
rename from nifi/user-scripts/legacy_scripts/flowfile_to_attribute_with_content.py
rename to nifi/user_scripts/legacy_scripts/flowfile_to_attribute_with_content.py
diff --git a/nifi/user-scripts/legacy_scripts/ingest_into_es.py b/nifi/user_scripts/legacy_scripts/ingest_into_es.py
similarity index 100%
rename from nifi/user-scripts/legacy_scripts/ingest_into_es.py
rename to nifi/user_scripts/legacy_scripts/ingest_into_es.py
diff --git a/nifi/user-scripts/legacy_scripts/parse-anns-from-nlp-response-bulk.py b/nifi/user_scripts/legacy_scripts/parse-anns-from-nlp-response-bulk.py
similarity index 100%
rename from nifi/user-scripts/legacy_scripts/parse-anns-from-nlp-response-bulk.py
rename to nifi/user_scripts/legacy_scripts/parse-anns-from-nlp-response-bulk.py
diff --git a/nifi/user-scripts/legacy_scripts/parse-es-db-result-for-nlp-request-bulk.py b/nifi/user_scripts/legacy_scripts/parse-es-db-result-for-nlp-request-bulk.py
similarity index 98%
rename from nifi/user-scripts/legacy_scripts/parse-es-db-result-for-nlp-request-bulk.py
rename to nifi/user_scripts/legacy_scripts/parse-es-db-result-for-nlp-request-bulk.py
index 0e42174e1..731c6edb1 100644
--- a/nifi/user-scripts/legacy_scripts/parse-es-db-result-for-nlp-request-bulk.py
+++ b/nifi/user_scripts/legacy_scripts/parse-es-db-result-for-nlp-request-bulk.py
@@ -98,7 +98,7 @@ def process(self, inputStream, outputStream):
session.transfer(flowFile, REL_FAILURE)
finally:
if LOG_INVALID_RECORDS:
- log_file_path = os.path.join(str(os.environ.get("NIFI_USER_SCRIPT_LOGS_DIR", "/opt/nifi/user-scripts/logs/")), str(LOG_FILE_NAME))
+ log_file_path = os.path.join(str(os.environ.get("NIFI_USER_SCRIPT_LOGS_DIR", "/opt/nifi/user_scripts/logs/")), str(LOG_FILE_NAME))
_out_list = ','.join(str(x) for x in invalid_record_ids)
if os.path.exists(log_file_path) and len(invalid_record_ids) > 0:
with open(log_file_path, "a+") as log_file:
diff --git a/nifi/user-scripts/legacy_scripts/parse-json-to-avro.py b/nifi/user_scripts/legacy_scripts/parse-json-to-avro.py
similarity index 91%
rename from nifi/user-scripts/legacy_scripts/parse-json-to-avro.py
rename to nifi/user_scripts/legacy_scripts/parse-json-to-avro.py
index 9e434b650..285a762a1 100644
--- a/nifi/user-scripts/legacy_scripts/parse-json-to-avro.py
+++ b/nifi/user_scripts/legacy_scripts/parse-json-to-avro.py
@@ -8,7 +8,7 @@
from avro.datafile import DataFileWriter
from avro.io import DatumWriter
-log_file_path = "/opt/nifi/user-scripts/logs/parse_json/parse-json-to-avro_file_"
+log_file_path = "/opt/nifi/user_scripts/logs/parse_json/parse-json-to-avro_file_"
time = str(time.now().timestamp())
@@ -39,7 +39,7 @@
file_id = str(uuid.uuid4().hex)
-tmp_file_path = os.path.join("/opt/nifi/user-scripts/tmp/" + file_id + ".avro")
+tmp_file_path = os.path.join("/opt/nifi/user_scripts/tmp/" + file_id + ".avro")
with open(tmp_file_path, mode="wb+") as tmp_file:
writer = DataFileWriter(tmp_file, DatumWriter(), avro_schema)
diff --git a/nifi/user-scripts/legacy_scripts/parse-tika-result-json-to-avro.py b/nifi/user_scripts/legacy_scripts/parse-tika-result-json-to-avro.py
similarity index 100%
rename from nifi/user-scripts/legacy_scripts/parse-tika-result-json-to-avro.py
rename to nifi/user_scripts/legacy_scripts/parse-tika-result-json-to-avro.py
diff --git a/nifi/user-scripts/legacy_scripts/prepare-db-record-for-tika-request-single.py b/nifi/user_scripts/legacy_scripts/prepare-db-record-for-tika-request-single.py
similarity index 100%
rename from nifi/user-scripts/legacy_scripts/prepare-db-record-for-tika-request-single.py
rename to nifi/user_scripts/legacy_scripts/prepare-db-record-for-tika-request-single.py
diff --git a/nifi/user-scripts/legacy_scripts/prepare-file-for-tika-request-single-keep-db-fields.py b/nifi/user_scripts/legacy_scripts/prepare-file-for-tika-request-single-keep-db-fields.py
similarity index 100%
rename from nifi/user-scripts/legacy_scripts/prepare-file-for-tika-request-single-keep-db-fields.py
rename to nifi/user_scripts/legacy_scripts/prepare-file-for-tika-request-single-keep-db-fields.py
diff --git a/nifi/user-scripts/clean_doc.py b/nifi/user_scripts/processors/clean_doc.py
similarity index 96%
rename from nifi/user-scripts/clean_doc.py
rename to nifi/user_scripts/processors/clean_doc.py
index ff6baf011..7b4ac6b65 100644
--- a/nifi/user-scripts/clean_doc.py
+++ b/nifi/user_scripts/processors/clean_doc.py
@@ -29,7 +29,7 @@
# removes any PII from the text field
for i in range(len(records)):
- if TEXT_FIELD_NAME in records[i].keys():
+ if TEXT_FIELD_NAME in records[i]:
clean_text = records[i][TEXT_FIELD_NAME]
for pattern, repl in PII_PATTERNS:
clean_text = re.sub(pattern, repl, clean_text, flags=re.IGNORECASE)
diff --git a/nifi/user-scripts/cogstack_cohort_generate_data.py b/nifi/user_scripts/processors/cogstack_cohort_generate_data.py
similarity index 99%
rename from nifi/user-scripts/cogstack_cohort_generate_data.py
rename to nifi/user_scripts/processors/cogstack_cohort_generate_data.py
index 0fc8ad2bf..c2baae320 100644
--- a/nifi/user-scripts/cogstack_cohort_generate_data.py
+++ b/nifi/user_scripts/processors/cogstack_cohort_generate_data.py
@@ -8,8 +8,8 @@
from datetime import datetime, timezone
from multiprocessing import Pool, Queue
-from utils.ethnicity_map import ethnicity_map
-from utils.generic import chunk, dict2json_file, dict2jsonl_file
+from nifi.user_scripts.utils.data.ethnicity_map import ethnicity_map
+from nifi.user_scripts.utils.generic import chunk, dict2json_file, dict2jsonl_file
# default values from /deploy/nifi.env
NIFI_USER_SCRIPT_LOGS_DIR = os.getenv("NIFI_USER_SCRIPT_LOGS_DIR", "")
@@ -28,7 +28,7 @@
PATIENT_DEATH_DATE_FIELD_NAME = "deathdate"
PATIENT_DEATH_DATE_BACKUP_FIELD_NAME = ""
-OUTPUT_FOLDER_PATH = os.path.join(os.getenv("NIFI_DATA_PATH", "/opt/data/"), "cogstack-cohort")
+OUTPUT_FOLDER_PATH = os.path.join(os.getenv("NIFI_DATA_PATH", "/data/"), "cogstack-cohort")
# this is a json exported by NiFi to some path in the NIFI_DATA_PATH
INPUT_PATIENT_RECORDS_PATH = ""
diff --git a/nifi/user-scripts/cogstack_cohort_generate_random_data.py b/nifi/user_scripts/processors/cogstack_cohort_generate_random_data.py
similarity index 98%
rename from nifi/user-scripts/cogstack_cohort_generate_random_data.py
rename to nifi/user_scripts/processors/cogstack_cohort_generate_random_data.py
index cb5fcbe48..b21c5d4d8 100644
--- a/nifi/user-scripts/cogstack_cohort_generate_random_data.py
+++ b/nifi/user_scripts/processors/cogstack_cohort_generate_random_data.py
@@ -5,7 +5,7 @@
from collections import Counter, defaultdict
from datetime import datetime
-from utils.ethnicity_map import ethnicity_map
+from nifi.user_scripts.utils.data.ethnicity_map import ethnicity_map
ANNOTATION_DOCUMENT_ID_FIELD_NAME = "meta.docid"
@@ -15,7 +15,7 @@
PATIENT_BIRTH_DATE_FIELD_NAME = "birthdate"
PATIENT_DEATH_DATE_FIELD_NAME = "deathdate"
-OUTPUT_FOLDER_PATH = os.path.join(os.getenv("NIFI_DATA_PATH", "/opt/data/"), "cogstack-cohort")
+OUTPUT_FOLDER_PATH = os.path.join(os.getenv("NIFI_DATA_PATH", "/data/"), "cogstack-cohort")
# this is a json exported by NiFi to some path in the NIFI_DATA_PATH
INPUT_PATIENT_RECORDS_PATH = ""
diff --git a/nifi/user_scripts/processors/convert_record_parquet_to_json.py b/nifi/user_scripts/processors/convert_record_parquet_to_json.py
new file mode 100644
index 000000000..cfa173f42
--- /dev/null
+++ b/nifi/user_scripts/processors/convert_record_parquet_to_json.py
@@ -0,0 +1,39 @@
+import io
+import json
+import sys
+import traceback
+from logging import Logger
+
+from pyarrow import parquet
+
+from nifi.user_scripts.utils.generic import get_logger
+from nifi.user_scripts.utils.serialization.parquet_json_data_types_converter import (
+ parquet_json_data_type_convert,
+)
+
+logger: Logger = get_logger(__name__)
+input_byte_buffer: io.BytesIO = io.BytesIO(sys.stdin.buffer.read())
+
+output_buffer = sys.stdout.buffer
+
+try:
+ parquet_file = parquet.ParquetFile(input_byte_buffer)
+
+ for batch in parquet_file.iter_batches(batch_size=10000):
+ for record in batch.to_pylist():
+ output_buffer.write(json.dumps(
+ record,
+ ensure_ascii=False,
+ separators=(",", ":"),
+ default=parquet_json_data_type_convert
+ ).encode("utf-8"))
+ output_buffer.write(b"\n")
+
+
+except Exception as exception:
+ logger.error("Exception during Parquet file processing: " + traceback.format_exc())
+ raise exception
+
+finally:
+ input_byte_buffer.close()
+ output_buffer.close()
diff --git a/nifi/user_scripts/processors/elastic_schema_converter.py b/nifi/user_scripts/processors/elastic_schema_converter.py
new file mode 100644
index 000000000..ea880673d
--- /dev/null
+++ b/nifi/user_scripts/processors/elastic_schema_converter.py
@@ -0,0 +1,64 @@
+import json
+import re
+import sys
+import traceback
+from collections import defaultdict
+from logging import Logger
+
+logger: Logger = Logger(__name__)
+
+origin_index_mapping = json.loads(sys.stdin.read())
+
+INPUT_INDEX_NAME = ""
+OUTPUT_INDEX_NAME = ""
+OUTPUT_FILE_NAME = ""
+JSON_FIELD_MAPPER_SCHEMA_FILE_PATH = ""
+TRANSFORM_KEYS_LOWER_CASE = False
+
+for arg in sys.argv:
+ _arg = arg.split("=", 1)
+ _arg[0] = _arg[0].lower()
+ if _arg[0] == "input_index_name":
+ INPUT_INDEX_NAME = _arg[1]
+ if _arg[0] == "output_index_name":
+ OUTPUT_INDEX_NAME = _arg[1]
+ if _arg[0] == "output_file_name":
+ OUTPUT_FILE_NAME = _arg[1]
+ if _arg[0] == "json_field_mapper_schema_file_path":
+ JSON_FIELD_MAPPER_SCHEMA_FILE_PATH = _arg[1]
+
+try:
+ json_field_mapper: dict = {}
+ with open(JSON_FIELD_MAPPER_SCHEMA_FILE_PATH) as file:
+ json_field_mapper = json.load(file)
+
+ output_index_mapping: dict = {}
+
+ origin_index_name = INPUT_INDEX_NAME if INPUT_INDEX_NAME else \
+ origin_index_mapping[list(origin_index_mapping.keys())[0]]
+
+ for origin_field_name, origin_field_es_properties in origin_index_mapping["mappings"]["properties"]:
+ pass
+
+ for curr_field_name, curr_field_value in record.items():
+ curr_field_name = str(curr_field_name).lower()
+ if curr_field_name in new_schema_field_names:
+ # check if the mapping is not a dict (nested field)
+ if isinstance(json_mapper_schema[curr_field_name], str):
+ new_record.update({json_mapper_schema[curr_field_name] : curr_field_value})
+ elif isinstance(json_mapper_schema[curr_field_name], dict):
+ # nested field
+ new_record.update({curr_field_name: {}})
+ for nested_field_name, nested_field_value in curr_field_value.items():
+ if nested_field_name in json_mapper_schema[curr_field_name].keys():
+ new_record[curr_field_name].update({ \
+ json_mapper_schema[curr_field_name][nested_field_name]: nested_field_value})
+
+
+
+except Exception as exception:
+ logger.error("Exception during flowfile processing: " + traceback.format_exc())
+ raise exception
+
+# Output cleaned JSON as UTF-8
+sys.stdout.buffer.write(json.dumps(output_index_mapping, ensure_ascii=False).encode("utf-8"))
diff --git a/nifi/user-scripts/generate_location.py b/nifi/user_scripts/processors/generate_location.py
similarity index 85%
rename from nifi/user-scripts/generate_location.py
rename to nifi/user_scripts/processors/generate_location.py
index 5d88f9af7..4eb9426f7 100644
--- a/nifi/user-scripts/generate_location.py
+++ b/nifi/user_scripts/processors/generate_location.py
@@ -37,21 +37,21 @@ def poly_creator(city: str):
def main():
input_stream = sys.stdin.read()
+ log_file_path = os.path.join(NIFI_USER_SCRIPT_LOGS_DIR, str(LOG_FILE_NAME))
+ output_stream = []
try:
- log_file_path = os.path.join(NIFI_USER_SCRIPT_LOGS_DIR, str(LOG_FILE_NAME))
patients = json.loads(input_stream)
locations = [poly_creator(location) for location in LOCATIONS.split(",")]
-
- output_stream = []
for patient in patients:
to_append = {}
id = patient["_source"][SUBJECT_ID_FIELD_NAME]
- idx = randrange(len(locations)) # pick a random location specified
- lat, lon, _ = rc.coordinates_randomizer(polygon = locations[idx], num_locations = 1) # generate latitude and longitude
-
+ # pick a random location specified
+ idx = randrange(len(locations))
+ # generate latitude and longitude
+ lat, lon, _ = rc.coordinates_randomizer(polygon = locations[idx], num_locations = 1)
to_append[SUBJECT_ID_FIELD_NAME] = id
to_append[LOCATION_NAME_FIELD] = "POINT (" + str(lon[0]) + " " + str(lat[0]) + ")"
output_stream.append(to_append)
@@ -62,8 +62,8 @@ def main():
else:
with open(log_file_path, "a+") as log_file:
log_file.write("\n" + str(traceback.print_exc()))
- finally:
- return output_stream
+ return output_stream
sys.stdout.write(json.dumps(main()))
+
diff --git a/nifi/user-scripts/get_files_from_storage.py b/nifi/user_scripts/processors/get_files_from_storage.py
similarity index 97%
rename from nifi/user-scripts/get_files_from_storage.py
rename to nifi/user_scripts/processors/get_files_from_storage.py
index f1aefbbb2..5e4bdde7f 100644
--- a/nifi/user-scripts/get_files_from_storage.py
+++ b/nifi/user_scripts/processors/get_files_from_storage.py
@@ -15,7 +15,7 @@
# we are looking at anything after the 1st arg (which is the script name)
# example args:
# [
-# '/opt/nifi/user-scripts/get_files_from_storage.py', 'root_project_data_dir=/opt/data/',
+# '/opt/nifi/user_scripts/get_files_from_storage.py', 'root_project_data_dir=/opt/data/',
# 'folder_pattern=.*\\d{4}\\/\\d{2}\\/\\d{2}', 'folder_to_ingest=2022',
# 'file_id_csv_column_name_match=file_name_id_no_ext'
# ]
@@ -57,7 +57,7 @@
# This is the DATA directory inside the postgres database Docker image, or it could be a folder on the local system
processed_folder_dump = "processed_" + folder_to_ingest
-processed_folder_dump_path = os.path.join(str(os.environ.get("NIFI_USER_SCRIPT_LOGS_DIR", "/opt/nifi/user-scripts/logs/")), processed_folder_dump)
+processed_folder_dump_path = os.path.join(str(os.environ.get("NIFI_USER_SCRIPT_LOGS_DIR", "/opt/nifi/user_scripts/logs/")), processed_folder_dump)
processed_folder_dump_path = processed_folder_dump_path.replace("\"", "").replace("\'", "")
# log file name
@@ -170,7 +170,8 @@ def get_files_and_metadata():
if generate_pseudo_doc_id is not False:
_file_id_dict["document_Pseudo_Id"] = str(uuid.uuid4().hex)
- txt_file_df = pandas.concat([txt_file_df, pandas.DataFrame.from_dict([_file_id_dict], orient="columns")])
+ txt_file_df = pandas.concat([txt_file_df,
+ pandas.DataFrame.from_dict([_file_id_dict], orient="columns")])
folders_ingested[root].append(file_id)
else:
diff --git a/nifi/user-scripts/tests/generate_big_ann_file.py b/nifi/user_scripts/tests/generate_big_ann_file.py
similarity index 100%
rename from nifi/user-scripts/tests/generate_big_ann_file.py
rename to nifi/user_scripts/tests/generate_big_ann_file.py
diff --git a/nifi/user-scripts/tests/generate_files.py b/nifi/user_scripts/tests/generate_files.py
similarity index 100%
rename from nifi/user-scripts/tests/generate_files.py
rename to nifi/user_scripts/tests/generate_files.py
diff --git a/nifi/user-scripts/tests/get_ingested_files.py b/nifi/user_scripts/tests/get_ingested_files.py
similarity index 100%
rename from nifi/user-scripts/tests/get_ingested_files.py
rename to nifi/user_scripts/tests/get_ingested_files.py
diff --git a/nifi/user_scripts/tests/nifi/test_nifi.py b/nifi/user_scripts/tests/nifi/test_nifi.py
new file mode 100644
index 000000000..12488430e
--- /dev/null
+++ b/nifi/user_scripts/tests/nifi/test_nifi.py
@@ -0,0 +1,146 @@
+import json
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+import requests
+from nifi.user_scripts.dto.nifi_api_config import NiFiAPIConfig
+from nifi.user_scripts.dto.database_config import DatabaseConfig
+from nifi.user_scripts.dto.service_health import NiFiHealth, DatabaseHealth
+from nipyapi import config as nipy_config
+from nipyapi import security, versioning
+from nifi.user_scripts.utils.nifi.nifi_api_client import NiFiClient, NiFiRegistryClient
+from nifi.user_scripts.utils.health.service import check_postgres
+
+
+class TestServices(unittest.TestCase):
+ """Service connectivity and health checks."""
+
+ @classmethod
+ def setUpClass(cls):
+
+
+ # cls.pg_cfg = PGConfig()
+ # cls.nifi_api_config = NiFiAPIConfig()
+ # cls.nifi_client = NiFiClient(config=cls.nifi_api_config, login_on_init=False)
+ # cls.nifi_registry_client = NiFiRegistryClient(config=cls.nifi_api_config)
+ # cls.pg_config = PGConfig()
+ # cls.registry_bucket_name = os.environ.get("NIFI_REGISTRY_BUCKET", "cogstack")
+ # cls.flow_name = "opensearch_ingest_docs_db_to_es"
+ # cls.template_path = (
+ # Path(__file__).resolve().parents[4]
+ # / "nifi"
+ # / "user-templates"
+ # / f"{cls.flow_name}.json"
+ # )
+ # cls.es_hosts = os.environ.get("OPENSEARCH_URLS", "http://localhost:9200")
+ # cls.es_username = os.environ.get("OPENSEARCH_USERNAME", "admin")
+ # cls.es_password = os.environ.get("OPENSEARCH_PASSWORD", "admin")
+
+ #@classmethod
+ #def _configure_nipyapi(cls) -> None:
+ # """Apply SSL + host config so nipyapi uses the same creds as the raw client."""
+ # nipy_config.nifi_config.host = cls.nifi_api_config.nifi_api_url
+ # nipy_config.registry_config.host = cls.nifi_api_config.nifi_registry_api_url
+ #
+ # for cfg in (nipy_config.nifi_config, nipy_config.registry_config):
+ # cfg.verify_ssl = cls.nifi_api_config.VERIFY_SSL
+ # cfg.cert_file = cls.nifi_api_config.NIFI_CERT_PEM_PATH
+ # cfg.key_file = cls.nifi_api_config.NIFI_CERT_KEY_PATH
+ # cfg.ssl_ca_cert = cls.nifi_api_config.ROOT_CERT_CA_PATH
+ #
+ #def _prepare_snapshot_with_env_defaults(self) -> Path:
+ # """
+ # Load the opensearch template and pre-fill controller service properties
+ # using env/default configs so the flow can start without manual clicks.
+ # """
+ # with self.template_path.open() as fp:
+ # snapshot = json.load(fp)
+ #
+ # db_url = f"jdbc:postgresql://{self.pg_cfg.host}:{self.pg_cfg.port}/{self.pg_cfg.db}"
+ #
+ # for controller_service in snapshot.get("flowContents", {}).get("controllerServices", []):
+ # name = controller_service.get("name")
+ # properties = controller_service.setdefault("properties", {})
+ #
+ # if name == "DBCPConnectionPool":
+ # properties["Database Connection URL"] = db_url
+ # properties["Database User"] = self.pg_cfg.user
+ # properties["Password"] = self.pg_cfg.password
+ # elif name == "ElasticSearchClientServiceImpl":
+ # properties["el-cs-http-hosts"] = self.es_hosts
+ # properties["el-cs-username"] = self.es_username
+ # properties["el-cs-password"] = self.es_password
+ #
+ # fd, tmp_path = tempfile.mkstemp(suffix=".json", prefix="nifi-template-")
+ # with os.fdopen(fd, "w") as tmp_file:
+ # json.dump(snapshot, tmp_file)
+ #
+ # return Path(tmp_path)
+ #
+ #def test_nifi_health(self) -> None:
+ # result = self.nifi_client._login()
+ # self.assertTrue(result)
+ #
+ #def test_nifi_registry_health(self) -> None:
+ # result = requests.head(
+ # url=self.nifi_api_config.nifi_registry_base_url,
+ # auth=self.nifi_api_config.auth_credentials(),
+ # cert=self.nifi_api_config.get_nifi_ssl_certs_paths(),
+ # verify=self.nifi_api_config.ROOT_CERT_CA_PATH,
+ # timeout=15,
+ # )
+ # self.assertEqual(result.status_code, 200)
+ #
+ #def test_postgres_health(self):
+ # result, latency, err = check_postgres(self.pg_config)
+ # self.assertTrue(result, f"PostgreSQL unhealthy: {err}")
+ # print(f"✅ PostgreSQL OK, latency {latency:.2f} ms")
+ #
+ #def test_import_opensearch_template_and_configure_controller_services(self) -> None:
+ # """
+ # Bring the opensearch template into the local NiFi Registry bucket and
+ # patch the controller services so they use local PG/ES credentials.
+ # """
+ # self.assertTrue(self.nifi_client._login())
+ # self._configure_nipyapi()
+ #
+ # security.service_login(
+ # service="registry",
+ # username=self.nifi_api_config.NIFI_USERNAME,
+ # password=self.nifi_api_config.NIFI_PASSWORD,
+ # )
+ #
+ # bucket = versioning.get_bucket(self.registry_bucket_name)
+ # if bucket is None:
+ # bucket = versioning.create_bucket(
+ # bucket_name=self.registry_bucket_name,
+ # bucket_desc="Auto-created for test imports",
+ # )
+ #
+ # flow = versioning.get_flow_in_bucket(
+ # bucket_id=bucket.identifier,
+ # identifier=self.flow_name,
+ # identifier_type="name",
+ # )
+ # if flow is None:
+ # flow = versioning.create_flow(
+ # bucket_id=bucket.identifier,
+ # flow_name=self.flow_name,
+ # desc="Auto-imported from user-templates",
+ # )
+ #
+ # snapshot_path = self._prepare_snapshot_with_env_defaults()
+ #
+ # try:
+ # snapshot = versioning.import_flow_version(
+ # bucket_id=bucket.identifier,
+ # flow_id=flow.identifier,
+ # file_path=str(snapshot_path),
+ # )
+ # finally:
+ # snapshot_path.unlink(missing_ok=True)
+ #
+ # self.assertIsNotNone(snapshot)
+ #
diff --git a/nifi/user_scripts/tests/nifi/test_opensearch_ingest.py b/nifi/user_scripts/tests/nifi/test_opensearch_ingest.py
new file mode 100644
index 000000000..a26238800
--- /dev/null
+++ b/nifi/user_scripts/tests/nifi/test_opensearch_ingest.py
@@ -0,0 +1,25 @@
+import unittest
+from io import BytesIO
+
+
+class DummyFlowFile:
+ def __init__(self, content: str):
+ self._data = BytesIO(content.encode())
+
+ def read(self):
+ return self._data.getvalue()
+
+ def write(self, data):
+ self._data = BytesIO(data)
+ return self
+
+class TestMyProcessor(unittest.TestCase):
+ def test_uppercase(self):
+ proc = Proccc()
+ ff_in = DummyFlowFile("hello nifi")
+ ff_out = proc.transform({}, ff_in)
+
+ self.assertEqual(ff_out.read().decode(), "HELLO NIFI")
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/nifi/user_scripts/tests/nifi/test_service_health.py b/nifi/user_scripts/tests/nifi/test_service_health.py
new file mode 100644
index 000000000..8799270e4
--- /dev/null
+++ b/nifi/user_scripts/tests/nifi/test_service_health.py
@@ -0,0 +1,54 @@
+import unittest
+
+from pydantic import SecretStr
+
+from nifi.user_scripts.dto.database_config import DatabaseConfig
+from nifi.user_scripts.dto.elastic_config import ElasticConfig
+from nifi.user_scripts.dto.nifi_api_config import NiFiAPIConfig
+from nifi.user_scripts.dto.service_health import DatabaseHealth, ElasticHealth, NiFiHealth
+from nifi.user_scripts.utils.generic import get_logger
+from nifi.user_scripts.utils.nifi.nifi_api_client import NiFiClient, NiFiRegistryClient
+from nifi.user_scripts.utils.health.service import check_elasticsearch, check_kibana, check_postgres
+
+
+class TestServices(unittest.TestCase):
+ """Service connectivity and health checks."""
+
+ logger = get_logger(__name__)
+
+ @classmethod
+ def setUpClass(cls):
+ cls.nifi_api_config: NiFiAPIConfig = NiFiAPIConfig()
+ cls.nifi_client: NiFiClient = NiFiClient(config=cls.nifi_api_config, healh_check_on_init=False)
+ cls.nifi_registry_client: NiFiRegistryClient = NiFiRegistryClient(config=cls.nifi_api_config)
+ cls.pg_config: DatabaseConfig = DatabaseConfig(port=5554)
+ cls.elastic_config: ElasticConfig = ElasticConfig(user="admin",
+ hosts=["https://localhost:9200"],
+ password=SecretStr("admin"),
+ kibana_host="https://localhost:5601",
+ kibana_version="opensearch-dashboards")
+
+ def test_nifi_health(self) -> None:
+ health: NiFiHealth = self.nifi_client.health_check()
+ self.assertTrue(health.connected)
+ self.assertEqual(health.status, "healthy")
+
+ def test_nifi_registry_health(self) -> None:
+ nifi_health: NiFiHealth = self.nifi_registry_client.health_check()
+ self.assertTrue(nifi_health.connected)
+ self.assertEqual(nifi_health.status, "healthy")
+
+ def test_postgres_health(self):
+ database_health: DatabaseHealth = check_postgres(self.pg_config)
+ self.assertTrue(database_health.connected)
+ self.assertEqual(database_health.status, "healthy")
+
+ def test_elastic_health(self):
+ elastic_health: ElasticHealth = check_elasticsearch(self.elastic_config)
+ self.assertTrue(elastic_health.connected)
+ self.assertEqual(elastic_health.status, "healthy")
+
+ def test_kibana_health(self):
+ elastic_health: ElasticHealth = check_kibana(self.elastic_config)
+ self.assertTrue(elastic_health.connected)
+ self.assertEqual(elastic_health.status, "healthy")
\ No newline at end of file
diff --git a/nifi/user-scripts/tests/test_files/ex1.pdf b/nifi/user_scripts/tests/resources/ex1.pdf
old mode 100755
new mode 100644
similarity index 100%
rename from nifi/user-scripts/tests/test_files/ex1.pdf
rename to nifi/user_scripts/tests/resources/ex1.pdf
diff --git a/nifi/user_scripts/tests/test_avro.py b/nifi/user_scripts/tests/test_avro.py
new file mode 100644
index 000000000..7f893747e
--- /dev/null
+++ b/nifi/user_scripts/tests/test_avro.py
@@ -0,0 +1,61 @@
+import io
+import json
+
+import avro
+from avro.datafile import DataFileWriter
+from avro.io import DatumWriter
+
+"""
+ Use this script to test avro schemas etc with python3
+"""
+
+stream = object()
+
+json_mapper_schema = json.loads(open("../user-schemas/cogstack_common_schema_mapping.json").read())
+avro_cogstack_schema = avro.schema.parse(open("../user-schemas/cogstack_common_schema_full.avsc", "rb").read(), validate_enum_symbols=False)
+
+test_records = [{ "docid" : "1",
+ "sampleid" : 1041,
+ "dct" : "2020-05-11 10:52:25.273518",
+ "binarydoc": "blablabla" },
+ { "docid" : "1",
+ "sampleid" : 1041,
+ "dct" : "2020-05-11 10:52:25.273518",
+ "binarydoc": "blablabla" }]
+
+schema_fields = avro_cogstack_schema.props["fields"]
+dict_fields_types = {}
+for field in schema_fields:
+ dict_fields_types[field.name] = ""
+ tmp_list = json.loads(str(field.type))
+ if len(tmp_list) > 1 and type(tmp_list) is not str:
+ if type(tmp_list[1]) is dict:
+ dict_fields_types[field.name] = tmp_list[1]["type"]
+ else:
+ dict_fields_types[field.name] = tmp_list[1]
+ else:
+ dict_fields_types[field.name] = field.type
+
+available_mapping_keys = {}
+for k,v in json_mapper_schema.items():
+ if v:
+ available_mapping_keys[k] = v
+
+bytes_io = io.BytesIO(bytes("", encoding="UTF-8"))
+
+type_mapping = {"boolean": "bool", "long": "int", "int": "int", "float" : "float", "byte":"bytes", "string": "str", "double": "float"}
+
+
+print(avro_cogstack_schema)
+
+with DataFileWriter(bytes_io, DatumWriter(), avro_cogstack_schema) as writer:
+ # re-map the value to the new keys
+
+ for _record in test_records:
+ record = {}
+
+ for k, v in available_mapping_keys.items():
+ if v in _record.keys():
+ record[k] = _record[v] #getattr(__builtins__, type_mapping[dict_fields_types[k]])(_record[v])
+
+ writer.append(record)
diff --git a/nifi/user_scripts/tmp/.gitignore b/nifi/user_scripts/tmp/.gitignore
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user_scripts/utils/__init__.py b/nifi/user_scripts/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user_scripts/utils/bootstrap_external_lib_imports.py b/nifi/user_scripts/utils/bootstrap_external_lib_imports.py
new file mode 100644
index 000000000..385c91f5a
--- /dev/null
+++ b/nifi/user_scripts/utils/bootstrap_external_lib_imports.py
@@ -0,0 +1,25 @@
+import os
+import sys
+from pathlib import Path
+
+
+def running_in_docker() -> bool:
+ if os.path.exists("/.dockerenv"):
+ return True
+ try:
+ with open("/proc/1/cgroup", "rt") as f:
+ return any("docker" in line or "containerd" in line for line in f)
+ except FileNotFoundError:
+ return False
+
+
+# Ensure the repo root (parent of the nifi package) is on the import path.
+if running_in_docker():
+ framework_dir = os.getenv(
+ "NIFI_PYTHON_FRAMEWORK_SOURCE_DIRECTORY",
+ "/opt/nifi/nifi-current/python/framework",
+ )
+ sys.path.insert(0, framework_dir)
+else:
+ repo_root = Path(__file__).resolve().parents[3]
+ sys.path.insert(0, str(repo_root))
diff --git a/nifi/user_scripts/utils/codecs/__init__.py b/nifi/user_scripts/utils/codecs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user-scripts/utils/cerner_blob.py b/nifi/user_scripts/utils/codecs/cerner_blob.py
similarity index 96%
rename from nifi/user-scripts/utils/cerner_blob.py
rename to nifi/user_scripts/utils/codecs/cerner_blob.py
index 4b59444bb..cfd299bf1 100644
--- a/nifi/user-scripts/utils/cerner_blob.py
+++ b/nifi/user_scripts/utils/codecs/cerner_blob.py
@@ -1,5 +1,3 @@
-from typing import List
-
class LzwItem:
def __init__(self, _prefix: int = 0, _suffix: int = 0) -> None:
@@ -10,8 +8,8 @@ def __init__(self, _prefix: int = 0, _suffix: int = 0) -> None:
class DecompressLzwCernerBlob:
def __init__(self) -> None:
self.MAX_CODES: int = 8192
- self.tmp_decompression_buffer: List[int] = [0] * self.MAX_CODES
- self.lzw_lookup_table: List[LzwItem] = [LzwItem()] * self.MAX_CODES
+ self.tmp_decompression_buffer: list[int] = [0] * self.MAX_CODES
+ self.lzw_lookup_table: list[LzwItem] = [LzwItem()] * self.MAX_CODES
self.tmp_buffer_index: int = 0
self.current_byte_buffer_index: int = 0
diff --git a/nifi/user_scripts/utils/data/__init__.py b/nifi/user_scripts/utils/data/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user-scripts/utils/ethnicity_map.py b/nifi/user_scripts/utils/data/ethnicity_map.py
similarity index 100%
rename from nifi/user-scripts/utils/ethnicity_map.py
rename to nifi/user_scripts/utils/data/ethnicity_map.py
diff --git a/nifi/user_scripts/utils/db/__init__.py b/nifi/user_scripts/utils/db/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user-scripts/utils/pgsql_query.py b/nifi/user_scripts/utils/db/pgsql_query.py
similarity index 71%
rename from nifi/user-scripts/utils/pgsql_query.py
rename to nifi/user_scripts/utils/db/pgsql_query.py
index 12e32d96e..33c639f2f 100644
--- a/nifi/user-scripts/utils/pgsql_query.py
+++ b/nifi/user_scripts/utils/db/pgsql_query.py
@@ -1,6 +1,6 @@
-import psycopg2
+import psycopg
-conn = psycopg2.connect(
+conn = psycopg.connect(
host="localhost",
database="suppliers",
user="YourUsername",
diff --git a/nifi/user-scripts/utils/sqlite_query.py b/nifi/user_scripts/utils/db/sqlite_query.py
similarity index 100%
rename from nifi/user-scripts/utils/sqlite_query.py
rename to nifi/user_scripts/utils/db/sqlite_query.py
diff --git a/nifi/user-scripts/utils/generic.py b/nifi/user_scripts/utils/generic.py
similarity index 98%
rename from nifi/user-scripts/utils/generic.py
rename to nifi/user_scripts/utils/generic.py
index 5d5be2ead..963f3773e 100644
--- a/nifi/user-scripts/utils/generic.py
+++ b/nifi/user_scripts/utils/generic.py
@@ -79,7 +79,7 @@ def dict2jsonl_file(input_dict: dict | defaultdict, file_path: str) -> None:
print('', file=outfile)
-def get_logger(name: str) -> logging.Logger:
+def get_logger(name: str, propagate: bool = False) -> logging.Logger:
"""Return a configured logger shared across all NiFi clients."""
level_name = os.getenv("NIFI_LOG_LEVEL", "INFO").upper()
level = getattr(logging, level_name, logging.INFO)
@@ -94,7 +94,7 @@ def get_logger(name: str) -> logging.Logger:
handler.setFormatter(fmt)
logger.addHandler(handler)
logger.setLevel(level)
- logger.propagate = False
+ logger.propagate = propagate
return logger
def download_file_from_url(url: str, output_path: str, ssl_verify: bool = False, chunk_size: int = 8192) -> None:
diff --git a/nifi/user_scripts/utils/health/__init__.py b/nifi/user_scripts/utils/health/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user_scripts/utils/health/service.py b/nifi/user_scripts/utils/health/service.py
new file mode 100644
index 000000000..c4db670e8
--- /dev/null
+++ b/nifi/user_scripts/utils/health/service.py
@@ -0,0 +1,114 @@
+import time
+
+import psycopg
+import requests
+from opensearchpy import OpenSearch
+
+from nifi.user_scripts.dto.database_config import DatabaseConfig
+from nifi.user_scripts.dto.elastic_config import ElasticConfig
+from nifi.user_scripts.dto.service_health import DatabaseHealth, ElasticHealth
+
+from ..generic import get_logger
+
+logger = get_logger(__name__)
+
+def check_kibana(config: ElasticConfig) -> ElasticHealth:
+
+ elastic_health: ElasticHealth = ElasticHealth(service=config.kibana_version)
+ start = time.perf_counter()
+
+ try:
+ if config.kibana_version == "kibana":
+ raise NotImplementedError
+
+ response = requests.get(config.kibana_host + "/api/status",
+ auth=config.auth_credentials(),
+ timeout=config.timeout,
+ cert=config.get_kibana_ssl_certs_path(),
+ verify=config.elastic_root_cert_ca_path
+ )
+
+ elastic_health.latency_ms = (time.perf_counter() - start) * 1000
+ elastic_health.connected = response.ok
+
+ if response.status_code == 200:
+ elastic_health.status = "healthy"
+ logger.info(f"✅ {config.kibana_version} OK, latency {elastic_health.latency_ms:.2f} ms")
+ else:
+ elastic_health.message = f"❌ Failed to query {config.kibana_version}"
+
+ except Exception as e:
+ elastic_health.message = str(e)
+ logger.error(f"❌ Failed to query {config.kibana_version}: %s", str(e))
+
+ return elastic_health
+
+def check_elasticsearch(config: ElasticConfig) -> ElasticHealth:
+
+ elastic_health: ElasticHealth = ElasticHealth(service=config.elasticsearch_version)
+ start = time.perf_counter()
+
+ try:
+ elastic_connection = OpenSearch(hosts=config.hosts,
+ use_ssl=config.verify_ssl,
+ verify_certs=False,
+ http_auth=config.auth_credentials(),
+ ssl_show_warn=False,
+ ssl_assert_hostname=False,
+ ca_cert=config.elastic_root_cert_ca_path,
+ client_cert=config.elastic_node_cert_pem_path,
+ client_key=config.elastic_node_cert_key_path
+ )
+
+ if config.elasticsearch_version == "elasticsearch":
+ raise NotImplementedError
+
+ if elastic_connection.ping():
+ elastic_health.connected = True
+ elastic_health.status = "healthy"
+ elastic_health.service_info = elastic_connection.nodes.info()
+ elastic_health.latency_ms = (time.perf_counter() - start) * 1000
+ logger.info(f"✅ {config.elasticsearch_version} OK, latency {elastic_health.latency_ms:.2f} ms")
+ else:
+ elastic_health.message = f"❌ Failed to query {config.elasticsearch_version}"
+ except Exception as e:
+ elastic_health.message = str(e)
+ logger.error(f"❌ Failed to query {config.elasticsearch_version}: %s", str(e))
+
+ return elastic_health
+
+def check_postgres(config: DatabaseConfig) -> DatabaseHealth:
+
+ start = time.perf_counter()
+ database_health = DatabaseHealth(service="cogstack-samples-db",
+ db_name=config.database_name,
+ version=None
+ )
+
+ try:
+ with psycopg.connect(
+ host=config.host,
+ port=config.port,
+ user=config.username,
+ password=config.password.get_secret_value(),
+ dbname=config.database_name,
+ connect_timeout=config.timeout,
+ ) as connection, connection.cursor() as cursor:
+ cursor.execute("SELECT version();")
+ result = cursor.fetchone()
+
+ if result and result[0]:
+ database_health.version = result[0]
+ database_health.status = "healthy"
+ database_health.connected = True
+ database_health.latency_ms = (time.perf_counter() - start) * 1000
+ logger.info(f"✅ PostgreSQL OK, latency {database_health.latency_ms:.2f} ms")
+ else:
+ database_health.message = "No version returned from database"
+ database_health.status = "unhealthy"
+ database_health.connected = True
+
+ except Exception as e:
+ database_health.message = str(e)
+ logger.info("❌ Failed to query PostgreSQLi: %s", str(e))
+ return database_health
diff --git a/nifi/user_scripts/utils/lint_env.py b/nifi/user_scripts/utils/lint_env.py
new file mode 100644
index 000000000..723927f63
--- /dev/null
+++ b/nifi/user_scripts/utils/lint_env.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+ Lightweight env file validator used by deploy/export_env_vars.sh.
+"""
+
+from __future__ import annotations
+
+import sys
+from collections.abc import Iterable
+from pathlib import Path
+
+PORT_SUFFIXES = ("_PORT", "_OUTPUT_PORT", "_INPUT_PORT")
+BOOL_SUFFIXES = ("_ENABLED", "_SSL_ENABLED", "_BAKE")
+BOOL_VALUES = {"true", "false", "1", "0", "yes", "no", "on", "off"}
+
+
+def strip_quotes(value: str) -> str:
+ if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
+ return value[1:-1]
+ return value
+
+
+def parse_env_file(path: Path) -> tuple[list[str], list[str], list[tuple[str, str, int]]]:
+ errors: list[str] = []
+ warnings: list[str] = []
+ entries: list[tuple[str, str, int]] = []
+
+ for lineno, raw_line in enumerate(path.read_text().splitlines(), start=1):
+ line = raw_line.strip()
+ if not line or line.startswith("#"):
+ continue
+
+ if line.startswith("export "):
+ line = line[len("export ") :].strip()
+
+ if "=" not in line:
+ errors.append(f"{path}:{lineno}: missing '=' (got: {raw_line})")
+ continue
+
+ key, value = line.split("=", 1)
+ key = key.strip()
+ value = value.strip()
+
+ if not key:
+ errors.append(f"{path}:{lineno}: empty key (got: {raw_line})")
+ continue
+
+ entries.append((key, value, lineno))
+
+ seen = {}
+ for key, _, lineno in entries:
+ if key in seen:
+ warnings.append(f"{path}:{lineno}: duplicate key '{key}' (also on line {seen[key]})")
+ else:
+ seen[key] = lineno
+
+ return errors, warnings, entries
+
+
+def validate_entries(path: Path, entries: Iterable[tuple[str, str, int]]) -> list[str]:
+ errors: list[str] = []
+
+ for key, value, lineno in entries:
+ normalized = strip_quotes(value)
+
+ if any(key.endswith(suffix) for suffix in PORT_SUFFIXES) and not normalized.isdigit():
+ errors.append(f"{path}:{lineno}: '{key}' should be an integer port (got '{value}')")
+
+ if any(key.endswith(suffix) for suffix in BOOL_SUFFIXES) and normalized.lower() not in BOOL_VALUES:
+ errors.append(
+ f"{path}:{lineno}: '{key}' should be one of {sorted(BOOL_VALUES)} (got '{value}')"
+ )
+
+ return errors
+
+
+def main(args: list[str]) -> int:
+ if not args:
+ script = Path(__file__).name
+ print(f"Usage: {script} [ ...]")
+ return 1
+
+ warnings: list[str] = []
+ errors: list[str] = []
+ checked_files = 0
+
+ for path_str in args:
+ path = Path(path_str).resolve()
+ if not path.exists():
+ warnings.append(f"Skipping missing env file: {path}")
+ continue
+
+ checked_files += 1
+ parse_errors, parse_warnings, entries = parse_env_file(path)
+ errors.extend(parse_errors)
+ warnings.extend(parse_warnings)
+ errors.extend(validate_entries(path, entries))
+
+ for warning in warnings:
+ print(f"⚠️ {warning}")
+
+ if errors:
+ print("❌ Env validation failed:")
+ for err in errors:
+ print(f" - {err}")
+ return 1
+
+ print(f"✅ Env validation passed ({checked_files} files checked)")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
diff --git a/nifi/user_scripts/utils/nifi/__init__.py b/nifi/user_scripts/utils/nifi/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user-scripts/utils/helpers/base_nifi_processor.py b/nifi/user_scripts/utils/nifi/base_nifi_processor.py
similarity index 61%
rename from nifi/user-scripts/utils/helpers/base_nifi_processor.py
rename to nifi/user_scripts/utils/nifi/base_nifi_processor.py
index 05e0786b6..8e28ef3bb 100644
--- a/nifi/user-scripts/utils/helpers/base_nifi_processor.py
+++ b/nifi/user_scripts/utils/nifi/base_nifi_processor.py
@@ -10,12 +10,14 @@
)
from nifiapi.relationship import Relationship
from py4j.java_gateway import JavaObject, JVMView
-from utils.generic import parse_value
+
+from ..generic import parse_value
def _make_wrapper_method(name):
- """Return a function that delegates to the base's implementation on self."""
+ """Return a wrapper that delegates to the base class implementation."""
def wrapper(self, *args, **kwargs):
+ """Call the named base class method on the current instance."""
# call Base class implementation
base_impl = getattr(super(self.__class__, self), name, None)
if base_impl is None:
@@ -26,20 +28,22 @@ def wrapper(self, *args, **kwargs):
def nifi_processor(*, processor_details: dict | None = None):
"""
- NOTE (4-11-2025): at the moment this decorator is a bit useless as the curre
- NiFi version does not support automatic discovery of processor details from Python processors
- it only scans for the Java nested class "ProcessorDetails" and stops there, limited
- discovery capabilities for now. Hopefully in future versions this can be used.
-
- Class decorator that injects:
- - class Java with implements set
- - class ProcessorDetails (optional)
- - thin wrappers for getPropertyDescriptors, getRelationships, transform
- Use like:
- @nifi_processor(processor_details={"version":"0.1.0"})
- class MyProc(BaseNiFiProcessor): ...
+ Class decorator that injects NiFi-required nested classes and method wrappers.
+
+ Note (2025-04-11): NiFi currently does not support automatic discovery of
+ processor details from Python processors. It scans for the Java nested class
+ "ProcessorDetails" and stops there, so discovery is limited for now.
+
+ Args:
+ processor_details: Optional attributes to add to the ProcessorDetails
+ nested class.
+
+ Use like:
+ @nifi_processor(processor_details={"version": "0.1.0"})
+ class MyProc(BaseNiFiProcessor): ...
"""
def decorator(cls):
+ """Mutate the class to add NiFi metadata and wrapper methods."""
# Inject Java if missing (exact nested-class syntax NiFi looks for)
if not hasattr(cls, "Java"):
class Java:
@@ -90,6 +94,15 @@ def __init__(self, jvm: JVMView):
self.logger: Logger = logging.getLogger(self.__class__.__name__)
self.process_context: ProcessContext
+ self.REL_SUCCESS = Relationship(
+ name="success",
+ description="All FlowFiles processed successfully.",
+ )
+ self.REL_FAILURE = Relationship(
+ name="failure",
+ description="FlowFiles that failed processing.",
+ )
+
self._properties: list = [
PropertyDescriptor(name="sample_property_one",
description="sample property one description",
@@ -98,16 +111,7 @@ def __init__(self, jvm: JVMView):
validators=StandardValidators.BOOLEAN_VALIDATOR),
]
- self._relationships = [
- Relationship(
- name="success",
- description="All FlowFiles processed successfully."
- ),
- Relationship(
- name="failure",
- description="FlowFiles that failed processing."
- )
- ]
+ self._relationships = [self.REL_SUCCESS, self.REL_FAILURE]
self.descriptors: list[PropertyDescriptor] = self._properties
self.relationships: list[Relationship] = self._relationships
@@ -115,16 +119,39 @@ def __init__(self, jvm: JVMView):
self.logger.info(f"Initialized {self.__class__.__name__} processor.")
def getRelationships(self) -> list[Relationship]:
+ """
+ Return the list of relationships supported by the processor.
+
+ Returns:
+ A list of Relationship objects exposed to NiFi.
+ """
return self.relationships
def getPropertyDescriptors(self) -> list[PropertyDescriptor]:
+ """
+ Return the property descriptors supported by the processor.
+
+ Returns:
+ A list of PropertyDescriptor objects exposed to NiFi.
+ """
return self.descriptors
def set_logger(self, logger: Logger):
+ """
+ Replace the logger instance used by this processor.
+
+ Args:
+ logger: Logger instance to use for subsequent log entries.
+ """
self.logger = logger
def set_properties(self, properties: dict) -> None:
- """Populate class attributes from NiFi property map."""
+ """
+ Populate class attributes from a NiFi property map.
+
+ Args:
+ properties: Mapping of NiFi PropertyDescriptor to value.
+ """
for k, v in properties.items():
name = k.name if hasattr(k, "name") else str(k)
val = parse_value(v)
@@ -139,15 +166,29 @@ def build_failure_result(
*,
include_flowfile_attributes: bool = False,
) -> FlowFileTransformResult:
+ """
+ Build a failure FlowFileTransformResult with exception metadata.
+
+ Args:
+ flowFile: The FlowFile being processed.
+ exception: The exception raised during processing.
+ include_flowfile_attributes: If true, include all FlowFile attributes.
+
+ Returns:
+ A FlowFileTransformResult targeting the failure relationship.
+ """
+
exception_name = type(exception).__name__
exception_message = str(exception)
exception_value = (
f"{exception_name}: {exception_message}" if exception_message else exception_name
)
+
attributes = {}
if include_flowfile_attributes:
attributes = {k: str(v) for k, v in flowFile.getAttributes().items()}
attributes["exception"] = exception_value
+
return FlowFileTransformResult(
relationship="failure",
attributes=attributes,
@@ -156,23 +197,29 @@ def build_failure_result(
def onScheduled(self, context: ProcessContext) -> None:
"""
- Called automatically by NiFi once when the processor is scheduled to run
- (i.e., enabled or started). This method is used for initializing and
- allocating resources that should persist across multiple FlowFile
- executions.
+ Called once when the processor is scheduled (enabled or started).
- Typical use cases include:
- - Loading static data from disk (e.g., CSV lookup tables, configuration files)
- - Establishing external connections (e.g., databases, APIs)
- - Building in-memory caches or models used by onTrigger/transform()
+ Use this hook to initialize and allocate resources that should persist
+ across multiple FlowFile executions, such as loading static data,
+ establishing connections, or building caches used by transform().
- The resources created here remain in memory for the lifetime of the
- processor and are shared by all concurrent FlowFile executions on this
- node. They should be lightweight and thread-safe. To release or clean up
- such resources, use the @OnStopped method, which NiFi calls when the
- processor is disabled or stopped.
+ Resources created here live for the processor lifetime and are shared
+ across concurrent executions. They should be lightweight and
+ thread-safe. Clean up in @OnStopped when the processor is disabled.
"""
pass
- def transform(self, context: ProcessContext, flowFile: JavaObject):
+ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTransformResult:
+ """
+ Process a FlowFile and return a FlowFileTransformResult.
+
+ Subclasses must override this method to implement processor logic.
+
+ Args:
+ context: The NiFi ProcessContext for this invocation.
+ flowFile: The FlowFile being processed.
+
+ Raises:
+ NotImplementedError: Always, until overridden by a subclass.
+ """
raise NotImplementedError
diff --git a/nifi/user_scripts/utils/nifi/nifi_api_client.py b/nifi/user_scripts/utils/nifi/nifi_api_client.py
new file mode 100644
index 000000000..9c24c97c0
--- /dev/null
+++ b/nifi/user_scripts/utils/nifi/nifi_api_client.py
@@ -0,0 +1,143 @@
+import time
+from logging import Logger
+
+import requests
+from nipyapi import canvas, security
+from nipyapi.nifi import ApiClient, ProcessGroupsApi
+from nipyapi.nifi.configuration import Configuration as NiFiConfiguration
+from nipyapi.nifi.models.process_group_entity import ProcessGroupEntity
+from nipyapi.nifi.models.processor_entity import ProcessorEntity
+from nipyapi.registry import ApiClient as RegistryApiClient
+from nipyapi.registry import BucketsApi
+from nipyapi.registry.configuration import Configuration as RegistryConfiguration
+
+from nifi.user_scripts.dto.nifi_api_config import NiFiAPIConfig
+from nifi.user_scripts.dto.service_health import NiFiHealth
+
+from ..generic import get_logger
+
+
+class NiFiRegistryClient:
+
+ def __init__(self, config: NiFiAPIConfig, healh_check_on_init: bool = True) -> None:
+ self.config = config or NiFiAPIConfig()
+ self.nipyapi_config = RegistryConfiguration()
+ self.nipyapi_config.host = self.config.nifi_registry_api_url
+ self.nipyapi_config.verify_ssl = self.config.verify_ssl
+ self.nipyapi_config.cert_file = self.config.nifi_cert_pem_path # type: ignore
+ self.nipyapi_config.key_file = self.config.nifi_cert_key_path # type: ignore
+ self.nipyapi_config.ssl_ca_cert = self.config.root_cert_ca_path # type: ignore
+
+ self.logger: Logger = get_logger(self.__class__.__name__)
+
+ self.api_client = RegistryApiClient(self.nipyapi_config.host)
+ self.buckets_api = BucketsApi(self.api_client)
+
+ def list_buckets(self):
+ buckets = self.buckets_api.get_buckets()
+ for b in buckets:
+ self.logger.info("Bucket: %s (%s)", b.name, b.identifier)
+ return buckets
+
+ def health_check(self, timeout: int = 15) -> NiFiHealth:
+ start = time.perf_counter()
+ nifi_health = NiFiHealth(
+ service="nifi-registry",
+ service_info=self.config.nifi_registry_base_url
+ )
+
+ try:
+ response = requests.head(
+ url=self.config.nifi_registry_base_url,
+ auth=self.config.auth_credentials(),
+ cert=self.config.get_ssl_certs_paths(),
+ verify=self.config.root_cert_ca_path,
+ timeout=timeout
+ )
+
+ nifi_health.latency_ms = (time.perf_counter() - start) * 1000
+ nifi_health.connected = response.ok
+
+ if response.status_code == 200:
+ nifi_health.status = "healthy"
+ self.logger.info(f"✅ Logged in to NiFi Registry, latency {nifi_health.latency_ms:.2f} ms")
+ else:
+ nifi_health.message = f"❌ Unexpected status code {response.status_code}"
+
+ except Exception as exc:
+ nifi_health.message = str(exc)
+ self.logger.info("❌ Failed to log in to NiFi: %s", exc)
+
+ return nifi_health
+
+
+class NiFiClient:
+
+ def __init__(self, config: NiFiAPIConfig, healh_check_on_init: bool = True) -> None:
+ self.config = config or NiFiAPIConfig()
+ self.nipyapi_config = NiFiConfiguration()
+ self.nipyapi_config.host = self.config.nifi_api_url
+ self.nipyapi_config.verify_ssl = self.config.verify_ssl
+ self.nipyapi_config.cert_file = self.config.nifi_cert_pem_path # type: ignore
+ self.nipyapi_config.key_file = self.config.nifi_cert_key_path # type: ignore
+ self.nipyapi_config.ssl_ca_cert = self.config.root_cert_ca_path # type: ignore
+
+ self.logger: Logger = get_logger(self.__class__.__name__)
+
+ self.api_client = ApiClient(self.nipyapi_config)
+ self.process_group_api = ProcessGroupsApi(self.api_client)
+
+ if healh_check_on_init:
+ self.health_check()
+
+ def health_check(self) -> NiFiHealth:
+ start = time.perf_counter()
+ nifi_health = NiFiHealth(
+ service="nifi",
+ service_info=self.config.nifi_api_url
+ )
+
+ try:
+ result = security.service_login(
+ service='nifi',
+ username=self.config.nifi_username,
+ password=self.config.nifi_password
+ )
+
+ nifi_health.connected = bool(result)
+ nifi_health.latency_ms = (time.perf_counter() - start) * 1000
+
+ if result:
+ nifi_health.status = "healthy"
+ self.logger.info(f"✅ Logged in to NiFi, latency {nifi_health.latency_ms:.2f} ms")
+ else:
+ nifi_health.message = "Authentication returned False"
+ self.logger.info("❌ Failed to log in to NiFi")
+
+ except Exception as exc:
+ nifi_health.message = str(exc)
+ self.logger.info("❌ Failed to log in to NiFi: %s", exc)
+
+ return nifi_health
+
+ def get_root_process_group_id(self) -> str:
+ return canvas.get_root_pg_id()
+
+ def get_process_group_by_name(self, process_group_name: str) -> None | list[object] | object:
+ return canvas.get_process_group(process_group_name, identifier_type="nam")
+
+ def get_process_group_by_id(self, process_group_id: str) -> ProcessGroupEntity:
+ return canvas.get_process_group(process_group_id, identifier_type="id")
+
+ def start_process_group(self, process_group_id: str) -> bool:
+ return canvas.schedule_process_group(process_group_id, True)
+
+ def stop_process_group(self, process_group_id: str) -> bool:
+ return canvas.schedule_process_group(process_group_id, False)
+
+ def get_child_process_groups_from_parent_id(self, parent_process_group_id: str) -> list[ProcessGroupEntity]:
+ parent_pg = canvas.get_process_group(parent_process_group_id, identifier_type="id")
+ return canvas.list_all_process_groups(parent_pg.id)
+
+ def get_all_processors_in_process_group(self, process_group_id: str) -> list[ProcessorEntity]:
+ return canvas.list_all_processors(process_group_id)
diff --git a/nifi/user_scripts/utils/serialization/__init__.py b/nifi/user_scripts/utils/serialization/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nifi/user-scripts/utils/helpers/avro_json_encoder.py b/nifi/user_scripts/utils/serialization/avro_json_encoder.py
similarity index 100%
rename from nifi/user-scripts/utils/helpers/avro_json_encoder.py
rename to nifi/user_scripts/utils/serialization/avro_json_encoder.py
diff --git a/nifi/user_scripts/utils/serialization/parquet_json_data_types_converter.py b/nifi/user_scripts/utils/serialization/parquet_json_data_types_converter.py
new file mode 100644
index 000000000..52f066a61
--- /dev/null
+++ b/nifi/user_scripts/utils/serialization/parquet_json_data_types_converter.py
@@ -0,0 +1,13 @@
+import base64
+from datetime import date, datetime
+from decimal import Decimal
+
+
+def parquet_json_data_type_convert(field_value):
+ if isinstance(field_value, (datetime, date)): # noqa: UP038
+ return field_value.isoformat()
+ if isinstance(field_value, Decimal):
+ return str(field_value)
+ if isinstance(field_value, (bytes, bytearray, memoryview)): # noqa: UP038
+ return base64.b64encode(bytes(field_value)).decode("ascii")
+ return str(field_value)
diff --git a/nifi/user_templates/azure_blobs_parquet_to_opensearch.json b/nifi/user_templates/azure_blobs_parquet_to_opensearch.json
new file mode 100644
index 000000000..5105de944
--- /dev/null
+++ b/nifi/user_templates/azure_blobs_parquet_to_opensearch.json
@@ -0,0 +1,1900 @@
+{
+ "flowContents": {
+ "identifier": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "instanceIdentifier": "bd30bb46-019b-1000-a394-23186386d413",
+ "name": "azure_blobs_parquet_to_opensearch",
+ "comments": "",
+ "position": {
+ "x": -1304.0,
+ "y": -2192.0
+ },
+ "processGroups": [],
+ "remoteProcessGroups": [],
+ "processors": [
+ {
+ "identifier": "7b442720-10fc-3fb9-a233-b4c922dc5af5",
+ "instanceIdentifier": "aceec62f-019b-1000-b767-ae30b679e5b7",
+ "name": "RouteOnAttribute-parquetFiles",
+ "comments": "",
+ "position": {
+ "x": -680.0,
+ "y": -1616.0
+ },
+ "type": "org.apache.nifi.processors.standard.RouteOnAttribute",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-standard-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Routing Strategy": "Route to 'match' if all match",
+ "filename": "${filename:endsWith(\".parquet\")}"
+ },
+ "propertyDescriptors": {
+ "Routing Strategy": {
+ "name": "Routing Strategy",
+ "displayName": "Routing Strategy",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "filename": {
+ "name": "filename",
+ "displayName": "filename",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": true
+ }
+ },
+ "style": {},
+ "schedulingPeriod": "0 sec",
+ "schedulingStrategy": "TIMER_DRIVEN",
+ "executionNode": "ALL",
+ "penaltyDuration": "30 sec",
+ "yieldDuration": "1 sec",
+ "bulletinLevel": "WARN",
+ "runDurationMillis": 25,
+ "concurrentlySchedulableTaskCount": 1,
+ "autoTerminatedRelationships": [
+ "unmatched"
+ ],
+ "scheduledState": "ENABLED",
+ "retryCount": 10,
+ "retriedRelationships": [],
+ "backoffMechanism": "PENALIZE_FLOWFILE",
+ "maxBackoffPeriod": "10 mins",
+ "componentType": "PROCESSOR",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "49cae84d-fbcd-392d-a87b-3353b91a283d",
+ "instanceIdentifier": "12bf93dd-019b-1000-9922-ddf71f2d1c72",
+ "name": "FetchAzureBlobStorage_v12",
+ "comments": "",
+ "position": {
+ "x": -680.0,
+ "y": -1376.0
+ },
+ "type": "org.apache.nifi.processors.azure.storage.FetchAzureBlobStorage_v12",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-azure-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Range Start": null,
+ "Container Name": "${azure.container}",
+ "Blob Name": "${azure.blobname}",
+ "Client-Side Encryption Key ID": null,
+ "Storage Credentials": "c35e15f9-c1c6-34f5-bbd5-52e93dcdfc7b",
+ "Client-Side Encryption Key Type": "NONE",
+ "Range Length": null,
+ "Proxy Configuration Service": null
+ },
+ "propertyDescriptors": {
+ "Range Start": {
+ "name": "Range Start",
+ "displayName": "Range Start",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Container Name": {
+ "name": "Container Name",
+ "displayName": "Container Name",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Blob Name": {
+ "name": "Blob Name",
+ "displayName": "Blob Name",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Client-Side Encryption Key ID": {
+ "name": "Client-Side Encryption Key ID",
+ "displayName": "Client-Side Encryption Key ID",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Storage Credentials": {
+ "name": "Storage Credentials",
+ "displayName": "Storage Credentials",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Client-Side Encryption Key Type": {
+ "name": "Client-Side Encryption Key Type",
+ "displayName": "Client-Side Encryption Key Type",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Client-Side Encryption Local Key": {
+ "name": "Client-Side Encryption Local Key",
+ "displayName": "Client-Side Encryption Local Key",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Range Length": {
+ "name": "Range Length",
+ "displayName": "Range Length",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Proxy Configuration Service": {
+ "name": "Proxy Configuration Service",
+ "displayName": "Proxy Configuration Service",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ }
+ },
+ "style": {},
+ "schedulingPeriod": "0 sec",
+ "schedulingStrategy": "TIMER_DRIVEN",
+ "executionNode": "ALL",
+ "penaltyDuration": "30 sec",
+ "yieldDuration": "1 sec",
+ "bulletinLevel": "WARN",
+ "runDurationMillis": 0,
+ "concurrentlySchedulableTaskCount": 1,
+ "autoTerminatedRelationships": [],
+ "scheduledState": "ENABLED",
+ "retryCount": 10,
+ "retriedRelationships": [],
+ "backoffMechanism": "PENALIZE_FLOWFILE",
+ "maxBackoffPeriod": "10 mins",
+ "componentType": "PROCESSOR",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "c2626114-337b-37a2-afb5-01597b41a99b",
+ "instanceIdentifier": "a2f5360a-019b-1000-5267-0c0cbf02b041",
+ "name": "PutElasticsearchRecord",
+ "comments": "",
+ "position": {
+ "x": -680.0,
+ "y": -2080.0
+ },
+ "type": "org.apache.nifi.processors.elasticsearch.PutElasticsearchRecord",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-elasticsearch-restapi-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Timestamp Value": null,
+ "Max JSON Field String Length": "20 MB",
+ "Dynamic Templates Record Path": null,
+ "Index Operation": "create",
+ "Index": "${azure.blobname:substringBefore('/'):toLower()}",
+ "Record Reader": "f0308333-144e-394f-98ca-f444ee91a2a7",
+ "Date Format": null,
+ "Retain Record Path ID Field": "false",
+ "Time Format": null,
+ "Result Record Writer": "839d0da7-1f72-36da-aefb-c8123d21c751",
+ "Treat Not Found as Success": "true",
+ "Group Results by Bulk Error Type": "false",
+ "Scripted Upsert Record Path": null,
+ "Index Operation Record Path": null,
+ "Type Record Path": null,
+ "ID Record Path": null,
+ "Log Error Responses": "true",
+ "Script Record Path": null,
+ "Output Error Responses": "true",
+ "Retain Record Timestamp": "false",
+ "Timestamp Format": null,
+ "Type": null,
+ "Batch Size": "10000",
+ "Client Service": "441a8cf7-4c90-3b4e-ab0b-8facbb51b4db",
+ "Index Record Path": null,
+ "Timestamp Record Path": null
+ },
+ "propertyDescriptors": {
+ "Timestamp Value": {
+ "name": "Timestamp Value",
+ "displayName": "Timestamp Value",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Max JSON Field String Length": {
+ "name": "Max JSON Field String Length",
+ "displayName": "Max JSON Field String Length",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Dynamic Templates Record Path": {
+ "name": "Dynamic Templates Record Path",
+ "displayName": "Dynamic Templates Record Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Index Operation": {
+ "name": "Index Operation",
+ "displayName": "Index Operation",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Index": {
+ "name": "Index",
+ "displayName": "Index",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Record Reader": {
+ "name": "Record Reader",
+ "displayName": "Record Reader",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Date Format": {
+ "name": "Date Format",
+ "displayName": "Date Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Retain Record Path ID Field": {
+ "name": "Retain Record Path ID Field",
+ "displayName": "Retain Record Path ID Field",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Time Format": {
+ "name": "Time Format",
+ "displayName": "Time Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Result Record Writer": {
+ "name": "Result Record Writer",
+ "displayName": "Result Record Writer",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Treat Not Found as Success": {
+ "name": "Treat Not Found as Success",
+ "displayName": "Treat Not Found as Success",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Group Results by Bulk Error Type": {
+ "name": "Group Results by Bulk Error Type",
+ "displayName": "Group Results by Bulk Error Type",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Scripted Upsert Record Path": {
+ "name": "Scripted Upsert Record Path",
+ "displayName": "Scripted Upsert Record Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Index Operation Record Path": {
+ "name": "Index Operation Record Path",
+ "displayName": "Index Operation Record Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Type Record Path": {
+ "name": "Type Record Path",
+ "displayName": "Type Record Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "ID Record Path": {
+ "name": "ID Record Path",
+ "displayName": "ID Record Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Log Error Responses": {
+ "name": "Log Error Responses",
+ "displayName": "Log Error Responses",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Script Record Path": {
+ "name": "Script Record Path",
+ "displayName": "Script Record Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Output Error Responses": {
+ "name": "Output Error Responses",
+ "displayName": "Output Error Responses",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Retain Record Timestamp": {
+ "name": "Retain Record Timestamp",
+ "displayName": "Retain Record Timestamp",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Timestamp Format": {
+ "name": "Timestamp Format",
+ "displayName": "Timestamp Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Type": {
+ "name": "Type",
+ "displayName": "Type",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Batch Size": {
+ "name": "Batch Size",
+ "displayName": "Batch Size",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Client Service": {
+ "name": "Client Service",
+ "displayName": "Client Service",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Index Record Path": {
+ "name": "Index Record Path",
+ "displayName": "Index Record Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Timestamp Record Path": {
+ "name": "Timestamp Record Path",
+ "displayName": "Timestamp Record Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ }
+ },
+ "style": {},
+ "schedulingPeriod": "0 sec",
+ "schedulingStrategy": "TIMER_DRIVEN",
+ "executionNode": "ALL",
+ "penaltyDuration": "30 sec",
+ "yieldDuration": "1 sec",
+ "bulletinLevel": "INFO",
+ "runDurationMillis": 0,
+ "concurrentlySchedulableTaskCount": 1,
+ "autoTerminatedRelationships": [
+ "original",
+ "successful"
+ ],
+ "scheduledState": "ENABLED",
+ "retryCount": 10,
+ "retriedRelationships": [],
+ "backoffMechanism": "PENALIZE_FLOWFILE",
+ "maxBackoffPeriod": "10 mins",
+ "componentType": "PROCESSOR",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "22b8f0dd-5e91-37e8-b483-2e672300eeb9",
+ "instanceIdentifier": "12bf12c5-019b-1000-8fad-fa2a6785fc4e",
+ "name": "ListAzureBlobStorage_v12",
+ "comments": "",
+ "position": {
+ "x": -1304.0,
+ "y": -1376.0
+ },
+ "type": "org.apache.nifi.processors.azure.storage.ListAzureBlobStorage_v12",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-azure-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Minimum File Age": "0 sec",
+ "Entity Tracking Initial Listing Target": "all",
+ "Maximum File Age": null,
+ "Entity Tracking State Cache": null,
+ "Maximum File Size": null,
+ "Storage Credentials": "c35e15f9-c1c6-34f5-bbd5-52e93dcdfc7b",
+ "Proxy Configuration Service": null,
+ "Container Name": "container_name",
+ "Blob Name Prefix": null,
+ "Record Writer": null,
+ "Minimum File Size": "0 B",
+ "Listing Strategy": "timestamps",
+ "Entity Tracking Time Window": "3 hours"
+ },
+ "propertyDescriptors": {
+ "Minimum File Age": {
+ "name": "Minimum File Age",
+ "displayName": "Minimum File Age",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Entity Tracking Initial Listing Target": {
+ "name": "Entity Tracking Initial Listing Target",
+ "displayName": "Entity Tracking Initial Listing Target",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Maximum File Age": {
+ "name": "Maximum File Age",
+ "displayName": "Maximum File Age",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Entity Tracking State Cache": {
+ "name": "Entity Tracking State Cache",
+ "displayName": "Entity Tracking State Cache",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Maximum File Size": {
+ "name": "Maximum File Size",
+ "displayName": "Maximum File Size",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Storage Credentials": {
+ "name": "Storage Credentials",
+ "displayName": "Storage Credentials",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Proxy Configuration Service": {
+ "name": "Proxy Configuration Service",
+ "displayName": "Proxy Configuration Service",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Container Name": {
+ "name": "Container Name",
+ "displayName": "Container Name",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Blob Name Prefix": {
+ "name": "Blob Name Prefix",
+ "displayName": "Blob Name Prefix",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Record Writer": {
+ "name": "Record Writer",
+ "displayName": "Record Writer",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Minimum File Size": {
+ "name": "Minimum File Size",
+ "displayName": "Minimum File Size",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Listing Strategy": {
+ "name": "Listing Strategy",
+ "displayName": "Listing Strategy",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Entity Tracking Time Window": {
+ "name": "Entity Tracking Time Window",
+ "displayName": "Entity Tracking Time Window",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ }
+ },
+ "style": {},
+ "schedulingPeriod": "1 min",
+ "schedulingStrategy": "TIMER_DRIVEN",
+ "executionNode": "PRIMARY",
+ "penaltyDuration": "30 sec",
+ "yieldDuration": "1 sec",
+ "bulletinLevel": "WARN",
+ "runDurationMillis": 0,
+ "concurrentlySchedulableTaskCount": 1,
+ "autoTerminatedRelationships": [],
+ "scheduledState": "ENABLED",
+ "retryCount": 10,
+ "retriedRelationships": [],
+ "backoffMechanism": "PENALIZE_FLOWFILE",
+ "maxBackoffPeriod": "10 mins",
+ "componentType": "PROCESSOR",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "ce4bef66-5d14-32f0-897d-539b09f5a5ac",
+ "instanceIdentifier": "b454d1ad-019b-1000-2c9e-809b0fd80136",
+ "name": "ExecuteStreamCommand-ParquetToJson",
+ "comments": "",
+ "position": {
+ "x": -680.0,
+ "y": -1816.0
+ },
+ "type": "org.apache.nifi.processors.standard.ExecuteStreamCommand",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-standard-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Max Attribute Length": "256",
+ "Command Arguments Strategy": "Command Arguments Property",
+ "Output MIME Type": "application/x-ndjson",
+ "Working Directory": "/opt/nifi/user_scripts/processors/",
+ "Command Path": "python3.11",
+ "Output Destination Attribute": null,
+ "Argument Delimiter": ";",
+ "Ignore STDIN": "false",
+ "Command Arguments": "convert_record_parquet_to_json.py"
+ },
+ "propertyDescriptors": {
+ "Max Attribute Length": {
+ "name": "Max Attribute Length",
+ "displayName": "Max Attribute Length",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Command Arguments Strategy": {
+ "name": "Command Arguments Strategy",
+ "displayName": "Command Arguments Strategy",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Output MIME Type": {
+ "name": "Output MIME Type",
+ "displayName": "Output MIME Type",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Working Directory": {
+ "name": "Working Directory",
+ "displayName": "Working Directory",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Command Path": {
+ "name": "Command Path",
+ "displayName": "Command Path",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Output Destination Attribute": {
+ "name": "Output Destination Attribute",
+ "displayName": "Output Destination Attribute",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Argument Delimiter": {
+ "name": "Argument Delimiter",
+ "displayName": "Argument Delimiter",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Ignore STDIN": {
+ "name": "Ignore STDIN",
+ "displayName": "Ignore STDIN",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Command Arguments": {
+ "name": "Command Arguments",
+ "displayName": "Command Arguments",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ }
+ },
+ "style": {},
+ "schedulingPeriod": "0 sec",
+ "schedulingStrategy": "TIMER_DRIVEN",
+ "executionNode": "ALL",
+ "penaltyDuration": "30 sec",
+ "yieldDuration": "1 sec",
+ "bulletinLevel": "WARN",
+ "runDurationMillis": 0,
+ "concurrentlySchedulableTaskCount": 1,
+ "autoTerminatedRelationships": [
+ "original"
+ ],
+ "scheduledState": "ENABLED",
+ "retryCount": 10,
+ "retriedRelationships": [],
+ "backoffMechanism": "PENALIZE_FLOWFILE",
+ "maxBackoffPeriod": "10 mins",
+ "componentType": "PROCESSOR",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ }
+ ],
+ "inputPorts": [],
+ "outputPorts": [],
+ "connections": [
+ {
+ "identifier": "3916e55c-d569-32d0-bb8e-9d220fd5868b",
+ "instanceIdentifier": "a2f57578-019b-1000-0fec-af6bdde922c0",
+ "name": "",
+ "source": {
+ "id": "c2626114-337b-37a2-afb5-01597b41a99b",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "PutElasticsearchRecord",
+ "comments": "",
+ "instanceIdentifier": "a2f5360a-019b-1000-5267-0c0cbf02b041"
+ },
+ "destination": {
+ "id": "1f850d26-6468-343d-a705-bd6e872dc9b4",
+ "type": "FUNNEL",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "Funnel",
+ "comments": "",
+ "instanceIdentifier": "a2f2838e-019b-1000-7b4d-437e60164c83"
+ },
+ "labelIndex": 0,
+ "zIndex": 8,
+ "selectedRelationships": [
+ "failure",
+ "retry",
+ "errors"
+ ],
+ "backPressureObjectThreshold": 10000,
+ "backPressureDataSizeThreshold": "2.5 GB",
+ "flowFileExpiration": "0 sec",
+ "prioritizers": [],
+ "bends": [],
+ "loadBalanceStrategy": "DO_NOT_LOAD_BALANCE",
+ "partitioningAttribute": "",
+ "loadBalanceCompression": "DO_NOT_COMPRESS",
+ "componentType": "CONNECTION",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "fe505c47-18e2-395c-aedf-b7ab32754fd9",
+ "instanceIdentifier": "b45604ea-019b-1000-5e72-a935e1eecb48",
+ "name": "",
+ "source": {
+ "id": "ce4bef66-5d14-32f0-897d-539b09f5a5ac",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "ExecuteStreamCommand-ParquetToJson",
+ "comments": "",
+ "instanceIdentifier": "b454d1ad-019b-1000-2c9e-809b0fd80136"
+ },
+ "destination": {
+ "id": "c2626114-337b-37a2-afb5-01597b41a99b",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "PutElasticsearchRecord",
+ "comments": "",
+ "instanceIdentifier": "a2f5360a-019b-1000-5267-0c0cbf02b041"
+ },
+ "labelIndex": 0,
+ "zIndex": 16,
+ "selectedRelationships": [
+ "output stream"
+ ],
+ "backPressureObjectThreshold": 10000,
+ "backPressureDataSizeThreshold": "1 GB",
+ "flowFileExpiration": "0 sec",
+ "prioritizers": [],
+ "bends": [],
+ "loadBalanceStrategy": "DO_NOT_LOAD_BALANCE",
+ "partitioningAttribute": "",
+ "loadBalanceCompression": "DO_NOT_COMPRESS",
+ "componentType": "CONNECTION",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "2e2b7e84-d655-3ef5-b23f-2b3a949e4660",
+ "instanceIdentifier": "acefe222-019b-1000-50cd-81b8134fb2ea",
+ "name": "",
+ "source": {
+ "id": "7b442720-10fc-3fb9-a233-b4c922dc5af5",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "RouteOnAttribute-parquetFiles",
+ "comments": "",
+ "instanceIdentifier": "aceec62f-019b-1000-b767-ae30b679e5b7"
+ },
+ "destination": {
+ "id": "ce4bef66-5d14-32f0-897d-539b09f5a5ac",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "ExecuteStreamCommand-ParquetToJson",
+ "comments": "",
+ "instanceIdentifier": "b454d1ad-019b-1000-2c9e-809b0fd80136"
+ },
+ "labelIndex": 0,
+ "zIndex": 12,
+ "selectedRelationships": [
+ "matched"
+ ],
+ "backPressureObjectThreshold": 10000,
+ "backPressureDataSizeThreshold": "1 GB",
+ "flowFileExpiration": "0 sec",
+ "prioritizers": [],
+ "bends": [],
+ "loadBalanceStrategy": "DO_NOT_LOAD_BALANCE",
+ "partitioningAttribute": "",
+ "loadBalanceCompression": "DO_NOT_COMPRESS",
+ "componentType": "CONNECTION",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "40b811d1-7ad0-30f5-b394-37080365d484",
+ "instanceIdentifier": "12f7175a-019b-1000-f077-2bccd9d1f77f",
+ "name": "",
+ "source": {
+ "id": "49cae84d-fbcd-392d-a87b-3353b91a283d",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "FetchAzureBlobStorage_v12",
+ "comments": "",
+ "instanceIdentifier": "12bf93dd-019b-1000-9922-ddf71f2d1c72"
+ },
+ "destination": {
+ "id": "f86fb032-4ec9-3c1f-95ca-cf0a490d05fc",
+ "type": "FUNNEL",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "Funnel",
+ "comments": "",
+ "instanceIdentifier": "12f70e62-019b-1000-456a-d6238f34c803"
+ },
+ "labelIndex": 0,
+ "zIndex": 3,
+ "selectedRelationships": [
+ "failure"
+ ],
+ "backPressureObjectThreshold": 10000,
+ "backPressureDataSizeThreshold": "1 GB",
+ "flowFileExpiration": "0 sec",
+ "prioritizers": [],
+ "bends": [],
+ "loadBalanceStrategy": "DO_NOT_LOAD_BALANCE",
+ "partitioningAttribute": "",
+ "loadBalanceCompression": "DO_NOT_COMPRESS",
+ "componentType": "CONNECTION",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "a7062be3-1d80-3e33-ba0d-1d57b47df4ab",
+ "instanceIdentifier": "12f70808-019b-1000-3f84-1aa24ca7f81e",
+ "name": "",
+ "source": {
+ "id": "49cae84d-fbcd-392d-a87b-3353b91a283d",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "FetchAzureBlobStorage_v12",
+ "comments": "",
+ "instanceIdentifier": "12bf93dd-019b-1000-9922-ddf71f2d1c72"
+ },
+ "destination": {
+ "id": "7b442720-10fc-3fb9-a233-b4c922dc5af5",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "RouteOnAttribute-parquetFiles",
+ "comments": "",
+ "instanceIdentifier": "aceec62f-019b-1000-b767-ae30b679e5b7"
+ },
+ "labelIndex": 0,
+ "zIndex": 2,
+ "selectedRelationships": [
+ "success"
+ ],
+ "backPressureObjectThreshold": 10000,
+ "backPressureDataSizeThreshold": "1 GB",
+ "flowFileExpiration": "0 sec",
+ "prioritizers": [],
+ "bends": [],
+ "loadBalanceStrategy": "DO_NOT_LOAD_BALANCE",
+ "partitioningAttribute": "",
+ "loadBalanceCompression": "DO_NOT_COMPRESS",
+ "componentType": "CONNECTION",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "72f6bca4-63f6-3622-8b8b-dd5458457c11",
+ "instanceIdentifier": "b72e03a3-019b-1000-1d5a-84a4b073816e",
+ "name": "",
+ "source": {
+ "id": "c2626114-337b-37a2-afb5-01597b41a99b",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "PutElasticsearchRecord",
+ "comments": "",
+ "instanceIdentifier": "a2f5360a-019b-1000-5267-0c0cbf02b041"
+ },
+ "destination": {
+ "id": "ed73259d-0c68-3bbc-b291-471f4549ca23",
+ "type": "FUNNEL",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "Funnel",
+ "comments": "",
+ "instanceIdentifier": "b72df80b-019b-1000-0328-c137f4566c4f"
+ },
+ "labelIndex": 0,
+ "zIndex": 19,
+ "selectedRelationships": [
+ "error_responses"
+ ],
+ "backPressureObjectThreshold": 10000,
+ "backPressureDataSizeThreshold": "1 GB",
+ "flowFileExpiration": "0 sec",
+ "prioritizers": [],
+ "bends": [],
+ "loadBalanceStrategy": "DO_NOT_LOAD_BALANCE",
+ "partitioningAttribute": "",
+ "loadBalanceCompression": "DO_NOT_COMPRESS",
+ "componentType": "CONNECTION",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "af7284d5-f746-3522-9ca4-ac603d5cfe54",
+ "instanceIdentifier": "12c7e7da-019b-1000-f977-dc837ca741aa",
+ "name": "",
+ "source": {
+ "id": "22b8f0dd-5e91-37e8-b483-2e672300eeb9",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "ListAzureBlobStorage_v12",
+ "comments": "",
+ "instanceIdentifier": "12bf12c5-019b-1000-8fad-fa2a6785fc4e"
+ },
+ "destination": {
+ "id": "49cae84d-fbcd-392d-a87b-3353b91a283d",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "FetchAzureBlobStorage_v12",
+ "comments": "",
+ "instanceIdentifier": "12bf93dd-019b-1000-9922-ddf71f2d1c72"
+ },
+ "labelIndex": 0,
+ "zIndex": 1,
+ "selectedRelationships": [
+ "success"
+ ],
+ "backPressureObjectThreshold": 10000,
+ "backPressureDataSizeThreshold": "1 GB",
+ "flowFileExpiration": "0 sec",
+ "prioritizers": [],
+ "bends": [],
+ "loadBalanceStrategy": "DO_NOT_LOAD_BALANCE",
+ "partitioningAttribute": "",
+ "loadBalanceCompression": "DO_NOT_COMPRESS",
+ "componentType": "CONNECTION",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "9ad78242-8ef8-3bad-a140-6a6fc2823136",
+ "instanceIdentifier": "b455e6d6-019b-1000-7dcf-2c5311212e8d",
+ "name": "",
+ "source": {
+ "id": "ce4bef66-5d14-32f0-897d-539b09f5a5ac",
+ "type": "PROCESSOR",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "ExecuteStreamCommand-ParquetToJson",
+ "comments": "",
+ "instanceIdentifier": "b454d1ad-019b-1000-2c9e-809b0fd80136"
+ },
+ "destination": {
+ "id": "d62beed4-d9a3-3117-b48d-60b2d3bfea74",
+ "type": "FUNNEL",
+ "groupId": "b3dd314f-e399-39f2-a5c6-bbc776589132",
+ "name": "Funnel",
+ "comments": "",
+ "instanceIdentifier": "b1a2a265-019b-1000-73cf-d1d352d0db2f"
+ },
+ "labelIndex": 0,
+ "zIndex": 15,
+ "selectedRelationships": [
+ "nonzero status"
+ ],
+ "backPressureObjectThreshold": 10000,
+ "backPressureDataSizeThreshold": "1 GB",
+ "flowFileExpiration": "0 sec",
+ "prioritizers": [],
+ "bends": [],
+ "loadBalanceStrategy": "DO_NOT_LOAD_BALANCE",
+ "partitioningAttribute": "",
+ "loadBalanceCompression": "DO_NOT_COMPRESS",
+ "componentType": "CONNECTION",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ }
+ ],
+ "labels": [
+ {
+ "identifier": "bd2c4d70-019b-1000-eb42-d372d94add8b",
+ "instanceIdentifier": "c70655f6-fbef-37b6-b678-be5cb13a2c1e",
+ "position": {
+ "x": -1000.0,
+ "y": -1816.0
+ },
+ "label": "Custom script to convert files from Parquet format to Json (NDJSON actually). We use this instead of a native processor because the current NiFi version is buggy with files over 100mb, and custom Python processor that is currently implemented \"CogStackConvertParquetToJson\" also has a similar issue.",
+ "zIndex": 2,
+ "width": 312.0,
+ "height": 144.0,
+ "style": {
+ "font-size": "12px"
+ },
+ "componentType": "LABEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "161e6c6d-bac4-3370-9eab-67b5cd74ee36",
+ "instanceIdentifier": "bd2e59ac-019b-1000-aee5-f778be711553",
+ "position": {
+ "x": -680.0,
+ "y": -2192.0
+ },
+ "label": "We grab the index name for the the specific record via the custom Expression language: ${azure.blobname:substringBefore('/'):toLower()}",
+ "zIndex": 4,
+ "width": 352.0,
+ "height": 104.0,
+ "style": {
+ "font-size": "12px"
+ },
+ "componentType": "LABEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "b7c5c895-2803-3b8b-9694-849d8acef785",
+ "instanceIdentifier": "13354bc6-019b-1000-d8c6-2574f9e0003f",
+ "position": {
+ "x": -1304.0,
+ "y": -1488.0
+ },
+ "label": "Note: do not set RecordWriter if you want one blob -> one flowfile, otherwise, if you do, the processor will pile them up in one flowfile and FetchAzureBlobStorage will error out.",
+ "zIndex": 1,
+ "width": 352.0,
+ "height": 96.0,
+ "style": {
+ "font-size": "12px"
+ },
+ "componentType": "LABEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "282b8e6d-03be-3fec-9215-dc9a7cb7a43e",
+ "instanceIdentifier": "bd2b2ad4-019b-1000-f5ff-5d7cca5ca409",
+ "position": {
+ "x": -680.0,
+ "y": -1240.0
+ },
+ "label": "The Azure container name and blob names are being fetched via ExpressionLanguage from the already generated FlowFiles from the ListAzureBlobStorage processor.",
+ "zIndex": 3,
+ "width": 352.0,
+ "height": 88.0,
+ "style": {
+ "font-size": "12px"
+ },
+ "componentType": "LABEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "fc49dc24-9d2f-3e32-b9f2-08d6bee2115d",
+ "instanceIdentifier": "b4a21b93-019b-1000-8ce2-b13f6d19498e",
+ "position": {
+ "x": -312.0,
+ "y": -1616.0
+ },
+ "label": "Only get .parquet files. Everything else, terminate.",
+ "zIndex": 2,
+ "width": 312.0,
+ "height": 32.0,
+ "style": {
+ "font-size": "12px"
+ },
+ "componentType": "LABEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ }
+ ],
+ "funnels": [
+ {
+ "identifier": "f86fb032-4ec9-3c1f-95ca-cf0a490d05fc",
+ "instanceIdentifier": "12f70e62-019b-1000-456a-d6238f34c803",
+ "position": {
+ "x": -40.0,
+ "y": -1336.0
+ },
+ "componentType": "FUNNEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "1f850d26-6468-343d-a705-bd6e872dc9b4",
+ "instanceIdentifier": "a2f2838e-019b-1000-7b4d-437e60164c83",
+ "position": {
+ "x": -40.0,
+ "y": -2032.0
+ },
+ "componentType": "FUNNEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "d62beed4-d9a3-3117-b48d-60b2d3bfea74",
+ "instanceIdentifier": "b1a2a265-019b-1000-73cf-d1d352d0db2f",
+ "position": {
+ "x": -40.0,
+ "y": -1776.0
+ },
+ "componentType": "FUNNEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ },
+ {
+ "identifier": "ed73259d-0c68-3bbc-b291-471f4549ca23",
+ "instanceIdentifier": "b72df80b-019b-1000-0328-c137f4566c4f",
+ "position": {
+ "x": -1064.0,
+ "y": -2040.0
+ },
+ "componentType": "FUNNEL",
+ "groupIdentifier": "b3dd314f-e399-39f2-a5c6-bbc776589132"
+ }
+ ],
+ "controllerServices": [
+ {
+ "identifier": "f0308333-144e-394f-98ca-f444ee91a2a7",
+ "instanceIdentifier": "a2f601e6-019b-1000-0b57-2e77f5f75fbc",
+ "name": "JsonTreeReader",
+ "comments": "",
+ "type": "org.apache.nifi.json.JsonTreeReader",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-record-serialization-services-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Schema Branch": null,
+ "Schema Reference Reader": null,
+ "Schema Text": "${avro.schema}",
+ "Starting Field Name": null,
+ "Max String Length": "20 MB",
+ "Schema Inference Cache": null,
+ "Starting Field Strategy": "ROOT_NODE",
+ "Schema Registry": null,
+ "Schema Access Strategy": "infer-schema",
+ "Schema Name": "${schema.name}",
+ "Timestamp Format": null,
+ "Date Format": null,
+ "Schema Application Strategy": "SELECTED_PART",
+ "Schema Version": null,
+ "Time Format": null,
+ "Allow Comments": "false"
+ },
+ "propertyDescriptors": {
+ "Schema Branch": {
+ "name": "Schema Branch",
+ "displayName": "Schema Branch",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Reference Reader": {
+ "name": "Schema Reference Reader",
+ "displayName": "Schema Reference Reader",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Text": {
+ "name": "Schema Text",
+ "displayName": "Schema Text",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Starting Field Name": {
+ "name": "Starting Field Name",
+ "displayName": "Starting Field Name",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Max String Length": {
+ "name": "Max String Length",
+ "displayName": "Max String Length",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Inference Cache": {
+ "name": "Schema Inference Cache",
+ "displayName": "Schema Inference Cache",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Starting Field Strategy": {
+ "name": "Starting Field Strategy",
+ "displayName": "Starting Field Strategy",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Registry": {
+ "name": "Schema Registry",
+ "displayName": "Schema Registry",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Access Strategy": {
+ "name": "Schema Access Strategy",
+ "displayName": "Schema Access Strategy",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Name": {
+ "name": "Schema Name",
+ "displayName": "Schema Name",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Timestamp Format": {
+ "name": "Timestamp Format",
+ "displayName": "Timestamp Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Date Format": {
+ "name": "Date Format",
+ "displayName": "Date Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Application Strategy": {
+ "name": "Schema Application Strategy",
+ "displayName": "Schema Application Strategy",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Version": {
+ "name": "Schema Version",
+ "displayName": "Schema Version",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Time Format": {
+ "name": "Time Format",
+ "displayName": "Time Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Allow Comments": {
+ "name": "Allow Comments",
+ "displayName": "Allow Comments",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ }
+ },
+ "controllerServiceApis": [
+ {
+ "type": "org.apache.nifi.serialization.RecordReaderFactory",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-standard-services-api-nar",
+ "version": "2.7.2"
+ }
+ }
+ ],
+ "scheduledState": "DISABLED",
+ "bulletinLevel": "WARN",
+ "componentType": "CONTROLLER_SERVICE",
+ "groupIdentifier": "bd30bb46-019b-1000-a394-23186386d413"
+ },
+ {
+ "identifier": "c35e15f9-c1c6-34f5-bbd5-52e93dcdfc7b",
+ "instanceIdentifier": "12c4d4a3-019b-1000-6c90-72ea0dda41d8",
+ "name": "AzureStorageCredentialsControllerService_v12-RIO_BLOBS",
+ "comments": "",
+ "type": "org.apache.nifi.services.azure.storage.AzureStorageCredentialsControllerService_v12",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-azure-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Endpoint Suffix": "blob.core.windows.net",
+ "Credentials Type": "ACCOUNT_KEY",
+ "Proxy Configuration Service": null
+ },
+ "propertyDescriptors": {
+ "Service Principal Client ID": {
+ "name": "Service Principal Client ID",
+ "displayName": "Service Principal Client ID",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Service Principal Client Secret": {
+ "name": "Service Principal Client Secret",
+ "displayName": "Service Principal Client Secret",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Storage Account Name": {
+ "name": "Storage Account Name",
+ "displayName": "Storage Account Name",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Managed Identity Client ID": {
+ "name": "Managed Identity Client ID",
+ "displayName": "Managed Identity Client ID",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Service Principal Tenant ID": {
+ "name": "Service Principal Tenant ID",
+ "displayName": "Service Principal Tenant ID",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Endpoint Suffix": {
+ "name": "Endpoint Suffix",
+ "displayName": "Endpoint Suffix",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Credentials Type": {
+ "name": "Credentials Type",
+ "displayName": "Credentials Type",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Account Key": {
+ "name": "Account Key",
+ "displayName": "Account Key",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "SAS Token": {
+ "name": "SAS Token",
+ "displayName": "SAS Token",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Proxy Configuration Service": {
+ "name": "Proxy Configuration Service",
+ "displayName": "Proxy Configuration Service",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ }
+ },
+ "controllerServiceApis": [
+ {
+ "type": "org.apache.nifi.services.azure.storage.AzureStorageCredentialsService_v12",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-azure-services-api-nar",
+ "version": "2.7.2"
+ }
+ }
+ ],
+ "scheduledState": "DISABLED",
+ "bulletinLevel": "WARN",
+ "componentType": "CONTROLLER_SERVICE",
+ "groupIdentifier": "bd30bb46-019b-1000-a394-23186386d413"
+ },
+ {
+ "identifier": "441a8cf7-4c90-3b4e-ab0b-8facbb51b4db",
+ "instanceIdentifier": "a2f5bd7a-019b-1000-84fa-874b9c14e835",
+ "name": "ElasticSearchClientServiceImpl",
+ "comments": "",
+ "type": "org.apache.nifi.elasticsearch.ElasticSearchClientServiceImpl",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-elasticsearch-client-service-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Sniffer Request Timeout": "1 sec",
+ "Connect Timeout": "5000",
+ "HTTP Hosts": "https://elasticsearch-1:9200",
+ "Path Prefix": null,
+ "API Key ID": null,
+ "Enable Compression": "false",
+ "Read Timeout": "60000",
+ "Sniff on Failure": "false",
+ "Sniffer Failure Delay": "1 min",
+ "OAuth2 Access Token Provider": null,
+ "Suppress Null and Empty Values": "always-suppress",
+ "Node Selector": "ANY",
+ "Proxy Configuration Service": null,
+ "Strict Deprecation": "false",
+ "Sniff Cluster Nodes": "false",
+ "SSL Context Service": "273c0798-af30-33d7-9cd7-9f1c316b4c8d",
+ "Send Meta Header": "true",
+ "Username": "admin",
+ "Authorization Scheme": "BASIC",
+ "Sniffer Interval": "5 mins",
+ "Run As User": null,
+ "Character Set": "UTF-8"
+ },
+ "propertyDescriptors": {
+ "Sniffer Request Timeout": {
+ "name": "Sniffer Request Timeout",
+ "displayName": "Sniffer Request Timeout",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Connect Timeout": {
+ "name": "Connect Timeout",
+ "displayName": "Connect Timeout",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "API Key ID": {
+ "name": "API Key ID",
+ "displayName": "API Key ID",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Enable Compression": {
+ "name": "Enable Compression",
+ "displayName": "Enable Compression",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Sniff on Failure": {
+ "name": "Sniff on Failure",
+ "displayName": "Sniff on Failure",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "OAuth2 Access Token Provider": {
+ "name": "OAuth2 Access Token Provider",
+ "displayName": "OAuth2 Access Token Provider",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Node Selector": {
+ "name": "Node Selector",
+ "displayName": "Node Selector",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Proxy Configuration Service": {
+ "name": "Proxy Configuration Service",
+ "displayName": "Proxy Configuration Service",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Strict Deprecation": {
+ "name": "Strict Deprecation",
+ "displayName": "Strict Deprecation",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "JWT Shared Secret": {
+ "name": "JWT Shared Secret",
+ "displayName": "JWT Shared Secret",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "API Key": {
+ "name": "API Key",
+ "displayName": "API Key",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Send Meta Header": {
+ "name": "Send Meta Header",
+ "displayName": "Send Meta Header",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Authorization Scheme": {
+ "name": "Authorization Scheme",
+ "displayName": "Authorization Scheme",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Password": {
+ "name": "Password",
+ "displayName": "Password",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "HTTP Hosts": {
+ "name": "HTTP Hosts",
+ "displayName": "HTTP Hosts",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Path Prefix": {
+ "name": "Path Prefix",
+ "displayName": "Path Prefix",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Read Timeout": {
+ "name": "Read Timeout",
+ "displayName": "Read Timeout",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Sniffer Failure Delay": {
+ "name": "Sniffer Failure Delay",
+ "displayName": "Sniffer Failure Delay",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Suppress Null and Empty Values": {
+ "name": "Suppress Null and Empty Values",
+ "displayName": "Suppress Null and Empty Values",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Sniff Cluster Nodes": {
+ "name": "Sniff Cluster Nodes",
+ "displayName": "Sniff Cluster Nodes",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "SSL Context Service": {
+ "name": "SSL Context Service",
+ "displayName": "SSL Context Service",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Username": {
+ "name": "Username",
+ "displayName": "Username",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Sniffer Interval": {
+ "name": "Sniffer Interval",
+ "displayName": "Sniffer Interval",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Run As User": {
+ "name": "Run As User",
+ "displayName": "Run As User",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Character Set": {
+ "name": "Character Set",
+ "displayName": "Character Set",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ }
+ },
+ "controllerServiceApis": [
+ {
+ "type": "org.apache.nifi.elasticsearch.ElasticSearchClientService",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-elasticsearch-client-service-api-nar",
+ "version": "2.7.2"
+ }
+ }
+ ],
+ "scheduledState": "DISABLED",
+ "bulletinLevel": "WARN",
+ "componentType": "CONTROLLER_SERVICE",
+ "groupIdentifier": "bd30bb46-019b-1000-a394-23186386d413"
+ },
+ {
+ "identifier": "839d0da7-1f72-36da-aefb-c8123d21c751",
+ "instanceIdentifier": "12cb4a70-019b-1000-282d-6bf1553998e3",
+ "name": "JsonRecordSetWriter",
+ "comments": "",
+ "type": "org.apache.nifi.json.JsonRecordSetWriter",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-record-serialization-services-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Schema Branch": null,
+ "Schema Reference Reader": null,
+ "Schema Text": "${avro.schema}",
+ "Allow Scientific Notation": "false",
+ "Compression Level": "1",
+ "Output Grouping": "output-array",
+ "Schema Registry": null,
+ "Schema Access Strategy": "inherit-record-schema",
+ "Schema Name": "${schema.name}",
+ "Timestamp Format": null,
+ "Date Format": null,
+ "Pretty Print JSON": "false",
+ "Suppress Null Values": "never-suppress",
+ "Schema Write Strategy": "no-schema",
+ "Compression Format": "none",
+ "Schema Cache": null,
+ "Schema Version": null,
+ "Time Format": null,
+ "Schema Reference Writer": null
+ },
+ "propertyDescriptors": {
+ "Schema Branch": {
+ "name": "Schema Branch",
+ "displayName": "Schema Branch",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Reference Reader": {
+ "name": "Schema Reference Reader",
+ "displayName": "Schema Reference Reader",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Text": {
+ "name": "Schema Text",
+ "displayName": "Schema Text",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Allow Scientific Notation": {
+ "name": "Allow Scientific Notation",
+ "displayName": "Allow Scientific Notation",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Compression Level": {
+ "name": "Compression Level",
+ "displayName": "Compression Level",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Output Grouping": {
+ "name": "Output Grouping",
+ "displayName": "Output Grouping",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Registry": {
+ "name": "Schema Registry",
+ "displayName": "Schema Registry",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Access Strategy": {
+ "name": "Schema Access Strategy",
+ "displayName": "Schema Access Strategy",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Name": {
+ "name": "Schema Name",
+ "displayName": "Schema Name",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Timestamp Format": {
+ "name": "Timestamp Format",
+ "displayName": "Timestamp Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Date Format": {
+ "name": "Date Format",
+ "displayName": "Date Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Pretty Print JSON": {
+ "name": "Pretty Print JSON",
+ "displayName": "Pretty Print JSON",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Suppress Null Values": {
+ "name": "Suppress Null Values",
+ "displayName": "Suppress Null Values",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Write Strategy": {
+ "name": "Schema Write Strategy",
+ "displayName": "Schema Write Strategy",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Compression Format": {
+ "name": "Compression Format",
+ "displayName": "Compression Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Cache": {
+ "name": "Schema Cache",
+ "displayName": "Schema Cache",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Version": {
+ "name": "Schema Version",
+ "displayName": "Schema Version",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Time Format": {
+ "name": "Time Format",
+ "displayName": "Time Format",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Schema Reference Writer": {
+ "name": "Schema Reference Writer",
+ "displayName": "Schema Reference Writer",
+ "identifiesControllerService": true,
+ "sensitive": false,
+ "dynamic": false
+ }
+ },
+ "controllerServiceApis": [
+ {
+ "type": "org.apache.nifi.serialization.RecordSetWriterFactory",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-standard-services-api-nar",
+ "version": "2.7.2"
+ }
+ }
+ ],
+ "scheduledState": "DISABLED",
+ "bulletinLevel": "WARN",
+ "componentType": "CONTROLLER_SERVICE",
+ "groupIdentifier": "bd30bb46-019b-1000-a394-23186386d413"
+ },
+ {
+ "identifier": "273c0798-af30-33d7-9cd7-9f1c316b4c8d",
+ "instanceIdentifier": "a2f6b3f8-019b-1000-7c3a-602b41f32ebf",
+ "name": "StandardSSLContextService",
+ "comments": "",
+ "type": "org.apache.nifi.ssl.StandardSSLContextService",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-ssl-context-service-nar",
+ "version": "2.7.2"
+ },
+ "properties": {
+ "Truststore Type": "PKCS12",
+ "Keystore Type": "PKCS12",
+ "Truststore Filename": "/security/certificates/elastic/opensearch/elasticsearch/elasticsearch-1/elasticsearch-1-truststore.key",
+ "TLS Protocol": "TLS",
+ "Keystore Filename": "/security/certificates/elastic/opensearch/elasticsearch/elasticsearch-1/elasticsearch-1-keystore.jks"
+ },
+ "propertyDescriptors": {
+ "Truststore Type": {
+ "name": "Truststore Type",
+ "displayName": "Truststore Type",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Key Password": {
+ "name": "Key Password",
+ "displayName": "Key Password",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Keystore Type": {
+ "name": "Keystore Type",
+ "displayName": "Keystore Type",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Truststore Filename": {
+ "name": "Truststore Filename",
+ "displayName": "Truststore Filename",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false,
+ "resourceDefinition": {
+ "cardinality": "SINGLE",
+ "resourceTypes": [
+ "FILE"
+ ]
+ }
+ },
+ "Keystore Password": {
+ "name": "Keystore Password",
+ "displayName": "Keystore Password",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "Truststore Password": {
+ "name": "Truststore Password",
+ "displayName": "Truststore Password",
+ "identifiesControllerService": false,
+ "sensitive": true,
+ "dynamic": false
+ },
+ "TLS Protocol": {
+ "name": "TLS Protocol",
+ "displayName": "TLS Protocol",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false
+ },
+ "Keystore Filename": {
+ "name": "Keystore Filename",
+ "displayName": "Keystore Filename",
+ "identifiesControllerService": false,
+ "sensitive": false,
+ "dynamic": false,
+ "resourceDefinition": {
+ "cardinality": "SINGLE",
+ "resourceTypes": [
+ "FILE"
+ ]
+ }
+ }
+ },
+ "controllerServiceApis": [
+ {
+ "type": "org.apache.nifi.ssl.SSLContextProvider",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-standard-services-api-nar",
+ "version": "2.7.2"
+ }
+ },
+ {
+ "type": "org.apache.nifi.ssl.SSLContextService",
+ "bundle": {
+ "group": "org.apache.nifi",
+ "artifact": "nifi-standard-services-api-nar",
+ "version": "2.7.2"
+ }
+ }
+ ],
+ "scheduledState": "DISABLED",
+ "bulletinLevel": "WARN",
+ "componentType": "CONTROLLER_SERVICE",
+ "groupIdentifier": "bd30bb46-019b-1000-a394-23186386d413"
+ }
+ ],
+ "defaultFlowFileExpiration": "0 sec",
+ "defaultBackPressureObjectThreshold": 10000,
+ "defaultBackPressureDataSizeThreshold": "1 GB",
+ "scheduledState": "ENABLED",
+ "executionEngine": "INHERITED",
+ "maxConcurrentTasks": 1,
+ "statelessFlowTimeout": "1 min",
+ "flowFileConcurrency": "UNBOUNDED",
+ "flowFileOutboundPolicy": "STREAM_WHEN_AVAILABLE",
+ "componentType": "PROCESS_GROUP"
+ },
+ "externalControllerServices": {},
+ "parameterContexts": {},
+ "flowEncodingVersion": "1.0",
+ "parameterProviders": {},
+ "latest": false
+}
\ No newline at end of file
diff --git a/nifi/user-templates/dt4h/annotate_dt4h_ann_manager.xml b/nifi/user_templates/dt4h/annotate_dt4h_ann_manager.xml
similarity index 100%
rename from nifi/user-templates/dt4h/annotate_dt4h_ann_manager.xml
rename to nifi/user_templates/dt4h/annotate_dt4h_ann_manager.xml
diff --git a/nifi/user-templates/dt4h/raw_ingest_dt4h.xml b/nifi/user_templates/dt4h/raw_ingest_dt4h.xml
similarity index 100%
rename from nifi/user-templates/dt4h/raw_ingest_dt4h.xml
rename to nifi/user_templates/dt4h/raw_ingest_dt4h.xml
diff --git a/nifi/user-templates/legacy/CogStack_Cohort_create_source_docs.xml b/nifi/user_templates/legacy/CogStack_Cohort_create_source_docs.xml
similarity index 100%
rename from nifi/user-templates/legacy/CogStack_Cohort_create_source_docs.xml
rename to nifi/user_templates/legacy/CogStack_Cohort_create_source_docs.xml
diff --git a/nifi/user-templates/legacy/Common_schema_example_ingest.xml b/nifi/user_templates/legacy/Common_schema_example_ingest.xml
similarity index 100%
rename from nifi/user-templates/legacy/Common_schema_example_ingest.xml
rename to nifi/user_templates/legacy/Common_schema_example_ingest.xml
diff --git a/nifi/user-templates/legacy/DEID_sample_pipeline.xml b/nifi/user_templates/legacy/DEID_sample_pipeline.xml
similarity index 100%
rename from nifi/user-templates/legacy/DEID_sample_pipeline.xml
rename to nifi/user_templates/legacy/DEID_sample_pipeline.xml
diff --git a/nifi/user-templates/legacy/Generate_location_ES.xml b/nifi/user_templates/legacy/Generate_location_ES.xml
similarity index 100%
rename from nifi/user-templates/legacy/Generate_location_ES.xml
rename to nifi/user_templates/legacy/Generate_location_ES.xml
diff --git a/nifi/user-templates/legacy/Grab_non_annotated_docs.xml b/nifi/user_templates/legacy/Grab_non_annotated_docs.xml
similarity index 100%
rename from nifi/user-templates/legacy/Grab_non_annotated_docs.xml
rename to nifi/user_templates/legacy/Grab_non_annotated_docs.xml
diff --git a/nifi/user-templates/legacy/HealTAC_23.xml b/nifi/user_templates/legacy/HealTAC_23.xml
similarity index 100%
rename from nifi/user-templates/legacy/HealTAC_23.xml
rename to nifi/user_templates/legacy/HealTAC_23.xml
diff --git a/nifi/user-templates/legacy/OS_annotate_per_doc.xml b/nifi/user_templates/legacy/OS_annotate_per_doc.xml
similarity index 100%
rename from nifi/user-templates/legacy/OS_annotate_per_doc.xml
rename to nifi/user_templates/legacy/OS_annotate_per_doc.xml
diff --git a/nifi/user-templates/legacy/OpenSearch_Ingest_DB_OCR_service_to_ES.xml b/nifi/user_templates/legacy/OpenSearch_Ingest_DB_OCR_service_to_ES.xml
similarity index 100%
rename from nifi/user-templates/legacy/OpenSearch_Ingest_DB_OCR_service_to_ES.xml
rename to nifi/user_templates/legacy/OpenSearch_Ingest_DB_OCR_service_to_ES.xml
diff --git a/nifi/user-templates/legacy/OpenSearch_ingest_annotate_DB_MedCATService_to_ES.xml b/nifi/user_templates/legacy/OpenSearch_ingest_annotate_DB_MedCATService_to_ES.xml
similarity index 100%
rename from nifi/user-templates/legacy/OpenSearch_ingest_annotate_DB_MedCATService_to_ES.xml
rename to nifi/user_templates/legacy/OpenSearch_ingest_annotate_DB_MedCATService_to_ES.xml
diff --git a/nifi/user-templates/legacy/OpenSearch_ingest_annotate_DB_to_ES_and_DB_ann_manager.xml b/nifi/user_templates/legacy/OpenSearch_ingest_annotate_DB_to_ES_and_DB_ann_manager.xml
similarity index 100%
rename from nifi/user-templates/legacy/OpenSearch_ingest_annotate_DB_to_ES_and_DB_ann_manager.xml
rename to nifi/user_templates/legacy/OpenSearch_ingest_annotate_DB_to_ES_and_DB_ann_manager.xml
diff --git a/nifi/user-templates/legacy/OpenSearch_ingest_annotate_ES_MedCATService_to_ES.xml b/nifi/user_templates/legacy/OpenSearch_ingest_annotate_ES_MedCATService_to_ES.xml
similarity index 100%
rename from nifi/user-templates/legacy/OpenSearch_ingest_annotate_ES_MedCATService_to_ES.xml
rename to nifi/user_templates/legacy/OpenSearch_ingest_annotate_ES_MedCATService_to_ES.xml
diff --git a/nifi/user-templates/legacy/OpenSearch_ingest_docs_DB_to_ES.xml b/nifi/user_templates/legacy/OpenSearch_ingest_docs_DB_to_ES.xml
similarity index 100%
rename from nifi/user-templates/legacy/OpenSearch_ingest_docs_DB_to_ES.xml
rename to nifi/user_templates/legacy/OpenSearch_ingest_docs_DB_to_ES.xml
diff --git a/nifi/user-templates/legacy/Raw_file_read_from_disk_ocr_custom.xml b/nifi/user_templates/legacy/Raw_file_read_from_disk_ocr_custom.xml
similarity index 100%
rename from nifi/user-templates/legacy/Raw_file_read_from_disk_ocr_custom.xml
rename to nifi/user_templates/legacy/Raw_file_read_from_disk_ocr_custom.xml
diff --git a/nifi/user-templates/opensearch_docs_ingest_annotations_to_es.json b/nifi/user_templates/opensearch_docs_ingest_annotations_to_es.json
similarity index 100%
rename from nifi/user-templates/opensearch_docs_ingest_annotations_to_es.json
rename to nifi/user_templates/opensearch_docs_ingest_annotations_to_es.json
diff --git a/nifi/user-templates/opensearch_ingest_docs_db_ocr_service_to_es.json b/nifi/user_templates/opensearch_ingest_docs_db_ocr_service_to_es.json
similarity index 100%
rename from nifi/user-templates/opensearch_ingest_docs_db_ocr_service_to_es.json
rename to nifi/user_templates/opensearch_ingest_docs_db_ocr_service_to_es.json
diff --git a/nifi/user-templates/opensearch_ingest_docs_db_to_es.json b/nifi/user_templates/opensearch_ingest_docs_db_to_es.json
similarity index 100%
rename from nifi/user-templates/opensearch_ingest_docs_db_to_es.json
rename to nifi/user_templates/opensearch_ingest_docs_db_to_es.json
diff --git a/pyproject.toml b/pyproject.toml
index 6fae60f28..2b563d437 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.ruff]
line-length = 120
-exclude = ["nifi/user-scripts/legacy_scripts", "services"]
+exclude = ["nifi/user_scripts/legacy_scripts", "services"]
target-version = "py311"
indent-width = 4
@@ -25,15 +25,30 @@ fixable = ["ALL"]
[tool.mypy]
plugins = ["pydantic.mypy"]
+python_version = "3.11"
ignore_missing_imports = true
strict = false
files = "."
mypy_path = "./typings/"
+warn_unused_configs = true
-[tool.isort]
-line_length = 120
-skip = ["venv", "venv-test", "envs", "docker", "models"]
+[tool.setuptools.packages.find]
+include = ["nifi*"]
+exclude = [
+ "*egg-info*",
+ "build*",
+ "dist*",
+ "nifi/conf*",
+ "nifi/drivers*",
+ "nifi/user_schemas*",
+ "nifi/user_templates*",
+]
+
+[project]
+name = "cogstack_nifi"
+version = "0.0.1"
+requires-python = ">=3.11"
-[tool.flake8]
-max-line-length = 120
-exclude = ["venv", "venv-test", "envs", "docker", "models"]
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
diff --git a/security/env/users_database.env b/security/env/users_database.env
index 9da92b17c..b2140eccf 100644
--- a/security/env/users_database.env
+++ b/security/env/users_database.env
@@ -3,8 +3,8 @@ POSTGRES_USER_SAMPLES=test
POSTGRES_PASSWORD_SAMPLES=test
# Production DATABASE user
-POSTGRES_USER=admin
-POSTGRES_PASSWORD=admin
+DATABASE_USER=admin
+DATABASE_PASSWORD=admin
# Production DATABASE MSSQL user
MSSQL_SA_USER=sa
diff --git a/security/env/users_elasticsearch.env b/security/env/users_elasticsearch.env
index 918334a3f..3ba87ca1a 100644
--- a/security/env/users_elasticsearch.env
+++ b/security/env/users_elasticsearch.env
@@ -42,4 +42,3 @@ ES_LOGSTASH_PASS=kibanaserver
ES_KIBANARO_PASS=kibanaserver
ES_READALL_PASS=kibanaserver
ES_SNAPSHOTRESTORE_PASS=kibanaserver
-
diff --git a/security/es_roles/opensearch/config.yml b/security/es_roles/opensearch/config.yml
index 1493a0d7f..e4c94bed9 100644
--- a/security/es_roles/opensearch/config.yml
+++ b/security/es_roles/opensearch/config.yml
@@ -65,13 +65,14 @@ config:
# Set filtered_alias_mode to 'nowarn' to allow more than 2 filtered aliases per index silently
#filtered_alias_mode: warn
#do_not_fail_on_forbidden: false
- #kibana:
+ kibana:
+ # this must match whatver index name is in the 'opensearchDashboards.index' property in : services/kibana/config/opensearch.yml
+ index: '.opensearch_dashboards'
# Kibana multitenancy
#multitenancy_enabled: true
#private_tenant_enabled: true
#default_tenant: ""
#server_username: kibanaserver
- #index: '.kibana'
http:
anonymous_auth_enabled: false
xff:
diff --git a/security/es_roles/opensearch/roles.yml b/security/es_roles/opensearch/roles.yml
index ced314709..776938061 100644
--- a/security/es_roles/opensearch/roles.yml
+++ b/security/es_roles/opensearch/roles.yml
@@ -479,10 +479,20 @@ dashboards_system_access:
- index_patterns:
- ".opensearch_dashboards*"
allowed_actions:
+ - "system:admin/system_index"
- "indices:admin/*"
- "indices:data/read/*"
- "indices:data/write/*"
- "indices:admin/create"
+ - index_patterns:
+ - ".opensearch-notifications*"
+ allowed_actions:
+ - "system:admin/system_index"
+ - "indices:admin/*"
+ - "indices:data/read/*"
+ - "indices:data/write/*"
+ - "indices:admin/create"
+
tenant_permissions:
- tenant_patterns:
- "*"
diff --git a/security/scripts/create_opensearch_client_admin_certs.sh b/security/scripts/create_opensearch_client_admin_certs.sh
index 7a8528808..1d864ab8e 100644
--- a/security/scripts/create_opensearch_client_admin_certs.sh
+++ b/security/scripts/create_opensearch_client_admin_certs.sh
@@ -49,7 +49,7 @@ echo "==========================================================================
CA_ROOT_CERT="${ROOT_CERTIFICATES_FOLDER}"$ROOT_CERTIFICATE_NAME".pem"
CA_ROOT_KEY="${ROOT_CERTIFICATES_FOLDER}"$ROOT_CERTIFICATE_NAME".key"
-EXT_FILE= "${SECURITY_TEMPLATES_FOLDER}ssl-extensions-x509.cnf"
+EXT_FILE="${SECURITY_TEMPLATES_FOLDER}ssl-extensions-x509.cnf"
# === Client cert ===
echo "Generating a key for: $ES_CLIENT_CERT_NAME"
diff --git a/services/elasticsearch/config/opensearch.yml b/services/elasticsearch/config/opensearch.yml
index f1bba783e..e2fe5ffb8 100644
--- a/services/elasticsearch/config/opensearch.yml
+++ b/services/elasticsearch/config/opensearch.yml
@@ -181,7 +181,20 @@ plugins.security.audit.config.enable_ssl_client_auth: true
# Indices configured as system indices can be accessed by only super-admin and no role will provide access to these indices.
# Enable system indices
plugins.security.system_indices.enabled: true
+plugins.security.system_indices.permission.enabled: true
# Specify a list of indices to mark as system. These indices will only be visible / mutable by members of the above setting, in addition to needing permission to the index via a normal role.
-# plugins.security.system_indices.indices: ['.opendistro-alerting-config', '.opendistro-ism-*', '.opendistro-reports-*', '.opensearch-notifications-*', '.opensearch-notebooks', '.opensearch-observability', '.opendistro-asynchronous-search-response*', '.replication-metadata-store']
-#plugins.security.system_indices.indices: [".opendistro-alerting-config", ".opendistro-alerting-alert*", ".opendistro-anomaly-results*", ".opendistro-anomaly-detector*", ".opendistro-anomaly-checkpoints", ".opendistro-anomaly-detection-state", ".opendistro-reports-*", ".opendistro-notifications-*", ".opendistro-notebooks", ".opendistro-asynchronous-search-response*"]
+plugins.security.system_indices.indices: [
+ ".opensearch_dashboards*",
+ ".opensearch-alerting-config",
+ ".opensearch-notifications-*",
+ ".opendistro-ism-*",
+ ".opendistro-reports-*",
+ ".opendistro-asynchronous-search-response*",
+ ".replication-metadata-store",
+ ".opendistro-notebooks",
+ ".opendistro-anomaly-results*",
+ ".opendistro-anomaly-detector*",
+ ".opendistro-anomaly-checkpoints",
+ ".opendistro-anomaly-detection-state"
+]
diff --git a/services/kibana/config/opensearch.yml b/services/kibana/config/opensearch.yml
index e8c4df486..1d395d761 100644
--- a/services/kibana/config/opensearch.yml
+++ b/services/kibana/config/opensearch.yml
@@ -28,7 +28,7 @@ opensearch.hosts: ${ELASTICSEARCH_HOSTS}
opensearch_security.multitenancy.enabled: true
opensearch_security.multitenancy.tenants.enable_global: true
opensearch_security.multitenancy.tenants.enable_private: true
-opensearch_security.multitenancy.tenants.preferred: ["Private", "Global"]
+opensearch_security.multitenancy.tenants.preferred: ["Global"]
opensearch_security.readonly_mode.roles: ["kibana_read_only"]
opensearch_security.multitenancy.enable_filter: true
opensearch_security.cookie.secure: true
@@ -38,7 +38,7 @@ opensearch_security.cookie.secure: true
# We changed this from .opensearch_dashboards to .kibana because of a permission bug in OpenSearch Dashboards 2.19.x and 3.0.0
# report: https://github.com/opensearch-project/security/issues/5360
-opensearchDashboards.index: ".kibana"
+opensearchDashboards.index: ".opensearch_dashboards"
# The default application to load.
#opensearchDashboards.defaultAppId: "home"
@@ -131,3 +131,16 @@ opensearch.requestHeadersAllowlist: ["securitytenant", "Authorization"]
# Set the value of this setting to true to capture region blocked warnings and errors
# for your map rendering services.
# map.showRegionBlockedWarning: false
+
+
+savedObjects.permission.enabled: true
+
+# Set the value to true to enable multiple data source feature
+data_source.enabled: true
+# Set the value to true to enable workspace feature
+workspace.enabled: true
+# Set the value to true to enable explore feature
+explore.enabled: true
+
+opensearchDashboards.dashboardAdmin.users: ["admin"]
+opensearchDashboards.dashboardAdmin.groups: ["admin","all_access"]
diff --git a/services/nginx/config/nginx.conf b/services/nginx/config/nginx.conf
index bc300b65b..567069aa4 100644
--- a/services/nginx/config/nginx.conf
+++ b/services/nginx/config/nginx.conf
@@ -136,7 +136,6 @@ http {
server {
listen 8443 ssl;
server_name nginx.local;
-
proxy_ssl_server_name on;
@@ -154,8 +153,8 @@ http {
location / {
proxy_set_header Host nifi;
- proxy_set_header X-Real-IP nifi;
- proxy_set_header X-ProxyHost nifi;
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-ProxyHost $host;
proxy_set_header X-ProxyPort 8443;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-ProxyScheme $scheme;
@@ -178,7 +177,7 @@ http {
location ^~ /nifi-api/ {
proxy_set_header Host nifi;
- proxy_set_header X-Real-IP nifi;
+ proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-ProxyHost $host;
proxy_set_header X-ProxyPort 8443;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
@@ -201,7 +200,7 @@ http {
location ^~ /media/ {
proxy_set_header Host nifi;
- proxy_set_header X-Real-IP nifi;
+ proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-ProxyHost $host;
proxy_set_header X-ProxyPort 8443;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
@@ -211,7 +210,7 @@ http {
location ^~ /content-viewer/ {
proxy_set_header Host nifi;
- proxy_set_header X-Real-IP nifi;
+ proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-ProxyHost $host;
proxy_set_header X-ProxyPort 8443;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;