kheiss-uwzoo
diff --git a/‎.github/workflows/ci-pipeline.yml‎
Lines changed: 179 additions & 16 deletions b/‎.github/workflows/ci-pipeline.yml‎
Lines changed: 179 additions & 16 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎deploy/compose/.env‎
Lines changed: 9 additions & 9 deletions b/‎deploy/compose/.env‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎deploy/compose/docker-compose-ingestor-server.yaml‎
Lines changed: 14 additions & 14 deletions b/‎deploy/compose/docker-compose-ingestor-server.yaml‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎deploy/compose/docker-compose-rag-server.yaml‎
Lines changed: 19 additions & 15 deletions b/‎deploy/compose/docker-compose-rag-server.yaml‎
Lines changed: 19 additions & 15 deletions
@@ -99,7 +99,7 @@ This modular design ensures efficient query processing, accurate retrieval of in
 
 - Response Generation (Inference)
 
-    - [NVIDIA NIM llama-3.3-nemotron-super-49b-v1.5](https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1_5)
+    - [NVIDIA NIM nemotron-3-super-120b-a12b](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b)
 
 - Retriever and Extraction Models
 
@@ -108,7 +108,7 @@ This modular design ensures efficient query processing, accurate retrieval of in
     - [NeMo Retriever Page Elements NIM](https://build.nvidia.com/nvidia/nemotron-page-elements-v3)
     - [NeMo Retriever Table Structure NIM](https://build.nvidia.com/nvidia/nemotron-table-structure-v1)
     - [NeMo Retriever Graphic Elements NIM](https://build.nvidia.com/nvidia/nemotron-graphic-elements-v1)
-    - [NeMo Retriever OCR NIM](https://build.nvidia.com/nvidia/nemoretriever-ocr)
+    - [Nemotron OCR NIM](https://build.nvidia.com/nvidia/nemotron-ocr)
 
 - Optional NIMs
 
@@ -124,7 +124,7 @@ This modular design ensures efficient query processing, accurate retrieval of in
 
 - **RAG Orchestrator Server** – Coordinates interactions between the user, retrievers, vector database, and inference models, ensuring multi-turn and context-aware query handling. This is [LangChain](https://www.langchain.com/)-based.
 
-- **Vector Database (accelerated with NVIDIA cuVS)** – Stores and searches embeddings at scale with GPU-accelerated indexing and retrieval for low-latency performance. You can use [Milvus Vector Database](https://milvus.io/) or [Elasticsearch](https://www.elastic.co/elasticsearch/vector-database).
+- **Vector Database (accelerated with NVIDIA cuVS)** – Stores and searches embeddings at scale with GPU-accelerated indexing and retrieval for low-latency performance. The default is [Elasticsearch](https://www.elastic.co/elasticsearch/vector-database). Another alternative is [Milvus](https://milvus.io/) (GPU-accelerated).
 
 - **NeMo Retriever Extraction** – A high-performance ingestion microservice for parsing multimodal content. For more information about the ingestion pipeline, see [NeMo Retriever Extraction Overview](https://docs.nvidia.com/nemo/retriever/latest/extraction/overview/)
 
@@ -229,5 +229,5 @@ The following models that are built with Llama are governed by the Llama 3.2 Com
 
 ## Additional Information
 
-The [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/) for the llama-3.1-nemotron-nano-vl-8b-v1, llama-3.1-nemoguard-8b-content-safety and llama-3.1-nemoguard-8b-topic-control models. The [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/) for the nvidia/llama-nemotron-embed-1b-v2, nvidia/llama-nemotron-rerank-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1 models. The [Llama 3.3 Community License Agreement](https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/LICENSE) for the llama-3.3-nemotron-super-49b-v1.5 models. Built with Llama. Apache 2.0 for NVIDIA Ingest and for the nemoretriever-page-elements-v2, nemotron-table-structure-v1, nemotron-graphic-elements-v1, paddleocr and nemoretriever-ocr-v1 models.
+The [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/) for the llama-3.1-nemotron-nano-vl-8b-v1, llama-3.1-nemoguard-8b-content-safety and llama-3.1-nemoguard-8b-topic-control models. The [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/) for the nvidia/llama-nemotron-embed-1b-v2, nvidia/llama-nemotron-rerank-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1 models. The [Llama 3.3 Community License Agreement](https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/LICENSE) for the llama-3.3-nemotron-super-49b-v1.5 models. Built with Llama. Apache 2.0 for NVIDIA Ingest and for the nemoretriever-page-elements-v2, nemotron-table-structure-v1, nemotron-graphic-elements-v1, paddleocr and nemotron-ocr-v1 models.
 
@@ -24,10 +24,10 @@ export APP_FILTEREXPRESSIONGENERATOR_SERVERURL=nim-llm:8000
 export SUMMARY_LLM_SERVERURL=nim-llm:8000
 export APP_EMBEDDINGS_SERVERURL=nemotron-embedding-ms:8000/v1
 export APP_RANKING_SERVERURL=nemotron-ranking-ms:8000
-export OCR_GRPC_ENDPOINT=nemoretriever-ocr:8001
-export OCR_HTTP_ENDPOINT=http://nemoretriever-ocr:8000/v1/infer
+export OCR_GRPC_ENDPOINT=nemotron-ocr:8001
+export OCR_HTTP_ENDPOINT=http://nemotron-ocr:8000/v1/infer
 export OCR_INFER_PROTOCOL=grpc
-export OCR_MODEL_NAME=scene_text_ensemble
+export OCR_MODEL_NAME=pipeline
 export YOLOX_GRPC_ENDPOINT=page-elements:8001
 export YOLOX_INFER_PROTOCOL=grpc
 export YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=graphic-elements:8001
@@ -41,23 +41,23 @@ export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=grpc
 
 # export APP_EMBEDDINGS_SERVERURL=https://integrate.api.nvidia.com/v1
 # export APP_LLM_SERVERURL=""
-# export APP_LLM_MODELNAME=nvidia/llama-3.3-nemotron-super-49b-v1.5
-# export APP_FILTEREXPRESSIONGENERATOR_MODELNAME=nvidia/llama-3.3-nemotron-super-49b-v1.5
+# export APP_LLM_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+# export APP_FILTEREXPRESSIONGENERATOR_MODELNAME=nvidia/nemotron-3-super-120b-a12b
 # export APP_FILTEREXPRESSIONGENERATOR_SERVERURL=""
-# export SUMMARY_LLM="nvidia/llama-3.3-nemotron-super-49b-v1.5"
+# export SUMMARY_LLM="nvidia/nemotron-3-super-120b-a12b"
 # export APP_RANKING_SERVERURL=""
 # export SUMMARY_LLM_SERVERURL=""
-# export OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
+# export OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-ocr-v1
 # export OCR_INFER_PROTOCOL=http
-# export OCR_MODEL_NAME=scene_text_ensemble
+# export OCR_MODEL_NAME=pipeline
 # export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
 # export YOLOX_INFER_PROTOCOL=http
 # export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
 # export YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
 # export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
 # export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
 # export APP_QUERYREWRITER_SERVERURL=""
-# export APP_QUERYREWRITER_MODELNAME="nvidia/llama-3.3-nemotron-super-49b-v1.5"
+# export APP_QUERYREWRITER_MODELNAME="nvidia/nemotron-3-super-120b-a12b"
 
 
 # ==========================
 
@@ -30,10 +30,10 @@ services:
       ##===Vector DB specific configurations===
       # URL on which vectorstore is hosted
       # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
-      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
+      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://elasticsearch:9200}
       # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
       # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
-      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
+      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"elasticsearch"}
 
       # Type of vectordb search to be used
       APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
@@ -44,10 +44,10 @@ services:
       # Weight for sparse vector search in case of "weighted" Hybrid Search
       APP_VECTORSTORE_SPARSE_WEIGHT: ${APP_VECTORSTORE_SPARSE_WEIGHT:-0.5}
 
-      # Boolean to enable GPU index for milvus vectorstore specific to nvingest
-      APP_VECTORSTORE_ENABLEGPUINDEX: ${APP_VECTORSTORE_ENABLEGPUINDEX:-True}
-      # Boolean to control GPU search for milvus vectorstore specific to nvingest
-      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-True}
+      # Milvus only (ignored for Elasticsearch). Set True when using Milvus + GPU.
+      APP_VECTORSTORE_ENABLEGPUINDEX: ${APP_VECTORSTORE_ENABLEGPUINDEX:-False}
+      # Milvus only (ignored for Elasticsearch). Set True when using Milvus + GPU.
+      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-False}
       # Username for vector store
       APP_VECTORSTORE_USERNAME: ${APP_VECTORSTORE_USERNAME:-""}
       APP_VECTORSTORE_PASSWORD: ${APP_VECTORSTORE_PASSWORD:-""}
@@ -124,7 +124,7 @@ services:
       ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
 
       # Choose the summary model to use for document summary
-      SUMMARY_LLM: ${SUMMARY_LLM:-nvidia/llama-3.3-nemotron-super-49b-v1.5}
+      SUMMARY_LLM: ${SUMMARY_LLM:-nvidia/nemotron-3-super-120b-a12b}
       SUMMARY_LLM_SERVERURL: ${SUMMARY_LLM_SERVERURL-${APP_LLM_SERVERURL-"nim-llm:8000"}}
       SUMMARY_LLM_MAX_CHUNK_LENGTH: ${SUMMARY_LLM_MAX_CHUNK_LENGTH:-9000}
       SUMMARY_CHUNK_OVERLAP: ${SUMMARY_CHUNK_OVERLAP:-400}
@@ -140,15 +140,15 @@ services:
       REDIS_DB: ${REDIS_DB:-0}
       ENABLE_REDIS_BACKEND: ${ENABLE_REDIS_BACKEND:-False}
 
-      # Bulk upload to MinIO
-      ENABLE_MINIO_BULK_UPLOAD: ${ENABLE_MINIO_BULK_UPLOAD:-True}
       TEMP_DIR: ${TEMP_DIR:-/tmp-data}
       INGESTOR_SERVER_DATA_DIR: ${INGESTOR_SERVER_DATA_DIR:-/data/}
 
       # NV-Ingest Batch Mode Configurations
       NV_INGEST_FILES_PER_BATCH: ${NV_INGEST_FILES_PER_BATCH:-16}
       NV_INGEST_CONCURRENT_BATCHES: ${NV_INGEST_CONCURRENT_BATCHES:-4}
       ENABLE_NV_INGEST_DYNAMIC_BATCHING: ${ENABLE_NV_INGEST_DYNAMIC_BATCHING:-True}
+      # Max memory budget (MB) for a single ingestion job; used for dynamic batch sizing
+      INGESTION_MAX_MEMORY_BUDGET_MB: ${INGESTION_MAX_MEMORY_BUDGET_MB:-1024}
 
       # Tracing
       APP_TRACING_ENABLED: ${APP_TRACING_ENABLED:-"False"}
@@ -169,7 +169,7 @@ services:
       - "6379:6379"
 
   nv-ingest-ms-runtime:
-    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.2
+    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.3.0
     # cpuset: "0-15" # Uncomment to restrict this container to CPU cores 0–15
     shm_size: 40gb # Should be at minimum 30% of assigned memory per Ray documentation
     volumes:
@@ -220,12 +220,12 @@ services:
       - NV_INGEST_MAX_UTIL=${NV_INGEST_MAX_UTIL:-48}
       - OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317
       # Self-hosted ocr endpoints.
-      - OCR_GRPC_ENDPOINT=${OCR_GRPC_ENDPOINT:-nemoretriever-ocr:8001}
-      - OCR_HTTP_ENDPOINT=${OCR_HTTP_ENDPOINT:-http://nemoretriever-ocr:8000/v1/infer}
+      - OCR_GRPC_ENDPOINT=${OCR_GRPC_ENDPOINT:-nemotron-ocr:8001}
+      - OCR_HTTP_ENDPOINT=${OCR_HTTP_ENDPOINT:-http://nemotron-ocr:8000/v1/infer}
       - OCR_INFER_PROTOCOL=${OCR_INFER_PROTOCOL:-grpc}
-      - OCR_MODEL_NAME=${OCR_MODEL_NAME:-scene_text_ensemble}
+      - OCR_MODEL_NAME=${OCR_MODEL_NAME:-pipeline}
       # build.nvidia.com hosted ocr endpoints.
-      #- OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
+      #- OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-ocr-v1
       #- OCR_INFER_PROTOCOL=http
       - PDF_SPLIT_PAGE_COUNT=${PDF_SPLIT_PAGE_COUNT:-32}
       - REDIS_INGEST_TASK_QUEUE=ingest_task_queue
 
@@ -31,10 +31,10 @@ services:
       ##===Vector DB specific configurations===
       # URL on which vectorstore is hosted
       # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
-      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
+      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://elasticsearch:9200}
       # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
       # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
-      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
+      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"elasticsearch"}
       # Type of index to be used for vectorstore
       APP_VECTORSTORE_INDEXTYPE: ${APP_VECTORSTORE_INDEXTYPE:-"GPU_CAGRA"}
 
@@ -47,8 +47,8 @@ services:
       # Weight for sparse vector search in case of "weighted" Hybrid Search
       APP_VECTORSTORE_SPARSE_WEIGHT: ${APP_VECTORSTORE_SPARSE_WEIGHT:-0.5}
 
-      # Boolean to control GPU search for milvus vectorstore specific to rag-server
-      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-True}
+      # Milvus only (ignored for Elasticsearch). Set True when using Milvus + GPU.
+      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-False}
       # ef: Parameter controlling query time/accuracy trade-off. Higher ef leads to more accurate but slower search.
       APP_VECTORSTORE_EF: ${APP_VECTORSTORE_EF:-100} # Must be greater or equal to VECTOR_DB_TOPK
       # Username for vector store 
@@ -66,27 +66,33 @@ services:
       # Top K from vector DB, which goes as input to reranker model if enabled, else goes to LLM prompt
       VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100}
 
+      # Fetch full page context: when True, fetches ALL chunks for retrieved pages and organizes by page
+      # Useful for PDFs where we have page numbers in file
+      APP_FETCH_FULL_PAGE_CONTEXT: ${APP_FETCH_FULL_PAGE_CONTEXT:-false}
+      # N pages before/after each retrieved page (0=disabled, 1=+/-1 page). Requires APP_FETCH_FULL_PAGE_CONTEXT=true
+      APP_FETCH_NEIGHBORING_PAGES: ${APP_FETCH_NEIGHBORING_PAGES:-0}
+
       ##===LLM Model specific configurations===
-      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"nvidia/llama-3.3-nemotron-super-49b-v1.5"}
+      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"nvidia/nemotron-3-super-120b-a12b"}
       # url on which llm model is hosted. If "", Nvidia hosted API is used
       APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-"nim-llm:8000"}
       # LLM model parameters
-      LLM_MAX_TOKENS: ${LLM_MAX_TOKENS:-32768}
+      LLM_MAX_TOKENS: ${LLM_MAX_TOKENS:-131072}
       LLM_TEMPERATURE: ${LLM_TEMPERATURE:-0}
       LLM_TOP_P: ${LLM_TOP_P:-1.0}
 
-      # Reasoning configuration (supported by Nemotron 3 and other reasoning models)
-      LLM_ENABLE_THINKING: ${LLM_ENABLE_THINKING:-false}
-      LLM_REASONING_BUDGET: ${LLM_REASONING_BUDGET:-0}
-      LLM_LOW_EFFORT: ${LLM_LOW_EFFORT:-false}
+      # Reasoning configuration (enabled by default for Nemotron 3 Super)
+      LLM_ENABLE_THINKING: ${LLM_ENABLE_THINKING:-true}
+      LLM_REASONING_BUDGET: ${LLM_REASONING_BUDGET:-256}
+      LLM_LOW_EFFORT: ${LLM_LOW_EFFORT:-true}
 
       ##===Query Rewriter Model specific configurations===
-      APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"nvidia/llama-3.3-nemotron-super-49b-v1.5"}
+      APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"nvidia/nemotron-3-super-120b-a12b"}
       # url on which query rewriter model is hosted. If "", Nvidia hosted API is used
       APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm:8000"}
 
       ##===Filter Expression Generator Model specific configurations===
-      APP_FILTEREXPRESSIONGENERATOR_MODELNAME: ${APP_FILTEREXPRESSIONGENERATOR_MODELNAME:-"nvidia/llama-3.3-nemotron-super-49b-v1.5"}
+      APP_FILTEREXPRESSIONGENERATOR_MODELNAME: ${APP_FILTEREXPRESSIONGENERATOR_MODELNAME:-"nvidia/nemotron-3-super-120b-a12b"}
       # url on which filter expression generator model is hosted. If "", Nvidia hosted API is used
       APP_FILTEREXPRESSIONGENERATOR_SERVERURL: ${APP_FILTEREXPRESSIONGENERATOR_SERVERURL-"nim-llm:8000"}
       # enable filter expression generator for natural language to filter expression conversion
@@ -189,7 +195,7 @@ services:
       # Minimum groundedness score threshold (0-2)
       RESPONSE_GROUNDEDNESS_THRESHOLD: ${RESPONSE_GROUNDEDNESS_THRESHOLD:-1}
       # reflection llm
-      REFLECTION_LLM: ${REFLECTION_LLM:-"nvidia/llama-3.3-nemotron-super-49b-v1.5"}
+      REFLECTION_LLM: ${REFLECTION_LLM:-"nvidia/nemotron-3-super-120b-a12b"}
       # reflection llm server url. If "", Nvidia hosted API is used
       REFLECTION_LLM_SERVERURL: ${REFLECTION_LLM_SERVERURL-"nim-llm:8000"}
       # enable iterative query decomposition
@@ -220,7 +226,6 @@ services:
         # Environment variables for Vite build
         VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
         VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
-        VITE_MILVUS_URL: http://milvus:19530
         DOWNLOAD_LEGAL_COMPLIANCE: ${DOWNLOAD_LEGAL_COMPLIANCE:-false}
     ports:
       - "8090:3000"
@@ -230,7 +235,6 @@ services:
       # Runtime environment variables for Vite
       VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
       VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
-      VITE_MILVUS_URL: http://milvus:19530
     depends_on:
       - rag-server