NVIDIA
diff --git a/‎.env.example‎
Lines changed: 17 additions & 0 deletions b/‎.env.example‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 12 additions & 0 deletions b/‎.gitignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docker-compose.yaml‎
Lines changed: 23 additions & 0 deletions b/‎docker-compose.yaml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎nemo_retriever/pyproject.toml‎
Lines changed: 4 additions & 0 deletions b/‎nemo_retriever/pyproject.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎nemo_retriever/src/nemo_retriever/graph/tabular_fetch_embeddings_operator.py‎
Lines changed: 38 additions & 0 deletions b/‎nemo_retriever/src/nemo_retriever/graph/tabular_fetch_embeddings_operator.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎nemo_retriever/src/nemo_retriever/graph/tabular_schema_extract_operator.py‎
Lines changed: 57 additions & 0 deletions b/‎nemo_retriever/src/nemo_retriever/graph/tabular_schema_extract_operator.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎nemo_retriever/src/nemo_retriever/ingestor.py‎
Lines changed: 1 addition & 1 deletion b/‎nemo_retriever/src/nemo_retriever/ingestor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nemo_retriever/src/nemo_retriever/params/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎nemo_retriever/src/nemo_retriever/params/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nemo_retriever/src/nemo_retriever/params/models.py‎
Lines changed: 21 additions & 0 deletions b/‎nemo_retriever/src/nemo_retriever/params/models.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎nemo_retriever/src/nemo_retriever/retriever.py‎
Lines changed: 6 additions & 0 deletions b/‎nemo_retriever/src/nemo_retriever/retriever.py‎
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1,17 @@
+# Neo4j connection settings
+# Copy this file to .env and fill in your values:
+#   cp .env.example .env
+#
+# bolt://neo4j:7687     — use when running inside Docker (service name as host)
+# bolt://localhost:7687 — use when running Python on your host machine
+NEO4J_URI=bolt://localhost:7687
+NEO4J_USERNAME=neo4j
+NEO4J_PASSWORD=your_password_here
+
+# LLM (NVIDIA NIM) settings
+LLM_INVOKE_URL=https://integrate.api.nvidia.com/v1
+LLM_API_KEY=your_nvidia_api_key_here
+LLM_MODEL=meta/llama-3.1-70b-instruct
+
+# DuckDB
+DUCKDB_PATH=./spider2.duckdb
@@ -1,5 +1,8 @@
 # Put new items at the bottom of this list!!!
 
+# macOS
+.DS_Store
+
 # Created by https://www.gitignore.io/api/vim,c++,cmake,python,synology
 
 ### C++ ###
@@ -248,4 +251,13 @@ lancedb/
 outputs/
 models/
 
+# DuckDB database files (generated by setup_spider2.py)
+*.duckdb
+*.duckdb.wal
+
+# Local environment variables (credentials) — never commit
+.env
+
 nemo_retriever/run_results/
+
+nemo_retriever/src/nemo_retriever/relational_db/generate_sql/spider2-lite.jsonl
@@ -520,3 +520,26 @@ services:
       - "milvus"
     profiles:
       - retrieval
+
+  neo4j:
+    image: neo4j:latest
+    container_name: neo4j
+    ports:
+      - "7474:7474"   # Browser UI
+      - "7687:7687"   # Bolt protocol
+    environment:
+      NEO4J_AUTH: ${NEO4J_USERNAME:-neo4j}/${NEO4J_PASSWORD:-neo4jpassword}
+      NEO4J_PLUGINS: '["apoc"]'
+    volumes:
+      - neo4j_data:/data
+    healthcheck:
+      test: ["CMD", "cypher-shell", "-u", "neo4j", "-p", "${NEO4J_PASSWORD:-neo4jpassword}", "RETURN 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    profiles:
+      - graph
+
+volumes:
+  neo4j_data:
@@ -78,6 +78,10 @@ dependencies = [
   "scipy>=1.11.0",
   "nvidia-ml-py",
   "vllm==0.16.0",
+  "duckdb>=1.2.0",
+  "duckdb-engine>=0.13.0",
+  "neo4j>=5.0",
+  "langchain-nvidia-ai-endpoints>=0.3.0",
 ]
 
 [project.optional-dependencies]
 
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Graph operator: fetch tabular entity descriptions from Neo4j into an embedding-ready DataFrame."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pandas as pd
+
+from nemo_retriever.graph.abstract_operator import AbstractOperator
+from nemo_retriever.graph.cpu_operator import CPUOperator
+
+
+class TabularFetchEmbeddingsOp(AbstractOperator, CPUOperator):
+    """Fetch all tabular entity descriptions from Neo4j into an embedding-ready DataFrame.
+
+    This operator ignores its input — it always queries Neo4j directly and
+    returns a fresh DataFrame with columns:
+    ``text``, ``_embed_modality``, ``path``, ``page_number``, ``metadata``.
+
+    The output schema matches the format produced by the unstructured pipeline,
+    so the standard :class:`~nemo_retriever.text_embed.operators._BatchEmbedActor`
+    can be chained directly after this operator.
+    """
+
+    def preprocess(self, data: Any, **kwargs: Any) -> Any:
+        return data
+
+    def process(self, data: Any, **kwargs: Any) -> pd.DataFrame:
+        from nemo_retriever.tabular_data.ingestion.embeddings import fetch_tabular_embedding_dataframe
+
+        return fetch_tabular_embedding_dataframe()
+
+    def postprocess(self, data: Any, **kwargs: Any) -> Any:
+        return data
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Graph operator: extract relational DB schema and store it in Neo4j."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pandas as pd
+
+from nemo_retriever.graph.abstract_operator import AbstractOperator
+from nemo_retriever.graph.cpu_operator import CPUOperator
+from nemo_retriever.params import TabularExtractParams
+
+
+class TabularSchemaExtractOp(AbstractOperator, CPUOperator):
+    """Extract schema entities from a relational DB and write them to Neo4j.
+
+    Combines two steps:
+    1. Pull schema metadata (tables, columns, views, PKs, FKs) from the
+       database via the :class:`~nemo_retriever.tabular_data.sql_database.SQLDatabase`
+       connector stored in *tabular_params*.
+    2. Write the extracted entities as graph nodes and relationships into Neo4j.
+
+    The operator produces an empty DataFrame as output so it can be chained
+    with downstream operators (e.g. :class:`TabularFetchEmbeddingsOp`) via
+    ``>>``.  All meaningful state lives in Neo4j after this step.
+    """
+
+    def __init__(
+        self,
+        *,
+        tabular_params: TabularExtractParams | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(tabular_params=tabular_params, **kwargs)
+        self._tabular_params = tabular_params
+
+    def preprocess(self, data: Any, **kwargs: Any) -> TabularExtractParams | None:
+        if isinstance(data, TabularExtractParams):
+            return data
+        return self._tabular_params
+
+    def process(self, data: TabularExtractParams | None, **kwargs: Any) -> pd.DataFrame:
+        from nemo_retriever.tabular_data.ingestion.extract_data import (
+            extract_tabular_db_data,
+            store_relational_db_in_neo4j,
+        )
+
+        schema_data = extract_tabular_db_data(params=data)
+        store_relational_db_in_neo4j(data=schema_data)
+        return pd.DataFrame()
+
+    def postprocess(self, data: Any, **kwargs: Any) -> Any:
+        return data
@@ -23,11 +23,11 @@
 from nemo_retriever.params import DedupParams
 from nemo_retriever.params import EmbedParams
 from nemo_retriever.params import ExtractParams
-from nemo_retriever.params import TextChunkParams
 from nemo_retriever.params import IngestExecuteParams
 from nemo_retriever.params import IngestorCreateParams
 from nemo_retriever.params import RunMode
 from nemo_retriever.params import StoreParams
+from nemo_retriever.params import TextChunkParams
 from nemo_retriever.params import VdbUploadParams
 
 
 
@@ -25,6 +25,7 @@
 from .models import RemoteRetryParams
 from .models import RunMode
 from .models import StoreParams
+from .models import TabularExtractParams
 from .models import TableParams
 from .models import TextChunkParams
 from .models import VdbUploadParams
@@ -53,6 +54,7 @@
     "RemoteRetryParams",
     "RunMode",
     "StoreParams",
+    "TabularExtractParams",
     "TableParams",
     "TextChunkParams",
     "VdbUploadParams",
 
@@ -9,6 +9,7 @@
 import warnings
 
 
+from nemo_retriever.tabular_data.sql_database import SQLDatabase
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
 RunMode = Literal["inprocess", "batch", "fused", "online"]
@@ -349,3 +350,23 @@ class InfographicParams(_ParamsModel):
     output_column: str = "infographic_elements_v1"
     num_detections_column: str = "infographic_elements_v1_num_detections"
     counts_by_label_column: str = "infographic_elements_v1_counts_by_label"
+
+
+# ---------------------------------------------------------------------------
+# Structured (database) ingestion params
+# ---------------------------------------------------------------------------
+
+
+class TabularExtractParams(_ParamsModel):
+    """Params for step 1: extract schema metadata and write to Neo4j.
+
+    Covers SQLAlchemy reflection of a live database and/or parsing of
+    pre-existing SQL DDL/query files.  Produces Database, Schema, Table,
+    Column, View and Query nodes together with their relationships.
+    The Neo4j connection is provided by get_neo4j_conn() (see
+    tabular_data.neo4j) and is not configured here.
+    """
+
+    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
+
+    connector: Optional[SQLDatabase] = None
@@ -319,6 +319,12 @@ def queries(
 
         return results
 
+    def generate_sql(self, query: str) -> str:
+        """Generate a SQL query for a given natural language query."""
+        from nemo_retriever.tabular_data.retrieval import generate_sql
+
+        return generate_sql(query)
+
 
 # Backward compatibility alias.
 retriever = Retriever