Skip to content

Commit 8906dd8

Browse files
tomer-levin-nvliavnaveDinaLaptiipolazilberftatiana-nv
authored
tabular data ingestion (#1720)
Co-authored-by: Liav Nave <lnave@nvidia.com> Co-authored-by: dlaptii <dlaptii@nvidia.com> Co-authored-by: polazilber <pzilberman@nvidia.com> Co-authored-by: Tatiana Frenklach <tfrenklach@nvidia.com> Co-authored-by: Yuval Shkolar <yshkolar@nvidia.com> Co-authored-by: Lio Fleishman <lfleishman@nvidia.com>
1 parent 28c2575 commit 8906dd8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+4197
-1
lines changed

.env.example

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Neo4j connection settings
2+
# Copy this file to .env and fill in your values:
3+
# cp .env.example .env
4+
#
5+
# bolt://neo4j:7687 — use when running inside Docker (service name as host)
6+
# bolt://localhost:7687 — use when running Python on your host machine
7+
NEO4J_URI=bolt://localhost:7687
8+
NEO4J_USERNAME=neo4j
9+
NEO4J_PASSWORD=your_password_here
10+
11+
# LLM (NVIDIA NIM) settings
12+
LLM_INVOKE_URL=https://integrate.api.nvidia.com/v1
13+
LLM_API_KEY=your_nvidia_api_key_here
14+
LLM_MODEL=meta/llama-3.1-70b-instruct
15+
16+
# DuckDB
17+
DUCKDB_PATH=./spider2.duckdb

.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Put new items at the bottom of this list!!!
22

3+
# macOS
4+
.DS_Store
5+
36
# Created by https://www.gitignore.io/api/vim,c++,cmake,python,synology
47

58
### C++ ###
@@ -248,4 +251,13 @@ lancedb/
248251
outputs/
249252
models/
250253

254+
# DuckDB database files (generated by setup_spider2.py)
255+
*.duckdb
256+
*.duckdb.wal
257+
258+
# Local environment variables (credentials) — never commit
259+
.env
260+
251261
nemo_retriever/run_results/
262+
263+
nemo_retriever/src/nemo_retriever/relational_db/generate_sql/spider2-lite.jsonl

docker-compose.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,3 +520,26 @@ services:
520520
- "milvus"
521521
profiles:
522522
- retrieval
523+
524+
neo4j:
525+
image: neo4j:latest
526+
container_name: neo4j
527+
ports:
528+
- "7474:7474" # Browser UI
529+
- "7687:7687" # Bolt protocol
530+
environment:
531+
NEO4J_AUTH: ${NEO4J_USERNAME:-neo4j}/${NEO4J_PASSWORD:-neo4jpassword}
532+
NEO4J_PLUGINS: '["apoc"]'
533+
volumes:
534+
- neo4j_data:/data
535+
healthcheck:
536+
test: ["CMD", "cypher-shell", "-u", "neo4j", "-p", "${NEO4J_PASSWORD:-neo4jpassword}", "RETURN 1"]
537+
interval: 30s
538+
timeout: 10s
539+
retries: 5
540+
start_period: 30s
541+
profiles:
542+
- graph
543+
544+
volumes:
545+
neo4j_data:

nemo_retriever/pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ dependencies = [
7878
"scipy>=1.11.0",
7979
"nvidia-ml-py",
8080
"vllm==0.16.0",
81+
"duckdb>=1.2.0",
82+
"duckdb-engine>=0.13.0",
83+
"neo4j>=5.0",
84+
"langchain-nvidia-ai-endpoints>=0.3.0",
8185
]
8286

8387
[project.optional-dependencies]
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2+
# All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""Graph operator: fetch tabular entity descriptions from Neo4j into an embedding-ready DataFrame."""
6+
7+
from __future__ import annotations
8+
9+
from typing import Any
10+
11+
import pandas as pd
12+
13+
from nemo_retriever.graph.abstract_operator import AbstractOperator
14+
from nemo_retriever.graph.cpu_operator import CPUOperator
15+
16+
17+
class TabularFetchEmbeddingsOp(AbstractOperator, CPUOperator):
18+
"""Fetch all tabular entity descriptions from Neo4j into an embedding-ready DataFrame.
19+
20+
This operator ignores its input — it always queries Neo4j directly and
21+
returns a fresh DataFrame with columns:
22+
``text``, ``_embed_modality``, ``path``, ``page_number``, ``metadata``.
23+
24+
The output schema matches the format produced by the unstructured pipeline,
25+
so the standard :class:`~nemo_retriever.text_embed.operators._BatchEmbedActor`
26+
can be chained directly after this operator.
27+
"""
28+
29+
def preprocess(self, data: Any, **kwargs: Any) -> Any:
30+
return data
31+
32+
def process(self, data: Any, **kwargs: Any) -> pd.DataFrame:
33+
from nemo_retriever.tabular_data.ingestion.embeddings import fetch_tabular_embedding_dataframe
34+
35+
return fetch_tabular_embedding_dataframe()
36+
37+
def postprocess(self, data: Any, **kwargs: Any) -> Any:
38+
return data
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2+
# All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""Graph operator: extract relational DB schema and store it in Neo4j."""
6+
7+
from __future__ import annotations
8+
9+
from typing import Any
10+
11+
import pandas as pd
12+
13+
from nemo_retriever.graph.abstract_operator import AbstractOperator
14+
from nemo_retriever.graph.cpu_operator import CPUOperator
15+
from nemo_retriever.params import TabularExtractParams
16+
17+
18+
class TabularSchemaExtractOp(AbstractOperator, CPUOperator):
19+
"""Extract schema entities from a relational DB and write them to Neo4j.
20+
21+
Combines two steps:
22+
1. Pull schema metadata (tables, columns, views, PKs, FKs) from the
23+
database via the :class:`~nemo_retriever.tabular_data.sql_database.SQLDatabase`
24+
connector stored in *tabular_params*.
25+
2. Write the extracted entities as graph nodes and relationships into Neo4j.
26+
27+
The operator produces an empty DataFrame as output so it can be chained
28+
with downstream operators (e.g. :class:`TabularFetchEmbeddingsOp`) via
29+
``>>``. All meaningful state lives in Neo4j after this step.
30+
"""
31+
32+
def __init__(
33+
self,
34+
*,
35+
tabular_params: TabularExtractParams | None = None,
36+
**kwargs: Any,
37+
) -> None:
38+
super().__init__(tabular_params=tabular_params, **kwargs)
39+
self._tabular_params = tabular_params
40+
41+
def preprocess(self, data: Any, **kwargs: Any) -> TabularExtractParams | None:
42+
if isinstance(data, TabularExtractParams):
43+
return data
44+
return self._tabular_params
45+
46+
def process(self, data: TabularExtractParams | None, **kwargs: Any) -> pd.DataFrame:
47+
from nemo_retriever.tabular_data.ingestion.extract_data import (
48+
extract_tabular_db_data,
49+
store_relational_db_in_neo4j,
50+
)
51+
52+
schema_data = extract_tabular_db_data(params=data)
53+
store_relational_db_in_neo4j(data=schema_data)
54+
return pd.DataFrame()
55+
56+
def postprocess(self, data: Any, **kwargs: Any) -> Any:
57+
return data

nemo_retriever/src/nemo_retriever/ingestor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323
from nemo_retriever.params import DedupParams
2424
from nemo_retriever.params import EmbedParams
2525
from nemo_retriever.params import ExtractParams
26-
from nemo_retriever.params import TextChunkParams
2726
from nemo_retriever.params import IngestExecuteParams
2827
from nemo_retriever.params import IngestorCreateParams
2928
from nemo_retriever.params import RunMode
3029
from nemo_retriever.params import StoreParams
30+
from nemo_retriever.params import TextChunkParams
3131
from nemo_retriever.params import VdbUploadParams
3232

3333

nemo_retriever/src/nemo_retriever/params/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from .models import RemoteRetryParams
2626
from .models import RunMode
2727
from .models import StoreParams
28+
from .models import TabularExtractParams
2829
from .models import TableParams
2930
from .models import TextChunkParams
3031
from .models import VdbUploadParams
@@ -53,6 +54,7 @@
5354
"RemoteRetryParams",
5455
"RunMode",
5556
"StoreParams",
57+
"TabularExtractParams",
5658
"TableParams",
5759
"TextChunkParams",
5860
"VdbUploadParams",

nemo_retriever/src/nemo_retriever/params/models.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import warnings
1010

1111

12+
from nemo_retriever.tabular_data.sql_database import SQLDatabase
1213
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
1314

1415
RunMode = Literal["inprocess", "batch", "fused", "online"]
@@ -349,3 +350,23 @@ class InfographicParams(_ParamsModel):
349350
output_column: str = "infographic_elements_v1"
350351
num_detections_column: str = "infographic_elements_v1_num_detections"
351352
counts_by_label_column: str = "infographic_elements_v1_counts_by_label"
353+
354+
355+
# ---------------------------------------------------------------------------
356+
# Structured (database) ingestion params
357+
# ---------------------------------------------------------------------------
358+
359+
360+
class TabularExtractParams(_ParamsModel):
361+
"""Params for step 1: extract schema metadata and write to Neo4j.
362+
363+
Covers SQLAlchemy reflection of a live database and/or parsing of
364+
pre-existing SQL DDL/query files. Produces Database, Schema, Table,
365+
Column, View and Query nodes together with their relationships.
366+
The Neo4j connection is provided by get_neo4j_conn() (see
367+
tabular_data.neo4j) and is not configured here.
368+
"""
369+
370+
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
371+
372+
connector: Optional[SQLDatabase] = None

nemo_retriever/src/nemo_retriever/retriever.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,12 @@ def queries(
319319

320320
return results
321321

322+
def generate_sql(self, query: str) -> str:
323+
"""Generate a SQL query for a given natural language query."""
324+
from nemo_retriever.tabular_data.retrieval import generate_sql
325+
326+
return generate_sql(query)
327+
322328

323329
# Backward compatibility alias.
324330
retriever = Retriever

0 commit comments

Comments
 (0)