Skip to content

Commit 3060ee0

Browse files
authored
Merge branch 'main' into shkolar/fix-mac
2 parents 00837ef + 81fabbe commit 3060ee0

18 files changed

+663
-530
lines changed

nemo_retriever/harness/HANDOFF.md

Lines changed: 63 additions & 224 deletions
Large diffs are not rendered by default.

nemo_retriever/harness/test_configs.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
active:
44
dataset: jp20
55
preset: single_gpu
6+
run_mode: batch
67
query_csv: data/jp20_query_gt.csv
78
input_type: pdf
89
recall_required: true

nemo_retriever/src/nemo_retriever/graph/content_transforms.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import pandas as pd
1212

13+
from nemo_retriever.io.image_store import resolve_image_b64
1314
from nemo_retriever.ocr.ocr import _crop_b64_image_by_norm_bbox
1415
from nemo_retriever.params.models import IMAGE_MODALITIES
1516

@@ -71,7 +72,7 @@ def explode_content_to_rows(
7172
batch_df = batch_df.copy()
7273
if text_mod in IMAGE_MODALITIES and "page_image" in batch_df.columns:
7374
batch_df["_image_b64"] = batch_df["page_image"].apply(
74-
lambda page_image: page_image.get("image_b64") if isinstance(page_image, dict) else None
75+
lambda page_image: resolve_image_b64(page_image) if isinstance(page_image, dict) else None
7576
)
7677
batch_df["_embed_modality"] = text_mod
7778
return batch_df
@@ -84,7 +85,7 @@ def explode_content_to_rows(
8485
page_image = row_dict.get("page_image")
8586
page_image_b64: Optional[str] = None
8687
if any_images and isinstance(page_image, dict):
87-
page_image_b64 = page_image.get("image_b64")
88+
page_image_b64 = resolve_image_b64(page_image)
8889

8990
page_text = row_dict.get(text_column)
9091
if isinstance(page_text, str) and page_text.strip():
@@ -103,6 +104,7 @@ def explode_content_to_rows(
103104
for item in content_list:
104105
if not isinstance(item, dict):
105106
continue
107+
item_b64 = resolve_image_b64(item) if struct_mod in IMAGE_MODALITIES else None
106108
# Emit rows for text and (optionally) caption fields.
107109
for field, content_type in [("text", column), ("caption", f"{column}_caption")]:
108110
value = item.get(field, "")
@@ -112,15 +114,18 @@ def explode_content_to_rows(
112114
content_row[text_column] = value.strip()
113115
content_row["_embed_modality"] = struct_mod
114116
content_row["_content_type"] = content_type
115-
if struct_mod in IMAGE_MODALITIES and page_image_b64:
116-
bbox = item.get("bbox_xyxy_norm")
117-
if bbox and len(bbox) == 4:
118-
cropped_b64, _ = _crop_b64_image_by_norm_bbox(page_image_b64, bbox_xyxy_norm=bbox)
119-
content_row["_image_b64"] = cropped_b64
117+
if struct_mod in IMAGE_MODALITIES:
118+
if item_b64:
119+
content_row["_image_b64"] = item_b64
120+
elif page_image_b64:
121+
bbox = item.get("bbox_xyxy_norm")
122+
if bbox and len(bbox) == 4:
123+
cropped_b64, _ = _crop_b64_image_by_norm_bbox(page_image_b64, bbox_xyxy_norm=bbox)
124+
content_row["_image_b64"] = cropped_b64
125+
else:
126+
content_row["_image_b64"] = page_image_b64
120127
else:
121-
content_row["_image_b64"] = page_image_b64
122-
elif struct_mod in IMAGE_MODALITIES:
123-
content_row["_image_b64"] = None
128+
content_row["_image_b64"] = None
124129
new_rows.append(content_row)
125130
exploded_any = True
126131

@@ -155,7 +160,7 @@ def collapse_content_to_page_rows(
155160
if modality in IMAGE_MODALITIES:
156161
if "page_image" in batch_df.columns:
157162
batch_df["_image_b64"] = batch_df["page_image"].apply(
158-
lambda page_image: page_image.get("image_b64") if isinstance(page_image, dict) else None
163+
lambda page_image: resolve_image_b64(page_image) if isinstance(page_image, dict) else None
159164
)
160165
else:
161166
batch_df["_image_b64"] = None

nemo_retriever/src/nemo_retriever/harness/artifacts.py

Lines changed: 87 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,116 @@
55
from __future__ import annotations
66

77
import json
8+
import re
89
import subprocess
910
from datetime import datetime, timezone
1011
from pathlib import Path
1112
from typing import Any
1213

1314
NEMO_RETRIEVER_ROOT = Path(__file__).resolve().parents[3]
1415
DEFAULT_ARTIFACTS_ROOT = NEMO_RETRIEVER_ROOT / "artifacts"
16+
_COMMIT_RE = re.compile(r"^[0-9a-fA-F]{7,40}$")
1517

1618

1719
def now_timestr() -> str:
1820
return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_UTC")
1921

2022

23+
def _normalize_commit(value: str | None) -> str | None:
24+
text = (value or "").strip()
25+
if not _COMMIT_RE.match(text):
26+
return None
27+
return text[:7]
28+
29+
30+
def _resolve_git_dir(repo_root: Path) -> Path | None:
31+
dot_git = repo_root / ".git"
32+
if dot_git.is_dir():
33+
return dot_git
34+
if not dot_git.is_file():
35+
return None
36+
try:
37+
raw = dot_git.read_text(encoding="utf-8").strip()
38+
except Exception:
39+
return None
40+
if not raw.startswith("gitdir:"):
41+
return None
42+
gitdir_text = raw.split(":", 1)[1].strip()
43+
git_dir = Path(gitdir_text).expanduser()
44+
if not git_dir.is_absolute():
45+
git_dir = (repo_root / git_dir).resolve()
46+
return git_dir
47+
48+
49+
def _read_packed_ref(git_dir: Path, ref_name: str) -> str | None:
50+
packed_refs = git_dir / "packed-refs"
51+
if not packed_refs.exists():
52+
return None
53+
try:
54+
for line in packed_refs.read_text(encoding="utf-8").splitlines():
55+
line = line.strip()
56+
if not line or line.startswith("#") or line.startswith("^"):
57+
continue
58+
commit, _sep, ref = line.partition(" ")
59+
if ref.strip() == ref_name:
60+
normalized = _normalize_commit(commit)
61+
if normalized is not None:
62+
return normalized
63+
except Exception:
64+
return None
65+
return None
66+
67+
68+
def _read_head_commit(repo_root: Path) -> str | None:
69+
git_dir = _resolve_git_dir(repo_root)
70+
if git_dir is None:
71+
return None
72+
73+
head_path = git_dir / "HEAD"
74+
if not head_path.exists():
75+
return None
76+
try:
77+
head_value = head_path.read_text(encoding="utf-8").strip()
78+
except Exception:
79+
return None
80+
81+
if head_value.startswith("ref:"):
82+
ref_name = head_value.split(":", 1)[1].strip()
83+
ref_path = git_dir / ref_name
84+
if ref_path.exists():
85+
try:
86+
normalized = _normalize_commit(ref_path.read_text(encoding="utf-8"))
87+
if normalized is not None:
88+
return normalized
89+
except Exception:
90+
pass
91+
return _read_packed_ref(git_dir, ref_name)
92+
93+
return _normalize_commit(head_value)
94+
95+
2196
def last_commit() -> str:
97+
repo_root = NEMO_RETRIEVER_ROOT.parent
2298
try:
2399
result = subprocess.run(
24100
["git", "rev-parse", "--short", "HEAD"],
25-
cwd=str(NEMO_RETRIEVER_ROOT.parent),
101+
cwd=str(repo_root),
26102
check=False,
27103
capture_output=True,
28104
text=True,
29105
)
30106
except Exception:
31-
return "unknown"
107+
result = None
108+
109+
if result is not None and result.returncode == 0:
110+
normalized = _normalize_commit(result.stdout)
111+
if normalized is not None:
112+
return normalized
32113

33-
if result.returncode != 0:
34-
return "unknown"
35-
return (result.stdout or "").strip() or "unknown"
114+
fallback = _read_head_commit(repo_root)
115+
if fallback is not None:
116+
return fallback
117+
return "unknown"
36118

37119

38120
def get_artifacts_root(base_dir: str | None = None) -> Path:

nemo_retriever/src/nemo_retriever/harness/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
REPO_ROOT = NEMO_RETRIEVER_ROOT.parent
1616
DEFAULT_TEST_CONFIG_PATH = NEMO_RETRIEVER_ROOT / "harness" / "test_configs.yaml"
1717
DEFAULT_NIGHTLY_CONFIG_PATH = NEMO_RETRIEVER_ROOT / "harness" / "nightly_config.yaml"
18+
VALID_RUN_MODES = {"batch", "inprocess"}
1819
VALID_EVALUATION_MODES = {"recall", "beir"}
1920
VALID_RECALL_ADAPTERS = {"none", "page_plus_one", "financebench_json"}
2021
VALID_BEIR_LOADERS = {"vidore_hf"}
@@ -54,6 +55,7 @@ class HarnessConfig:
5455
dataset_dir: str
5556
dataset_label: str
5657
preset: str
58+
run_mode: str = "batch"
5759

5860
query_csv: str | None = None
5961
input_type: str = "pdf"
@@ -114,6 +116,9 @@ def validate(self) -> list[str]:
114116
if self.query_csv is not None and not Path(self.query_csv).exists():
115117
errors.append(f"query_csv does not exist: {self.query_csv}")
116118

119+
if self.run_mode not in VALID_RUN_MODES:
120+
errors.append(f"run_mode must be one of {sorted(VALID_RUN_MODES)}")
121+
117122
if self.evaluation_mode not in VALID_EVALUATION_MODES:
118123
errors.append(f"evaluation_mode must be one of {sorted(VALID_EVALUATION_MODES)}")
119124

@@ -263,6 +268,7 @@ def _apply_env_overrides(config_dict: dict[str, Any]) -> None:
263268
"HARNESS_DATASET": ("dataset", str),
264269
"HARNESS_DATASET_DIR": ("dataset_dir", str),
265270
"HARNESS_PRESET": ("preset", str),
271+
"HARNESS_RUN_MODE": ("run_mode", str),
266272
"HARNESS_QUERY_CSV": ("query_csv", str),
267273
"HARNESS_INPUT_TYPE": ("input_type", str),
268274
"HARNESS_RECALL_REQUIRED": ("recall_required", _parse_bool),

nemo_retriever/src/nemo_retriever/harness/parsers.py

Lines changed: 0 additions & 86 deletions
This file was deleted.

0 commit comments

Comments
 (0)