Skip to content

Commit f3a489e

Browse files
charlesblucaclaude
andauthored
feat(harness): respect "Use Default Heuristics" preset option in harness run (#1804)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 8906dd8 commit f3a489e

File tree

3 files changed

+84
-42
lines changed

3 files changed

+84
-42
lines changed

nemo_retriever/src/nemo_retriever/harness/config.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ class HarnessConfig:
8282
extract_page_as_image: bool = True
8383
extract_infographics: bool = False
8484
write_detection_file: bool = False
85+
use_heuristics: bool = False
8586

8687
pdf_extract_workers: int = 8
8788
pdf_extract_num_cpus: float = 2.0
@@ -158,12 +159,15 @@ def validate(self) -> list[str]:
158159
if self.embed_granularity not in VALID_EMBED_GRANULARITIES:
159160
errors.append(f"embed_granularity must be one of {sorted(VALID_EMBED_GRANULARITIES)}")
160161

162+
_ZERO_ALLOWED_WORKERS = {f for f in TUNING_FIELDS if f.endswith("_workers")} if self.use_heuristics else set()
161163
for name in TUNING_FIELDS:
162164
val = getattr(self, name)
163165
if name.startswith("gpu_") and float(val) < 0.0:
164166
errors.append(f"{name} must be >= 0.0")
165-
elif name.endswith("_workers") and int(val) < 1:
166-
errors.append(f"{name} must be >= 1")
167+
elif name.endswith("_workers"):
168+
min_val = 0 if name in _ZERO_ALLOWED_WORKERS else 1
169+
if int(val) < min_val:
170+
errors.append(f"{name} must be >= {min_val}")
167171

168172
return errors
169173

@@ -281,6 +285,7 @@ def _apply_env_overrides(config_dict: dict[str, Any]) -> None:
281285
"HARNESS_EXTRACT_PAGE_AS_IMAGE": ("extract_page_as_image", _parse_bool),
282286
"HARNESS_EXTRACT_INFOGRAPHICS": ("extract_infographics", _parse_bool),
283287
"HARNESS_WRITE_DETECTION_FILE": ("write_detection_file", _parse_bool),
288+
"HARNESS_USE_HEURISTICS": ("use_heuristics", _parse_bool),
284289
}
285290

286291
for key in TUNING_FIELDS:

nemo_retriever/src/nemo_retriever/harness/run.py

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -206,38 +206,45 @@ def _build_command(cfg: HarnessConfig, artifact_dir: Path, run_id: str) -> tuple
206206
cfg.input_type,
207207
"--evaluation-mode",
208208
cfg.evaluation_mode,
209-
"--pdf-extract-tasks",
210-
str(cfg.pdf_extract_workers),
211-
"--pdf-extract-cpus-per-task",
212-
str(cfg.pdf_extract_num_cpus),
213-
"--pdf-extract-batch-size",
214-
str(cfg.pdf_extract_batch_size),
215-
"--pdf-split-batch-size",
216-
str(cfg.pdf_split_batch_size),
217-
"--page-elements-batch-size",
218-
str(cfg.page_elements_batch_size),
219-
"--page-elements-actors",
220-
str(cfg.page_elements_workers),
221-
"--ocr-actors",
222-
str(cfg.ocr_workers),
223-
"--ocr-batch-size",
224-
str(cfg.ocr_batch_size),
225-
"--embed-actors",
226-
str(cfg.embed_workers),
227-
"--embed-batch-size",
228-
str(cfg.embed_batch_size),
229-
"--page-elements-cpus-per-actor",
230-
str(cfg.page_elements_cpus_per_actor),
231-
"--ocr-cpus-per-actor",
232-
str(cfg.ocr_cpus_per_actor),
233-
"--embed-cpus-per-actor",
234-
str(cfg.embed_cpus_per_actor),
235-
"--page-elements-gpus-per-actor",
236-
str(cfg.gpu_page_elements),
237-
"--ocr-gpus-per-actor",
238-
str(cfg.gpu_ocr),
239-
"--embed-gpus-per-actor",
240-
str(cfg.gpu_embed),
209+
]
210+
211+
if not cfg.use_heuristics:
212+
cmd += [
213+
"--pdf-extract-tasks",
214+
str(cfg.pdf_extract_workers),
215+
"--pdf-extract-cpus-per-task",
216+
str(cfg.pdf_extract_num_cpus),
217+
"--pdf-extract-batch-size",
218+
str(cfg.pdf_extract_batch_size),
219+
"--pdf-split-batch-size",
220+
str(cfg.pdf_split_batch_size),
221+
"--page-elements-batch-size",
222+
str(cfg.page_elements_batch_size),
223+
"--page-elements-actors",
224+
str(cfg.page_elements_workers),
225+
"--ocr-actors",
226+
str(cfg.ocr_workers),
227+
"--ocr-batch-size",
228+
str(cfg.ocr_batch_size),
229+
"--embed-actors",
230+
str(cfg.embed_workers),
231+
"--embed-batch-size",
232+
str(cfg.embed_batch_size),
233+
"--page-elements-cpus-per-actor",
234+
str(cfg.page_elements_cpus_per_actor),
235+
"--ocr-cpus-per-actor",
236+
str(cfg.ocr_cpus_per_actor),
237+
"--embed-cpus-per-actor",
238+
str(cfg.embed_cpus_per_actor),
239+
"--page-elements-gpus-per-actor",
240+
str(cfg.gpu_page_elements),
241+
"--ocr-gpus-per-actor",
242+
str(cfg.gpu_ocr),
243+
"--embed-gpus-per-actor",
244+
str(cfg.gpu_embed),
245+
]
246+
247+
cmd += [
241248
"--embed-model-name",
242249
cfg.embed_model_name,
243250
"--embed-modality",
@@ -470,6 +477,7 @@ def _run_single(cfg: HarnessConfig, artifact_dir: Path, run_id: str, tags: list[
470477
"extract_page_as_image": cfg.extract_page_as_image,
471478
"extract_infographics": cfg.extract_infographics,
472479
"write_detection_file": cfg.write_detection_file,
480+
"use_heuristics": cfg.use_heuristics,
473481
"lancedb_uri": _resolve_lancedb_uri(cfg, artifact_dir),
474482
"tuning": {field: getattr(cfg, field) for field in sorted(TUNING_FIELDS)},
475483
},

nemo_retriever/tests/test_harness_run.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,6 @@ def test_build_command_uses_hidden_detection_file_by_default(tmp_path: Path) ->
7878
assert "element" in cmd
7979
assert "--extract-page-as-image" in cmd
8080
assert "--no-extract-page-as-image" not in cmd
81-
assert "--pdf-extract-workers" not in cmd
82-
assert "--pdf-extract-num-cpus" not in cmd
83-
assert "--page-elements-workers" not in cmd
84-
assert "--ocr-workers" not in cmd
85-
assert "--embed-workers" not in cmd
86-
assert "--gpu-page-elements" not in cmd
87-
assert "--gpu-ocr" not in cmd
88-
assert "--gpu-embed" not in cmd
8981
assert detection_file.parent == runtime_dir
9082
assert detection_file.name == ".detection_summary.json"
9183
assert effective_query_csv == query_csv
@@ -232,6 +224,42 @@ def test_build_command_passes_audio_recall_options(tmp_path: Path) -> None:
232224
assert cmd[cmd.index("--audio-split-interval") + 1] == "45"
233225

234226

227+
def test_build_command_omits_tuning_flags_when_use_heuristics(tmp_path: Path) -> None:
228+
dataset_dir = tmp_path / "dataset"
229+
dataset_dir.mkdir()
230+
query_csv = tmp_path / "query.csv"
231+
query_csv.write_text("q,s,p\nx,y,1\n", encoding="utf-8")
232+
233+
cfg = HarnessConfig(
234+
dataset_dir=str(dataset_dir),
235+
dataset_label="jp20",
236+
preset="single_gpu",
237+
query_csv=str(query_csv),
238+
use_heuristics=True,
239+
)
240+
cmd, _runtime_dir, _detection_file, _effective_query_csv = _build_command(cfg, tmp_path, run_id="r1")
241+
242+
assert "--pdf-extract-tasks" not in cmd
243+
assert "--pdf-extract-cpus-per-task" not in cmd
244+
assert "--pdf-extract-batch-size" not in cmd
245+
assert "--pdf-split-batch-size" not in cmd
246+
assert "--page-elements-batch-size" not in cmd
247+
assert "--page-elements-actors" not in cmd
248+
assert "--ocr-actors" not in cmd
249+
assert "--ocr-batch-size" not in cmd
250+
assert "--embed-actors" not in cmd
251+
assert "--embed-batch-size" not in cmd
252+
assert "--page-elements-cpus-per-actor" not in cmd
253+
assert "--ocr-cpus-per-actor" not in cmd
254+
assert "--embed-cpus-per-actor" not in cmd
255+
assert "--page-elements-gpus-per-actor" not in cmd
256+
assert "--ocr-gpus-per-actor" not in cmd
257+
assert "--embed-gpus-per-actor" not in cmd
258+
# non-tuning flags still present
259+
assert "--embed-model-name" in cmd
260+
assert "--evaluation-mode" in cmd
261+
262+
235263
def test_normalize_recall_metric_key_removes_duplicate_prefix() -> None:
236264
assert _normalize_recall_metric_key("recall@1") == "recall_1"
237265
assert _normalize_recall_metric_key("recall@10") == "recall_10"
@@ -565,6 +593,7 @@ def _fake_run_subprocess(_cmd: list[str], metrics) -> int:
565593
"extract_page_as_image": cfg.extract_page_as_image,
566594
"extract_infographics": cfg.extract_infographics,
567595
"write_detection_file": True,
596+
"use_heuristics": cfg.use_heuristics,
568597
"lancedb_uri": str((artifact_dir / "lancedb").resolve()),
569598
"tuning": {field: getattr(cfg, field) for field in sorted(harness_run.TUNING_FIELDS)},
570599
},

0 commit comments

Comments
 (0)