diff --git a/ai4rag/components/assets_generator/pattern_builder.py b/ai4rag/components/assets_generator/pattern_builder.py index 95b7235..a0dab8b 100644 --- a/ai4rag/components/assets_generator/pattern_builder.py +++ b/ai4rag/components/assets_generator/pattern_builder.py @@ -2,10 +2,243 @@ # Copyright IBM Corp. 2026 # SPDX-License-Identifier: Apache-2.0 # ----------------------------------------------------------------------------- +"""Build Responses API pattern definitions from HPO experiment results.""" + +import re + +from ai4rag.components.assets_generator.prompt_filters import ( + GROUNDING_PREFIXES, + USER_GROUNDING_SKIP_PREFIXES, + USER_RAG_GROUNDING_PREFIXES, + is_citation_related_line, + strip_ogx_runtime_instructions, +) + +_USER_QUERY_PLACEHOLDER = "" +_EMPTY_SYSTEM_FALLBACK = "You are a helpful assistant." +_EXPORT_SLOT_MARKERS = ("{reference_documents}", "{question}", "{multilingual_support}") + +# Suffix lines after ``{reference_documents}``: drop structural wrappers (e.g. ``[End]``). +_DOCUMENT_SLOT_MARKERS = frozenset({"[Document]", "[End]", "Documents:", "Context:"}) + +# Document and question slot markers +_DOCUMENT_LABELS = ("Documents:", "Context:", "[Document]") +_QUESTION_PREFIXES = ("Question:", "Q:", "[conversation]:") + + +def _join_answer_scaffold_blocks(lines: list[str]) -> str: + """Group lines into paragraph blocks, starting a new block when an answer-scaffold line appears. + + Scaffold lines are specifically in the form ``Answer (...)`` — e.g. + ``"Answer (max 150 words):"`` — as produced by HPO prompt templates. + Other leading text such as ``"Answer:"`` or ``"Response:"`` does NOT + trigger a new block. + """ + if not lines: + return "" + + blocks: list[str] = [] + current_block: list[str] = [] + for line in lines: + if line.startswith("Answer (") and current_block: + blocks.append("\n".join(current_block)) + current_block = [line] + else: + current_block.append(line) + if current_block: + blocks.append("\n".join(current_block)) + return "\n\n".join(blocks) + + +def _should_skip_redundant_user_line(stripped: str, system_has_grounding: bool) -> bool: + """Return whether a user-template line duplicates system policy for export.""" + if is_citation_related_line(stripped): + return True + return system_has_grounding and any( + stripped.startswith(prefix) for prefix in GROUNDING_PREFIXES + USER_RAG_GROUNDING_PREFIXES + ) + + +def _should_skip_user_export_line(stripped: str) -> bool: + """Return whether a merged user line is OGX-owned and must not be exported.""" + if any(stripped.startswith(prefix) for prefix in USER_GROUNDING_SKIP_PREFIXES): + return True + return is_citation_related_line(stripped) + + +def _strip_document_slot_prefix(prefix: str) -> str: + """Remove structural labels that wrap the reference-documents slot.""" + for label in _DOCUMENT_LABELS: + if prefix == label: + return "" + if prefix.endswith(label): + return prefix[: -len(label)].strip() + return prefix + + +def _extract_static_suffix_line(stripped: str) -> str | None: + """Return static instruction text from one post-documents template line.""" + if not stripped or stripped == ":" or stripped in _DOCUMENT_SLOT_MARKERS: + return None + if "{question}" in stripped: + without_question = stripped.replace("{question}", "").strip() + for question_prefix in _QUESTION_PREFIXES: + if without_question.startswith(question_prefix): + without_question = without_question[len(question_prefix) :].strip() + without_question = without_question.lstrip(":.").strip() + return without_question or None + if stripped.startswith(_QUESTION_PREFIXES): + return None + if "{multilingual_support}" in stripped: + return None + return stripped + + +def _extract_static_user_from_reference_slot(text: str) -> str: + """Extract static instructions from a template that contains ``{reference_documents}``.""" + before, after = text.split("{reference_documents}", 1) + parts: list[str] = [] + prefix = _strip_document_slot_prefix(before.strip()) + if prefix: + parts.append(prefix) + + suffix_lines = [ + line_text + for line_text in (_extract_static_suffix_line(line.strip()) for line in after.splitlines()) + if line_text + ] + if suffix_lines: + parts.append("\n".join(suffix_lines)) + return "\n\n".join(parts).strip() + + +def _system_has_grounding_policy(system: str) -> bool: + """Return whether the system prompt already states an explicit document-only grounding rule. + + Uses the same prefix list as sentence-level filtering so that adding a new + OGX phrase to ``GROUNDING_PREFIXES`` automatically covers system detection too. + Does NOT match descriptive personas like "retrieval-augmented assistant" without + an explicit grounding constraint. + + Checks at sentence granularity to avoid false positives from embedded substrings. + """ + sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", system)] + return any(any(sent.lower().startswith(p.lower()) for p in GROUNDING_PREFIXES) for sent in sentences) + + +def _filter_static_user_for_responses(system: str, static_user: str) -> str: + """Drop user-template lines that duplicate system policy for Responses export. + + Pass 1 of 2: compare against ``original_system`` (author intent before OGX + stripping). Removes user lines that repeat grounding or citation policy already + present in the HPO system prompt. + """ + if not static_user.strip(): + return "" + + system_has_grounding = _system_has_grounding_policy(system) + + filtered_lines: list[str] = [] + for line in static_user.splitlines(): + stripped = line.strip() + if not stripped or _should_skip_redundant_user_line(stripped, system_has_grounding): + continue + filtered_lines.append(stripped) + + return _join_answer_scaffold_blocks(filtered_lines) + + +def _adapt_system_for_responses_export(system: str) -> str: + """Drop OGX-runtime retrieval/citation text from the HPO system prompt.""" + return strip_ogx_runtime_instructions(system) + + +def _adapt_static_user_for_responses_export(static_user: str) -> str: + """Drop merged user supplements that OGX injects at file_search runtime. + + Pass 2 of 2: strip OGX-runtime phrases from user lines that survived pass 1. + """ + if not static_user.strip(): + return "" + + adapted_lines: list[str] = [] + for line in static_user.splitlines(): + stripped = line.strip() + if not stripped or _should_skip_user_export_line(stripped): + continue + cleaned = strip_ogx_runtime_instructions(stripped) + if cleaned: + adapted_lines.append(cleaned) + + return _join_answer_scaffold_blocks(adapted_lines) + + +def _extract_static_user_instructions(user_message_text: str) -> str: + """Return static instruction text from a HPO user template. + + Strips runtime slots (retrieved documents, question) that Responses API + supplies via ``file_search`` and the user ``input`` message respectively. + + All current templates use {reference_documents} placeholder format. + """ + if not user_message_text: + return "" + + text = str(user_message_text).strip() + if "{reference_documents}" not in text: + return "" + + return _extract_static_user_from_reference_slot(text) + + +def _is_placeholder_only_export(text: str) -> bool: + """Return whether export text contains only unresolved HPO template slots.""" + cleaned = text.strip() + if not cleaned: + return True + for marker in _EXPORT_SLOT_MARKERS: + cleaned = cleaned.replace(marker, "") + return not cleaned.strip() + + +def build_responses_system_input(generation: dict) -> str: + """Build Responses API system input aligned with HPO chat/completion prompts. + + HPO sends ``system_message_text`` plus a formatted ``user_message_text`` + (rules, documents, question). Responses uses ``file_search`` for documents + and a separate user message for the question. Non-redundant supplements + from the user template are merged into export; retrieval framing, chunk + presentation, and citation instructions owned by OGX ``config.yaml`` are + stripped rather than rephrased into the exported system input. + """ + original_system = (generation.get("system_message_text") or "").strip() + exported_system = _adapt_system_for_responses_export(original_system) + user_template = generation.get("user_message_text") or "" + + # Pass 1: dedupe vs original_system; pass 2: strip OGX-owned user supplements. + static_user = _adapt_static_user_for_responses_export( + _filter_static_user_for_responses( + original_system, + _extract_static_user_instructions(user_template), + ), + ) + + if exported_system and static_user: + result = f"{exported_system}\n\n{static_user}" + else: + result = exported_system or static_user + + # Fallback for completely empty patterns (rare edge case) + if not result or not result.strip() or _is_placeholder_only_export(result): + return _EMPTY_SYSTEM_FALLBACK + + return result + + def build_pattern_json( pattern: dict, ) -> dict: - """Update pattern information with detected language and responses template. + """Update pattern information with responses template. Parameters ---------- @@ -13,29 +246,50 @@ def build_pattern_json( A single evaluation result object carrying ``indexing_params``, ``rag_params``, ``pattern_name``, ``collection``, etc. + Notes + ----- + ``pattern["settings"]["generation"]`` must include ``model_id``, + ``temperature``, ``max_completion_tokens``, ``system_message_text``, and + ``user_message_text`` (as produced by the experiment payload). + Returns ------- dict Pattern definition suitable for JSON serialisation. """ - pattern["settings"]["responses_template"] = { - "model": pattern["settings"]["generation"]["model_id"], + generation = pattern["settings"]["generation"] + system_input = build_responses_system_input(generation) + + responses_template = { + "model": generation["model_id"], "stream": False, "store": False, - "input": "", - "instructions": pattern["settings"]["generation"]["system_message_text"], + "input": [ + { + "content": [{"text": system_input, "type": "input_text"}], + "role": "system", + }, + {"content": [{"text": _USER_QUERY_PLACEHOLDER, "type": "input_text"}], "role": "user"}, + ], + "tool_choice": {"type": "file_search"}, "tools": [ { "type": "file_search", "vector_store_ids": [pattern["settings"]["vector_store_binding"]["vector_store_id"]], - "ranking_options": { - "max_num_results": pattern["settings"]["retrieval"]["number_of_chunks"], - }, + "max_num_results": pattern["settings"]["retrieval"]["number_of_chunks"], }, ], "include": ["file_search_call.results"], } + # Only include temperature and max_output_tokens if they are not None + if generation.get("temperature") is not None: + responses_template["temperature"] = generation["temperature"] + if generation.get("max_completion_tokens") is not None: + responses_template["max_output_tokens"] = generation["max_completion_tokens"] + + pattern["settings"]["responses_template"] = responses_template + retrieval_settings = pattern["settings"]["retrieval"] search_mode = retrieval_settings.get("search_mode") ranker_strategy = retrieval_settings.get("ranker_strategy") @@ -43,8 +297,20 @@ def build_pattern_json( ranker_alpha = retrieval_settings.get("ranker_alpha") if search_mode == "hybrid" and ranker_strategy == "rrf" and ranker_k is not None and ranker_k > 0: - pattern["settings"]["responses_template"]["tools"][0]["ranking_options"]["impact_factor"] = ranker_k + pattern["settings"]["responses_template"]["tools"][0]["ranking_options"] = { + "ranker": "rrf", + "impact_factor": ranker_k, + } elif search_mode == "hybrid" and ranker_strategy == "weighted" and ranker_alpha is not None and ranker_alpha != 1: - pattern["settings"]["responses_template"]["tools"][0]["ranking_options"]["alpha"] = ranker_alpha + # ``ranker_alpha == 1.0`` intentionally falls through to ``else`` (semantic-only default). + pattern["settings"]["responses_template"]["tools"][0]["ranking_options"] = { + "ranker": "weighted", + "alpha": ranker_alpha, + } + else: + pattern["settings"]["responses_template"]["tools"][0]["ranking_options"] = { + "ranker": "weighted", + "alpha": 1.0, + } return pattern diff --git a/ai4rag/components/assets_generator/prompt_filters.py b/ai4rag/components/assets_generator/prompt_filters.py new file mode 100644 index 0000000..efceb27 --- /dev/null +++ b/ai4rag/components/assets_generator/prompt_filters.py @@ -0,0 +1,268 @@ +# ----------------------------------------------------------------------------- +# Copyright IBM Corp. 2026 +# SPDX-License-Identifier: Apache-2.0 +# ----------------------------------------------------------------------------- +"""Filter HPO prompts to remove OGX runtime injection duplicates. + +OGX (OpenSearch GenAI eXperience) injects grounding, citation, and retrieval +instructions at runtime via benchmarking/rag/config.yaml. HPO (HyperParameter +Optimization) templates sometimes include similar phrases that must be removed +during Responses API export to avoid duplication. + +This module provides filtering functions to strip OGX-owned content while +preserving HPO-specific persona, policy, and answer formatting rules. + +Note +---- +OGX phrase lists must stay synchronized with benchmarking/rag/config.yaml. +If OGX updates their injection strings, update the constants below. +""" + +import re + +from ai4rag.search_space.src.model_props import _RAG_CITATION_INSTRUCTION + +# ============================================================================ +# OGX Runtime Injection Strings +# ============================================================================ +# Source: benchmarking/rag/config.yaml +# These phrases are injected by OGX at file_search runtime. +# Export must NOT duplicate them in responses_template.input[system]. +# ============================================================================ + +# Citation-related phrases +CITATION_PREFIXES = ( + "You MUST cite sources", + "Cite sources immediately", +) +CITATION_SUBSTRINGS = ( + "[1], [2]", + "<|file-id|>", + "cite as <|", + "file citations", + "document numbers for every factual claim", +) +# HPO citation fragments for filtering (uses _RAG_CITATION_INSTRUCTION from model_props) +HPO_CITATION_FRAGMENTS = ( + _RAG_CITATION_INSTRUCTION, + "You MUST cite sources using [1], [2], etc.", + "You MUST cite sources using [1], [2].", +) + +# Grounding/retrieval-related phrases +# Used in: sentence-level filtering (sentence_is_ogx_duplicative) and +# system grounding detection (_system_has_grounding_policy in pattern_builder.py) +GROUNDING_PREFIXES = ( + "Answer ONLY using information from the documents", + "Answer ONLY using information from documents retrieved", + "Answer using ONLY the provided documents", + "Answer using ONLY information from documents", + "Do not use outside knowledge", + "If the retrieved documents do not contain", + "If the documents do not contain", +) +# Used in: substring matching within sentences for partial phrase detection +GROUNDING_SUBSTRINGS = ( + "documents below", + "retrieved via file search", + "retrieved to help answer the user", + "supporting information only in answering", +) +# Used in: whole-phrase removal from system prompts (strip_ogx_runtime_instructions) +SYSTEM_GROUNDING_PHRASES = ( + "Answer using ONLY the provided documents.", + "Answer using ONLY information from documents retrieved via file search.", +) + +# File search tool markers +# Used in: detecting OGX tool result wrappers in sentence-level filtering +FILE_SEARCH_MARKERS = ( + "file_search tool found", + "BEGIN of file_search tool results", + "END of file_search tool results", + "The above results were retrieved to help answer", + "Use them as supporting information only", + "Do not add extra punctuation. Use only the file IDs", +) + +# User template duplicate detection +# Used in: Pass 2 filtering (_should_skip_user_export_line in pattern_builder.py) +# OGX-owned lines that must never be exported regardless of system prompt content +USER_GROUNDING_SKIP_PREFIXES = ( + "Answer ONLY using information from the documents below", + "Do not use outside knowledge", + "If the documents do not contain the answer", +) +# Used in: Pass 1 filtering (_should_skip_redundant_user_line in pattern_builder.py) +# Only suppressed when system prompt already has grounding policy to avoid duplication +USER_RAG_GROUNDING_PREFIXES = ( + "You are a specialized Retrieval Augmented Generation", + "Prioritize correctness and ensure your response is grounded", +) + +# Combined line prefixes for sentence-level filtering +OGX_DUPLICATIVE_LINE_PREFIXES = CITATION_PREFIXES + GROUNDING_PREFIXES + FILE_SEARCH_MARKERS + +# Combined substrings for partial-match filtering +OGX_DUPLICATIVE_SUBSTRINGS = CITATION_SUBSTRINGS + GROUNDING_SUBSTRINGS + + +def collapse_whitespace(text: str) -> str: + """Collapse repeated interior spaces after phrase removal. + + Parameters + ---------- + text : str + Text potentially containing multiple consecutive spaces. + + Returns + ------- + str + Text with interior whitespace collapsed to single spaces, stripped. + """ + return re.sub(r" +", " ", text).strip() + + +def is_sentence_ogx_duplicative(sentence: str) -> bool: + """Return whether a sentence duplicates OGX file_search runtime injection. + + Parameters + ---------- + sentence : str + Single sentence to check. + + Returns + ------- + bool + True if sentence matches OGX injection patterns. + """ + stripped = sentence.strip().rstrip(".") + if not stripped: + return True + if any(stripped.startswith(prefix.rstrip(".")) for prefix in OGX_DUPLICATIVE_LINE_PREFIXES): + return True + normalized = stripped.lower() + return any(fragment.lower() in normalized for fragment in OGX_DUPLICATIVE_SUBSTRINGS) + + +def is_citation_related_line(line: str) -> bool: + """Return whether an entire line should be dropped as citation guidance. + + Parameters + ---------- + line : str + Line of text to check. + + Returns + ------- + bool + True if line contains only citation instructions owned by OGX. + """ + stripped = line.strip() + if not stripped: + return False + lower = stripped.lower() + if any(stripped.startswith(prefix) for prefix in CITATION_PREFIXES): + return True + if any(fragment.lower() in lower for fragment in HPO_CITATION_FRAGMENTS): + return True + return any(sub.lower() in lower for sub in CITATION_SUBSTRINGS) + + +def filter_ogx_duplicative_sentences(line: str) -> str: + """Remove OGX-duplicative sentences while keeping persona or policy sentences. + + Handles multi-sentence lines by filtering at sentence granularity. + + Parameters + ---------- + line : str + Line potentially containing multiple sentences. + + Returns + ------- + str + Line with OGX-duplicative sentences removed, or empty string if all filtered. + """ + stripped = line.strip() + if not stripped or is_citation_related_line(stripped): + return "" + + # Split on ". " only — avoids breaking abbreviations such as "i.e.," + parts = [part.strip() for part in stripped.split(". ") if part.strip()] + if len(parts) <= 1: + if is_sentence_ogx_duplicative(stripped.rstrip(".")): + return "" + return stripped + + kept = [part.rstrip(".") for part in parts if not is_sentence_ogx_duplicative(part.rstrip("."))] + if not kept: + return "" + + result = ". ".join(kept) + if stripped.endswith("."): + result += "." + return result + + +def normalize_answer_scaffold(line: str) -> str: + """Drop citation hints from answer scaffolds; OGX owns citation via annotations. + + Parameters + ---------- + line : str + Line potentially containing answer scaffold with citation hints. + + Returns + ------- + str + Line with ", with citations" and "with citations" removed, whitespace normalized. + """ + normalized = re.sub(r",?\s*with citations,?\s*", "", line) + return collapse_whitespace(normalized) + + +def strip_ogx_runtime_instructions(text: str) -> str: + """Remove text that OGX injects via file_search config at inference time. + + This is the main filtering function that orchestrates all OGX deduplication. + + Parameters + ---------- + text : str + Raw HPO prompt text (system or user message). + + Returns + ------- + str + Filtered text with OGX-duplicative content removed. + """ + if not text.strip(): + return "" + + for phrase in SYSTEM_GROUNDING_PHRASES: + text = text.replace(phrase, "").replace(phrase.rstrip("."), "") + text = collapse_whitespace(text) + + lines: list[str] = [] + for line in text.splitlines(): + stripped = line.strip() + if not stripped: + if lines and lines[-1] != "": + lines.append("") + continue + if is_citation_related_line(stripped): + continue + + cleaned = filter_ogx_duplicative_sentences(stripped) + for fragment in HPO_CITATION_FRAGMENTS: + if fragment in cleaned: + cleaned = cleaned.replace(fragment, "").strip() + break + cleaned = normalize_answer_scaffold(cleaned) + if cleaned: + lines.append(cleaned) + + result = "\n".join(lines) + result = re.sub(r"\n{3,}", "\n\n", result) + return result.strip() diff --git a/ai4rag/core/experiment/experiment.py b/ai4rag/core/experiment/experiment.py index 975d47b..f6f0c5d 100644 --- a/ai4rag/core/experiment/experiment.py +++ b/ai4rag/core/experiment/experiment.py @@ -338,6 +338,8 @@ def run_single_evaluation(self, rag_params: RAGParamsType) -> float: "retrieval": retrieval_params, "generation": { "model_id": foundation_model.model_id, + "temperature": foundation_model.params.temperature, + "max_completion_tokens": foundation_model.params.max_completion_tokens, "context_template_text": context_template_text, "user_message_text": user_message_text, "system_message_text": system_message_text, diff --git a/ai4rag/rag/chunking/docling_chunker.py b/ai4rag/rag/chunking/docling_chunker.py index bcf50f9..4aedb16 100644 --- a/ai4rag/rag/chunking/docling_chunker.py +++ b/ai4rag/rag/chunking/docling_chunker.py @@ -100,7 +100,7 @@ def split_documents(self, documents: Sequence[DoclingDocument]) -> list[AI4RAGCh } if chunk.meta.headings: - metadata["headings"] = chunk.meta.headings + metadata["headings"] = " > ".join(chunk.meta.headings) all_chunks.append(AI4RAGChunk(text=text, metadata=metadata)) diff --git a/tests/unit/ai4rag/assets_generator/test_pattern_builder.py b/tests/unit/ai4rag/assets_generator/test_pattern_builder.py index 0c57dae..e664ba7 100644 --- a/tests/unit/ai4rag/assets_generator/test_pattern_builder.py +++ b/tests/unit/ai4rag/assets_generator/test_pattern_builder.py @@ -9,6 +9,12 @@ import pytest from ai4rag.components.assets_generator import build_pattern_json +from ai4rag.components.assets_generator.pattern_builder import ( + _is_placeholder_only_export, + build_responses_system_input, +) +from ai4rag.components.assets_generator.prompt_filters import normalize_answer_scaffold +from ai4rag.search_space.src.model_props import get_system_message_text, get_user_message_text # --------------------------------------------------------------------------- # Helpers @@ -41,6 +47,8 @@ def _make_pattern(**overrides) -> dict: }, "generation": { "model_id": "ibm/granite-3-8b-instruct", + "temperature": 0.7, + "max_completion_tokens": 1024, "system_message_text": "Answer based on context only.", "user_message_text": "Context: {reference_documents}\nQ: {question}", "context_template_text": "{document}", @@ -56,6 +64,22 @@ def _make_pattern(**overrides) -> dict: return base +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("", True), + (" ", True), + ("{reference_documents}", True), + ("{reference_documents}\n{question}", True), + ("foo {reference_documents}", False), + ("You are a helpful assistant.", False), + ], +) +def test_is_placeholder_only_export(text: str, expected: bool): + """Placeholder-only export text must trigger the empty-input fallback path.""" + assert _is_placeholder_only_export(text) == expected + + # --------------------------------------------------------------------------- # build_pattern_json -- responses_template generation # --------------------------------------------------------------------------- @@ -70,15 +94,26 @@ def test_adds_responses_template(self): result = build_pattern_json(pattern) rt = result["settings"]["responses_template"] + generation = result["settings"]["generation"] + expected_system = build_responses_system_input(generation) + assert rt["model"] == "ibm/granite-3-8b-instruct" assert rt["stream"] is False assert rt["store"] is False - assert rt["input"] == "" - assert rt["instructions"] == "Answer based on context only." + assert rt["input"] == [ + { + "content": [{"text": expected_system, "type": "input_text"}], + "role": "system", + }, + {"content": [{"text": "", "type": "input_text"}], "role": "user"}, + ] + assert rt["max_output_tokens"] == 1024 + assert rt["temperature"] == 0.7 + assert rt["tool_choice"] == {"type": "file_search"} assert len(rt["tools"]) == 1 assert rt["tools"][0]["type"] == "file_search" assert "test_collection_001" in rt["tools"][0]["vector_store_ids"] - assert rt["tools"][0]["ranking_options"]["max_num_results"] == 5 + assert rt["tools"][0]["max_num_results"] == 5 assert rt["include"] == ["file_search_call.results"] def test_returns_same_dict(self): @@ -88,7 +123,7 @@ def test_returns_same_dict(self): assert result is pattern def test_hybrid_rrf_ranking_options(self): - """Hybrid search with RRF ranker must merge impact_factor into ranking_options.""" + """Hybrid search with RRF ranker must set ranker and impact_factor in ranking_options.""" pattern = _make_pattern() pattern["settings"]["retrieval"]["search_mode"] = "hybrid" pattern["settings"]["retrieval"]["ranker_strategy"] = "rrf" @@ -97,11 +132,11 @@ def test_hybrid_rrf_ranking_options(self): build_pattern_json(pattern) ro = pattern["settings"]["responses_template"]["tools"][0]["ranking_options"] - assert ro["impact_factor"] == 60 - assert ro["max_num_results"] == 5 + assert ro == {"ranker": "rrf", "impact_factor": 60} + assert pattern["settings"]["responses_template"]["tools"][0]["max_num_results"] == 5 def test_hybrid_weighted_ranking_options(self): - """Hybrid search with weighted ranker must merge alpha into ranking_options.""" + """Hybrid search with weighted ranker must set ranker and alpha in ranking_options.""" pattern = _make_pattern() pattern["settings"]["retrieval"]["search_mode"] = "hybrid" pattern["settings"]["retrieval"]["ranker_strategy"] = "weighted" @@ -110,16 +145,308 @@ def test_hybrid_weighted_ranking_options(self): build_pattern_json(pattern) ro = pattern["settings"]["responses_template"]["tools"][0]["ranking_options"] - assert ro["alpha"] == 0.7 - assert ro["max_num_results"] == 5 + assert ro == {"ranker": "weighted", "alpha": 0.7} + assert pattern["settings"]["responses_template"]["tools"][0]["max_num_results"] == 5 - def test_simple_retrieval_has_only_max_num_results(self): - """Simple retrieval must have ranking_options with only max_num_results.""" + def test_simple_retrieval_default_ranking_options(self): + """Vector-only search simulates semantic retrieval via weighted ranker alpha=1.0.""" pattern = _make_pattern() build_pattern_json(pattern) ro = pattern["settings"]["responses_template"]["tools"][0]["ranking_options"] - assert ro == {"max_num_results": 5} + assert ro == {"ranker": "weighted", "alpha": 1.0} + assert pattern["settings"]["responses_template"]["tools"][0]["max_num_results"] == 5 + + def test_hybrid_weighted_alpha_one_uses_default_ranking(self): + """Hybrid weighted with alpha=1.0 uses the default semantic-only simulation branch.""" + pattern = _make_pattern() + pattern["settings"]["retrieval"]["search_mode"] = "hybrid" + pattern["settings"]["retrieval"]["ranker_strategy"] = "weighted" + pattern["settings"]["retrieval"]["ranker_alpha"] = 1.0 + + build_pattern_json(pattern) + + ro = pattern["settings"]["responses_template"]["tools"][0]["ranking_options"] + assert ro == {"ranker": "weighted", "alpha": 1.0} + + def test_export_system_input_merges_non_redundant_user_rules(self): + """Legacy user supplements merge; redundant grounding and citations are omitted.""" + pattern = _make_pattern() + pattern["settings"]["generation"][ + "system_message_text" + ] = "You are a retrieval-augmented assistant. Answer using ONLY the provided documents." + pattern["settings"]["generation"]["user_message_text"] = ( + "Answer ONLY using information from the documents below. " + "Do not use outside knowledge.\n" + "You MUST cite sources using [1], [2], etc.\n\n" + "Documents:\n{reference_documents}\n\n" + "Question: {question}\n\n" + "Answer (max 150 words, with citations):\n" + "You MUST write your entire answer in English only." + ) + + build_pattern_json(pattern) + + system_text = pattern["settings"]["responses_template"]["input"][0]["content"][0]["text"] + assert "retrieval-augmented assistant" in system_text + assert "retrieved via file search" not in system_text + assert "provided documents" not in system_text.lower() + assert "Answer ONLY using information from the documents below" not in system_text + assert "must cite sources" not in system_text.lower() + assert "file citations" not in system_text.lower() + assert "max 150 words" in system_text + assert "with citations" not in system_text.lower() + assert "English only" in system_text + assert "{reference_documents}" not in system_text + assert "{question}" not in system_text + + def test_export_system_input_skips_duplicate_citation_and_keeps_answer_scaffold(self): + """Citation lines are stripped; answer scaffold and language policy still merge.""" + pattern = _make_pattern() + pattern["settings"]["generation"][ + "system_message_text" + ] = "You are a retrieval-augmented assistant. You MUST cite sources using [1], [2]." + pattern["settings"]["generation"]["user_message_text"] = ( + "You MUST cite sources using [1], [2], etc.\n\n" + "Documents:\n{reference_documents}\n\n" + "Question: {question}\n\n" + "Answer (max 150 words, with citations):\n" + "You MUST write your entire answer in English only." + ) + + build_pattern_json(pattern) + + system_text = pattern["settings"]["responses_template"]["input"][0]["content"][0]["text"] + assert "must cite sources" not in system_text.lower() + assert "max 150 words" in system_text + assert "English only" in system_text + + def test_build_responses_system_input_strips_ogx_prefix(self): + """Legacy grounding and citation lines are omitted; persona supplements are kept.""" + generation = { + "system_message_text": "Short system prefix.", + "user_message_text": ( + "Answer ONLY using information from the documents below.\n" + "You MUST cite sources using [1], [2].\n\n" + "Context: {reference_documents}\n\n" + "Question: {question}\n" + ), + } + system_input = build_responses_system_input(generation) + assert system_input == "Short system prefix." + assert "retrieved via file search" not in system_input + assert "must cite sources" not in system_input.lower() + assert "documents below" not in system_input + + def test_build_pattern_json_uses_export_parity_system_input(self): + """build_pattern_json must use build_responses_system_input(), not raw system text.""" + model_id = "ibm/granite-3-8b-instruct" + expected = build_responses_system_input( + { + "system_message_text": get_system_message_text(model_id), + "user_message_text": get_user_message_text(model_id, language="English"), + } + ) + + pattern = _make_pattern() + pattern["settings"]["generation"]["model_id"] = model_id + pattern["settings"]["generation"]["system_message_text"] = get_system_message_text(model_id) + pattern["settings"]["generation"]["user_message_text"] = get_user_message_text(model_id, language="English") + + build_pattern_json(pattern) + + actual = pattern["settings"]["responses_template"]["input"][0]["content"][0]["text"] + assert actual == expected + assert actual != pattern["settings"]["generation"]["system_message_text"] + assert "Granite Chat" in actual + assert "retrieval-augmented assistant" in actual + assert "You MUST respond in English" in actual + + @pytest.mark.parametrize( + "model_id", + [ + "unknown-model", + "ibm/granite-3-8b-instruct", + "meta-llama/llama-3-1-8b-instruct", + "mistralai/mistral-large", + "openai/gpt-oss-120b", + ], + ) + def test_export_omits_ogx_duplicative_prompt_text(self, model_id: str): + """Export must not duplicate citation/retrieval text that OGX injects at file_search runtime.""" + generation = { + "system_message_text": get_system_message_text(model_id), + "user_message_text": get_user_message_text(model_id, language="English"), + } + system_text = build_responses_system_input(generation) + + assert "[1], [2]" not in system_text + assert "must cite sources" not in system_text.lower() + assert "file citations" not in system_text.lower() + assert "documents below" not in system_text + assert "retrieved via file search" not in system_text + assert "retrieved to help answer" not in system_text.lower() + assert "<|file-id|>" not in system_text + assert "cite sources immediately" not in system_text.lower() + assert "supporting information only" not in system_text.lower() + assert "{reference_documents}" not in system_text + assert "{question}" not in system_text + assert "[End]" not in system_text + + def test_export_omits_ogx_config_yaml_instruction_text(self): + """Export must not contain verbatim OGX annotation/context template phrases.""" + generation = { + "system_message_text": ( + "You are a retrieval-augmented assistant. " + "Cite sources immediately at the end of sentences before punctuation." + ), + "user_message_text": ( + "The above results were retrieved to help answer the user's query. " + "Use them as supporting information only in answering this query.\n" + "Documents:\n{reference_documents}\n\n" + "Question: {question}\n" + ), + } + system_text = build_responses_system_input(generation) + assert system_text == "You are a retrieval-augmented assistant." + + @pytest.mark.parametrize( + "model_id", + [ + "meta-llama/llama-3-1-8b-instruct", + "mistralai/mistral-large", + "openai/gpt-oss-120b", + "ibm/granite-3-8b-instruct", + ], + ) + def test_export_merges_unified_rag_instructions(self, model_id: str): + """All model families use unified RAG instructions after PR #81.""" + generation = { + "system_message_text": get_system_message_text(model_id), + "user_message_text": get_user_message_text(model_id, language="English"), + } + system_text = build_responses_system_input(generation) + # All models now use unified RAG structure from PR #81 + assert "retrieval-augmented assistant" in system_text + assert "max 150 words" in system_text + assert "You MUST respond in English" in system_text + + def test_build_responses_system_input_handles_empty_inputs(self): + """When both system and user are empty or contain only placeholders, return fallback.""" + # Case 1: Completely empty + generation = { + "system_message_text": "", + "user_message_text": "", + } + system_text = build_responses_system_input(generation) + assert system_text == "You are a helpful assistant." + + # Case 2: Only placeholders in user template + generation = { + "system_message_text": "", + "user_message_text": "{reference_documents}\n{question}", + } + system_text = build_responses_system_input(generation) + assert system_text == "You are a helpful assistant." + + # Case 4: Unresolved question slot only (cookbook-style minimal user template) + generation = { + "system_message_text": "", + "user_message_text": "{question}", + } + system_text = build_responses_system_input(generation) + assert system_text == "You are a helpful assistant." + + # Case 3: Only OGX-duplicative content that gets stripped + generation = { + "system_message_text": "Answer using ONLY the provided documents.", + "user_message_text": ( + "Answer ONLY using information from the documents below.\n" + "You MUST cite sources using [1], [2].\n" + "{reference_documents}\n{question}" + ), + } + system_text = build_responses_system_input(generation) + assert system_text == "You are a helpful assistant." + + def test_strip_ogx_runtime_partial_sentence_removal(self): + """OGX sentences in a multi-sentence system prompt are removed; others are kept.""" + generation = { + "system_message_text": ( + "You are an expert assistant. " "Answer using ONLY the provided documents. " "Be concise." + ), + "user_message_text": "", + } + result = build_responses_system_input(generation) + assert "You are an expert assistant" in result + assert "Be concise" in result + assert "provided documents" not in result.lower() + + def test_user_grounding_merges_when_system_is_persona_only(self): + """Persona-only system must not suppress non-OGX user supplements (e.g. RAG block).""" + generation = { + "system_message_text": "You are a retrieval-augmented assistant. Use your best judgment.", + "user_message_text": ( + "You are a specialized Retrieval Augmented Generation (RAG) assistant. " + "Prioritize correctness and ensure your response is grounded in the documents.\n" + "{reference_documents}\n{question}" + ), + } + result = build_responses_system_input(generation) + assert "retrieval-augmented assistant" in result + assert "specialized Retrieval Augmented Generation" in result + + def test_extract_static_user_pure_text_no_slots(self): + """Templates without {reference_documents} are invalid and return empty user text.""" + generation = { + "system_message_text": "Short system.", + "user_message_text": "Always respond in a formal tone.", + } + result = build_responses_system_input(generation) + # Invalid template (no {reference_documents}) → system only + assert result == "Short system." + + def test_normalize_answer_scaffold_strips_with_citations(self): + """Answer scaffolds must not retain citation hints owned by OGX.""" + assert normalize_answer_scaffold("Answer (max 150 words, with citations):") == "Answer (max 150 words):" + + def test_build_pattern_json_requires_generation_model_id(self): + """Malformed generation payloads must raise KeyError for required fields.""" + pattern = _make_pattern() + del pattern["settings"]["generation"]["model_id"] + with pytest.raises(KeyError): + build_pattern_json(pattern) + + def test_system_grounding_detection_requires_explicit_policy(self): + """Grounding detection must require explicit 'ONLY' constraint, not just persona.""" + generation_persona_only = { + "system_message_text": "You are a retrieval-augmented assistant. Use your best judgment.", + "user_message_text": "Answer ONLY using information from the documents below.\n{reference_documents}\n{question}", + } + result_persona = build_responses_system_input(generation_persona_only) + # Persona-only system should NOT trigger grounding suppression + assert "retrieval-augmented assistant" in result_persona + assert "use your best judgment" in result_persona.lower() + + generation_explicit = { + "system_message_text": "Answer using ONLY the provided documents.", + "user_message_text": "Answer ONLY using information from the documents below.\n{reference_documents}\n{question}", + } + result_explicit = build_responses_system_input(generation_explicit) + # Explicit grounding system SHOULD suppress redundant user grounding + # Both prompts are OGX-duplicative, so fallback is used + assert result_explicit == "You are a helpful assistant." + + def test_system_grounding_detection_uses_grounding_prefixes(self): + """Grounding detection must cover all ``_GROUNDING_PREFIXES`` entries.""" + generation = { + "system_message_text": "Answer ONLY using information from documents retrieved via file search.", + "user_message_text": ( + "Answer ONLY using information from the documents below.\n{reference_documents}\n{question}" + ), + } + result = build_responses_system_input(generation) + assert result == "You are a helpful assistant." def test_preserves_existing_pattern_fields(self): """Existing pattern fields (name, chunking, embedding, etc.) must not be altered.""" @@ -131,3 +458,38 @@ def test_preserves_existing_pattern_fields(self): assert pattern["name"] == original_name assert pattern["settings"]["chunking"] == original_chunking + + def test_omits_temperature_when_none(self): + """Temperature field must be omitted when None to avoid sending null to API.""" + pattern = _make_pattern() + pattern["settings"]["generation"]["temperature"] = None + + build_pattern_json(pattern) + + assert "temperature" not in pattern["settings"]["responses_template"] + # max_output_tokens should still be present + assert "max_output_tokens" in pattern["settings"]["responses_template"] + + def test_omits_max_output_tokens_when_none(self): + """max_output_tokens field must be omitted when None to avoid sending null to API.""" + pattern = _make_pattern() + pattern["settings"]["generation"]["max_completion_tokens"] = None + + build_pattern_json(pattern) + + assert "max_output_tokens" not in pattern["settings"]["responses_template"] + # temperature should still be present + assert "temperature" in pattern["settings"]["responses_template"] + + def test_system_grounding_detection_no_false_positive_on_embedded_substring(self): + """Grounding detection must not match embedded substrings, only sentence prefixes.""" + generation = { + "system_message_text": "Use only relevant information. All documents do not contain PII.", + "user_message_text": "Answer ONLY using information from the documents below.\n{reference_documents}\n{question}", + } + result = build_responses_system_input(generation) + + # "documents do not contain" is in _GROUNDING_PREFIXES but appears mid-sentence + # Should NOT suppress user grounding since system doesn't start with a grounding prefix + assert "Use only relevant information" in result + assert "All documents do not contain PII" in result diff --git a/tests/unit/ai4rag/rag/chunking/test_docling_chunker.py b/tests/unit/ai4rag/rag/chunking/test_docling_chunker.py index ffbb3af..47ea827 100644 --- a/tests/unit/ai4rag/rag/chunking/test_docling_chunker.py +++ b/tests/unit/ai4rag/rag/chunking/test_docling_chunker.py @@ -78,20 +78,24 @@ def test_contextualize_true_includes_headings(self, doc_with_sections): heading_chunks = [c for c in chunks if c.metadata.get("headings")] assert len(heading_chunks) > 0 for chunk in heading_chunks: - assert chunk.metadata["headings"][0] in chunk.text + # headings is now a string like "Introduction > Section 1" + first_heading = chunk.metadata["headings"].split(" > ")[0] + assert first_heading in chunk.text def test_contextualize_false_excludes_headings(self, doc_with_sections): chunker = DoclingChunker(contextualize=False) chunks = chunker.split_documents([doc_with_sections]) heading_chunks = [c for c in chunks if c.metadata.get("headings")] for chunk in heading_chunks: - heading = chunk.metadata["headings"][0] - assert not chunk.text.startswith(heading) + # headings is now a string like "Introduction > Section 1" + first_heading = chunk.metadata["headings"].split(" > ")[0] + assert not chunk.text.startswith(first_heading) def test_headings_in_metadata(self, chunker, doc_with_sections): chunks = chunker.split_documents([doc_with_sections]) heading_chunks = [c for c in chunks if c.metadata.get("headings")] - heading_values = [c.metadata["headings"][0] for c in heading_chunks] + # headings is now a string like "Introduction" or "Introduction > Section 1" + heading_values = [c.metadata["headings"].split(" > ")[0] for c in heading_chunks] assert "Introduction" in heading_values assert "Methods" in heading_values assert "Results" in heading_values