diff --git a/docs/examples/intrinsics/README.md b/docs/examples/intrinsics/README.md index 180114bef..c6036289b 100644 --- a/docs/examples/intrinsics/README.md +++ b/docs/examples/intrinsics/README.md @@ -31,6 +31,19 @@ Detects when model outputs contain hallucinated information. ### query_rewrite.py Rewrites queries for better retrieval or understanding. +### uncertainty.py +Estimates the model's certainty about answering a question. + +### requirement_check.py +Detect if text adheres to provided requirements. + +### factuality_detection.py +Detects if the the model's output is factually incorrect relative to context. + +### factuality_correction.py +Corrects a factually incorrect response relative to context. + + ## Concepts Demonstrated - **Intrinsic Functions**: Specialized model capabilities beyond text generation @@ -48,7 +61,7 @@ from mellea.stdlib.components import Intrinsic import mellea.stdlib.functional as mfuncs # Create backend and adapter -backend = LocalHFBackend(model_id="ibm-granite/granite-3.3-8b-instruct") +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") adapter = IntrinsicAdapter("requirement_check", base_model_name=backend.base_model_name) backend.add_adapter(adapter) @@ -71,9 +84,12 @@ out, new_ctx = mfuncs.act( - **context_relevance**: Assess context-query relevance - **hallucination_detection**: Detect hallucinated content - **query_rewrite**: Improve query formulation +- **uncertainty**: Estimate certainty about answering a question +- **factuality_detection**: Detect factually incorrect responses +- **factuality_correction**: Correct factually incorrect responses ## Related Documentation - See `mellea/stdlib/components/intrinsic/` for intrinsic implementations - See `mellea/backends/adapters/` for adapter system -- See `docs/dev/intrinsics_and_adapters.md` for architecture details \ No newline at end of file +- See `docs/dev/intrinsics_and_adapters.md` for architecture details diff --git a/docs/examples/intrinsics/factuality_correction.py b/docs/examples/intrinsics/factuality_correction.py new file mode 100644 index 000000000..62777d595 --- /dev/null +++ b/docs/examples/intrinsics/factuality_correction.py @@ -0,0 +1,45 @@ +# pytest: huggingface, requires_heavy_ram, llm + +"""Example usage of the factuality correction intrinsic. + +To run this script from the root of the Mellea source tree, use the command: +``` +uv run python docs/examples/intrinsics/factuality_correction.py +``` +""" + +from mellea.backends.huggingface import LocalHFBackend +from mellea.stdlib.components import Document, Message +from mellea.stdlib.context import ChatContext +from mellea.stdlib.components.intrinsic import guardian + +user_text = "Is Ozzy Osbourne still alive?" +response_text = "Yes, Ozzy Osbourne is alive in 2025 and preparing for another world tour, continuing to amaze fans with his energy and resilience." +document = Document( + # Context says Ozzy Osbourne is dead, but the response says he is alive. + "Ozzy Osbourne passed away on July 22, 2025, at the age of 76 from a heart attack. " + "He died at his home in Buckinghamshire, England, with contributing conditions " + "including coronary artery disease and Parkinson's disease. His final " + "performance took place earlier that month in Birmingham." +) + +# Create the backend. +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") +context = ( + ChatContext() + .add(document) + .add(Message("user", user_text)) + .add(Message("assistant", response_text)) +) + +# Create the backend. +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") +context = ( + ChatContext() + .add(document) + .add(Message("user", user_text)) + .add(Message("assistant", response_text)) +) + +result = guardian.factuality_correction(context, backend) +print(f"Result of factuality correction: {result}") # corrected response string diff --git a/docs/examples/intrinsics/factuality_detection.py b/docs/examples/intrinsics/factuality_detection.py new file mode 100644 index 000000000..52e3abb2a --- /dev/null +++ b/docs/examples/intrinsics/factuality_detection.py @@ -0,0 +1,37 @@ +# pytest: huggingface, requires_heavy_ram, llm + +"""Example usage of the factuality detection intrinsic. + +To run this script from the root of the Mellea source tree, use the command: +``` +uv run python docs/examples/intrinsics/factuality_detection.py +``` +""" + +from mellea.backends.huggingface import LocalHFBackend +from mellea.stdlib.components import Document, Message +from mellea.stdlib.context import ChatContext +from mellea.stdlib.components.intrinsic import guardian + +user_text = "Is Ozzy Osbourne still alive?" +response_text = "Yes, Ozzy Osbourne is alive in 2025 and preparing for another world tour, continuing to amaze fans with his energy and resilience." + +document = Document( + # Context says Ozzy Osbourne is dead, but the response says he is alive. + "Ozzy Osbourne passed away on July 22, 2025, at the age of 76 from a heart attack. " + "He died at his home in Buckinghamshire, England, with contributing conditions " + "including coronary artery disease and Parkinson's disease. His final " + "performance took place earlier that month in Birmingham." +) + +# Create the backend. +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") +context = ( + ChatContext() + .add(document) + .add(Message("user", user_text)) + .add(Message("assistant", response_text)) +) + +result = guardian.factuality_detection(context, backend) +print(f"Result of factuality detection: {result}") # string "yes" or "no" diff --git a/docs/examples/intrinsics/requirement_check.py b/docs/examples/intrinsics/requirement_check.py new file mode 100644 index 000000000..00df9944b --- /dev/null +++ b/docs/examples/intrinsics/requirement_check.py @@ -0,0 +1,51 @@ +# pytest: huggingface, requires_heavy_ram, llm + +"""Example usage of the requirement check intrinsic. + +Intrinsic function that determines if the text satisfies the given requirements. + +To run this script from the root of the Mellea source tree, use the command: +``` +uv run python docs/examples/intrinsics/requirement_check.py +``` +""" + +from mellea.backends.huggingface import LocalHFBackend +from mellea.stdlib.components import Message +from mellea.stdlib.components.intrinsic import core +from mellea.stdlib.context import ChatContext + +user_text = "Invite for an IBM office party." +response_text = """ +Dear Team, + +To celebrate our recent successes and take a well-deserved moment to recharge, +you are cordially invited to a team social. Please join us for an evening of +live music, appetizers, and drinks as we recognize our collective wins. + +Event Details +* **Date:** Saturday, April 25, 2026 +* **Time:** 6:00 PM +* **Location:** Ryan’s Bar, Chelsea, NY +* **Highlights:** Live entertainment and refreshments + +RSVP +To ensure we have an accurate headcount for catering, please confirm your +attendance by **Friday, April 10, 2026**. + +We look forward to seeing everyone there and celebrating our hard work together. + +**Best regards,** +[Your Name/Management Team] +""" +requirement = "Use a professional tone." + +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") +context = ( + ChatContext() + .add(Message("user", user_text)) + .add(Message("assistant", response_text)) +) + +result = core.requirement_check(context, backend, requirement) +print(f"Requirements Satisfied: {result}") # float between 0.0 and 1.0 diff --git a/docs/examples/intrinsics/uncertainty.py b/docs/examples/intrinsics/uncertainty.py new file mode 100644 index 000000000..b0ade6e88 --- /dev/null +++ b/docs/examples/intrinsics/uncertainty.py @@ -0,0 +1,27 @@ +# pytest: huggingface, requires_heavy_ram, llm + +"""Example usage of the uncertainty/certainty intrinsic. + +Evaluates how certain the model is about its response to a user question. +The context should contain a user question followed by an assistant answer. + +To run this script from the root of the Mellea source tree, use the command: +``` +uv run python docs/examples/intrinsics/uncertainty.py +``` +""" + +from mellea.backends.huggingface import LocalHFBackend +from mellea.stdlib.components import Message +from mellea.stdlib.components.intrinsic import core +from mellea.stdlib.context import ChatContext + +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") +context = ( + ChatContext() + .add(Message("user", "What is the square root of 4?")) + .add(Message("assistant", "The square root of 4 is 2.")) +) + +result = core.check_certainty(context, backend) +print(f"Certainty score: {result}") diff --git a/mellea/backends/adapters/catalog.py b/mellea/backends/adapters/catalog.py index e60223493..b78c18ae8 100644 --- a/mellea/backends/adapters/catalog.py +++ b/mellea/backends/adapters/catalog.py @@ -55,14 +55,15 @@ class IntriniscsCatalogEntry(pydantic.BaseModel): _RAG_REPO = "ibm-granite/granite-lib-rag-r1.0" _CORE_REPO = "ibm-granite/rag-intrinsics-lib" +_CORE_R1_REPO = "ibm-granite/granitelib-core-r1.0" _INTRINSICS_CATALOG_ENTRIES = [ ############################################ # Core Intrinsics ############################################ - IntriniscsCatalogEntry(name="requirement_check", repo_id=_CORE_REPO), - IntriniscsCatalogEntry(name="uncertainty", repo_id=_CORE_REPO), + IntriniscsCatalogEntry(name="requirement_check", repo_id=_CORE_R1_REPO), + IntriniscsCatalogEntry(name="uncertainty", repo_id=_CORE_R1_REPO), ############################################ # RAG Intrinsics ############################################ diff --git a/mellea/stdlib/components/intrinsic/_util.py b/mellea/stdlib/components/intrinsic/_util.py new file mode 100644 index 000000000..2d5740088 --- /dev/null +++ b/mellea/stdlib/components/intrinsic/_util.py @@ -0,0 +1,57 @@ +"""Shared utilities for intrinsic convenience wrappers.""" + +import json + +from ....backends import ModelOption +from ....backends.adapters import AdapterMixin, AdapterType, IntrinsicAdapter +from ....stdlib import functional as mfuncs +from ...context import ChatContext +from .intrinsic import Intrinsic + + +def call_intrinsic( + intrinsic_name: str, + context: ChatContext, + backend: AdapterMixin, + /, + kwargs: dict | None = None, +): + """Shared code for invoking intrinsics. + + :returns: Result of the call in JSON format. + """ + # Adapter needs to be present in the backend before it can be invoked. + # We must create the Adapter object in order to determine whether we need to create + # the Adapter object. + base_model_name = backend.base_model_name + if base_model_name is None: + raise ValueError("Backend has no model ID") + adapter = IntrinsicAdapter( + intrinsic_name, adapter_type=AdapterType.LORA, base_model_name=base_model_name + ) + if adapter.qualified_name not in backend.list_adapters(): + backend.add_adapter(adapter) + + # Create the AST node for the action we wish to perform. + intrinsic = Intrinsic(intrinsic_name, intrinsic_kwargs=kwargs) + + # Execute the AST node. + model_output_thunk, _ = mfuncs.act( + intrinsic, + context, + backend, + model_options={ModelOption.TEMPERATURE: 0.0}, + # No rejection sampling, please + strategy=None, + ) + + # act() can return a future. Don't know how to handle one from non-async code. + assert model_output_thunk.is_computed() + + # Output of an Intrinsic action is the string representation of the output of the + # intrinsic. Parse the string. + result_str = model_output_thunk.value + if result_str is None: + raise ValueError("Model output is None.") + result_json = json.loads(result_str) + return result_json diff --git a/mellea/stdlib/components/intrinsic/core.py b/mellea/stdlib/components/intrinsic/core.py new file mode 100644 index 000000000..d0a06da46 --- /dev/null +++ b/mellea/stdlib/components/intrinsic/core.py @@ -0,0 +1,52 @@ +"""Intrinsic functions for core model capabilities.""" + +from ....backends.adapters import AdapterMixin +from ...components import Message +from ...context import ChatContext +from ._util import call_intrinsic + + +def check_certainty(context: ChatContext, backend: AdapterMixin) -> float: + """Estimate the model's certainty about its last response. + + Intrinsic function that evaluates how certain the model is about the + assistant's response to a user's question. The context should end with + a user question followed by an assistant answer. + + :param context: Chat context containing user question and assistant answer. + :param backend: Backend instance that supports LoRA/aLoRA adapters. + + :return: Certainty score as a float (higher = more certain). + """ + result_json = call_intrinsic("uncertainty", context, backend) + return result_json["certainty"] + + +_EVALUATION_PROMPT = ( + "Please verify if the assistant's generation satisfies the user's " + "requirements or not and reply with a binary label accordingly. " + 'Respond with a json {"score": "yes"} if the constraints are ' + 'satisfied or respond with {"score": "no"} if the constraints are not ' + "satisfied." +) + + +def requirement_check( + context: ChatContext, backend: AdapterMixin, requirement: str +) -> float: + """Detect if text adheres to provided requirements. + + Intrinsic function that determines if the text satisfies the given + requirements. Appends an evaluation prompt to the context following + the format specified by the Granite Guardian requirement checker model card. + + :param context: Chat context containing user question and assistant answer. + :param backend: Backend instance that supports LoRA/aLoRA adapters. + :param requirement: set of requirements to satisfy + + :return: Score as a float between 0.0 and 1.0 (higher = more likely satisfied). + """ + eval_message = f": {requirement}\n{_EVALUATION_PROMPT}" + context = context.add(Message("user", eval_message)) + result_json = call_intrinsic("requirement_check", context, backend) + return result_json["requirement_check"]["score"] diff --git a/mellea/stdlib/components/intrinsic/guardian.py b/mellea/stdlib/components/intrinsic/guardian.py new file mode 100644 index 000000000..ba5e70717 --- /dev/null +++ b/mellea/stdlib/components/intrinsic/guardian.py @@ -0,0 +1,57 @@ +"""Intrinsic functions for the Guardian component.""" + +from ....backends.adapters import AdapterMixin +from ...components import Message +from ...context import ChatContext +from ._util import call_intrinsic + + +def factuality_detection(context: ChatContext, backend: AdapterMixin) -> float: + """Determine is the last response is factually incorrect. + + Intrinsic function that evaluates the factuality of the + assistant's response to a user's question. The context should end with + a user question followed by an assistant answer. + + :param context: Chat context containing user question and assistant answer. + :param backend: Backend instance that supports LoRA/aLoRA adapters. + + :return: Factuality score as a "yes/no" label (yes = factually incorrect). + """ + + detector_message = """ +As a judge agent, your role is to help assess whether the provided text meets the given judging criteria, utilizing all available information, including conversations, documents, and tools. + +### Criteria: A factually incorrect response occurs when the assistant's message contains one or more factual claims that are unsupported by, inconsistent with, or directly contradicted by the information provided in the documents or context. This includes situations where the assistant: introduces details not grounded in the context, misstates or distorts facts contained within the context, misinterprets the meaning or implications of the context, supplies erroneous or conflicting information relative to the context. Even if only a small portion of the response contains such inaccuracies, the overall message is considered factually incorrect. + +### Scoring Schema: If the last assistant's text meets the criteria, return 'yes'; otherwise, return 'no'. +""" + + context = context.add(Message("user", detector_message)) + result_json = call_intrinsic("factuality_detection", context, backend) + return result_json["score"] + +def factuality_correction(context: ChatContext, backend: AdapterMixin) -> float: + """Corrects the last response so that it is factually correct relative + to the given contextual information. + + Intrinsic function that corrects the assistant's response to a user's + question relative to the given context. + + :param context: Chat context containing user question and assistant answer. + :param backend: Backend instance that supports LoRA/aLoRA adapters. + + :return: Correct assistant response. + """ + + corrector_message = """ +As a judge agent, your role is to help assess whether the provided text meets the given judging criteria, utilizing all available information, including conversations, documents, and tools. + +### Criteria: A factually incorrect response occurs when the assistant's message contains one or more factual claims that are unsupported by, inconsistent with, or directly contradicted by the information provided in the documents or context. This includes situations where the assistant: introduces details not grounded in the context, misstates or distorts facts contained within the context, misinterprets the meaning or implications of the context, supplies erroneous or conflicting information relative to the context. Even if only a small portion of the response contains such inaccuracies, the overall message is considered factually incorrect. + +### Scoring Schema: If the last assistant's text meets the criteria, return a corrected version of the assistant's message based on the given context; otherwise, return 'none'. +""" + + context = context.add(Message("user", corrector_message)) + result_json = call_intrinsic("factuality_correction", context, backend) + return result_json["correction"] diff --git a/mellea/stdlib/components/intrinsic/rag.py b/mellea/stdlib/components/intrinsic/rag.py index cf14d24b1..14a06c227 100644 --- a/mellea/stdlib/components/intrinsic/rag.py +++ b/mellea/stdlib/components/intrinsic/rag.py @@ -1,15 +1,12 @@ """Intrinsic functions related to retrieval-augmented generation.""" import collections.abc -import json -from ....backends import ModelOption -from ....backends.adapters import AdapterMixin, AdapterType, IntrinsicAdapter -from ....stdlib import functional as mfuncs +from ....backends.adapters import AdapterMixin from ...components import Document from ...context import ChatContext from ..chat import Message -from .intrinsic import Intrinsic +from ._util import call_intrinsic _ANSWER_RELEVANCE_CORRECTION_METHODS = { "Excessive unnecessary information": "removing the excessive information from the " @@ -30,54 +27,6 @@ so it's important to stick to in-domain prompts.""" -def _call_intrinsic( - intrinsic_name: str, - context: ChatContext, - backend: AdapterMixin, - /, - kwargs: dict | None = None, -): - """Shared code for invoking intrinsics. - - :returns: Result of the call in JSON format. - """ - # Adapter needs to be present in the backend before it can be invoked. - # We must create the Adapter object in order to determine whether we need to create - # the Adapter object. - base_model_name = backend.base_model_name - if base_model_name is None: - raise ValueError("Backend has no model ID") - adapter = IntrinsicAdapter( - intrinsic_name, adapter_type=AdapterType.LORA, base_model_name=base_model_name - ) - if adapter.qualified_name not in backend.list_adapters(): - backend.add_adapter(adapter) - - # Create the AST node for the action we wish to perform. - intrinsic = Intrinsic(intrinsic_name, intrinsic_kwargs=kwargs) - - # Execute the AST node. - model_output_thunk, _ = mfuncs.act( - intrinsic, - context, - backend, - model_options={ModelOption.TEMPERATURE: 0.0}, - # No rejection sampling, please - strategy=None, - ) - - # act() can return a future. Don't know how to handle one from non-async code. - assert model_output_thunk.is_computed() - - # Output of an Intrinsic action is the string representation of the output of the - # intrinsic. Parse the string. - result_str = model_output_thunk.value - if result_str is None: - raise ValueError("Model output is None.") - result_json = json.loads(result_str) - return result_json - - def check_answerability( question: str, documents: collections.abc.Iterable[Document], @@ -101,7 +50,7 @@ def check_answerability( Returns: Answerability score as a floating-point value from 0 to 1. """ - result_json = _call_intrinsic( + result_json = call_intrinsic( "answerability", context.add(Message("user", question, documents=list(documents))), backend, @@ -126,7 +75,7 @@ def rewrite_question( Returns: Rewritten version of ``question``. """ - result_json = _call_intrinsic( + result_json = call_intrinsic( "query_rewrite", context.add(Message("user", question)), backend ) return result_json["rewritten_question"] @@ -155,7 +104,7 @@ def clarify_query( Clarification question string (e.g., "Do you mean A or B?"), or the string "CLEAR" if no clarification is needed. """ - result_json = _call_intrinsic( + result_json = call_intrinsic( "query_clarification", context.add(Message("user", question, documents=list(documents))), backend, @@ -190,7 +139,7 @@ def find_citations( ``citation_end``, ``citation_text``. Begin and end offsets are character offsets into their respective UTF-8 strings. """ - result_json = _call_intrinsic( + result_json = call_intrinsic( "citations", context.add(Message("assistant", response, documents=list(documents))), backend, @@ -217,7 +166,7 @@ def check_context_relevance( Returns: Context relevance score as a floating-point value from 0 to 1. """ - result_json = _call_intrinsic( + result_json = call_intrinsic( "context_relevance", context.add(Message("user", question)), backend, @@ -252,7 +201,7 @@ def flag_hallucinated_content( ``response_end``, ``response_text``, ``faithfulness_likelihood``, ``explanation``. """ - result_json = _call_intrinsic( + result_json = call_intrinsic( "hallucination_detection", context.add(Message("assistant", response, documents=list(documents))), backend, @@ -289,7 +238,7 @@ def rewrite_answer_for_relevance( # * answer_relevance_analysis # * answer_relevance_category # * answer_relevance_likelihood - result_json = _call_intrinsic( + result_json = call_intrinsic( "answer_relevance_classifier", context.add(Message("assistant", response, documents=list(documents))), backend, @@ -305,7 +254,7 @@ def rewrite_answer_for_relevance( result_json["answer_relevance_category"] ] - result_json = _call_intrinsic( + result_json = call_intrinsic( "answer_relevance_rewriter", context.add(Message("assistant", response, documents=list(documents))), backend, diff --git a/test/stdlib/components/intrinsic/test_core.py b/test/stdlib/components/intrinsic/test_core.py new file mode 100644 index 000000000..b3fa8feb6 --- /dev/null +++ b/test/stdlib/components/intrinsic/test_core.py @@ -0,0 +1,94 @@ +"""Tests of the code in ``mellea.stdlib.intrinsics.core``""" + +import gc +import json +import os +import pathlib + +import pytest +import torch + +from mellea.backends.huggingface import LocalHFBackend +from mellea.stdlib.components import Message +from mellea.stdlib.components.intrinsic import core +from mellea.stdlib.context import ChatContext + +# Skip entire module in CI since all tests are qualitative +pytestmark = [ + pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping core intrinsic tests in CI - all qualitative tests", + ), + pytest.mark.huggingface, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + pytest.mark.llm, +] + +DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata" +"""Location of data files for the tests in this file.""" + + +BASE_MODEL = "ibm-granite/granite-4.0-micro" + + +@pytest.fixture(name="backend", scope="module") +def _backend(): + """Backend used by the tests in this file. Module-scoped to avoid reloading the 3B model for each test.""" + # Prevent thrashing if the default device is CPU + torch.set_num_threads(4) + + backend_ = LocalHFBackend(model_id=BASE_MODEL) + yield backend_ + + # Code after yield is cleanup code. + # Free GPU memory with extreme prejudice. + del backend_ + gc.collect() + gc.collect() + gc.collect() + torch.cuda.empty_cache() + + +def _read_input_json(file_name: str): + """Read test data from JSON and convert to a ChatContext. + + Returns the context and the raw JSON data (for accessing extra fields + like ``requirement``). + """ + with open(DATA_ROOT / "input_json" / file_name, encoding="utf-8") as f: + json_data = json.load(f) + + context = ChatContext() + for m in json_data["messages"]: + context = context.add(Message(m["role"], m["content"])) + return context, json_data + + +@pytest.mark.qualitative +def test_certainty(backend): + """Verify that the uncertainty/certainty intrinsic functions properly.""" + context, _ = _read_input_json("uncertainty.json") + + result = core.check_certainty(context, backend) + assert 0.0 <= result <= 1.0 + + result2 = core.check_certainty(context, backend) + assert 0.0 <= result2 <= 1.0 + + +@pytest.mark.qualitative +def test_requirement_check(backend): + """Verify that the requirement check intrinsic functions properly.""" + context, json_data = _read_input_json("requirement_check.json") + requirement = json_data["requirement"] + + result = core.requirement_check(context, backend, requirement) + assert 0.0 <= result <= 1.0 + + result2 = core.requirement_check(context, backend, requirement) + assert 0.0 <= result2 <= 1.0 + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/stdlib/components/intrinsic/test_guardian.py b/test/stdlib/components/intrinsic/test_guardian.py new file mode 100644 index 000000000..fc19374e6 --- /dev/null +++ b/test/stdlib/components/intrinsic/test_guardian.py @@ -0,0 +1,88 @@ +"""Tests of the code in ``mellea.stdlib.intrinsics.core``""" + +import gc +import json +import os +import pathlib + +import pytest +import torch + +from mellea.backends.huggingface import LocalHFBackend +from mellea.stdlib.components import Message, Document +from mellea.stdlib.components.intrinsic import guardian +from mellea.stdlib.context import ChatContext + +# Skip entire module in CI since all tests are qualitative +pytestmark = [ + pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping core intrinsic tests in CI - all qualitative tests", + ), + pytest.mark.huggingface, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + pytest.mark.llm, +] + +DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata" +"""Location of data files for the tests in this file.""" + + +BASE_MODEL = "ibm-granite/granite-4.0-micro" + + +@pytest.fixture(name="backend", scope="module") +def _backend(): + """Backend used by the tests in this file. Module-scoped to avoid reloading the 3B model for each test.""" + # Prevent thrashing if the default device is CPU + torch.set_num_threads(4) + + backend_ = LocalHFBackend(model_id=BASE_MODEL) + yield backend_ + + # Code after yield is cleanup code. + # Free GPU memory with extreme prejudice. + del backend_ + gc.collect() + gc.collect() + gc.collect() + torch.cuda.empty_cache() + + +def _read_input_json(file_name: str): + """Read test data from JSON and convert to a ChatContext. + + Returns the context and the raw JSON data (for accessing extra fields + like ``requirement``). + """ + with open(DATA_ROOT / "input_json" / file_name, encoding="utf-8") as f: + json_data = json.load(f) + + context = ChatContext() + for m in json_data["messages"]: + context = context.add(Message(m["role"], m["content"])) + + if "extra_body" in json_data and "documents" in json_data["extra_body"]: + for d in json_data["extra_body"]["documents"]: + context.add(Document(text=d["text"], doc_id=d["doc_id"])) + return context, json_data + +@pytest.mark.qualitative +def test_factuality_detection(backend): + """Verify that the factuality detection intrinsic functions properly.""" + context, _ = _read_input_json("factuality_detection.json") + + result = guardian.factuality_detection(context, backend) + assert result == "yes" or result == "no" + +@pytest.mark.qualitative +def test_factuality_correction(backend): + """Verify that the factuality correction intrinsic functions properly.""" + context, _ = _read_input_json("factuality_correction.json") + + result = guardian.factuality_correction(context, backend) + assert isinstance(result, str) + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index 17b306910..3a242bd70 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -197,7 +197,7 @@ def test_answer_relevance_classifier(backend): """Verify that the first phase of the answer relevance flow behaves as expectee.""" context, answer, docs = _read_input_json("answer_relevance.json") - result_json = rag._call_intrinsic( + result_json = rag.call_intrinsic( "answer_relevance_classifier", context.add(Message("assistant", answer, documents=list(docs))), backend, diff --git a/test/stdlib/components/intrinsic/testdata/input_json/factuality_correction.json b/test/stdlib/components/intrinsic/testdata/input_json/factuality_correction.json new file mode 100644 index 000000000..a21302ad5 --- /dev/null +++ b/test/stdlib/components/intrinsic/testdata/input_json/factuality_correction.json @@ -0,0 +1,20 @@ +{ + "messages": [ + { + "role": "user", + "content": "Is Ozzy Osbourne still alive?" + }, + { + "role": "assistant", + "content": "Yes, Ozzy Osbourne is alive in 2025 and preparing for another world tour, continuing to amaze fans with his energy and resilience." + } + ], + "extra_body": { + "documents": [ + { + "doc_id": "1", + "text": "Ozzy Osbourne passed away on July 22, 2025, at the age of 76 from a heart attack. He died at his home in Buckinghamshire, England, with contributing conditions including coronary artery disease and Parkinson's disease. His final performance took place earlier that month in Birmingham." + } + ] + } +} \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/testdata/input_json/factuality_detection.json b/test/stdlib/components/intrinsic/testdata/input_json/factuality_detection.json new file mode 100644 index 000000000..a21302ad5 --- /dev/null +++ b/test/stdlib/components/intrinsic/testdata/input_json/factuality_detection.json @@ -0,0 +1,20 @@ +{ + "messages": [ + { + "role": "user", + "content": "Is Ozzy Osbourne still alive?" + }, + { + "role": "assistant", + "content": "Yes, Ozzy Osbourne is alive in 2025 and preparing for another world tour, continuing to amaze fans with his energy and resilience." + } + ], + "extra_body": { + "documents": [ + { + "doc_id": "1", + "text": "Ozzy Osbourne passed away on July 22, 2025, at the age of 76 from a heart attack. He died at his home in Buckinghamshire, England, with contributing conditions including coronary artery disease and Parkinson's disease. His final performance took place earlier that month in Birmingham." + } + ] + } +} \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/testdata/input_json/requirement_check.json b/test/stdlib/components/intrinsic/testdata/input_json/requirement_check.json new file mode 100644 index 000000000..e8c2065d8 --- /dev/null +++ b/test/stdlib/components/intrinsic/testdata/input_json/requirement_check.json @@ -0,0 +1,13 @@ +{ + "messages": [ + { + "role": "user", + "content": "Invite for an IBM office party." + }, + { + "role": "assistant", + "content": "\nDear Team,\n\nTo celebrate our recent successes and take a well-deserved moment to recharge,\nyou are cordially invited to a team social. Please join us for an evening of\nlive music, appetizers, and drinks as we recognize our collective wins.\n\nEvent Details\n* **Date:** Saturday, April 25, 2026\n* **Time:** 6:00 PM\n* **Location:** Ryan's Bar, Chelsea, NY\n* **Highlights:** Live entertainment and refreshments\n\nRSVP\nTo ensure we have an accurate headcount for catering, please confirm your\nattendance by **Friday, April 10, 2026**.\n\nWe look forward to seeing everyone there and celebrating our hard work together.\n\n**Best regards,**\n[Your Name/Management Team]\n" + } + ], + "requirement": "Use a professional tone." +} diff --git a/test/stdlib/components/intrinsic/testdata/input_json/uncertainty.json b/test/stdlib/components/intrinsic/testdata/input_json/uncertainty.json new file mode 100644 index 000000000..045c0e667 --- /dev/null +++ b/test/stdlib/components/intrinsic/testdata/input_json/uncertainty.json @@ -0,0 +1,12 @@ +{ + "messages": [ + { + "role": "user", + "content": "What is the square root of 4?" + }, + { + "role": "assistant", + "content": "The square root of 4 is 2." + } + ] +} \ No newline at end of file