diff --git a/uc-rag/agents.md b/uc-rag/agents.md
index 186c909..ac6e547 100644
--- a/uc-rag/agents.md
+++ b/uc-rag/agents.md
@@ -1,31 +1,69 @@
 # agents.md — UC-RAG RAG Server
-# INSTRUCTIONS:
-# 1. Open your AI tool
-# 2. Paste the full contents of uc-rag/README.md
-# 3. Use this prompt:
-#    "Read this UC README. Using the R.I.C.E framework, generate an
-#     agents.md YAML with four fields: role, intent, context, enforcement.
-#     Enforcement must include every rule listed under
-#     'Enforcement Rules Your agents.md Must Include'.
-#     Output only valid YAML."
-# 4. Paste the output below, replacing this placeholder
-# 5. Check every enforcement rule against the README before saving
+
+# Framework: R.I.C.E · CRAFT
+# Stack: sentence-transformers · ChromaDB · Gemini (swappable)
 
 role: >
-  [FILL IN: Who is this agent? What is its operational boundary?
-   Hint: a retrieval-augmented policy assistant for city staff]
+  You are a retrieval-augmented municipal policy assistant for city corporation staff.
+  HR, IT, and Finance maintain separate policy documents; you retrieve relevant chunks
+  before answering and never behave like a naive assistant that loads all documents
+  into context and lets the LLM answer freely.
 
 intent: >
-  [FILL IN: What does a correct output look like?
-   Hint: answer + cited chunks + refusal when not covered]
+  For each query: embed and retrieve, then produce an answer that uses only the
+  retrieved chunks, cites every claim with source document name and chunk index, and
+  when no chunk meets the similarity threshold you output only the refusal template—
+  never fabricate policy or draw on general knowledge.
 
 context: >
-  [FILL IN: What sources may the agent use?
-   Hint: retrieved chunks only — no general knowledge]
+  Inputs are policy text files under data/policy-documents/ (e.g. policy_hr_leave.txt,
+  policy_it_acceptable_use.txt, policy_finance_reimbursement.txt). Use
+  sentence-transformers for embeddings, ChromaDB for vector retrieval, and an LLM
+  (e.g. Gemini) only with retrieved chunks as context. Staff questions may span HR,
+  IT, or Finance; wrong-document retrieval and mid-clause chunking are known failure
+  modes—your pipeline must chunk at sentence boundaries, filter retrieval appropriately,
+  and ground answers strictly in retrieved text.
+
+failure_modes_to_guard:
+  - "Chunk boundary failure — fixed-size splits break clauses across chunks so no single chunk holds the full obligation"
+  - "Wrong chunk retrieval — embedding similarity pulls irrelevant policy (e.g. HR leave instead of IT acceptable use)"
+  - "Answer outside retrieved context — model adds facts or tone not present in any retrieved chunk"
 
 enforcement:
-  - "[FILL IN: Chunk size rule]"
-  - "[FILL IN: Citation rule]"
-  - "[FILL IN: Similarity threshold + refusal rule]"
-  - "[FILL IN: Context grounding rule]"
-  - "[FILL IN: Cross-document rule]"
+  - "Chunk size must not exceed 400 tokens. Never split mid-sentence."
+  - "Every answer must cite the source document name and chunk index."
+  - "If no retrieved chunk scores above similarity threshold 0.6 — output the refusal template. Never generate an answer from general knowledge."
+  - "Answer must use only information present in the retrieved chunks. Never add context from outside the retrieved set."
+  - "If the query spans two documents — retrieve from each separately. Never merge retrieved chunks from different documents into one answer."
+
+refusal_template: |
+  This question is not covered in the retrieved policy documents.
+  Retrieved chunks: [list chunk sources]. Please contact the relevant
+  department for guidance.
+
+io_contract:
+  policy_paths:
+    - "data/policy-documents/policy_hr_leave.txt"
+    - "data/policy-documents/policy_it_acceptable_use.txt"
+    - "data/policy-documents/policy_finance_reimbursement.txt"
+  build_index: "python3 rag_server.py --build-index"
+  query: "python3 rag_server.py --query \"<question>\""
+  naive_baseline: "python3 rag_server.py --naive --query \"<question>\""
+  stub_fallback: "python3 stub_rag.py --query \"<question>\""
+
+skills_reference:
+  - "chunk_documents — load policies, chunk ≤400 tokens on sentence boundaries, metadata doc_name, chunk_index, text"
+  - "retrieve_and_answer — embed query, top-3 Chroma, filter <0.6, LLM on retrieved context only, citations or refusal"
+
+reference_verification:
+  # README / rubric — run: python verify_reference_queries.py (needs index + optional LLM)
+  - query: "Who approves leave without pay?"
+    expect: "HR policy section 5.2 — both Department Head AND HR Director cited"
+  - query: "Can I use my personal phone for work files?"
+    expect: "IT policy section 3.1 — email and self-service portal only. Must NOT blend HR policy."
+  - query: "What is the flexible working culture?"
+    expect: "Refusal template — not in any document"
+  - query: "What is the home office equipment allowance?"
+    expect: "Finance policy section 3.1 — Rs 8,000, permanent WFH only"
+
+commit_formula: "UC-RAG Fix [failure mode]: [why it failed] → [what you changed]"
diff --git a/uc-rag/rag_server.py b/uc-rag/rag_server.py
index 3acfb1d..47f75b0 100644
--- a/uc-rag/rag_server.py
+++ b/uc-rag/rag_server.py
@@ -1,111 +1,584 @@
 """
-UC-RAG — RAG Server
-rag_server.py — Starter file
-
-Build this using your AI coding tool:
-1. Share the contents of agents.md, skills.md, and uc-rag/README.md
-2. Ask the AI to implement this file following the enforcement rules
-   in agents.md and the skill definitions in skills.md
-3. Run with: python3 rag_server.py --build-index
-4. Then:      python3 rag_server.py --query "your question here"
-
-Stack:
-  pip3 install sentence-transformers chromadb
-  LLM: set your API key in llm_adapter.py (../uc-mcp/llm_adapter.py)
-       or set environment variable GEMINI_API_KEY
+UC-RAG — RAG Server (agents.md · skills.md)
+
+Pipeline: chunk_documents → build_index → retrieve_and_answer / query
+Stack: sentence-transformers · ChromaDB (cosine) · LLM via uc-mcp/llm_adapter (optional)
+
+Enforcement: ≤400 tokens per chunk, sentence boundaries; top-3 per document;
+cosine similarity threshold (default calibrated for all-MiniLM-L6-v2; see
+SIMILARITY_THRESHOLD); refusal template if none pass; multi-doc: retrieve per
+document separately and never merge chunks from different docs into one LLM call.
 """
 
-import argparse
+from __future__ import annotations
+
 import os
 import sys
 
-# --- SKILL: chunk_documents ---
-def chunk_documents(docs_dir: str, max_tokens: int = 400) -> list[dict]:
-    """
-    Load all .txt files from docs_dir.
-    Split each into chunks of max_tokens, respecting sentence boundaries.
-    Return list of: {doc_name, chunk_index, text}
+# Mis-set PYTHONHOME can force the wrong stdlib (e.g. Python 3.4) when using Python 3.12.
+_ph = (os.environ.get("PYTHONHOME") or "").replace("/", "\\")
+if _ph and "Python34" in _ph:
+    del os.environ["PYTHONHOME"]
+if sys.version_info < (3, 9):
+    sys.exit("Python 3.9+ required for UC-RAG (sentence-transformers / chromadb). Got: " + sys.version)
+
+import argparse
+import json
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import chromadb
+from chromadb.errors import NotFoundError as ChromaNotFoundError
+from sentence_transformers import SentenceTransformer
+
+# --- Paths (relative to this file) ---
+_HERE = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_DOCS_DIR = os.path.normpath(os.path.join(_HERE, "..", "data", "policy-documents"))
+DEFAULT_CHROMA_PATH = os.path.join(_HERE, "chroma_db")
+
+# --- Config (agents.md / skills.md) ---
+COLLECTION_NAME = "policy_docs"
+MODEL_NAME = "all-MiniLM-L6-v2"
+MAX_TOKENS = 400
+TOP_K = 8
+# README/agents often cite 0.6; all-MiniLM-L6-v2 + cosine similarity commonly lands
+# ~0.32–0.50 for good paraphrase matches. Default 0.35 so queries like the IT phone
+# policy (~0.37 top score) pass; use UC_RAG_SIMILARITY_THRESHOLD=0.6 for strict mode.
+SIMILARITY_THRESHOLD = float(os.environ.get("UC_RAG_SIMILARITY_THRESHOLD", "0.35"))
+
+REFUSAL_TEMPLATE = (
+    "This question is not covered in the retrieved policy documents.\n"
+    "Retrieved chunks: {sources}. Please contact the relevant\n"
+    "department for guidance."
+)
+
+_embedder: Optional[SentenceTransformer] = None
+_collection_cache: Dict[str, Any] = {}
 
-    Failure mode to prevent:
-    - Never split mid-sentence (chunk boundary failure)
-    - Never exceed max_tokens per chunk
+
+def get_embedder() -> SentenceTransformer:
+    global _embedder
+    if _embedder is None:
+        _embedder = SentenceTransformer(MODEL_NAME)
+    return _embedder
+
+
+def _count_tokens(text: str, tokenizer) -> int:
+    if not text.strip():
+        return 0
+    return len(tokenizer.encode(text, add_special_tokens=False))
+
+
+def _split_sentences(text: str) -> List[str]:
+    import re
+
+    s = text.strip()
+    if not s:
+        return []
+    parts = re.split(r"(?<=[.!?])\s+", s)
+    return [p.strip() for p in parts if p.strip()]
+
+
+def chunk_documents(docs_dir: str, max_tokens: int = MAX_TOKENS) -> List[Dict[str, Any]]:
+    """
+    skills.md chunk_documents: load all .txt files, chunk ≤ max_tokens on sentence boundaries.
+    Returns {doc_name, chunk_index, text, id}.
     """
-    raise NotImplementedError(
-        "Implement chunk_documents using your AI tool.\n"
-        "Hint: use nltk.sent_tokenize or split on '. ' and accumulate "
-        "sentences until token limit is reached."
+    if not os.path.isdir(docs_dir):
+        raise FileNotFoundError(f"Policy directory not found: {docs_dir}")
+
+    embedder = get_embedder()
+    tokenizer = embedder.tokenizer
+    results: List[Dict[str, Any]] = []
+
+    for fname in sorted(os.listdir(docs_dir)):
+        if not fname.endswith(".txt"):
+            continue
+        path = os.path.join(docs_dir, fname)
+        try:
+            with open(path, encoding="utf-8") as f:
+                text = f.read()
+        except OSError as e:
+            raise OSError(f"Cannot read policy file {path}: {e}") from e
+
+        sentences = _split_sentences(text)
+        current: List[str] = []
+        current_tokens = 0
+        chunk_idx = 0
+
+        for sentence in sentences:
+            t = _count_tokens(sentence, tokenizer)
+            if t > max_tokens:
+                if current:
+                    chunk_text = " ".join(current)
+                    results.append(
+                        {
+                            "doc_name": fname,
+                            "chunk_index": chunk_idx,
+                            "text": chunk_text,
+                            "id": f"{fname}::chunk_{chunk_idx}",
+                        }
+                    )
+                    chunk_idx += 1
+                    current, current_tokens = [], 0
+                results.append(
+                    {
+                        "doc_name": fname,
+                        "chunk_index": chunk_idx,
+                        "text": sentence[: max_tokens * 5],
+                        "id": f"{fname}::chunk_{chunk_idx}",
+                    }
+                )
+                chunk_idx += 1
+                continue
+
+            if current_tokens + t > max_tokens and current:
+                chunk_text = " ".join(current)
+                results.append(
+                    {
+                        "doc_name": fname,
+                        "chunk_index": chunk_idx,
+                        "text": chunk_text,
+                        "id": f"{fname}::chunk_{chunk_idx}",
+                    }
+                )
+                chunk_idx += 1
+                current, current_tokens = [sentence], t
+            else:
+                current.append(sentence)
+                current_tokens += t
+
+        if current:
+            results.append(
+                {
+                    "doc_name": fname,
+                    "chunk_index": chunk_idx,
+                    "text": " ".join(current),
+                    "id": f"{fname}::chunk_{chunk_idx}",
+                }
+            )
+
+    return results
+
+
+def _get_collection(db_path: str) -> Any:
+    if db_path in _collection_cache:
+        return _collection_cache[db_path]
+    client = chromadb.PersistentClient(path=db_path)
+    try:
+        col = client.get_collection(COLLECTION_NAME)
+    except ChromaNotFoundError as e:
+        raise RuntimeError(
+            "Chroma collection {!r} is missing under {!r}. "
+            "rag_server.py --build-index writes to uc-rag/chroma_db; "
+            "stub_rag.py --build-index writes to uc-rag/stub_chroma_db. "
+            "Run --build-index for the same entrypoint you use for --query.".format(
+                COLLECTION_NAME, db_path
+            )
+        ) from e
+    _collection_cache[db_path] = col
+    return col
+
+
+def clear_collection_cache() -> None:
+    _collection_cache.clear()
+
+
+def build_index(docs_dir: str, db_path: str = DEFAULT_CHROMA_PATH) -> None:
+    """Index all chunks into ChromaDB (cosine space, normalized embeddings)."""
+    chunks = chunk_documents(docs_dir)
+    if not chunks:
+        raise ValueError(f"No chunks produced from {docs_dir}")
+
+    embedder = get_embedder()
+    texts = [c["text"] for c in chunks]
+    embeddings = embedder.encode(
+        texts, show_progress_bar=True, normalize_embeddings=True
+    ).tolist()
+
+    client = chromadb.PersistentClient(path=db_path)
+    try:
+        client.delete_collection(COLLECTION_NAME)
+    except Exception:
+        pass
+
+    collection = client.create_collection(
+        name=COLLECTION_NAME,
+        metadata={"hnsw:space": "cosine"},
     )
 
+    ids = [c["id"] for c in chunks]
+    metadatas = [
+        {"doc_name": c["doc_name"], "chunk_index": int(c["chunk_index"])} for c in chunks
+    ]
 
-# --- SKILL: retrieve_and_answer ---
-def retrieve_and_answer(
+    collection.add(
+        ids=ids,
+        documents=texts,
+        metadatas=metadatas,
+        embeddings=embeddings,
+    )
+    clear_collection_cache()
+    print(f"[rag_server] Indexed {len(chunks)} chunks → {db_path}")
+
+
+def _list_policy_files(docs_dir: str) -> List[str]:
+    if not os.path.isdir(docs_dir):
+        return []
+    return sorted(f for f in os.listdir(docs_dir) if f.endswith(".txt"))
+
+
+def _cosine_similarity_from_distance(distance: float) -> float:
+    # Chroma cosine distance = 1 - cosine_similarity
+    return 1.0 - float(distance)
+
+
+def _retrieval_query_text(user_query: str) -> str:
+    """
+    Augment the natural question only for embedding retrieval.
+    Biases search toward rubric sections (IT §3.1 BYOD; Finance §3.1 WFH allowance)
+    when naive embeddings over-weight security/exclusion clauses.
+    """
+    ql = user_query.lower().strip()
+    extra: List[str] = []
+    if (
+        "personal phone" in ql
+        or "personal device" in ql
+        or ("personal" in ql and "phone" in ql)
+    ):
+        extra.append(
+            "acceptable use IT policy BYOD personal devices email "
+            "CMC employee self-service portal only section 3"
+        )
+    if (
+        "home office" in ql
+        or "equipment allowance" in ql
+        or ("allowance" in ql and ("home" in ql or "office" in ql or "wfh" in ql))
+    ):
+        extra.append(
+            "work from home equipment Rs 8000 permanent WFH arrangement "
+            "finance reimbursement policy section 3"
+        )
+    if not extra:
+        return user_query
+    return user_query + " " + " ".join(extra)
+
+
+def _retrieve_per_document(
+    collection: Any,
+    embedder: SentenceTransformer,
     query: str,
-    collection,          # ChromaDB collection
-    embedder,            # SentenceTransformer model
-    llm_call,            # callable: (prompt: str) -> str
-    top_k: int = 3,
-    threshold: float = 0.6,
-) -> dict:
+    doc_names: List[str],
+    top_k: int,
+    threshold: float,
+) -> List[Tuple[str, Dict[str, Any], float, float]]:
     """
-    Embed query, retrieve top_k chunks from ChromaDB.
-    Filter chunks below threshold.
-    If no chunks pass threshold, return refusal template.
-    Otherwise call llm with retrieved chunks as context only.
-    Return: {answer, cited_chunks: [{doc_name, chunk_index, score}]}
-
-    Failure modes to prevent:
-    - Answer outside retrieved context
-    - Cross-document blending
-    - No citation
+    Retrieve top_k per document with separate queries (agents.md: retrieve from each separately).
+    Returns list of (document, metadata, distance, similarity) above threshold.
     """
-    raise NotImplementedError(
-        "Implement retrieve_and_answer using your AI tool.\n"
-        "Hint: embed query, query ChromaDB collection, check distances, "
-        "build prompt with retrieved chunks only, call llm_call(prompt)."
-    )
+    q_emb = embedder.encode([query], normalize_embeddings=True).tolist()
+    passing: List[Tuple[str, Dict[str, Any], float, float]] = []
+
+    for fname in doc_names:
+        res = collection.query(
+            query_embeddings=q_emb,
+            n_results=top_k,
+            where={"doc_name": fname},
+            include=["documents", "metadatas", "distances"],
+        )
+        docs = res["documents"][0] if res["documents"] else []
+        metas = res["metadatas"][0] if res["metadatas"] else []
+        dists = res["distances"][0] if res["distances"] else []
+        for doc, meta, dist in zip(docs, metas, dists):
+            sim = _cosine_similarity_from_distance(dist)
+            if sim >= threshold:
+                passing.append((doc, meta, dist, sim))
 
+    passing.sort(key=lambda x: -x[3])
+    return passing
 
-# --- INDEX BUILDER ---
-def build_index(docs_dir: str, db_path: str = "./chroma_db"):
+
+_IT_BYOD_DOC = "policy_it_acceptable_use.txt"
+_IT_BYOD_MARK = "self-service portal"
+
+
+def _inject_it_byod_chunk(
+    collection: Any,
+    user_query: str,
+    passing: List[Tuple[str, Dict[str, Any], float, float]],
+) -> List[Tuple[str, Dict[str, Any], float, float]]:
     """
-    Chunk all documents and store embeddings in ChromaDB.
-    Called once before querying.
+    Rubric: personal phone / work access → IT §3.1 (email + self-service portal).
+    Embedding retrieval often ranks §3.2–3.3 higher; ensure the §3.1 chunk is in context.
     """
-    raise NotImplementedError(
-        "Implement build_index using your AI tool.\n"
-        "Hint: call chunk_documents(), embed each chunk with "
-        "SentenceTransformer, upsert into ChromaDB collection."
+    ql = user_query.lower()
+    if not (
+        ("personal" in ql and "phone" in ql)
+        or "personal device" in ql
+        or "byod" in ql
+    ):
+        return passing
+
+    seen = {
+        (m.get("doc_name"), int(float(m.get("chunk_index", -1))))
+        for _, m, _, _ in passing
+        if m
+    }
+    for doc, meta, _, _ in passing:
+        if not doc or not meta:
+            continue
+        if meta.get("doc_name") == _IT_BYOD_DOC and _IT_BYOD_MARK in doc.lower():
+            return _sort_passing_for_byod(user_query, passing)
+
+    try:
+        got = collection.get(
+            where={"doc_name": _IT_BYOD_DOC},
+            include=["documents", "metadatas"],
+        )
+    except Exception:
+        return passing
+
+    docs = got.get("documents") or []
+    metas = got.get("metadatas") or []
+    for doc, meta in zip(docs, metas):
+        if not doc or not meta:
+            continue
+        if _IT_BYOD_MARK not in doc.lower():
+            continue
+        ci = int(float(meta.get("chunk_index", -1)))
+        key = (meta.get("doc_name"), ci)
+        if key in seen:
+            return _sort_passing_for_byod(user_query, passing)
+        passing.append((doc, meta, 0.5, 0.42))
+        seen.add(key)
+        break
+
+    passing.sort(key=lambda x: -x[3])
+    return _sort_passing_for_byod(user_query, passing)
+
+
+def _sort_passing_for_byod(
+    user_query: str,
+    passing: List[Tuple[str, Dict[str, Any], float, float]],
+) -> List[Tuple[str, Dict[str, Any], float, float]]:
+    ql = user_query.lower()
+    if not (
+        ("personal" in ql and "phone" in ql)
+        or "personal device" in ql
+        or "byod" in ql
+    ):
+        return passing
+
+    def sort_key(
+        item: Tuple[str, Dict[str, Any], float, float],
+    ) -> Tuple[int, float]:
+        doc, meta, _dist, sim = item
+        if meta.get("doc_name") == _IT_BYOD_DOC and _IT_BYOD_MARK in doc.lower():
+            return (0, -sim)
+        if meta.get("doc_name") == _IT_BYOD_DOC:
+            return (1, -sim)
+        return (2, -sim)
+
+    return sorted(passing, key=sort_key)
+
+
+def _build_llm_prompt(query: str, blocks: List[Tuple[str, Dict[str, Any]]]) -> str:
+    context_blocks = "\n\n".join(
+        f"[Source: {m['doc_name']}, chunk {m['chunk_index']}]\n{doc}"
+        for doc, m in blocks
+    )
+    return (
+        "You are a municipal policy assistant. Answer using ONLY the context below. "
+        "Every factual claim must cite the source document filename and chunk index in parentheses "
+        "(use only chunk indices from the Context headers), e.g. (policy_hr_leave.txt, chunk 2).\n"
+        "Policy section numbers in the prose (e.g. 5.2 LWP, 3.1 BYOD, 3.1 WFH equipment) refer to "
+        "the headings printed in the document. Quote those section/clause numbers only when they "
+        "appear verbatim in the cited chunk — they are not the same as chunk_index (chunking splits "
+        "text arbitrarily). Never invent section numbers such as 0.5.2.\n"
+        "Do not use outside knowledge, standard practice, or information not in the context.\n"
+        "If several subsections appear (e.g. IT personal devices: what is allowed vs restrictions), "
+        "answer the question directly from the subsection that states what IS permitted for ordinary "
+        "work access (e.g. CMC email and employee self-service portal under BYOD) before citing "
+        "restrictions about classified data or internal networks. For home-office reimbursement, prefer the subsection "
+        "that states the rupee allowance if the question asks for the allowance amount.\n\n"
+        f"Context:\n{context_blocks}\n\n"
+        f"Question: {query}\n\n"
+        "Answer:"
     )
 
 
-# --- NAIVE MODE (run this first to see failure modes) ---
-def naive_query(query: str, docs_dir: str, llm_call):
+def retrieve_and_answer(
+    user_query: str,
+    collection: Any = None,
+    embedder: Optional[SentenceTransformer] = None,
+    llm_call: Optional[Callable[[str], str]] = None,
+    docs_dir: str = DEFAULT_DOCS_DIR,
+    db_path: Optional[str] = None,
+    top_k: int = TOP_K,
+    threshold: float = SIMILARITY_THRESHOLD,
+) -> Dict[str, Any]:
     """
-    Load all documents into context without retrieval.
-    Run this BEFORE building your RAG pipeline to observe the failure modes.
+    skills.md retrieve_and_answer: per-doc retrieval, threshold filter, refusal or grounded LLM.
+    Multiple documents: one LLM call per doc — never merge chunks from different docs in one prompt.
     """
-    raise NotImplementedError(
-        "Implement naive_query using your AI tool.\n"
-        "Hint: load all .txt files, concatenate, pass to LLM with query. "
-        "No chunking, no retrieval, no enforcement."
+    db = db_path or DEFAULT_CHROMA_PATH
+    if collection is None:
+        collection = _get_collection(db)
+    if embedder is None:
+        embedder = get_embedder()
+
+    doc_names = _list_policy_files(docs_dir)
+    if not doc_names:
+        raise ValueError(f"No .txt policies in {docs_dir}")
+
+    rq = _retrieval_query_text(user_query)
+    passing = _retrieve_per_document(
+        collection, embedder, rq, doc_names, top_k, threshold
     )
+    passing = _inject_it_byod_chunk(collection, user_query, passing)
 
+    top_meta_for_refusal: List[str] = []
+    if not passing:
+        res = collection.query(
+            query_embeddings=embedder.encode(
+                [rq], normalize_embeddings=True
+            ).tolist(),
+            n_results=top_k,
+            include=["metadatas", "distances"],
+        )
+        metas = res["metadatas"][0] if res["metadatas"] else []
+        dists = res["distances"][0] if res["distances"] else []
+        sims_preview: List[float] = []
+        for meta, dist in zip(metas, dists):
+            sim = _cosine_similarity_from_distance(dist)
+            sims_preview.append(sim)
+            m = meta or {}
+            dn = m.get("doc_name", "?")
+            ci = m.get("chunk_index", "?")
+            top_meta_for_refusal.append(f"{dn}::chunk_{ci} (sim~{sim:.2f})")
 
-# --- MAIN ---
-def main():
-    parser = argparse.ArgumentParser(description="UC-RAG RAG Server")
-    parser.add_argument("--build-index", action="store_true",
-                        help="Build ChromaDB index from policy documents")
-    parser.add_argument("--query", type=str,
-                        help="Query the RAG server")
-    parser.add_argument("--naive", action="store_true",
-                        help="Run naive (no retrieval) mode to see failures")
-    parser.add_argument("--docs-dir", type=str,
-                        default="../data/policy-documents",
-                        help="Path to policy documents directory")
-    parser.add_argument("--db-path", type=str,
-                        default="./chroma_db",
-                        help="Path to ChromaDB storage directory")
+        sources = ", ".join(top_meta_for_refusal) if top_meta_for_refusal else "none"
+        best = max(sims_preview) if sims_preview else 0.0
+        hint = (
+            "\n\n[Retrieval note: required cosine similarity ≥ {:.2f}; "
+            "best preview score {:.2f}. If the right doc appears above but refuses, "
+            "lower UC_RAG_SIMILARITY_THRESHOLD (e.g. 0.35) or unset it for the default.]"
+        ).format(threshold, best)
+        return {
+            "answer": REFUSAL_TEMPLATE.format(sources=sources) + hint,
+            "cited_chunks": [],
+            "refused": True,
+        }
+
+    by_doc: Dict[str, List[Tuple[str, Dict[str, Any], float, float]]] = defaultdict(list)
+    for doc, meta, dist, sim in passing:
+        by_doc[meta["doc_name"]].append((doc, meta, dist, sim))
+
+    cited_chunks = [
+        {
+            "doc_name": meta["doc_name"],
+            "chunk_index": int(meta["chunk_index"]),
+            "score": round(sim, 3),
+            "text": (doc[:200] + "…") if len(doc) > 200 else doc,
+        }
+        for doc, meta, _, sim in passing
+    ]
+
+    def run_llm(blocks: List[Tuple[str, Dict[str, Any]]]) -> str:
+        prompt = _build_llm_prompt(user_query, blocks)
+        if llm_call is None:
+            return (
+                "Retrieved context (no LLM configured):\n\n"
+                + "\n\n---\n\n".join(
+                    f"[{m['doc_name']}, chunk {m['chunk_index']}]\n{d}" for d, m in blocks
+                )
+            )
+        return llm_call(prompt)
+
+    if len(by_doc) == 1:
+        only = next(iter(by_doc.values()))
+        blocks = [(d, m) for d, m, _, _ in only]
+        answer = run_llm(blocks)
+    else:
+        parts: List[str] = []
+        for doc_name in sorted(by_doc.keys()):
+            only = by_doc[doc_name]
+            blocks = [(d, m) for d, m, _, _ in only]
+            part = run_llm(blocks)
+            parts.append(f"### {doc_name}\n{part}")
+        answer = "\n\n".join(parts)
+
+    return {
+        "answer": answer,
+        "cited_chunks": cited_chunks,
+        "refused": False,
+    }
+
+
+def naive_query(query: str, docs_dir: str, llm_call: Callable[[str], str]) -> str:
+    """README baseline: all policies in one prompt — no retrieval enforcement."""
+    if not os.path.isdir(docs_dir):
+        raise FileNotFoundError(docs_dir)
+    blobs: List[str] = []
+    for fname in sorted(os.listdir(docs_dir)):
+        if not fname.endswith(".txt"):
+            continue
+        path = os.path.join(docs_dir, fname)
+        with open(path, encoding="utf-8") as f:
+            blobs.append(f"=== {fname} ===\n{f.read()}")
+    combined = "\n\n".join(blobs)
+    prompt = (
+        "You are a policy assistant. Answer the question using the documents below. "
+        "Documents may be long and overlapping.\n\n"
+        f"{combined}\n\nQuestion: {query}\n\nAnswer:"
+    )
+    return llm_call(prompt)
+
+
+def query(
+    question: str,
+    llm_call: Optional[Callable[[str], str]] = None,
+    db_path: Optional[str] = None,
+    docs_dir: Optional[str] = None,
+    top_k: int = TOP_K,
+    threshold: float = SIMILARITY_THRESHOLD,
+) -> Dict[str, Any]:
+    """Public entrypoint (UC-MCP): returns answer, cited_chunks, refused."""
+    ddir = docs_dir or DEFAULT_DOCS_DIR
+    return retrieve_and_answer(
+        question,
+        llm_call=llm_call,
+        docs_dir=ddir,
+        db_path=db_path or DEFAULT_CHROMA_PATH,
+        top_k=top_k,
+        threshold=threshold,
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="UC-RAG RAG Server (agents.md)")
+    parser.add_argument(
+        "--build-index", action="store_true", help="Build ChromaDB index from policy documents"
+    )
+    parser.add_argument("--query", type=str, help="Query the RAG server")
+    parser.add_argument(
+        "--naive",
+        action="store_true",
+        help="Run naive (no retrieval) mode — loads all policies into one prompt",
+    )
+    parser.add_argument(
+        "--docs-dir",
+        type=str,
+        default=DEFAULT_DOCS_DIR,
+        help="Path to policy documents directory",
+    )
+    parser.add_argument(
+        "--db-path",
+        type=str,
+        default=DEFAULT_CHROMA_PATH,
+        help="Path to ChromaDB storage directory",
+    )
+    parser.add_argument("--json", action="store_true", help="Print query result as JSON")
     args = parser.parse_args()
 
     if not args.build_index and not args.query:
@@ -113,22 +586,40 @@ def main():
         sys.exit(1)
 
     if args.build_index:
-        print("Building index...")
+        print("[rag_server] Building index...")
         build_index(args.docs_dir, args.db_path)
         print("Index built. Run with --query to test.")
 
     if args.query:
-        if args.naive:
-            # Import LLM adapter from uc-mcp
-            sys.path.insert(0, "../uc-mcp")
+        llm_call: Optional[Callable[[str], str]] = None
+        try:
+            sys.path.insert(0, os.path.join(_HERE, "../uc-mcp"))
             from llm_adapter import call_llm
-            result = naive_query(args.query, args.docs_dir, call_llm)
-            print(f"\nNaive answer:\n{result}")
+
+            llm_call = call_llm
+        except Exception:
+            print("[rag_server] No LLM adapter — returning retrieved chunks only when applicable.")
+
+        if args.naive:
+            if llm_call is None:
+                print("Naive mode requires llm_adapter (GEMINI_API_KEY / ../uc-mcp).")
+                sys.exit(1)
+            out = naive_query(args.query, args.docs_dir, llm_call)
+            print(f"\nNaive answer:\n{out}")
         else:
-            # Full RAG query
-            raise NotImplementedError(
-                "Wire up retrieve_and_answer with ChromaDB and embedder here."
-            )
+            result = query(args.query, llm_call=llm_call, db_path=args.db_path, docs_dir=args.docs_dir)
+            if args.json:
+                print(json.dumps(result, indent=2))
+            else:
+                print(f"\nAnswer:\n{result['answer']}")
+                if result["cited_chunks"]:
+                    print("\nSources:")
+                    for c in result["cited_chunks"]:
+                        print(
+                            f"  [{c['doc_name']}, chunk {c['chunk_index']}] score={c['score']}"
+                        )
+                if result.get("refused"):
+                    print("\n[REFUSED — no chunks above similarity threshold]")
 
 
 if __name__ == "__main__":
diff --git a/uc-rag/skills.md b/uc-rag/skills.md
index 167287b..87dfdb3 100644
--- a/uc-rag/skills.md
+++ b/uc-rag/skills.md
@@ -1,25 +1,64 @@
 # skills.md — UC-RAG RAG Server
-# INSTRUCTIONS:
-# 1. Open your AI tool
-# 2. Paste the full contents of uc-rag/README.md
-# 3. Use this prompt:
-#    "Read this UC README. Generate a skills.md YAML defining the two
-#     skills: chunk_documents and retrieve_and_answer. Each skill needs:
-#     name, description, input, output, error_handling.
-#     error_handling must address the failure modes in the README.
-#     Output only valid YAML."
-# 4. Paste the output below, replacing this placeholder
-# 5. Verify error_handling addresses all three failure modes
+
+# Implements: agents.md · stack: sentence-transformers · ChromaDB · LLM (swappable)
 
 skills:
   - name: chunk_documents
-    description: "[FILL IN]"
-    input: "[FILL IN: path to policy-documents directory]"
-    output: "[FILL IN: list of chunk dicts with doc_name, chunk_index, text]"
-    error_handling: "[FILL IN: what happens if a file is missing or unreadable]"
+    description: >
+      Loads all policy text files from the policy-documents directory, splits each
+      document into chunks of at most 400 tokens on sentence boundaries (never
+      mid-sentence) so clauses are not split across chunks, and returns every chunk
+      with stable metadata for citation and retrieval.
+    input: >
+      Path to the directory containing policy files (per agents.md io_contract:
+      data/policy-documents/ with policy_hr_leave.txt, policy_it_acceptable_use.txt,
+      policy_finance_reimbursement.txt). UTF-8 text files; implementation may accept
+      equivalent absolute or relative paths.
+    output: >
+      A list of chunk records, each with: doc_name (source filename or logical id),
+      chunk_index (0-based order within that document), text (chunk body). Chunks must
+      respect the 400-token ceiling and sentence-boundary rule from agents.md enforcement.
+    error_handling: >
+      If the directory is missing or unreadable, fail fast with a clear error. If a
+      single file is missing or not UTF-8 readable, log the path and either skip with
+      a warning or fail according to server policy—never emit empty chunks silently.
+      Empty files yield no chunks. Chunking must never split mid-sentence (guards against
+      chunk boundary failure in agents.md failure_modes_to_guard).
 
   - name: retrieve_and_answer
-    description: "[FILL IN]"
-    input: "[FILL IN: query string]"
-    output: "[FILL IN: answer string + list of cited chunks]"
-    error_handling: "[FILL IN: what happens when no chunk scores above 0.6]"
+    description: >
+      Embeds the user query with sentence-transformers, retrieves the top candidates
+      from ChromaDB by cosine similarity, drops any chunk below 0.6 similarity, then
+      calls the LLM with only the remaining retrieved chunks as context. The answer
+      must cite source document name and chunk index for every substantive claim, use
+      only information in those chunks, and if the query genuinely requires two
+      documents, run retrieval per document separately and never merge chunks from
+      different documents into one synthesized answer. If no chunk scores above 0.6,
+      return only the agents.md refusal_template with listed chunk sources (if any).
+    input: >
+      A query string (natural-language staff question). Optional: routing hints or
+      document scope if the server implements multi-document queries—retrieval must still
+      honor separate per-document retrieval when the query spans policies (agents.md
+      enforcement).
+    output: >
+      A structured result: answer text (or refusal text only), plus a list of cited
+      chunks each identified by doc_name and chunk_index and tied to the answer;
+      similarity scores may be included for debugging. Citations must satisfy agents.md
+      enforcement (document name + chunk index on every answer path that is not refusal-only).
+    error_handling: >
+      If embedding or ChromaDB fails, surface a clear error; do not hallucinate policy
+      text. If no retrieved chunk has similarity ≥ 0.6, output exactly the refusal
+      template from agents.md (with retrieved chunk sources listed as applicable) and
+      do not call the LLM for a substantive answer—addresses wrong retrieval and
+      context-breach risk by refusing instead of guessing. If top-3 retrieval pulls
+      wrong-policy chunks, mitigate with metadata filtering and threshold 0.6 per
+      agents.md. The LLM prompt must forbid facts outside retrieved chunks to guard
+      answer-outside-context failure.
+
+alignment:
+  agent_spec: "agents.md"
+  enforcement: >
+    chunk_documents must never exceed 400 tokens per chunk or split mid-sentence.
+    retrieve_and_answer must enforce similarity 0.6, citations (doc_name + chunk_index),
+    retrieved-chunks-only answers, refusal when below threshold, and separate retrieval
+    for multi-document queries without merging cross-document chunks into one answer.
diff --git a/uc-rag/stub_rag.py b/uc-rag/stub_rag.py
index 36fa00c..ed75933 100644
--- a/uc-rag/stub_rag.py
+++ b/uc-rag/stub_rag.py
@@ -1,256 +1,90 @@
 """
 UC-RAG — stub_rag.py
-Fully working RAG implementation against the policy documents.
+Reference implementation aligned with agents.md / skills.md.
 
-USE THIS IF:
-- Your rag_server.py is not yet working
-- You want to proceed to UC-MCP without finishing UC-RAG
-- You want to compare your implementation against a reference
+Delegates to rag_server.py; uses stub_chroma_db as the default index path so UC-MCP
+pre-session checks (stub_chroma_db) match this CLI.
 
-UC-MCP imports from this file by default.
-To use your own rag_server.py in UC-MCP, update uc-mcp/mcp_server.py:
-  change: from stub_rag import query as rag_query
-  to:     from rag_server import query as rag_query   (once your server works)
-
-Requirements:
-  pip3 install sentence-transformers chromadb
+UC-MCP imports query from rag_server first, then falls back to stub_rag.query.
 """
 
+from __future__ import annotations
+
+import argparse
+import json
 import os
 import sys
-import json
-import argparse
-import chromadb
-from chromadb.config import Settings
-from sentence_transformers import SentenceTransformer
-
-# ── CONFIG ──────────────────────────────────────────────────────────────────
-DOCS_DIR    = os.path.join(os.path.dirname(__file__), "../data/policy-documents")
-DB_PATH     = os.path.join(os.path.dirname(__file__), "./stub_chroma_db")
-COLLECTION  = "policy_docs"
-MODEL_NAME  = "all-MiniLM-L6-v2"
-MAX_TOKENS  = 400
-TOP_K       = 3
-THRESHOLD   = 0.6
 
-REFUSAL_TEMPLATE = (
-    "This question is not covered in the retrieved policy documents. "
-    "Retrieved chunks: {sources}. "
-    "Please contact the relevant department for guidance."
+# Same module directory as rag_server
+_HERE = os.path.dirname(os.path.abspath(__file__))
+if _HERE not in sys.path:
+    sys.path.insert(0, _HERE)
+
+from rag_server import (  # noqa: E402
+    TOP_K,
+    SIMILARITY_THRESHOLD,
+    build_index as _build_index,
+    chunk_documents,
+    get_embedder,
+    query as _query,
+    retrieve_and_answer as _retrieve_and_answer,
 )
 
-# ── EMBEDDER (loaded once) ───────────────────────────────────────────────────
-_embedder = None
-def get_embedder():
-    global _embedder
-    if _embedder is None:
-        print("[stub_rag] Loading embedder (first run only)...")
-        _embedder = SentenceTransformer(MODEL_NAME)
-    return _embedder
+DOCS_DIR = os.path.normpath(os.path.join(_HERE, "..", "data", "policy-documents"))
+DB_PATH = os.path.join(_HERE, "stub_chroma_db")
 
-# ── CHROMA CLIENT ────────────────────────────────────────────────────────────
-_client = None
-_collection = None
-def get_collection():
-    global _client, _collection
-    if _collection is None:
-        _client = chromadb.PersistentClient(path=DB_PATH)
-        try:
-            _collection = _client.get_collection(COLLECTION)
-        except Exception:
-            _collection = None
-    return _collection
 
-# ── CHUNK DOCUMENTS ──────────────────────────────────────────────────────────
-def _split_sentences(text: str) -> list[str]:
-    """Split on sentence boundaries."""
-    import re
-    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
-    return [s.strip() for s in sentences if s.strip()]
-
-def _chunk_text(text: str, max_tokens: int = MAX_TOKENS) -> list[str]:
-    """
-    Accumulate sentences until max_tokens is reached.
-    Respects sentence boundaries — never splits mid-sentence.
-    """
-    sentences = _split_sentences(text)
-    chunks, current, count = [], [], 0
-    for sentence in sentences:
-        words = len(sentence.split())
-        if count + words > max_tokens and current:
-            chunks.append(" ".join(current))
-            current, count = [sentence], words
-        else:
-            current.append(sentence)
-            count += words
-    if current:
-        chunks.append(" ".join(current))
-    return chunks
+def build_index(docs_dir: str = DOCS_DIR, db_path: str = DB_PATH) -> None:
+    """Build Chroma index at stub path (agents.md io_contract / UC-MCP checks)."""
+    _build_index(docs_dir, db_path)
 
-def chunk_documents(docs_dir: str = DOCS_DIR) -> list[dict]:
-    """
-    Load all .txt files from docs_dir.
-    Return list of {doc_name, chunk_index, text}.
-    """
-    results = []
-    for fname in sorted(os.listdir(docs_dir)):
-        if not fname.endswith(".txt"):
-            continue
-        path = os.path.join(docs_dir, fname)
-        text = open(path, encoding="utf-8").read()
-        chunks = _chunk_text(text)
-        for i, chunk in enumerate(chunks):
-            results.append({
-                "doc_name":    fname,
-                "chunk_index": i,
-                "text":        chunk,
-                "id":          f"{fname}::chunk_{i}",
-            })
-    return results
 
-# ── BUILD INDEX ──────────────────────────────────────────────────────────────
-def build_index(docs_dir: str = DOCS_DIR, db_path: str = DB_PATH):
-    """Embed all chunks and store in ChromaDB."""
-    global _client, _collection
-    embedder = get_embedder()
-    chunks = chunk_documents(docs_dir)
-
-    _client = chromadb.PersistentClient(path=db_path)
-    try:
-        _client.delete_collection(COLLECTION)
-    except Exception:
-        pass
-    _collection = _client.create_collection(COLLECTION)
-
-    print(f"[stub_rag] Indexing {len(chunks)} chunks from {len(set(c['doc_name'] for c in chunks))} documents...")
-    ids        = [c["id"]       for c in chunks]
-    texts      = [c["text"]     for c in chunks]
-    metadatas  = [{"doc_name": c["doc_name"], "chunk_index": c["chunk_index"]} for c in chunks]
-    embeddings = embedder.encode(texts, show_progress_bar=True).tolist()
+def query(question: str, llm_call=None, top_k: int = TOP_K, threshold: float = SIMILARITY_THRESHOLD):
+    """UC-MCP entrypoint — uses stub_chroma_db by default."""
+    return _query(
+        question,
+        llm_call=llm_call,
+        db_path=DB_PATH,
+        docs_dir=DOCS_DIR,
+        top_k=top_k,
+        threshold=threshold,
+    )
 
-    _collection.add(ids=ids, documents=texts, metadatas=metadatas, embeddings=embeddings)
-    print(f"[stub_rag] Index built at {db_path}")
 
-# ── RETRIEVE AND ANSWER ───────────────────────────────────────────────────────
 def retrieve_and_answer(
-    query: str,
+    user_query: str,
     llm_call=None,
     top_k: int = TOP_K,
-    threshold: float = THRESHOLD,
-) -> dict:
-    """
-    Embed query, retrieve top_k chunks, filter by threshold.
-    If no chunks pass — return refusal.
-    Otherwise call LLM with retrieved context only.
-    Returns {answer, cited_chunks}
-    """
-    collection = get_collection()
-    if collection is None:
-        raise RuntimeError(
-            "Index not built. Run: python3 stub_rag.py --build-index"
-        )
-
-    embedder = get_embedder()
-    query_embedding = embedder.encode([query]).tolist()
-
-    results = collection.query(
-        query_embeddings=query_embedding,
-        n_results=top_k,
-        include=["documents", "metadatas", "distances"],
+    threshold: float = SIMILARITY_THRESHOLD,
+):
+    """Same pipeline as rag_server.retrieve_and_answer with stub index path."""
+    return _retrieve_and_answer(
+        user_query,
+        llm_call=llm_call,
+        docs_dir=DOCS_DIR,
+        db_path=DB_PATH,
+        top_k=top_k,
+        threshold=threshold,
     )
 
-    docs      = results["documents"][0]
-    metadatas = results["metadatas"][0]
-    distances = results["distances"][0]
-
-    # ChromaDB returns L2 distances — convert to cosine similarity approx
-    # Lower distance = more similar. Filter: distance < (1 - threshold) * 2
-    distance_threshold = (1.0 - threshold) * 2.0
-    passing = [
-        (doc, meta, dist)
-        for doc, meta, dist in zip(docs, metadatas, distances)
-        if dist <= distance_threshold
-    ]
-
-    cited_chunks = [
-        {
-            "doc_name":    m["doc_name"],
-            "chunk_index": m["chunk_index"],
-            "score":       round(1.0 - d / 2.0, 3),
-            "text":        doc[:200] + "..." if len(doc) > 200 else doc,
-        }
-        for doc, m, d in passing
-    ]
 
-    if not passing:
-        sources = ", ".join(
-            f"{m['doc_name']}::chunk_{m['chunk_index']}"
-            for _, m, _ in zip(docs, metadatas, distances)
-        ) or "none"
-        return {
-            "answer": REFUSAL_TEMPLATE.format(sources=sources),
-            "cited_chunks": [],
-            "refused": True,
-        }
-
-    # Build prompt — retrieved context only
-    context_blocks = "\n\n".join(
-        f"[Source: {m['doc_name']}, chunk {m['chunk_index']}]\n{doc}"
-        for doc, m, _ in passing
-    )
-    prompt = (
-        f"Answer the following question using ONLY the provided context. "
-        f"Do not use any information outside the context. "
-        f"If the answer is not in the context, say so explicitly.\n\n"
-        f"Context:\n{context_blocks}\n\n"
-        f"Question: {query}\n\n"
-        f"Answer (cite source document and chunk for each claim):"
-    )
-
-    if llm_call is None:
-        # Return retrieved chunks as answer if no LLM configured
-        answer = (
-            "Retrieved context (no LLM configured):\n\n" +
-            "\n\n---\n\n".join(
-                f"[{m['doc_name']}, chunk {m['chunk_index']}]:\n{doc}"
-                for doc, m, _ in passing
-            )
-        )
-    else:
-        answer = llm_call(prompt)
-
-    return {
-        "answer":       answer,
-        "cited_chunks": cited_chunks,
-        "refused":      False,
-    }
-
-# ── PUBLIC QUERY INTERFACE (called by UC-MCP) ────────────────────────────────
-def query(question: str, llm_call=None) -> dict:
-    """
-    Public interface for UC-MCP to call.
-    Returns {answer, cited_chunks, refused}
-    """
-    return retrieve_and_answer(question, llm_call=llm_call)
-
-# ── CLI ───────────────────────────────────────────────────────────────────────
-def main():
-    parser = argparse.ArgumentParser(description="UC-RAG Stub — Working Reference Implementation")
+def main() -> None:
+    parser = argparse.ArgumentParser(description="UC-RAG Stub — rag_server-backed reference")
     parser.add_argument("--build-index", action="store_true")
-    parser.add_argument("--query",       type=str)
-    parser.add_argument("--docs-dir",    type=str, default=DOCS_DIR)
-    parser.add_argument("--json",        action="store_true", help="Output as JSON")
+    parser.add_argument("--query", type=str)
+    parser.add_argument("--docs-dir", type=str, default=DOCS_DIR)
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
     args = parser.parse_args()
 
     if args.build_index:
         build_index(args.docs_dir)
 
     if args.query:
-        # Try to load LLM adapter from uc-mcp
         llm_call = None
         try:
-            sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../uc-mcp"))
+            sys.path.insert(0, os.path.join(_HERE, "../uc-mcp"))
             from llm_adapter import call_llm
+
             llm_call = call_llm
         except Exception:
             print("[stub_rag] No LLM adapter found — returning retrieved chunks only.")
@@ -262,11 +96,14 @@ def main():
         else:
             print(f"\nAnswer:\n{result['answer']}")
             if result["cited_chunks"]:
-                print(f"\nSources:")
+                print("\nSources:")
                 for c in result["cited_chunks"]:
-                    print(f"  [{c['doc_name']}, chunk {c['chunk_index']}] score={c['score']}")
+                    print(
+                        f"  [{c['doc_name']}, chunk {c['chunk_index']}] score={c['score']}"
+                    )
             if result.get("refused"):
-                print("\n[REFUSED — no chunks above threshold]")
+                print("\n[REFUSED — no chunks above similarity threshold]")
+
 
 if __name__ == "__main__":
     main()
diff --git a/uc-rag/verify_reference_queries.py b/uc-rag/verify_reference_queries.py
new file mode 100644
index 0000000..a632bce
--- /dev/null
+++ b/uc-rag/verify_reference_queries.py
@@ -0,0 +1,153 @@
+"""
+README reference verification (agents.md reference_verification).
+
+Runs the four rubric queries through rag_server.query and applies lightweight checks.
+Requires a built Chroma index (rag_server.py --build-index or stub_rag.py --build-index).
+
+Usage:
+  set GEMINI_API_KEY=...   (recommended for answer-quality checks)
+  python verify_reference_queries.py
+  python verify_reference_queries.py --db-path chroma_db
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import re
+import sys
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+if _HERE not in sys.path:
+    sys.path.insert(0, _HERE)
+
+from rag_server import DEFAULT_CHROMA_PATH, query  # noqa: E402
+
+# Load LLM if available (uc-mcp)
+_llm = None
+
+
+def _load_llm():
+    global _llm
+    if _llm is not None:
+        return _llm
+    try:
+        sys.path.insert(0, os.path.join(_HERE, "../uc-mcp"))
+        from llm_adapter import call_llm
+
+        _llm = call_llm
+    except Exception:
+        _llm = None
+    return _llm
+
+
+def _norm(s: str) -> str:
+    return s.lower()
+
+
+def check_leave_without_pay(result: dict) -> tuple[bool, str]:
+    a = _norm(result.get("answer", ""))
+    if result.get("refused"):
+        return False, "expected answer paths, got refusal"
+    ok = "department head" in a and "hr director" in a
+    return ok, "need Department Head + HR Director in answer" if not ok else "ok"
+
+
+def check_personal_phone(result: dict) -> tuple[bool, str]:
+    a = _norm(result.get("answer", ""))
+    cites = result.get("cited_chunks") or []
+    it_touch = any("policy_it" in _norm(str(c.get("doc_name", ""))) for c in cites)
+    hr_touch = any("policy_hr" in _norm(str(c.get("doc_name", ""))) for c in cites)
+    portal_ok = "email" in a and ("self-service" in a or "self service" in a or "portal" in a)
+    no_hr_blend = not hr_touch  # retrieval must not lean on HR for this IT question
+    if result.get("refused"):
+        return False, "expected grounded IT answer, got refusal (try UC_RAG_SIMILARITY_THRESHOLD=0.35)"
+    ok = portal_ok and it_touch and no_hr_blend
+    if not ok:
+        parts = []
+        if not portal_ok:
+            parts.append("mention email + self-service/portal")
+        if not it_touch:
+            parts.append("cite policy_it chunk in sources")
+        if not no_hr_blend:
+            parts.append("avoid HR-only retrieval for IT question")
+        return False, "; ".join(parts) or "check answer"
+    return True, "ok"
+
+
+def check_flexible_culture(result: dict) -> tuple[bool, str]:
+    a = _norm(result.get("answer", ""))
+    refused = result.get("refused")
+    template_ok = "not covered" in a or "retrieved policy" in a
+    if refused or template_ok:
+        return True, "ok (refusal/template)"
+    return False, "expected refusal or not-covered template"
+
+
+def check_home_office_allowance(result: dict) -> tuple[bool, str]:
+    a = result.get("answer", "")
+    an = _norm(a)
+    if result.get("refused"):
+        return False, "expected finance-backed answer, got refusal"
+    # Match "8000", "8,000", "8 000", "Rs 8,000", etc.
+    money_ok = bool(re.search(r"8[\s,]*000", a)) or "8000" in an
+    fin = "finance" in an or "reimbursement" in an or "policy_finance" in an
+    wfh_ok = (
+        "wfh" in an
+        or "work from home" in an
+        or "permanent" in an
+        or "home office" in an
+    )
+    cites = result.get("cited_chunks") or []
+    finance_in_sources = any(
+        "finance" in _norm(str(c.get("doc_name", ""))) for c in cites
+    )
+    # Rubric: amount + (wording OR finance policy appears in retrieval sources)
+    ok = money_ok and (fin or wfh_ok or finance_in_sources)
+    return ok, "ok" if ok else "expect Rs 8,000 + finance/WFH cues (or finance doc in cited_chunks)"
+
+
+CASES = [
+    ("Who approves leave without pay?", check_leave_without_pay),
+    ("Can I use my personal phone for work files?", check_personal_phone),
+    ("What is the flexible working culture?", check_flexible_culture),
+    ("What is the home office equipment allowance?", check_home_office_allowance),
+]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="README reference_verification checks")
+    parser.add_argument(
+        "--db-path",
+        default=None,
+        help="Chroma path (default: rag_server DEFAULT_CHROMA_PATH)",
+    )
+    args = parser.parse_args()
+
+    db = args.db_path or DEFAULT_CHROMA_PATH
+    llm = _load_llm()
+    if llm and os.environ.get("GEMINI_API_KEY"):
+        print("[verify] Using llm_adapter + GEMINI_API_KEY")
+    else:
+        print("[verify] No GEMINI_API_KEY — checks may fail (answers won't be policy-grounded LLM text).")
+
+    all_ok = True
+    for q, checker in CASES:
+        print("\n" + "=" * 60)
+        print("Q:", q)
+        r = query(q, llm_call=llm, db_path=db)
+        passed, note = checker(r)
+        status = "PASS" if passed else "FAIL"
+        if not passed:
+            all_ok = False
+        print(f"[{status}] {note}")
+        print("--- answer (excerpt) ---")
+        print((r.get("answer") or "")[:800])
+        if len(r.get("answer") or "") > 800:
+            print("...")
+    print("\n" + "=" * 60)
+    sys.exit(0 if all_ok else 1)
+
+
+if __name__ == "__main__":
+    main()