withmartian · tonyd3 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/offline/analysis/benchmark_dashboard.html b/offline/analysis/benchmark_dashboard.html
diff --git a/offline/analysis/benchmark_dashboard.json b/offline/analysis/benchmark_dashboard.json
diff --git a/offline/code_review_benchmark/step1_download_prs.py b/offline/code_review_benchmark/step1_download_prs.py
@@ -15,6 +15,12 @@
 
 # GitHub API allows ~30 concurrent requests, stay conservative
 MAX_WORKERS = 15
+IGNORED_COMMAND_COMMENTS = {
+    "/propel review",
+    "baz review",
+    "bugbot review",
+    "@greptile",
+}
 
 
 def load_dotenv(filepath: str = ".env") -> None:
@@ -34,6 +40,14 @@ def load_dotenv(filepath: str = ".env") -> None:
                 os.environ.setdefault(key, value)
 
 
+def is_ignored_comment_body(body: str | None) -> bool:
+    """True when the text is a non-review command that should not be evaluated."""
+    if not body:
+        return False
+    normalized = " ".join(body.strip().split()).lower()
+    return normalized in IGNORED_COMMAND_COMMENTS
+
+
 def gh(args: list[str]) -> dict | list:
     """Run gh CLI command and return parsed JSON."""
     result = subprocess.run(
@@ -104,10 +118,13 @@ def fetch_review_comments(org: str, repo: str, pr: int) -> list[dict]:
     try:
         review_comments = gh(["api", f"/repos/{org}/{repo}/pulls/{pr}/comments"])
         for c in review_comments:
+            body = c.get("body")
+            if is_ignored_comment_body(body):
+                continue
             comments.append({
                 "path": c.get("path"),
                 "line": c.get("line") or c.get("original_line"),
-                "body": c.get("body"),
+                "body": body,
                 "created_at": c.get("created_at"),
             })
     except subprocess.CalledProcessError:
@@ -117,11 +134,12 @@ def fetch_review_comments(org: str, repo: str, pr: int) -> list[dict]:
     try:
         reviews = gh(["api", f"/repos/{org}/{repo}/pulls/{pr}/reviews"])
         for r in reviews:
-            if r.get("body"):
+            body = r.get("body")
+            if body and not is_ignored_comment_body(body):
                 comments.append({
                     "path": None,
                     "line": None,
-                    "body": r.get("body"),
+                    "body": body,
                     "created_at": r.get("submitted_at"),
                 })
     except subprocess.CalledProcessError:
@@ -131,10 +149,13 @@ def fetch_review_comments(org: str, repo: str, pr: int) -> list[dict]:
     try:
         issue_comments = gh(["api", f"/repos/{org}/{repo}/issues/{pr}/comments"])
         for c in issue_comments:
+            body = c.get("body")
+            if is_ignored_comment_body(body):
+                continue
             comments.append({
                 "path": None,
                 "line": None,
-                "body": c.get("body"),
+                "body": body,
                 "created_at": c.get("created_at"),
             })
     except subprocess.CalledProcessError:

diff --git a/offline/code_review_benchmark/step3_judge_comments.py b/offline/code_review_benchmark/step3_judge_comments.py
@@ -23,6 +23,12 @@
 BATCH_SIZE = 40
 LLM_CALL_TIMEOUT = 30  # seconds per individual LLM call
 REVIEW_TIMEOUT = 1800  # seconds per full review evaluation (30 min)
+IGNORED_COMMAND_COMMENTS = {
+    "/propel review",
+    "baz review",
+    "bugbot review",
+    "@greptile",
+}
 
 
 JUDGE_PROMPT = """You are evaluating AI code review tools.
@@ -98,6 +104,14 @@ def get_model_dir() -> Path:
     return model_dir
 
 
+def is_ignored_candidate(candidate: str | None) -> bool:
+    """True when candidate text is non-review command chatter."""
+    if not candidate:
+        return False
+    normalized = " ".join(candidate.strip().split()).lower()
+    return normalized in IGNORED_COMMAND_COMMENTS
+
+
 class LLMJudge:
     def __init__(self, structured_output: bool = False):
         load_dotenv()
@@ -205,11 +219,11 @@ def get_candidates(review: dict, all_candidates: dict, golden_url: str) -> list[
     # Prefer model-specific candidates file
     if golden_url in all_candidates and tool in all_candidates[golden_url]:
         candidates = all_candidates[golden_url][tool]
-        return [c["text"] for c in candidates if c.get("text")]
+        return [c["text"] for c in candidates if c.get("text") and not is_ignored_candidate(c["text"])]
 
     # Fall back to raw comment bodies
     comments = review.get("review_comments", [])
-    return [c["body"] for c in comments if c.get("body")]
+    return [c["body"] for c in comments if c.get("body") and not is_ignored_candidate(c["body"])]
 
 
 async def evaluate_review(
@@ -218,6 +232,7 @@ async def evaluate_review(
     candidates: list[str],
 ) -> dict:
     """Evaluate candidates against golden comments. Returns precision and recall metrics."""
+    candidates = [candidate for candidate in candidates if not is_ignored_candidate(candidate)]
 
     if not golden_comments:
         return {