Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions offline/analysis/benchmark_dashboard.html

Large diffs are not rendered by default.

136 changes: 68 additions & 68 deletions offline/analysis/benchmark_dashboard.json

Large diffs are not rendered by default.

29 changes: 25 additions & 4 deletions offline/code_review_benchmark/step1_download_prs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@

# GitHub API allows ~30 concurrent requests, stay conservative
MAX_WORKERS = 15
IGNORED_COMMAND_COMMENTS = {
"/propel review",
"baz review",
"bugbot review",
"@greptile",
}


def load_dotenv(filepath: str = ".env") -> None:
Expand All @@ -34,6 +40,14 @@ def load_dotenv(filepath: str = ".env") -> None:
os.environ.setdefault(key, value)


def is_ignored_comment_body(body: str | None) -> bool:
"""True when the text is a non-review command that should not be evaluated."""
if not body:
return False
normalized = " ".join(body.strip().split()).lower()
return normalized in IGNORED_COMMAND_COMMENTS


def gh(args: list[str]) -> dict | list:
"""Run gh CLI command and return parsed JSON."""
result = subprocess.run(
Expand Down Expand Up @@ -104,10 +118,13 @@ def fetch_review_comments(org: str, repo: str, pr: int) -> list[dict]:
try:
review_comments = gh(["api", f"/repos/{org}/{repo}/pulls/{pr}/comments"])
for c in review_comments:
body = c.get("body")
if is_ignored_comment_body(body):
continue
comments.append({
"path": c.get("path"),
"line": c.get("line") or c.get("original_line"),
"body": c.get("body"),
"body": body,
"created_at": c.get("created_at"),
})
except subprocess.CalledProcessError:
Expand All @@ -117,11 +134,12 @@ def fetch_review_comments(org: str, repo: str, pr: int) -> list[dict]:
try:
reviews = gh(["api", f"/repos/{org}/{repo}/pulls/{pr}/reviews"])
for r in reviews:
if r.get("body"):
body = r.get("body")
if body and not is_ignored_comment_body(body):
comments.append({
"path": None,
"line": None,
"body": r.get("body"),
"body": body,
"created_at": r.get("submitted_at"),
})
except subprocess.CalledProcessError:
Expand All @@ -131,10 +149,13 @@ def fetch_review_comments(org: str, repo: str, pr: int) -> list[dict]:
try:
issue_comments = gh(["api", f"/repos/{org}/{repo}/issues/{pr}/comments"])
for c in issue_comments:
body = c.get("body")
if is_ignored_comment_body(body):
continue
comments.append({
"path": None,
"line": None,
"body": c.get("body"),
"body": body,
"created_at": c.get("created_at"),
})
except subprocess.CalledProcessError:
Expand Down
19 changes: 17 additions & 2 deletions offline/code_review_benchmark/step3_judge_comments.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
BATCH_SIZE = 40
LLM_CALL_TIMEOUT = 30 # seconds per individual LLM call
REVIEW_TIMEOUT = 1800 # seconds per full review evaluation (30 min)
IGNORED_COMMAND_COMMENTS = {
"/propel review",
"baz review",
"bugbot review",
"@greptile",
}


JUDGE_PROMPT = """You are evaluating AI code review tools.
Expand Down Expand Up @@ -98,6 +104,14 @@ def get_model_dir() -> Path:
return model_dir


def is_ignored_candidate(candidate: str | None) -> bool:
"""True when candidate text is non-review command chatter."""
if not candidate:
return False
normalized = " ".join(candidate.strip().split()).lower()
return normalized in IGNORED_COMMAND_COMMENTS


class LLMJudge:
def __init__(self, structured_output: bool = False):
load_dotenv()
Expand Down Expand Up @@ -205,11 +219,11 @@ def get_candidates(review: dict, all_candidates: dict, golden_url: str) -> list[
# Prefer model-specific candidates file
if golden_url in all_candidates and tool in all_candidates[golden_url]:
candidates = all_candidates[golden_url][tool]
return [c["text"] for c in candidates if c.get("text")]
return [c["text"] for c in candidates if c.get("text") and not is_ignored_candidate(c["text"])]

# Fall back to raw comment bodies
comments = review.get("review_comments", [])
return [c["body"] for c in comments if c.get("body")]
return [c["body"] for c in comments if c.get("body") and not is_ignored_candidate(c["body"])]


async def evaluate_review(
Expand All @@ -218,6 +232,7 @@ async def evaluate_review(
candidates: list[str],
) -> dict:
"""Evaluate candidates against golden comments. Returns precision and recall metrics."""
candidates = [candidate for candidate in candidates if not is_ignored_candidate(candidate)]

if not golden_comments:
return {
Expand Down
Loading