Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions online/api_service/src/compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ fn record_matches(record: &PrRecord, snapshot: &Snapshot, params: &FilterParams)
}
}

// Exclude self-authored PRs (bot reviewing its own PR)
if params.exclude_self_authored && record.self_authored {
return false;
}

// Require non-empty reviews
if params.require_reviews && !record.has_reviews {
return false;
}

true
}

Expand Down
13 changes: 12 additions & 1 deletion online/api_service/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@ pub async fn load_from_postgres(database_url: &str) -> anyhow::Result<Snapshot>
la.recall,
p.bot_reviewed_at,
p.diff_lines,
p.pr_author,
c.github_username,
c.display_name,
pl.labels as pr_labels_json
pl.labels as pr_labels_json,
(p.reviews IS NOT NULL AND p.reviews != '[]') as has_reviews
FROM llm_analyses la
JOIN prs p ON la.pr_id = p.id
JOIN chatbots c ON la.chatbot_id = c.id
LEFT JOIN pr_labels pl ON pl.pr_id = la.pr_id AND pl.chatbot_id = la.chatbot_id
WHERE p.pr_merged = TRUE
ORDER BY p.bot_reviewed_at ASC NULLS FIRST
"#,
)
Expand Down Expand Up @@ -70,9 +73,11 @@ struct RawRow {
recall: Option<f32>,
bot_reviewed_at: Option<DateTime<Utc>>,
diff_lines: Option<i32>,
pr_author: Option<String>,
github_username: String,
display_name: Option<String>,
pr_labels_json: Option<String>,
has_reviews: Option<bool>,
}

#[derive(sqlx::FromRow)]
Expand Down Expand Up @@ -112,6 +117,10 @@ fn build_snapshot(rows: Vec<RawRow>, volume_rows: Vec<VolumeRawRow>, ignored_use
&mut languages,
);

let self_authored = row.pr_author.as_ref()
.map(|a| a.eq_ignore_ascii_case(&row.github_username))
.unwrap_or(false);

let record = PrRecord {
chatbot_idx,
bot_reviewed_at: row.bot_reviewed_at,
Expand All @@ -122,6 +131,8 @@ fn build_snapshot(rows: Vec<RawRow>, volume_rows: Vec<VolumeRawRow>, ignored_use
domain,
pr_type,
severity,
self_authored,
has_reviews: row.has_reviews.unwrap_or(false),
};

match row.bot_reviewed_at {
Expand Down
4 changes: 4 additions & 0 deletions online/api_service/src/handlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ pub struct MetricsQuery {
pub min_prs_per_day: Option<usize>,
pub min_total_prs: Option<usize>,
pub include_ignored: Option<bool>,
pub exclude_self_authored: Option<bool>,
pub require_reviews: Option<bool>,
}

fn parse_date(s: &str) -> Option<NaiveDate> {
Expand Down Expand Up @@ -70,6 +72,8 @@ fn to_filter_params(q: &MetricsQuery) -> FilterParams {
min_prs_per_day: q.min_prs_per_day.unwrap_or(0),
min_total_prs: q.min_total_prs.unwrap_or(0),
include_ignored: q.include_ignored.unwrap_or(false),
exclude_self_authored: q.exclude_self_authored.unwrap_or(false),
require_reviews: q.require_reviews.unwrap_or(false),
}
}

Expand Down
6 changes: 6 additions & 0 deletions online/api_service/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ pub struct PrRecord {
pub domain: Option<Domain>,
pub pr_type: Option<PrType>,
pub severity: Option<Severity>,
pub self_authored: bool,
pub has_reviews: bool,
}

// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -144,6 +146,8 @@ pub struct FilterParams {
pub min_prs_per_day: usize,
pub min_total_prs: usize,
pub include_ignored: bool,
pub exclude_self_authored: bool,
pub require_reviews: bool,
}

impl Default for FilterParams {
Expand All @@ -162,6 +166,8 @@ impl Default for FilterParams {
min_prs_per_day: 0,
min_total_prs: 0,
include_ignored: false,
exclude_self_authored: false,
require_reviews: false,
}
}
}
Expand Down
7 changes: 6 additions & 1 deletion online/api_service/static/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ <h3>Filters</h3>
<div><input type="number" id="diff-min" value="0" min="0" step="50"></div>
<div><input type="number" id="diff-max" value="2000" min="0" step="50"></div>
</div>
<h3>Quality Filters</h3>
<label><input type="checkbox" id="exclude-self-authored"> Exclude self-authored</label>
<label><input type="checkbox" id="require-reviews"> Require reviews</label>
<h3>Label Filters</h3>
<label>Domain</label><div class="cb-group" id="f-domain"></div>
<label>Language</label><input type="text" class="search-input" placeholder="Type to filter..." oninput="filterCbs('f-language',this.value)"><div class="cb-group" id="f-language"></div>
Expand Down Expand Up @@ -126,7 +129,7 @@ <h2>Leaderboard</h2>
$('#end-date').value=d.end;
// Build initial color map from all chatbots
assignColors(o.chatbots);
for(const id of['start-date','end-date','beta','min-prs-total','min-prs','diff-min','diff-max','include-ignored'])
for(const id of['start-date','end-date','beta','min-prs-total','min-prs','diff-min','diff-max','exclude-self-authored','require-reviews','include-ignored'])
document.getElementById(id).addEventListener('change',refresh);
refresh();
}
Expand All @@ -144,6 +147,8 @@ <h2>Leaderboard</h2>
const dmin=$('#diff-min').value, dmax=$('#diff-max').value;
if(dmin&&dmin!=='0') p.set('diff_lines_min',dmin);
if(dmax) p.set('diff_lines_max',dmax);
if($('#exclude-self-authored').checked) p.set('exclude_self_authored','true');
if($('#require-reviews').checked) p.set('require_reviews','true');
const j=(cls,key)=>{const v=checked(cls);if(v.length)p.set(key,v.join(','));};
j('cb-bot','chatbot');j('cb-dom','domain');j('cb-lang','language');j('cb-prt','pr_type');j('cb-sev','severity');
if($('#include-ignored').checked) p.set('include_ignored','true');
Expand Down
21 changes: 20 additions & 1 deletion online/etl/dashboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@
# Pre-fetch analyses to extract label options
_all_analyses = get_analyses(DATABASE_URL, chatbot_id=chatbot_id)

# Quality filters
exclude_self = st.sidebar.checkbox("Exclude self-authored PRs", value=False)
require_reviews = st.sidebar.checkbox("Require non-empty reviews", value=False)

# Diff lines filter
diff_over_2k = st.sidebar.checkbox("More than 2k LOC", value=False)
diff_range = st.sidebar.slider(
Expand Down Expand Up @@ -125,7 +129,22 @@ def _diff_lines_ok(row) -> bool:
return diff_range[0] <= dl <= diff_range[1]


analyses = [a for a in _all_analyses if _label_matches(a) and _diff_lines_ok(a)]
def _author_ok(row) -> bool:
if not exclude_self:
return True
author = (row.get("pr_author") or "").strip()
if not author:
return False # exclude unknown-author PRs when filter is on
return author.lower() != (row.get("github_username") or "").lower()


def _reviews_ok(row) -> bool:
if not require_reviews:
return True
return row.get("has_reviews", False)


analyses = [a for a in _all_analyses if _label_matches(a) and _diff_lines_ok(a) and _author_ok(a) and _reviews_ok(a)]

start_str = str(start_date) if start_date else None
end_str = str(end_date) if end_date else None
Expand Down
6 changes: 4 additions & 2 deletions online/etl/dashboard/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ def get_analyses(database_url: str, chatbot_id: int | None = None) -> list[dict[
conn = _get_sync_connection(database_url)
try:
base = """SELECT la.*, p.repo_name, p.pr_number, p.pr_url, p.pr_created_at,
p.bot_reviewed_at, p.diff_lines, c.github_username, c.display_name,
pl.labels as pr_labels_json
p.bot_reviewed_at, p.diff_lines, p.pr_author,
c.github_username, c.display_name,
pl.labels as pr_labels_json,
(p.reviews IS NOT NULL AND p.reviews != '[]') as has_reviews
FROM llm_analyses la
JOIN prs p ON la.pr_id = p.id
JOIN chatbots c ON la.chatbot_id = c.id
Expand Down
20 changes: 18 additions & 2 deletions online/etl/db/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

INSERT_PR = """
INSERT INTO prs (chatbot_id, repo_name, pr_number, pr_url, pr_title, pr_author,
pr_created_at, pr_merged, status, bq_events, bot_reviewed_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
pr_created_at, pr_merged, status, bq_events, bot_reviewed_at, repo_id)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
ON CONFLICT (chatbot_id, repo_name, pr_number) DO NOTHING
RETURNING id
"""
Expand Down Expand Up @@ -61,6 +61,7 @@
WHERE p.chatbot_id = $1
AND p.status = 'assembled'
AND la.id IS NULL
AND p.pr_merged = TRUE
ORDER BY p.bot_reviewed_at DESC NULLS LAST
LIMIT $2
"""
Expand All @@ -70,6 +71,7 @@
LEFT JOIN llm_analyses la ON la.pr_id = p.id AND la.chatbot_id = p.chatbot_id
WHERE p.status = 'assembled'
AND la.id IS NULL
AND p.pr_merged = TRUE
ORDER BY p.bot_reviewed_at DESC NULLS LAST
LIMIT $1
"""
Expand All @@ -80,6 +82,7 @@
WHERE p.chatbot_id = $1
AND p.status = 'assembled'
AND la.id IS NULL
AND p.pr_merged = TRUE
AND p.bot_reviewed_at >= $2
ORDER BY p.bot_reviewed_at DESC NULLS LAST
LIMIT $3
Expand All @@ -90,6 +93,7 @@
LEFT JOIN llm_analyses la ON la.pr_id = p.id AND la.chatbot_id = p.chatbot_id
WHERE p.status = 'assembled'
AND la.id IS NULL
AND p.pr_merged = TRUE
AND p.bot_reviewed_at >= $1
ORDER BY p.bot_reviewed_at DESC NULLS LAST
LIMIT $2
Expand Down Expand Up @@ -171,6 +175,18 @@
WHERE id = $5
"""

UPDATE_PR_AUTHOR = """
UPDATE prs SET pr_author = $1 WHERE id = $2
"""

MERGE_PR_BQ_EVENTS = """
UPDATE prs SET bq_events = $1, pr_merged = COALESCE($2, pr_merged),
pr_title = CASE WHEN pr_title = '' OR pr_title IS NULL THEN $3 ELSE pr_title END,
pr_author = COALESCE(pr_author, $4),
pr_created_at = COALESCE(pr_created_at, $5)
WHERE id = $6
"""

# -- LLM analyses --------------------------------------------------------------

INSERT_LLM_ANALYSIS = """
Expand Down
71 changes: 68 additions & 3 deletions online/etl/db/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@
from db.connection import DBAdapter


def _merge_bq_events(existing_raw: str | list | None, new_events: list[dict]) -> list[dict]:
"""Merge new BQ events into existing ones, deduplicating by event_id."""
if existing_raw is None:
return new_events
old_events = json.loads(existing_raw) if isinstance(existing_raw, str) else existing_raw
seen_ids = {e.get("event_id") for e in old_events if e.get("event_id")}
unique_new = [e for e in new_events if e.get("event_id") not in seen_ids]
if not unique_new:
return old_events
merged = old_events + unique_new
merged.sort(key=lambda e: e.get("created_at", ""))
return merged


class PRRepository:
"""High-level async database operations."""

Expand Down Expand Up @@ -46,10 +60,15 @@ async def insert_pr(
status: str = "pending",
bq_events: list | None = None,
bot_reviewed_at: str | None = None,
repo_id: int | None = None,
) -> bool:
"""Insert a PR row (ON CONFLICT DO NOTHING for idempotency).
"""Insert a PR row, merging bq_events on conflict.

Returns True if the row was actually inserted, False if it already existed.
On conflict (same chatbot_id, repo_name, pr_number), appends new BQ
events to existing ones (deduplicated by event_id) and fills in any
missing metadata (pr_merged, pr_author, etc.).

Returns True if the row was newly inserted, False if it already existed.
"""
bq_json = json.dumps(bq_events) if bq_events is not None else None
row = await self.db.fetchone(
Expand All @@ -66,9 +85,52 @@ async def insert_pr(
status,
bq_json,
bot_reviewed_at,
repo_id,
),
)
return row is not None
if row is not None:
return True

# Conflict: merge new bq_events into existing row
if bq_events:
existing = await self.get_pr(chatbot_id, repo_name, pr_number)
if existing:
old_events = json.loads(existing["bq_events"]) if isinstance(existing.get("bq_events"), str) else (existing.get("bq_events") or [])
merged_events = _merge_bq_events(existing.get("bq_events"), bq_events)
from pipeline.discover import _extract_pr_metadata
meta = _extract_pr_metadata(merged_events)
await self.db.execute(
*self.db._translate_params(
q.MERGE_PR_BQ_EVENTS,
(
json.dumps(merged_events),
meta["pr_merged"],
meta["pr_title"],
meta["pr_author"],
meta["pr_created_at"],
existing["id"],
),
)
)
# New events arrived — reset to pending so PR gets
# re-enriched/assembled/analyzed with the updated timeline
if len(merged_events) > len(old_events):
await self.db.execute(
*self.db._translate_params(
"UPDATE prs SET status = 'pending', enrichment_step = NULL, "
"assembled = NULL, assembled_at = NULL WHERE id = $1",
(existing["id"],),
)
)
# Also set repo_id if we have it and existing doesn't
if repo_id and not existing.get("repo_id"):
await self.db.execute(
*self.db._translate_params(
"UPDATE prs SET repo_id = $1 WHERE id = $2",
(repo_id, existing["id"]),
)
)
return False

async def get_pr(self, chatbot_id: int, repo_name: str, pr_number: int) -> dict[str, Any] | None:
return await self.db.fetchone(q.GET_PR, (chatbot_id, repo_name, pr_number))
Expand Down Expand Up @@ -169,6 +231,9 @@ async def update_metadata(
) -> None:
await self.db.execute(q.UPDATE_PR_METADATA, (pr_title, pr_author, pr_created_at, pr_merged, pr_id))

async def update_pr_author(self, pr_id: int, pr_author: str) -> None:
await self.db.execute(q.UPDATE_PR_AUTHOR, (pr_author, pr_id))

# -- LLM analyses ----------------------------------------------------------

async def insert_analysis(
Expand Down
4 changes: 4 additions & 0 deletions online/etl/db/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,15 @@

MIGRATIONS = [
"ALTER TABLE prs ADD COLUMN IF NOT EXISTS diff_lines INTEGER",
"ALTER TABLE prs ADD COLUMN IF NOT EXISTS pr_api_raw TEXT",
"ALTER TABLE prs ADD COLUMN IF NOT EXISTS repo_id BIGINT",
]

# SQLite doesn't support IF NOT EXISTS on ALTER TABLE ADD COLUMN
MIGRATIONS_SQLITE = [
("diff_lines", "ALTER TABLE prs ADD COLUMN diff_lines INTEGER"),
("pr_api_raw", "ALTER TABLE prs ADD COLUMN pr_api_raw TEXT"),
("repo_id", "ALTER TABLE prs ADD COLUMN repo_id BIGINT"),
]


Expand Down
Loading