withmartian · zverianskii · Mar 5, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 20, 2026
diff --git a/online/api_service/src/compute.rs b/online/api_service/src/compute.rs
@@ -109,6 +109,16 @@ fn record_matches(record: &PrRecord, snapshot: &Snapshot, params: &FilterParams)
         }
     }
 
+    // Exclude self-authored PRs (bot reviewing its own PR)
+    if params.exclude_self_authored && record.self_authored {
+        return false;
+    }
+
+    // Require non-empty reviews
+    if params.require_reviews && !record.has_reviews {
+        return false;
+    }
+
     true
 }
 

diff --git a/online/api_service/src/db.rs b/online/api_service/src/db.rs
@@ -20,13 +20,16 @@ pub async fn load_from_postgres(database_url: &str) -> anyhow::Result<Snapshot>
                la.recall,
                p.bot_reviewed_at,
                p.diff_lines,
+               p.pr_author,
                c.github_username,
                c.display_name,
-               pl.labels as pr_labels_json
+               pl.labels as pr_labels_json,
+               (p.reviews IS NOT NULL AND p.reviews != '[]') as has_reviews
         FROM llm_analyses la
         JOIN prs p ON la.pr_id = p.id
         JOIN chatbots c ON la.chatbot_id = c.id
         LEFT JOIN pr_labels pl ON pl.pr_id = la.pr_id AND pl.chatbot_id = la.chatbot_id
+        WHERE p.pr_merged = TRUE
         ORDER BY p.bot_reviewed_at ASC NULLS FIRST
         "#,
     )
@@ -70,9 +73,11 @@ struct RawRow {
     recall: Option<f32>,
     bot_reviewed_at: Option<DateTime<Utc>>,
     diff_lines: Option<i32>,
+    pr_author: Option<String>,
     github_username: String,
     display_name: Option<String>,
     pr_labels_json: Option<String>,
+    has_reviews: Option<bool>,
 }
 
 #[derive(sqlx::FromRow)]
@@ -112,6 +117,10 @@ fn build_snapshot(rows: Vec<RawRow>, volume_rows: Vec<VolumeRawRow>, ignored_use
             &mut languages,
         );
 
+        let self_authored = row.pr_author.as_ref()
+            .map(|a| a.eq_ignore_ascii_case(&row.github_username))
+            .unwrap_or(false);
+
         let record = PrRecord {
             chatbot_idx,
             bot_reviewed_at: row.bot_reviewed_at,
@@ -122,6 +131,8 @@ fn build_snapshot(rows: Vec<RawRow>, volume_rows: Vec<VolumeRawRow>, ignored_use
             domain,
             pr_type,
             severity,
+            self_authored,
+            has_reviews: row.has_reviews.unwrap_or(false),
         };
 
         match row.bot_reviewed_at {

diff --git a/online/api_service/src/handlers.rs b/online/api_service/src/handlers.rs
@@ -27,6 +27,8 @@ pub struct MetricsQuery {
     pub min_prs_per_day: Option<usize>,
     pub min_total_prs: Option<usize>,
     pub include_ignored: Option<bool>,
+    pub exclude_self_authored: Option<bool>,
+    pub require_reviews: Option<bool>,
 }
 
 fn parse_date(s: &str) -> Option<NaiveDate> {
@@ -70,6 +72,8 @@ fn to_filter_params(q: &MetricsQuery) -> FilterParams {
         min_prs_per_day: q.min_prs_per_day.unwrap_or(0),
         min_total_prs: q.min_total_prs.unwrap_or(0),
         include_ignored: q.include_ignored.unwrap_or(false),
+        exclude_self_authored: q.exclude_self_authored.unwrap_or(false),
+        require_reviews: q.require_reviews.unwrap_or(false),
     }
 }
 

diff --git a/online/api_service/src/model.rs b/online/api_service/src/model.rs
@@ -93,6 +93,8 @@ pub struct PrRecord {
     pub domain: Option<Domain>,
     pub pr_type: Option<PrType>,
     pub severity: Option<Severity>,
+    pub self_authored: bool,
+    pub has_reviews: bool,
 }
 
 // ---------------------------------------------------------------------------
@@ -144,6 +146,8 @@ pub struct FilterParams {
     pub min_prs_per_day: usize,
     pub min_total_prs: usize,
     pub include_ignored: bool,
+    pub exclude_self_authored: bool,
+    pub require_reviews: bool,
 }
 
 impl Default for FilterParams {
@@ -162,6 +166,8 @@ impl Default for FilterParams {
             min_prs_per_day: 0,
             min_total_prs: 0,
             include_ignored: false,
+            exclude_self_authored: false,
+            require_reviews: false,
         }
     }
 }

diff --git a/online/api_service/static/index.html b/online/api_service/static/index.html
@@ -51,6 +51,9 @@ <h3>Filters</h3>
     <div><input type="number" id="diff-min" value="0" min="0" step="50"></div>
     <div><input type="number" id="diff-max" value="2000" min="0" step="50"></div>
   </div>
+  <h3>Quality Filters</h3>
+  <label><input type="checkbox" id="exclude-self-authored"> Exclude self-authored</label>
+  <label><input type="checkbox" id="require-reviews"> Require reviews</label>
   <h3>Label Filters</h3>
   <label>Domain</label><div class="cb-group" id="f-domain"></div>
   <label>Language</label><input type="text" class="search-input" placeholder="Type to filter..." oninput="filterCbs('f-language',this.value)"><div class="cb-group" id="f-language"></div>
@@ -126,7 +129,7 @@ <h2>Leaderboard</h2>
   $('#end-date').value=d.end;
   // Build initial color map from all chatbots
   assignColors(o.chatbots);
-  for(const id of['start-date','end-date','beta','min-prs-total','min-prs','diff-min','diff-max','include-ignored'])
+  for(const id of['start-date','end-date','beta','min-prs-total','min-prs','diff-min','diff-max','exclude-self-authored','require-reviews','include-ignored'])
     document.getElementById(id).addEventListener('change',refresh);
   refresh();
 }
@@ -144,6 +147,8 @@ <h2>Leaderboard</h2>
   const dmin=$('#diff-min').value, dmax=$('#diff-max').value;
   if(dmin&&dmin!=='0') p.set('diff_lines_min',dmin);
   if(dmax) p.set('diff_lines_max',dmax);
+  if($('#exclude-self-authored').checked) p.set('exclude_self_authored','true');
+  if($('#require-reviews').checked) p.set('require_reviews','true');
   const j=(cls,key)=>{const v=checked(cls);if(v.length)p.set(key,v.join(','));};
   j('cb-bot','chatbot');j('cb-dom','domain');j('cb-lang','language');j('cb-prt','pr_type');j('cb-sev','severity');
   if($('#include-ignored').checked) p.set('include_ignored','true');

diff --git a/online/etl/dashboard/app.py b/online/etl/dashboard/app.py
@@ -60,6 +60,10 @@
 # Pre-fetch analyses to extract label options
 _all_analyses = get_analyses(DATABASE_URL, chatbot_id=chatbot_id)
 
+# Quality filters
+exclude_self = st.sidebar.checkbox("Exclude self-authored PRs", value=False)
+require_reviews = st.sidebar.checkbox("Require non-empty reviews", value=False)
+
 # Diff lines filter
 diff_over_2k = st.sidebar.checkbox("More than 2k LOC", value=False)
 diff_range = st.sidebar.slider(
@@ -125,7 +129,22 @@ def _diff_lines_ok(row) -> bool:
     return diff_range[0] <= dl <= diff_range[1]
 
 
-analyses = [a for a in _all_analyses if _label_matches(a) and _diff_lines_ok(a)]
+def _author_ok(row) -> bool:
+    if not exclude_self:
+        return True
+    author = (row.get("pr_author") or "").strip()
+    if not author:
+        return False  # exclude unknown-author PRs when filter is on
+    return author.lower() != (row.get("github_username") or "").lower()
+
+
+def _reviews_ok(row) -> bool:
+    if not require_reviews:
+        return True
+    return row.get("has_reviews", False)
+
+
+analyses = [a for a in _all_analyses if _label_matches(a) and _diff_lines_ok(a) and _author_ok(a) and _reviews_ok(a)]
 
 start_str = str(start_date) if start_date else None
 end_str = str(end_date) if end_date else None

diff --git a/online/etl/dashboard/data.py b/online/etl/dashboard/data.py
@@ -53,8 +53,10 @@ def get_analyses(database_url: str, chatbot_id: int | None = None) -> list[dict[
     conn = _get_sync_connection(database_url)
     try:
         base = """SELECT la.*, p.repo_name, p.pr_number, p.pr_url, p.pr_created_at,
-                         p.bot_reviewed_at, p.diff_lines, c.github_username, c.display_name,
-                         pl.labels as pr_labels_json
+                         p.bot_reviewed_at, p.diff_lines, p.pr_author,
+                         c.github_username, c.display_name,
+                         pl.labels as pr_labels_json,
+                         (p.reviews IS NOT NULL AND p.reviews != '[]') as has_reviews
                   FROM llm_analyses la
                   JOIN prs p ON la.pr_id = p.id
                   JOIN chatbots c ON la.chatbot_id = c.id

diff --git a/online/etl/db/queries.py b/online/etl/db/queries.py
@@ -21,8 +21,8 @@
 
 INSERT_PR = """
     INSERT INTO prs (chatbot_id, repo_name, pr_number, pr_url, pr_title, pr_author,
-                     pr_created_at, pr_merged, status, bq_events, bot_reviewed_at)
-    VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
+                     pr_created_at, pr_merged, status, bq_events, bot_reviewed_at, repo_id)
+    VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
     ON CONFLICT (chatbot_id, repo_name, pr_number) DO NOTHING
     RETURNING id
 """
@@ -61,6 +61,7 @@
     WHERE p.chatbot_id = $1
       AND p.status = 'assembled'
       AND la.id IS NULL
+      AND p.pr_merged = TRUE
     ORDER BY p.bot_reviewed_at DESC NULLS LAST
     LIMIT $2
 """
@@ -70,6 +71,7 @@
     LEFT JOIN llm_analyses la ON la.pr_id = p.id AND la.chatbot_id = p.chatbot_id
     WHERE p.status = 'assembled'
       AND la.id IS NULL
+      AND p.pr_merged = TRUE
     ORDER BY p.bot_reviewed_at DESC NULLS LAST
     LIMIT $1
 """
@@ -80,6 +82,7 @@
     WHERE p.chatbot_id = $1
       AND p.status = 'assembled'
       AND la.id IS NULL
+      AND p.pr_merged = TRUE
       AND p.bot_reviewed_at >= $2
     ORDER BY p.bot_reviewed_at DESC NULLS LAST
     LIMIT $3
@@ -90,6 +93,7 @@
     LEFT JOIN llm_analyses la ON la.pr_id = p.id AND la.chatbot_id = p.chatbot_id
     WHERE p.status = 'assembled'
       AND la.id IS NULL
+      AND p.pr_merged = TRUE
       AND p.bot_reviewed_at >= $1
     ORDER BY p.bot_reviewed_at DESC NULLS LAST
     LIMIT $2
@@ -171,6 +175,18 @@
     WHERE id = $5
 """
 
+UPDATE_PR_AUTHOR = """
+    UPDATE prs SET pr_author = $1 WHERE id = $2
+"""
+
+MERGE_PR_BQ_EVENTS = """
+    UPDATE prs SET bq_events = $1, pr_merged = COALESCE($2, pr_merged),
+                   pr_title = CASE WHEN pr_title = '' OR pr_title IS NULL THEN $3 ELSE pr_title END,
+                   pr_author = COALESCE(pr_author, $4),
+                   pr_created_at = COALESCE(pr_created_at, $5)
+    WHERE id = $6
+"""
+
 # -- LLM analyses --------------------------------------------------------------
 
 INSERT_LLM_ANALYSIS = """

diff --git a/online/etl/db/repository.py b/online/etl/db/repository.py
@@ -11,6 +11,20 @@
 from db.connection import DBAdapter
 
 
+def _merge_bq_events(existing_raw: str | list | None, new_events: list[dict]) -> list[dict]:
+    """Merge new BQ events into existing ones, deduplicating by event_id."""
+    if existing_raw is None:
+        return new_events
+    old_events = json.loads(existing_raw) if isinstance(existing_raw, str) else existing_raw
+    seen_ids = {e.get("event_id") for e in old_events if e.get("event_id")}
+    unique_new = [e for e in new_events if e.get("event_id") not in seen_ids]
+    if not unique_new:
+        return old_events
+    merged = old_events + unique_new
+    merged.sort(key=lambda e: e.get("created_at", ""))
+    return merged
+
+
 class PRRepository:
     """High-level async database operations."""
 
@@ -46,10 +60,15 @@ async def insert_pr(
         status: str = "pending",
         bq_events: list | None = None,
         bot_reviewed_at: str | None = None,
+        repo_id: int | None = None,
     ) -> bool:
-        """Insert a PR row (ON CONFLICT DO NOTHING for idempotency).
+        """Insert a PR row, merging bq_events on conflict.
 
-        Returns True if the row was actually inserted, False if it already existed.
+        On conflict (same chatbot_id, repo_name, pr_number), appends new BQ
+        events to existing ones (deduplicated by event_id) and fills in any
+        missing metadata (pr_merged, pr_author, etc.).
+
+        Returns True if the row was newly inserted, False if it already existed.
         """
         bq_json = json.dumps(bq_events) if bq_events is not None else None
         row = await self.db.fetchone(
@@ -66,9 +85,52 @@ async def insert_pr(
                 status,
                 bq_json,
                 bot_reviewed_at,
+                repo_id,
             ),
         )
-        return row is not None
+        if row is not None:
+            return True
+
+        # Conflict: merge new bq_events into existing row
+        if bq_events:
+            existing = await self.get_pr(chatbot_id, repo_name, pr_number)
+            if existing:
+                old_events = json.loads(existing["bq_events"]) if isinstance(existing.get("bq_events"), str) else (existing.get("bq_events") or [])
+                merged_events = _merge_bq_events(existing.get("bq_events"), bq_events)
+                from pipeline.discover import _extract_pr_metadata
+                meta = _extract_pr_metadata(merged_events)
+                await self.db.execute(
+                    *self.db._translate_params(
+                        q.MERGE_PR_BQ_EVENTS,
+                        (
+                            json.dumps(merged_events),
+                            meta["pr_merged"],
+                            meta["pr_title"],
+                            meta["pr_author"],
+                            meta["pr_created_at"],
+                            existing["id"],
+                        ),
+                    )
+                )
+                # New events arrived — reset to pending so PR gets
+                # re-enriched/assembled/analyzed with the updated timeline
+                if len(merged_events) > len(old_events):
+                    await self.db.execute(
+                        *self.db._translate_params(
+                            "UPDATE prs SET status = 'pending', enrichment_step = NULL, "
+                            "assembled = NULL, assembled_at = NULL WHERE id = $1",
+                            (existing["id"],),
+                        )
+                    )
+                # Also set repo_id if we have it and existing doesn't
+                if repo_id and not existing.get("repo_id"):
+                    await self.db.execute(
+                        *self.db._translate_params(
+                            "UPDATE prs SET repo_id = $1 WHERE id = $2",
+                            (repo_id, existing["id"]),
+                        )
+                    )
+        return False
 
     async def get_pr(self, chatbot_id: int, repo_name: str, pr_number: int) -> dict[str, Any] | None:
         return await self.db.fetchone(q.GET_PR, (chatbot_id, repo_name, pr_number))
@@ -169,6 +231,9 @@ async def update_metadata(
     ) -> None:
         await self.db.execute(q.UPDATE_PR_METADATA, (pr_title, pr_author, pr_created_at, pr_merged, pr_id))
 
+    async def update_pr_author(self, pr_id: int, pr_author: str) -> None:
+        await self.db.execute(q.UPDATE_PR_AUTHOR, (pr_author, pr_id))
+
     # -- LLM analyses ----------------------------------------------------------
 
     async def insert_analysis(

diff --git a/online/etl/db/schema.py b/online/etl/db/schema.py
@@ -112,11 +112,15 @@
 
 MIGRATIONS = [
     "ALTER TABLE prs ADD COLUMN IF NOT EXISTS diff_lines INTEGER",
+    "ALTER TABLE prs ADD COLUMN IF NOT EXISTS pr_api_raw TEXT",
+    "ALTER TABLE prs ADD COLUMN IF NOT EXISTS repo_id BIGINT",
 ]
 
 # SQLite doesn't support IF NOT EXISTS on ALTER TABLE ADD COLUMN
 MIGRATIONS_SQLITE = [
     ("diff_lines", "ALTER TABLE prs ADD COLUMN diff_lines INTEGER"),
+    ("pr_api_raw", "ALTER TABLE prs ADD COLUMN pr_api_raw TEXT"),
+    ("repo_id", "ALTER TABLE prs ADD COLUMN repo_id BIGINT"),
 ]