From 180adcdc9ffa8258d2edbbce2941a73778ba983e Mon Sep 17 00:00:00 2001 From: Antonio De Marinis Date: Fri, 3 Apr 2026 15:33:20 +0200 Subject: [PATCH] [recipes] Fix UUID cursor in fingerprint dedup backfill - Use created_at instead of id for cursor pagination - Fixes error when thoughts table has UUID primary key - Tested on real Open Brain DB with 1815 rows backfilled --- .../backfill-fingerprints.mjs | 22 +++++++++---------- .../delete-duplicates.mjs | 22 +++++++++---------- .../fingerprint-dedup-backfill/metadata.json | 2 +- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/recipes/fingerprint-dedup-backfill/backfill-fingerprints.mjs b/recipes/fingerprint-dedup-backfill/backfill-fingerprints.mjs index 1d39ab59..897408e6 100644 --- a/recipes/fingerprint-dedup-backfill/backfill-fingerprints.mjs +++ b/recipes/fingerprint-dedup-backfill/backfill-fingerprints.mjs @@ -94,14 +94,14 @@ function buildContentFingerprint(text) { // ── REST helpers ──────────────────────────────────────────────────────────── -async function fetchBatch(cursorId, batchSize) { +async function fetchBatch(cursorCreatedAt, batchSize) { const url = `${REST_BASE}/thoughts` + `?content_fingerprint=is.null` + - `&id=gt.${cursorId}` + - `&select=id,content` + + `&created_at=gt.${encodeURIComponent(cursorCreatedAt)}` + + `&select=id,content,created_at` + `&limit=${batchSize}` + - `&order=id.asc`; + `&order=created_at.asc`; const res = await fetch(url, { headers: HEADERS }); if (!res.ok) { const body = await res.text().catch(() => ""); @@ -162,7 +162,7 @@ function loadState() { return JSON.parse(fs.readFileSync(STATE_FILE, "utf8")); } catch { return { - cursorId: 0, + cursorCreatedAt: "1970-01-01T00:00:00Z", totalDone: 0, totalDuplicates: 0, totalErrors: 0, @@ -183,7 +183,7 @@ async function main() { const state = loadState(); console.log("=== Backfill content_fingerprint ==="); console.log( - `Resuming from cursor id=${state.cursorId} (${state.totalDone} already done)` + `Resuming from cursor created_at=${state.cursorCreatedAt} (${state.totalDone} already done)` ); console.log(`Batch size: ${BATCH_SIZE}`); console.log(); @@ -191,12 +191,12 @@ async function main() { while (true) { state.batches++; process.stdout.write( - `Batch ${state.batches}: fetching from id>${state.cursorId}… ` + `Batch ${state.batches}: fetching from created_at>${state.cursorCreatedAt}… ` ); let rows; try { - rows = await fetchBatch(state.cursorId, BATCH_SIZE); + rows = await fetchBatch(state.cursorCreatedAt, BATCH_SIZE); } catch (err) { console.error("\n Fetch error:", err.message, "— retrying in 5s…"); await new Promise((r) => setTimeout(r, 5000)); @@ -221,8 +221,8 @@ async function main() { state.totalDuplicates += duplicates; state.totalErrors += errors; - const maxId = rows[rows.length - 1].id; - state.cursorId = typeof maxId === "number" ? maxId : maxId; + const lastCreatedAt = rows[rows.length - 1].created_at; + state.cursorCreatedAt = lastCreatedAt; saveState(state); const dupeStr = @@ -231,7 +231,7 @@ async function main() { console.log( ` → ${done} patched${dupeStr}${errStr}. ` + `Total: ${state.totalDone} patched, ${state.totalDuplicates} duplicates, ${state.totalErrors} errors. ` + - `Cursor: ${state.cursorId}` + `Cursor: ${state.cursorCreatedAt}` ); await new Promise((r) => setTimeout(r, 150)); diff --git a/recipes/fingerprint-dedup-backfill/delete-duplicates.mjs b/recipes/fingerprint-dedup-backfill/delete-duplicates.mjs index 8c201ada..f874c6d8 100644 --- a/recipes/fingerprint-dedup-backfill/delete-duplicates.mjs +++ b/recipes/fingerprint-dedup-backfill/delete-duplicates.mjs @@ -93,14 +93,14 @@ function buildFingerprint(text) { // ── REST helpers ──────────────────────────────────────────────────────────── -async function fetchBatch(cursorId, batchSize) { +async function fetchBatch(cursorCreatedAt, batchSize) { const url = `${REST_BASE}/thoughts` + `?content_fingerprint=is.null` + - `&id=gt.${cursorId}` + - `&select=id,content` + + `&created_at=gt.${encodeURIComponent(cursorCreatedAt)}` + + `&select=id,content,created_at` + `&limit=${batchSize}` + - `&order=id.asc`; + `&order=created_at.asc`; const res = await fetch(url, { headers: HEADERS }); if (!res.ok) { const body = await res.text().catch(() => ""); @@ -173,7 +173,7 @@ function loadState() { return JSON.parse(fs.readFileSync(STATE_FILE, "utf8")); } catch { return { - cursorId: 0, + cursorCreatedAt: "1970-01-01T00:00:00Z", totalDeleted: 0, totalPatched: 0, totalWouldDelete: 0, @@ -205,19 +205,19 @@ async function main() { } console.log( - `Resuming from cursor id=${state.cursorId} (deleted: ${state.totalDeleted}, patched: ${state.totalPatched})` + `Resuming from cursor created_at=${state.cursorCreatedAt} (deleted: ${state.totalDeleted}, patched: ${state.totalPatched})` ); console.log(`Batch size: ${BATCH_SIZE}\n`); while (true) { state.batches++; process.stdout.write( - `Batch ${state.batches}: fetching from id>${state.cursorId}… ` + `Batch ${state.batches}: fetching from created_at>${state.cursorCreatedAt}… ` ); let rows; try { - rows = await fetchBatch(state.cursorId, BATCH_SIZE); + rows = await fetchBatch(state.cursorCreatedAt, BATCH_SIZE); } catch (err) { console.error("\n Fetch error:", err.message, "— retrying in 5s…"); await new Promise((r) => setTimeout(r, 5000)); @@ -303,14 +303,14 @@ async function main() { } // Advance cursor - const maxId = rows[rows.length - 1].id; - state.cursorId = maxId; + const lastCreatedAt = rows[rows.length - 1].created_at; + state.cursorCreatedAt = lastCreatedAt; saveState(state); console.log( ` Totals: deleted=${state.totalDeleted}, patched=${state.totalPatched}, ` + `would-delete=${state.totalWouldDelete}, errors=${state.totalErrors}. ` + - `Cursor: ${state.cursorId}` + `Cursor: ${state.cursorCreatedAt}` ); await new Promise((r) => setTimeout(r, 200)); diff --git a/recipes/fingerprint-dedup-backfill/metadata.json b/recipes/fingerprint-dedup-backfill/metadata.json index 83eb2344..c0a1cd90 100644 --- a/recipes/fingerprint-dedup-backfill/metadata.json +++ b/recipes/fingerprint-dedup-backfill/metadata.json @@ -16,5 +16,5 @@ "difficulty": "beginner", "estimated_time": "15 minutes", "created": "2026-03-22", - "updated": "2026-03-22" + "updated": "2026-04-03" }