Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions src/memory/SemanticMemory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,25 @@ export class SemanticMemory {

this.db = constructor(this.config.dbPath) as Database;

// Pre-load sqlite-vec so vec0 virtual tables are queryable during the probe
// below. Without this, an existing entity_embeddings (vec0) table — which is
// recreated asynchronously by initializeVectorSearch() the first time vector
// search runs — causes the probe to throw "no such module: vec0" and triggers
// false-positive corruption quarantine. The probe runs synchronously in open()
// before the async vector-search init has a chance to load the extension.
//
// Loading here uses sqlite-vec directly (not via EmbeddingProvider) because:
// 1. EmbeddingProvider may not be attached at open() time.
// 2. The probe must succeed regardless of whether vector search is wired up,
// since the on-disk DB schema doesn't know about that runtime choice.
//
// Failure is non-fatal: the probe loop below has its own missing-module guard
// that skips virtual tables whose extension isn't available.
try {
const vec = await import('sqlite-vec');
vec.load(this.db);
} catch { /* @silent-fallback-ok: sqlite-vec unavailable — probe will skip vec0 tables */ }

// Integrity check — auto-recover from corruption (JSONL is source of truth).
// Corrupt DBs are quarantined (renamed) not deleted, and a marker file is written
// so operators can notice the recovery after the fact.
Expand All @@ -589,9 +608,23 @@ export class SemanticMemory {
if (!this._needsRebuild) {
try {
const tables = this.db!.prepare(
"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'fts%' AND name NOT LIKE 'sqlite%'"
).all() as Array<{ name: string }>;
"SELECT name, sql FROM sqlite_master WHERE type='table' AND name NOT LIKE 'fts%' AND name NOT LIKE 'sqlite%'"
).all() as Array<{ name: string; sql: string | null }>;
for (const t of tables) {
// Virtual tables require their module to be loaded into the connection.
// If a virtual-table probe throws "no such module", the table isn't
// corrupt — its extension just isn't available (sqlite-vec missing,
// a custom module not yet registered, etc.). Treating that as
// corruption produces an infinite quarantine loop on every open.
if (t.sql && /^\s*CREATE\s+VIRTUAL\s+TABLE/i.test(t.sql)) {
try {
this.db!.prepare(`SELECT * FROM "${t.name}" LIMIT 100`).all();
} catch (vErr) {
if (/no such module/i.test((vErr as Error).message)) continue;
throw vErr;
}
continue;
}
this.db!.prepare(`SELECT * FROM "${t.name}" LIMIT 100`).all();
}
} catch (err) {
Expand Down
104 changes: 104 additions & 0 deletions tests/unit/semantic-memory-corruption-recovery.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import { SemanticMemory } from '../../src/memory/SemanticMemory.js';
import { EmbeddingProvider } from '../../src/memory/EmbeddingProvider.js';

interface Setup {
dir: string;
Expand Down Expand Up @@ -310,3 +311,106 @@ describe('SemanticMemory corruption auto-recovery', () => {
expect(afterSecond.corruptFiles.length).toBe(beforeSecond.corruptFiles.length);
});
});

/**
* Virtual-table probe load-order tests.
*
* Contract:
* The secondary probe-read MUST NOT treat "no such module: <name>" on a
* virtual table as corruption. Virtual tables require their extension to be
* loaded into the connection before they're queryable, and the probe runs
* synchronously in open() before any deferred extension load could happen.
* Treating a missing-module error as corruption produces an infinite
* quarantine loop on every reopen (see the upstream gap report).
*
* Concretely for sqlite-vec: open() pre-loads the extension when available,
* so a healthy DB with an `entity_embeddings` (vec0) virtual table opens
* without quarantine.
*/
describe('SemanticMemory probe — virtual tables and load order', () => {
let setup: Setup;
beforeEach(() => { setup = makeSetup(); });
afterEach(() => setup.cleanup());

async function seedDbWithVec0VirtualTable(dbPath: string): Promise<void> {
// Seed the DB exactly the way SemanticMemory's hybrid-search path does: open
// through SemanticMemory with an EmbeddingProvider attached, initialize vector
// search (which loads sqlite-vec and creates the `entity_embeddings` vec0
// virtual table via VectorSearch.createTable), then close. The on-disk schema
// now has a vec0 virtual table that requires the sqlite-vec module to query.
const memory = new SemanticMemory({ dbPath, decayHalfLifeDays: 30, lessonDecayHalfLifeDays: 90, staleThreshold: 0.2 });
memory.setEmbeddingProvider(new EmbeddingProvider());
await memory.open();
await memory.initializeVectorSearch();
memory.close();
}

it('does not quarantine a healthy DB that contains a vec0 virtual table', async () => {
await seedDbWithVec0VirtualTable(setup.dbPath);

// Sanity: schema actually contains a CREATE VIRTUAL TABLE for entity_embeddings.
const BetterSqlite3 = (await import('better-sqlite3')).default;
const inspect = new BetterSqlite3(setup.dbPath, { readonly: true });
const row = inspect.prepare(
"SELECT name, sql FROM sqlite_master WHERE name='entity_embeddings'"
).get() as { name: string; sql: string } | undefined;
inspect.close();
expect(row).toBeDefined();
expect(row!.sql).toMatch(/CREATE\s+VIRTUAL\s+TABLE/i);

// Reopen via SemanticMemory — must not throw and must not quarantine.
const mem = new SemanticMemory({ dbPath: setup.dbPath, decayHalfLifeDays: 30, lessonDecayHalfLifeDays: 90, staleThreshold: 0.2 });
await expect(mem.open()).resolves.not.toThrow();
mem.close();

const { corruptFiles, markerFiles } = listCorruptArtifacts(setup.dir);
expect(corruptFiles).toEqual([]);
expect(markerFiles).toEqual([]);
});

it('opening repeatedly does not accumulate quarantine artifacts (no probe-loop regression)', async () => {
await seedDbWithVec0VirtualTable(setup.dbPath);

for (let i = 0; i < 3; i++) {
const mem = new SemanticMemory({ dbPath: setup.dbPath, decayHalfLifeDays: 30, lessonDecayHalfLifeDays: 90, staleThreshold: 0.2 });
await mem.open();
mem.close();
}

const { corruptFiles, markerFiles } = listCorruptArtifacts(setup.dir);
expect(corruptFiles).toEqual([]);
expect(markerFiles).toEqual([]);
});

it('opens cleanly with no embedding provider attached (probe still survives the vec0 schema)', async () => {
// The deferred-attachment shape: vector search may not be wired up on every
// reopen, but the on-disk schema still contains the vec0 virtual table from
// a previous run. The pre-load + per-table missing-module guard must cover
// this path so we don't quarantine.
await seedDbWithVec0VirtualTable(setup.dbPath);

const mem = new SemanticMemory({ dbPath: setup.dbPath, decayHalfLifeDays: 30, lessonDecayHalfLifeDays: 90, staleThreshold: 0.2 });
// Intentionally no setEmbeddingProvider() — exercise the "vec0 table exists
// on disk but caller hasn't asked for vector search" reopen path.
await mem.open();
mem.close();

const { corruptFiles, markerFiles } = listCorruptArtifacts(setup.dir);
expect(corruptFiles).toEqual([]);
expect(markerFiles).toEqual([]);
});

it('genuine probe-detected corruption still quarantines (regression guard for the virtual-table carve-out)', async () => {
// Make sure the virtual-table skip didn't accidentally widen the probe's
// tolerance for actual corruption in non-virtual tables.
await writePartiallyCorruptDb(setup.dbPath);

const mem = new SemanticMemory({ dbPath: setup.dbPath, decayHalfLifeDays: 30, lessonDecayHalfLifeDays: 90, staleThreshold: 0.2 });
await mem.open();
mem.close();

const { corruptFiles, markerFiles } = listCorruptArtifacts(setup.dir);
expect(corruptFiles.length).toBeGreaterThanOrEqual(1);
expect(markerFiles.length).toBe(1);
});
});
Loading