docs cleanup

jcdoll · jcdoll · commit 1d0b7366838f · 2025-12-21T23:13:27.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # This project
 markdown/
 db/
+nul
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/AGENTS.md b/AGENTS.md
@@ -12,12 +12,12 @@ Local documentation retrieval system using MCP (Model Context Protocol). Indexes
 HTML docs → pandoc → Markdown → build_index.py → SQLite DB → mcp_server.py → MCP tools
 ```
 
-**Key files:**
+Key files:
 - `build_index.py` — Indexes Markdown into SQLite (FTS5 + optional sqlite-vec embeddings)
 - `mcp_server.py` — MCP server exposing `search_docs`, `get_chunk`, `list_sources` tools
 - `scripts/convert_html.sh` — Batch HTML→Markdown conversion via pandoc
 
-**Database schema:**
+Database schema:
 - `chunks` — Document chunks with id, source, title, content, chunk_index
 - `chunks_fts` — FTS5 virtual table for keyword search
 - `chunks_vec` — sqlite-vec virtual table for embeddings (optional)
diff --git a/README.md b/README.md
@@ -2,6 +2,14 @@
 
 Make engineering documentation searchable by LLM coding assistants (Claude Code, Cursor, Codex CLI). Uses SQLite FTS5 for keyword search and vector embeddings for semantic search. Single file, no external services.
 
+## Why This Architecture
+
+Hybrid search gives you the best of both worlds. FTS5 handles exact matches—API names, error messages, symbols. Embeddings handle vocabulary mismatch—when someone searches "make grid finer near edges" instead of "mesh refinement."
+
+SQLite FTS5 + sqlite-vec keeps everything in one file. No vector database to operate, no Docker, no external services.
+
+What this replaces: grep (no ranking), Qdrant/Weaviate (operational overhead), local LLMs (slow, no accuracy benefit for retrieval).
+
 ## Requirements
 
 - Python 3.11+
@@ -72,14 +80,25 @@ For COMSOL-specific conversion, see [docs/comsol.md](docs/comsol.md). You can ad
 
 ## MCP Tools
 
-There are a few key MCP commands for the LLM.
-
 | Tool | Description |
 |------|-------------|
 | `search_docs` | Hybrid keyword + semantic search. Returns matching chunks with scores. |
 | `get_chunk` | Retrieve a specific chunk by ID. |
 | `list_sources` | List all indexed source files. |
 
+Example `search_docs` response:
+```json
+[
+  {
+    "chunk_id": "comsol_ref_mesh.24.80.md:0",
+    "source": "comsol_ref_mesh.24.80.md",
+    "title": "Mesh Refinement",
+    "content": "Use Refine to refine a mesh by splitting elements...",
+    "score": 0.032
+  }
+]
+```
+
 ## Publishing Databases
 
 You can publish database snapshots for ease of use using the following example command:
diff --git a/build_index.py b/build_index.py
@@ -400,7 +400,8 @@ def main():
     for i, md_path in enumerate(md_files):
         # Check if file has changed
         current_hash = file_hash(md_path)
-        relative_path = str(md_path.relative_to(args.source_dir))
+        # Normalize path separators for cross-platform consistency
+        relative_path = str(md_path.relative_to(args.source_dir)).replace("\\", "/")
 
         existing = conn.execute(
             "SELECT hash FROM sources WHERE path = ?", (relative_path,)
diff --git a/docs/comsol.md b/docs/comsol.md
@@ -12,8 +12,9 @@ When installing COMSOL, select:
 
 COMSOL 6.4 HTML documentation default paths:
 
-- **Windows:** `C:\Program Files\COMSOL\COMSOL64\Multiphysics\doc\help\wtpwebapps\ROOT\doc\`
-- **Linux:** `/usr/local/comsol/multiphysics/doc/help/wtpwebapps/ROOT/doc`
+- Windows: `C:\Program Files\COMSOL\COMSOL64\Multiphysics\doc\help\wtpwebapps\ROOT\doc\`
+- macOS: `/Applications/COMSOL64/Multiphysics/doc/help/wtpwebapps/ROOT/doc/`
+- Linux: `/usr/local/comsol/multiphysics/doc/help/wtpwebapps/ROOT/doc/`
 
 The HTML files are spread across subdirectories (`comsol_ref_manual/`, `acdc_module/`, etc.).
 
diff --git a/docs/development.md b/docs/development.md
@@ -38,10 +38,10 @@ CREATE TABLE sources (
 
 Documents are split using a header-aware algorithm:
 
-1. **Primary split:** Markdown headers (`##`, `###`, etc.)
-2. **Secondary split:** If a section exceeds `chunk_size`, split on paragraph boundaries
-3. **Tertiary split:** If still too large, split on sentence boundaries
-4. **Overlap:** Each chunk includes `chunk_overlap` characters from the previous chunk's end
+1. Primary split: Markdown headers (`##`, `###`, etc.)
+2. Secondary split: If a section exceeds `chunk_size`, split on paragraph boundaries
+3. Tertiary split: If still too large, split on sentence boundaries
+4. Overlap: Each chunk includes `chunk_overlap` characters from the previous chunk's end
 
 Section titles are preserved as metadata for better context in search results.
 
@@ -112,3 +112,117 @@ For other formats:
 pandoc -f rst -t gfm input.rst -o output.md   # Sphinx RST
 pandoc -f docx -t gfm input.docx -o output.md # Word docs
 ```
+
+## Running Tests
+
+```bash
+uv run pytest tests/ -v
+```
+
+Tests cover chunking, indexing, FTS triggers, RRF scoring, and search functionality. All tests use temporary databases and clean up after themselves.
+
+## Creating Skills for New Documentation
+
+Skills help LLMs know when to use your MCP server. Create a skill file for each documentation set.
+
+### Skill Format
+
+Both Claude Code and Codex use the [Agent Skills specification](https://agentskills.io/specification):
+
+```markdown
+---
+name: your-docs
+description: Search YOUR_PRODUCT documentation. Use when asked about [list key topics, features, common questions].
+---
+
+# Your Documentation Search
+
+Use the `search_docs` MCP tool to find documentation.
+
+## When to use
+
+- [List specific use cases]
+- [Topics this documentation covers]
+- [Types of questions it answers]
+
+## Prerequisites
+
+The MCP server must be configured:
+\`\`\`bash
+claude mcp add --transport stdio your-docs -- docs-mcp --db your-docs.db
+\`\`\`
+```
+
+### Skill Locations
+
+| IDE | User-level location |
+|-----|---------------------|
+| Claude Code | `~/.claude/skills/your-docs.md` |
+| Codex CLI | `~/.codex/skills/your-docs.md` |
+
+### Tips
+
+- Be specific in the description - include keywords users would mention
+- List concrete examples - helps the LLM match user queries to your skill
+- Update prerequisites - use the correct MCP add command for each IDE
+
+## Embedding Models
+
+The default model is `BAAI/bge-small-en-v1.5`. You can change it with `--embedding-model`.
+
+| Model | Dimensions | Size | Speed | Quality | Notes |
+|-------|------------|------|-------|---------|-------|
+| `BAAI/bge-small-en-v1.5` | 384 | 130MB | Fast | Good | Default, best balance |
+| `BAAI/bge-base-en-v1.5` | 768 | 440MB | Medium | Better | More accurate, 2x slower |
+| `BAAI/bge-large-en-v1.5` | 1024 | 1.3GB | Slow | Best | Diminishing returns for docs |
+| `all-MiniLM-L6-v2` | 384 | 90MB | Fastest | OK | Smaller, less accurate |
+
+### Recommendations
+
+- bge-small (default): Best for most use cases. Good accuracy, fast indexing.
+- bge-base: Use if search quality matters more than indexing time.
+- bge-large: Rarely needed. The accuracy gain over base is marginal for documentation.
+- MiniLM: Use if disk space or memory is constrained.
+
+### GPU Acceleration
+
+sentence-transformers auto-detects CUDA. On a GPU, even bge-large indexes quickly.
+
+```bash
+# Check if GPU is available
+python -c "import torch; print(torch.cuda.is_available())"
+```
+
+## Performance Tuning
+
+### Chunk Size
+
+| Setting | Effect |
+|---------|--------|
+| Smaller chunks (500-1000) | More precise matches, more chunks to search, larger database |
+| Larger chunks (2000-3000) | More context per result, fewer chunks, may include irrelevant content |
+| Default (1500) | Good balance for technical documentation |
+
+### Chunk Overlap
+
+| Setting | Effect |
+|---------|--------|
+| No overlap (0) | Smallest database, may miss matches at chunk boundaries |
+| Small overlap (100-200) | Default, catches most boundary cases |
+| Large overlap (300+) | Better boundary matching, larger database, more redundancy |
+
+### When to Use Each Search Mode
+
+| Mode | Best for | Speed |
+|------|----------|-------|
+| `keyword` | Exact terms, API names, error codes, CLI testing | Instant |
+| `semantic` | Natural language, vocabulary mismatch, conceptual queries | Slower (model load) |
+| `hybrid` | Production use, best overall results | Slower (model load) |
+
+### Indexing Performance
+
+- Without embeddings: ~1000 files/second
+- With embeddings (CPU): ~50 chunks/second
+- With embeddings (GPU): ~500 chunks/second
+
+For large documentation sets (10k+ files), use `--no-embeddings` first to verify conversion worked, then rebuild with embeddings.
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
@@ -46,3 +46,42 @@ The embedding model loads on first semantic search. This is a one-time cost per
 ```bash
 docs-mcp --db comsol.db --test "query" --mode keyword
 ```
+
+## Windows-specific issues
+
+### "Database is locked" or temp file errors
+
+SQLite WAL mode can cause file locking issues on Windows. The server handles this automatically, but if you see errors:
+
+1. Close any other programs accessing the database
+2. Delete `.db-wal` and `.db-shm` files if present
+3. Restart the MCP server
+
+### NPX commands fail with "Connection closed"
+
+On native Windows (not WSL), wrap NPX commands with `cmd /c`:
+
+```powershell
+# Instead of: npx -y some-package
+cmd /c npx -y some-package
+```
+
+### Path issues
+
+Always use forward slashes or escaped backslashes in config files:
+
+```json
+{
+  "args": ["--db", "C:/Users/name/docs-mcp/comsol.db"]
+}
+```
+
+Or use environment variables:
+
+```json
+{
+  "args": ["--db", "comsol.db"]
+}
+```
+
+(Database files in `%LOCALAPPDATA%\docs-mcp\` are found automatically.)
diff --git a/mcp_server.py b/mcp_server.py
@@ -26,6 +26,9 @@
 )
 logger = logging.getLogger(__name__)
 
+# Valid search modes for search_docs
+VALID_SEARCH_MODES = ("keyword", "semantic", "hybrid")
+
 
 def get_data_dir() -> Path:
     """Get the default data directory for database files."""
@@ -89,24 +92,60 @@ def get_embedding_model():
     return _embedding_model
 
 
+def sanitize_fts_query(query: str) -> str:
+    """
+    Sanitize a query string for safe use with FTS5 MATCH.
+
+    FTS5 has special syntax (AND, OR, NOT, quotes, parentheses, etc.)
+    that can cause errors or unexpected behavior. This wraps each word
+    in quotes to treat them as literals.
+    """
+    # Split on whitespace and filter empty tokens
+    tokens = query.split()
+    if not tokens:
+        return ""
+
+    # Escape double quotes within tokens and wrap each in quotes
+    # This treats each word as a literal phrase, joined by implicit AND
+    escaped = []
+    for token in tokens:
+        # Escape any existing double quotes
+        safe_token = token.replace('"', '""')
+        escaped.append(f'"{safe_token}"')
+
+    return " ".join(escaped)
+
+
 def search_fts(query: str, limit: int) -> list[tuple[str, float]]:
     """Full-text search using FTS5. Returns (chunk_id, score) pairs."""
+    # Handle empty or whitespace-only queries
+    if not query or not query.strip():
+        return []
+
     conn = get_connection()
 
-    # BM25 scoring (lower is better in FTS5, so we negate)
-    results = conn.execute(
-        """
-        SELECT c.id, -bm25(chunks_fts, 1, 10) as score
-        FROM chunks_fts
-        JOIN chunks c ON chunks_fts.rowid = c.rowid
-        WHERE chunks_fts MATCH ?
-        ORDER BY score DESC
-        LIMIT ?
-    """,
-        (query, limit),
-    ).fetchall()
+    # Sanitize query to prevent FTS5 syntax errors
+    safe_query = sanitize_fts_query(query)
+    if not safe_query:
+        return []
 
-    return results
+    try:
+        # BM25 scoring (lower is better in FTS5, so we negate)
+        results = conn.execute(
+            """
+            SELECT c.id, -bm25(chunks_fts, 1, 10) as score
+            FROM chunks_fts
+            JOIN chunks c ON chunks_fts.rowid = c.rowid
+            WHERE chunks_fts MATCH ?
+            ORDER BY score DESC
+            LIMIT ?
+        """,
+            (safe_query, limit),
+        ).fetchall()
+        return results
+    except sqlite3.OperationalError as e:
+        logger.warning(f"FTS5 search error: {e}")
+        return []
 
 
 def search_vec(query: str, limit: int) -> list[tuple[str, float]]:
@@ -147,6 +186,16 @@ def reciprocal_rank_fusion(
     Combine multiple ranked lists using Reciprocal Rank Fusion.
 
     RRF score = sum(1 / (k + rank_i)) for each list where item appears
+
+    Args:
+        results_lists: List of ranked result lists, each containing (id, score) tuples
+        k: Ranking constant that controls how much weight is given to lower-ranked items.
+           Default of 60 is from the original RRF paper (Cormack et al., 2009) and works
+           well in practice. Lower k gives more weight to top results; higher k makes
+           the ranking more uniform.
+
+    Returns:
+        Combined list of (id, rrf_score) tuples, sorted by score descending
     """
     scores: dict[str, float] = {}
 
@@ -204,7 +253,13 @@ def search_docs_impl(query: str, limit: int = 10, mode: str = "hybrid") -> list[
 
     Returns:
         List of matching chunks with scores
+
+    Raises:
+        ValueError: If mode is not one of the valid search modes
     """
+    if mode not in VALID_SEARCH_MODES:
+        raise ValueError(f"Invalid search mode '{mode}'. Must be one of: {VALID_SEARCH_MODES}")
+
     results_lists = []
 
     if mode in ("keyword", "hybrid"):
diff --git a/tests/test_search.py b/tests/test_search.py