diff --git a/cli/watch.go b/cli/watch.go index ccd47175..aceff00f 100644 --- a/cli/watch.go +++ b/cli/watch.go @@ -987,7 +987,8 @@ func watchProjectWithEventObserver(ctx context.Context, projectRoot string, emb } // Initialize scanner - scanner := indexer.NewScanner(projectRoot, ignoreMatcher) + scanner := indexer.NewScanner(projectRoot, ignoreMatcher). + WithCustomExtensions(cfg.Chunking.CustomExtensions) // Initialize chunker chunker := indexer.NewChunker(cfg.Chunking.Size, cfg.Chunking.Overlap) @@ -2752,7 +2753,9 @@ func initializeWorkspaceRuntime(ctx context.Context, ws *config.Workspace, proje return nil, nil, fmt.Errorf("failed to initialize ignore matcher: %w", err) } - scanner := indexer.NewScanner(project.Path, ignoreMatcher) + scanner := indexer.NewScanner(project.Path, ignoreMatcher). + WithCustomExtensions(ws.Chunking.CustomExtensions). + WithCustomExtensions(projectCfg.Chunking.CustomExtensions) chunker := indexer.NewChunker(projectCfg.Chunking.Size, projectCfg.Chunking.Overlap) processorRegistry := buildFrameworkRegistry(projectCfg) vectorStore := &projectPrefixStore{ diff --git a/config/config.go b/config/config.go index 2ca67996..c0733a42 100644 --- a/config/config.go +++ b/config/config.go @@ -203,6 +203,12 @@ type QdrantConfig struct { type ChunkingConfig struct { Size int `yaml:"size"` Overlap int `yaml:"overlap"` + // CustomExtensions extends the built-in SupportedExtensions list with + // additional file extensions to index (e.g. [".tengo", ".el"]). Each + // entry must include the leading dot and is matched case-insensitively. + // Binary detection (UTF-8 + null-byte check) still applies, so adding + // a binary extension here is safe. + CustomExtensions []string `yaml:"custom_extensions,omitempty"` } func DefaultStoreForBackend(backend string) StoreConfig { diff --git a/config/workspace.go b/config/workspace.go index 4a919e9d..59fe19aa 100644 --- a/config/workspace.go +++ b/config/workspace.go @@ -25,6 +25,10 @@ type Workspace struct { Name string `yaml:"name"` Store StoreConfig `yaml:"store"` Embedder EmbedderConfig `yaml:"embedder"` + // Chunking provides workspace-level defaults. Currently only + // CustomExtensions is consumed; Size/Overlap are taken from the + // per-project config. + Chunking ChunkingConfig `yaml:"chunking,omitempty"` Projects []ProjectEntry `yaml:"projects"` } diff --git a/docs/src/content/docs/configuration.md b/docs/src/content/docs/configuration.md index 55e4b14e..3c0b2f6b 100644 --- a/docs/src/content/docs/configuration.md +++ b/docs/src/content/docs/configuration.md @@ -53,6 +53,13 @@ chunking: size: 512 # Overlap between chunks (for context continuity) overlap: 50 + # Optional: index extra extensions beyond the built-in supported set. + # Entries must include the leading dot, are matched case-insensitively, + # and still go through the binary-file and minified-file filters. + # Examples: ".tengo" (Tengo scripts), ".el" (Emacs Lisp), ".prisma". + # custom_extensions: + # - .tengo + # - .prisma # File watching configuration watch: diff --git a/docs/src/content/docs/workspace.md b/docs/src/content/docs/workspace.md index 067bcc0a..7e0d4d50 100644 --- a/docs/src/content/docs/workspace.md +++ b/docs/src/content/docs/workspace.md @@ -278,6 +278,7 @@ When indexing via workspace, settings come from different sources: | `store` (backend, DSN) | **Workspace** | Shared store required | | `embedder` (provider, model) | **Workspace** | Ensures compatible vectors | | `chunking` (size, overlap) | **Project** | Can vary per project/language | +| `chunking.custom_extensions` | **Workspace** + **Project** | Both lists are merged when scanning a project | | `ignore` patterns | **Project** | Project-specific exclusions | | `external_gitignore` | **Project** | Project-specific gitignore | diff --git a/indexer/scanner.go b/indexer/scanner.go index 373a9bdc..790ff2e7 100644 --- a/indexer/scanner.go +++ b/indexer/scanner.go @@ -112,8 +112,9 @@ type FileMeta struct { } type Scanner struct { - root string - ignore *IgnoreMatcher + root string + ignore *IgnoreMatcher + extraExts map[string]bool } func NewScanner(root string, ignore *IgnoreMatcher) *Scanner { @@ -123,6 +124,31 @@ func NewScanner(root string, ignore *IgnoreMatcher) *Scanner { } } +// WithCustomExtensions returns the scanner with the given extensions added to +// the set of extensions it will index, on top of SupportedExtensions. Entries +// must include the leading dot ("." e.g., ".tengo"); they are normalized to +// lowercase and entries missing the dot or empty are silently dropped. +func (s *Scanner) WithCustomExtensions(exts []string) *Scanner { + for _, raw := range exts { + ext := strings.ToLower(strings.TrimSpace(raw)) + if ext == "" || !strings.HasPrefix(ext, ".") { + continue + } + if s.extraExts == nil { + s.extraExts = make(map[string]bool) + } + s.extraExts[ext] = true + } + return s +} + +func (s *Scanner) isSupported(ext string) bool { + if SupportedExtensions[ext] { + return true + } + return s.extraExts[ext] +} + // ScanMetadata scans indexable files and returns only file metadata. // It avoids reading file contents and hash computation for a faster first pass. func (s *Scanner) ScanMetadata() ([]FileMeta, []string, error) { @@ -154,7 +180,7 @@ func (s *Scanner) ScanMetadata() ([]FileMeta, []string, error) { // Check extension ext := strings.ToLower(filepath.Ext(path)) - if !SupportedExtensions[ext] { + if !s.isSupported(ext) { return nil } @@ -216,7 +242,7 @@ func (s *Scanner) Scan() ([]FileInfo, []string, error) { // Check extension ext := strings.ToLower(filepath.Ext(path)) - if !SupportedExtensions[ext] { + if !s.isSupported(ext) { return nil } diff --git a/indexer/scanner_test.go b/indexer/scanner_test.go index 85b641bd..5feb388c 100644 --- a/indexer/scanner_test.go +++ b/indexer/scanner_test.go @@ -352,3 +352,141 @@ func TestScanner_ScanFile_SkipsMinified(t *testing.T) { t.Error("expected nil for minified file, got file info") } } + +func TestScanner_CustomExtensions(t *testing.T) { + tmpDir := t.TempDir() + + // File with a custom extension that's not in SupportedExtensions. + tengoFile := filepath.Join(tmpDir, "workflow.tengo") + if err := os.WriteFile(tengoFile, []byte(`fmt := import("fmt")` + "\n" + `fmt.println("hello")` + "\n"), 0644); err != nil { + t.Fatalf("failed to create tengo file: %v", err) + } + + // File with an unrecognized extension that should remain unsupported. + unknownFile := filepath.Join(tmpDir, "data.xyz") + if err := os.WriteFile(unknownFile, []byte("ignored"), 0644); err != nil { + t.Fatalf("failed to create xyz file: %v", err) + } + + // File with a built-in supported extension to confirm coexistence. + goFile := filepath.Join(tmpDir, "main.go") + if err := os.WriteFile(goFile, []byte("package main"), 0644); err != nil { + t.Fatalf("failed to create go file: %v", err) + } + + ignoreMatcher, err := NewIgnoreMatcher(tmpDir, []string{}, "") + if err != nil { + t.Fatalf("failed to create ignore matcher: %v", err) + } + + // Without custom extensions: only main.go indexed. + scanner := NewScanner(tmpDir, ignoreMatcher) + files, _, err := scanner.Scan() + if err != nil { + t.Fatalf("scan failed: %v", err) + } + if len(files) != 1 || filepath.Base(files[0].Path) != "main.go" { + t.Errorf("baseline scan expected only main.go, got %+v", files) + } + + // With ".tengo" added: main.go + workflow.tengo (xyz still skipped). + scanner = NewScanner(tmpDir, ignoreMatcher). + WithCustomExtensions([]string{".tengo"}) + files, _, err = scanner.Scan() + if err != nil { + t.Fatalf("scan with custom ext failed: %v", err) + } + got := make(map[string]bool) + for _, f := range files { + got[filepath.Base(f.Path)] = true + } + if !got["main.go"] || !got["workflow.tengo"] { + t.Errorf("expected main.go + workflow.tengo, got %v", got) + } + if got["data.xyz"] { + t.Errorf("data.xyz should not have been indexed") + } +} + +func TestScanner_WithCustomExtensions_Normalizes(t *testing.T) { + tmpDir := t.TempDir() + + // Both uppercase and lowercase variants of the file extension. + upper := filepath.Join(tmpDir, "module.TENGO") + if err := os.WriteFile(upper, []byte("upper"), 0644); err != nil { + t.Fatalf("failed to create upper file: %v", err) + } + + ignoreMatcher, err := NewIgnoreMatcher(tmpDir, []string{}, "") + if err != nil { + t.Fatalf("failed to create ignore matcher: %v", err) + } + + scanner := NewScanner(tmpDir, ignoreMatcher). + WithCustomExtensions([]string{" .TENGO "}) + files, _, err := scanner.Scan() + if err != nil { + t.Fatalf("scan failed: %v", err) + } + if len(files) != 1 { + t.Fatalf("expected 1 file, got %d: %+v", len(files), files) + } +} + +func TestScanner_WithCustomExtensions_RejectsInvalidEntries(t *testing.T) { + tmpDir := t.TempDir() + + // Files exercising the various invalid-entry forms. + if err := os.WriteFile(filepath.Join(tmpDir, "bad1"), []byte("x"), 0644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(tmpDir, "bad2.foo"), []byte("x"), 0644); err != nil { + t.Fatal(err) + } + + ignoreMatcher, err := NewIgnoreMatcher(tmpDir, []string{}, "") + if err != nil { + t.Fatal(err) + } + + // All these inputs should be silently dropped: empty, missing dot, just whitespace. + scanner := NewScanner(tmpDir, ignoreMatcher). + WithCustomExtensions([]string{"", "foo", " ", ".."}) + files, _, err := scanner.Scan() + if err != nil { + t.Fatal(err) + } + // ".." is a valid extension-with-dot, so .foo files might match if treated wrong. + // But filepath.Ext("bad2.foo") == ".foo", which we did NOT add. Should be 0 files. + if len(files) != 0 { + t.Errorf("expected no files indexed from invalid extensions, got %+v", files) + } +} + +func TestScanner_CustomExtensions_BinaryStillSkipped(t *testing.T) { + tmpDir := t.TempDir() + + // A "tengo" file that is actually binary (null bytes) — should be skipped + // even with the extension whitelisted. + binFile := filepath.Join(tmpDir, "blob.tengo") + if err := os.WriteFile(binFile, []byte{0x00, 0x01, 0x02, 0x03}, 0644); err != nil { + t.Fatal(err) + } + + ignoreMatcher, err := NewIgnoreMatcher(tmpDir, []string{}, "") + if err != nil { + t.Fatal(err) + } + + scanner := NewScanner(tmpDir, ignoreMatcher). + WithCustomExtensions([]string{".tengo"}) + + // Scan reads content + binary check; should drop the binary blob. + files, _, err := scanner.Scan() + if err != nil { + t.Fatal(err) + } + if len(files) != 0 { + t.Errorf("expected binary .tengo to be skipped, got %+v", files) + } +}