Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cli/watch.go
Original file line number Diff line number Diff line change
Expand Up @@ -987,7 +987,8 @@ func watchProjectWithEventObserver(ctx context.Context, projectRoot string, emb
}

// Initialize scanner
scanner := indexer.NewScanner(projectRoot, ignoreMatcher)
scanner := indexer.NewScanner(projectRoot, ignoreMatcher).
WithCustomExtensions(cfg.Chunking.CustomExtensions)

// Initialize chunker
chunker := indexer.NewChunker(cfg.Chunking.Size, cfg.Chunking.Overlap)
Expand Down Expand Up @@ -2752,7 +2753,9 @@ func initializeWorkspaceRuntime(ctx context.Context, ws *config.Workspace, proje
return nil, nil, fmt.Errorf("failed to initialize ignore matcher: %w", err)
}

scanner := indexer.NewScanner(project.Path, ignoreMatcher)
scanner := indexer.NewScanner(project.Path, ignoreMatcher).
WithCustomExtensions(ws.Chunking.CustomExtensions).
WithCustomExtensions(projectCfg.Chunking.CustomExtensions)
chunker := indexer.NewChunker(projectCfg.Chunking.Size, projectCfg.Chunking.Overlap)
processorRegistry := buildFrameworkRegistry(projectCfg)
vectorStore := &projectPrefixStore{
Expand Down
6 changes: 6 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@ type QdrantConfig struct {
type ChunkingConfig struct {
Size int `yaml:"size"`
Overlap int `yaml:"overlap"`
// CustomExtensions extends the built-in SupportedExtensions list with
// additional file extensions to index (e.g. [".tengo", ".el"]). Each
// entry must include the leading dot and is matched case-insensitively.
// Binary detection (UTF-8 + null-byte check) still applies, so adding
// a binary extension here is safe.
CustomExtensions []string `yaml:"custom_extensions,omitempty"`
}

func DefaultStoreForBackend(backend string) StoreConfig {
Expand Down
4 changes: 4 additions & 0 deletions config/workspace.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ type Workspace struct {
Name string `yaml:"name"`
Store StoreConfig `yaml:"store"`
Embedder EmbedderConfig `yaml:"embedder"`
// Chunking provides workspace-level defaults. Currently only
// CustomExtensions is consumed; Size/Overlap are taken from the
// per-project config.
Chunking ChunkingConfig `yaml:"chunking,omitempty"`
Projects []ProjectEntry `yaml:"projects"`
}

Expand Down
7 changes: 7 additions & 0 deletions docs/src/content/docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ chunking:
size: 512
# Overlap between chunks (for context continuity)
overlap: 50
# Optional: index extra extensions beyond the built-in supported set.
# Entries must include the leading dot, are matched case-insensitively,
# and still go through the binary-file and minified-file filters.
# Examples: ".tengo" (Tengo scripts), ".el" (Emacs Lisp), ".prisma".
# custom_extensions:
# - .tengo
# - .prisma

# File watching configuration
watch:
Expand Down
1 change: 1 addition & 0 deletions docs/src/content/docs/workspace.md
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ When indexing via workspace, settings come from different sources:
| `store` (backend, DSN) | **Workspace** | Shared store required |
| `embedder` (provider, model) | **Workspace** | Ensures compatible vectors |
| `chunking` (size, overlap) | **Project** | Can vary per project/language |
| `chunking.custom_extensions` | **Workspace** + **Project** | Both lists are merged when scanning a project |
| `ignore` patterns | **Project** | Project-specific exclusions |
| `external_gitignore` | **Project** | Project-specific gitignore |

Expand Down
34 changes: 30 additions & 4 deletions indexer/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,9 @@ type FileMeta struct {
}

type Scanner struct {
root string
ignore *IgnoreMatcher
root string
ignore *IgnoreMatcher
extraExts map[string]bool
}

func NewScanner(root string, ignore *IgnoreMatcher) *Scanner {
Expand All @@ -123,6 +124,31 @@ func NewScanner(root string, ignore *IgnoreMatcher) *Scanner {
}
}

// WithCustomExtensions returns the scanner with the given extensions added to
// the set of extensions it will index, on top of SupportedExtensions. Entries
// must include the leading dot ("." e.g., ".tengo"); they are normalized to
// lowercase and entries missing the dot or empty are silently dropped.
func (s *Scanner) WithCustomExtensions(exts []string) *Scanner {
for _, raw := range exts {
ext := strings.ToLower(strings.TrimSpace(raw))
if ext == "" || !strings.HasPrefix(ext, ".") {
continue
}
if s.extraExts == nil {
s.extraExts = make(map[string]bool)
}
s.extraExts[ext] = true
}
return s
}

func (s *Scanner) isSupported(ext string) bool {
if SupportedExtensions[ext] {
return true
}
return s.extraExts[ext]
}

// ScanMetadata scans indexable files and returns only file metadata.
// It avoids reading file contents and hash computation for a faster first pass.
func (s *Scanner) ScanMetadata() ([]FileMeta, []string, error) {
Expand Down Expand Up @@ -154,7 +180,7 @@ func (s *Scanner) ScanMetadata() ([]FileMeta, []string, error) {

// Check extension
ext := strings.ToLower(filepath.Ext(path))
if !SupportedExtensions[ext] {
if !s.isSupported(ext) {
return nil
}

Expand Down Expand Up @@ -216,7 +242,7 @@ func (s *Scanner) Scan() ([]FileInfo, []string, error) {

// Check extension
ext := strings.ToLower(filepath.Ext(path))
if !SupportedExtensions[ext] {
if !s.isSupported(ext) {
return nil
}

Expand Down
138 changes: 138 additions & 0 deletions indexer/scanner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,141 @@ func TestScanner_ScanFile_SkipsMinified(t *testing.T) {
t.Error("expected nil for minified file, got file info")
}
}

func TestScanner_CustomExtensions(t *testing.T) {
tmpDir := t.TempDir()

// File with a custom extension that's not in SupportedExtensions.
tengoFile := filepath.Join(tmpDir, "workflow.tengo")
if err := os.WriteFile(tengoFile, []byte(`fmt := import("fmt")` + "\n" + `fmt.println("hello")` + "\n"), 0644); err != nil {
t.Fatalf("failed to create tengo file: %v", err)
}

// File with an unrecognized extension that should remain unsupported.
unknownFile := filepath.Join(tmpDir, "data.xyz")
if err := os.WriteFile(unknownFile, []byte("ignored"), 0644); err != nil {
t.Fatalf("failed to create xyz file: %v", err)
}

// File with a built-in supported extension to confirm coexistence.
goFile := filepath.Join(tmpDir, "main.go")
if err := os.WriteFile(goFile, []byte("package main"), 0644); err != nil {
t.Fatalf("failed to create go file: %v", err)
}

ignoreMatcher, err := NewIgnoreMatcher(tmpDir, []string{}, "")
if err != nil {
t.Fatalf("failed to create ignore matcher: %v", err)
}

// Without custom extensions: only main.go indexed.
scanner := NewScanner(tmpDir, ignoreMatcher)
files, _, err := scanner.Scan()
if err != nil {
t.Fatalf("scan failed: %v", err)
}
if len(files) != 1 || filepath.Base(files[0].Path) != "main.go" {
t.Errorf("baseline scan expected only main.go, got %+v", files)
}

// With ".tengo" added: main.go + workflow.tengo (xyz still skipped).
scanner = NewScanner(tmpDir, ignoreMatcher).
WithCustomExtensions([]string{".tengo"})
files, _, err = scanner.Scan()
if err != nil {
t.Fatalf("scan with custom ext failed: %v", err)
}
got := make(map[string]bool)
for _, f := range files {
got[filepath.Base(f.Path)] = true
}
if !got["main.go"] || !got["workflow.tengo"] {
t.Errorf("expected main.go + workflow.tengo, got %v", got)
}
if got["data.xyz"] {
t.Errorf("data.xyz should not have been indexed")
}
}

func TestScanner_WithCustomExtensions_Normalizes(t *testing.T) {
tmpDir := t.TempDir()

// Both uppercase and lowercase variants of the file extension.
upper := filepath.Join(tmpDir, "module.TENGO")
if err := os.WriteFile(upper, []byte("upper"), 0644); err != nil {
t.Fatalf("failed to create upper file: %v", err)
}

ignoreMatcher, err := NewIgnoreMatcher(tmpDir, []string{}, "")
if err != nil {
t.Fatalf("failed to create ignore matcher: %v", err)
}

scanner := NewScanner(tmpDir, ignoreMatcher).
WithCustomExtensions([]string{" .TENGO "})
files, _, err := scanner.Scan()
if err != nil {
t.Fatalf("scan failed: %v", err)
}
if len(files) != 1 {
t.Fatalf("expected 1 file, got %d: %+v", len(files), files)
}
}

func TestScanner_WithCustomExtensions_RejectsInvalidEntries(t *testing.T) {
tmpDir := t.TempDir()

// Files exercising the various invalid-entry forms.
if err := os.WriteFile(filepath.Join(tmpDir, "bad1"), []byte("x"), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(tmpDir, "bad2.foo"), []byte("x"), 0644); err != nil {
t.Fatal(err)
}

ignoreMatcher, err := NewIgnoreMatcher(tmpDir, []string{}, "")
if err != nil {
t.Fatal(err)
}

// All these inputs should be silently dropped: empty, missing dot, just whitespace.
scanner := NewScanner(tmpDir, ignoreMatcher).
WithCustomExtensions([]string{"", "foo", " ", ".."})
files, _, err := scanner.Scan()
if err != nil {
t.Fatal(err)
}
// ".." is a valid extension-with-dot, so .foo files might match if treated wrong.
// But filepath.Ext("bad2.foo") == ".foo", which we did NOT add. Should be 0 files.
if len(files) != 0 {
t.Errorf("expected no files indexed from invalid extensions, got %+v", files)
}
}

func TestScanner_CustomExtensions_BinaryStillSkipped(t *testing.T) {
tmpDir := t.TempDir()

// A "tengo" file that is actually binary (null bytes) — should be skipped
// even with the extension whitelisted.
binFile := filepath.Join(tmpDir, "blob.tengo")
if err := os.WriteFile(binFile, []byte{0x00, 0x01, 0x02, 0x03}, 0644); err != nil {
t.Fatal(err)
}

ignoreMatcher, err := NewIgnoreMatcher(tmpDir, []string{}, "")
if err != nil {
t.Fatal(err)
}

scanner := NewScanner(tmpDir, ignoreMatcher).
WithCustomExtensions([]string{".tengo"})

// Scan reads content + binary check; should drop the binary blob.
files, _, err := scanner.Scan()
if err != nil {
t.Fatal(err)
}
if len(files) != 0 {
t.Errorf("expected binary .tengo to be skipped, got %+v", files)
}
}