Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 128 additions & 64 deletions packages/leann-core/src/leann/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,14 @@ def create_parser(self) -> argparse.ArgumentParser:
)

# List command
subparsers.add_parser("list", help="List all indexes")
list_parser = subparsers.add_parser("list", help="List all indexes")
list_parser.add_argument(
"--max-depth",
type=int,
default=3,
help="Maximum directory depth to scan for indexes (default: 3). "
"Increase if your indexes are in deeply nested directories.",
)

# Remove command
remove_parser = subparsers.add_parser("remove", help="Remove an index")
Expand Down Expand Up @@ -404,6 +411,68 @@ def _should_exclude_file(self, file_path: Path, gitignore_matches) -> bool:
absolute_path = Path(str(file_path))
return gitignore_matches(absolute_path.as_posix())

def _find_meta_files_limited(
self, root: Path, max_depth: int = 3, pattern: str = "*.leann.meta.json"
):
"""Find meta files with limited depth to avoid scanning large directories.

Skips common large directories that shouldn't contain LEANN indexes.
"""
# Directories to skip - these are typically large and won't contain user indexes
skip_dirs = {
".git",
".svn",
".hg",
"node_modules",
"__pycache__",
".venv",
"venv",
".env",
"env",
".tox",
".nox",
".mypy_cache",
".pytest_cache",
".ruff_cache",
"dist",
"build",
".eggs",
"*.egg-info",
".cache",
".npm",
".yarn",
"vendor",
"Pods",
".gradle",
"target",
}

def should_skip(dir_name: str) -> bool:
"""Check if directory should be skipped."""
if dir_name in skip_dirs:
return True
# Skip hidden directories (except .leann which we want)
if dir_name.startswith(".") and dir_name != ".leann":
return True
return False

def search_dir(path: Path, current_depth: int):
"""Recursively search with depth limit."""
if current_depth > max_depth:
return

try:
for item in path.iterdir():
if item.is_file() and item.match(pattern):
yield item
elif item.is_dir() and not should_skip(item.name):
yield from search_dir(item, current_depth + 1)
except (PermissionError, OSError):
# Skip directories we can't read
pass

yield from search_dir(root, 0)

def _is_git_submodule(self, path: Path) -> bool:
"""Check if a path is a git submodule."""
try:
Expand All @@ -430,7 +499,13 @@ def _is_git_submodule(self, path: Path) -> bool:
# If anything goes wrong, assume it's not a submodule
return False

def list_indexes(self):
def list_indexes(self, max_depth: int = 3):
"""List all LEANN indexes across registered projects.

Args:
max_depth: Maximum directory depth to scan for app-format indexes.
Default is 3. Increase if indexes are in deeply nested directories.
"""
# Get all project directories with .leann
global_registry = Path.home() / ".leann" / "projects.json"
all_projects = []
Expand Down Expand Up @@ -475,7 +550,7 @@ def list_indexes(self):
print(" " + "─" * 45)

current_indexes = self._discover_indexes_in_project(
current_path, exclude_dirs=other_projects
current_path, exclude_dirs=other_projects, max_depth=max_depth
)
if current_indexes:
for idx in current_indexes:
Expand All @@ -494,7 +569,7 @@ def list_indexes(self):
print(" " + "─" * 45)

for project_path in other_projects:
project_indexes = self._discover_indexes_in_project(project_path)
project_indexes = self._discover_indexes_in_project(project_path, max_depth=max_depth)
if not project_indexes:
continue

Expand All @@ -518,9 +593,9 @@ def list_indexes(self):
projects_count = 0
for p in valid_projects:
if p == current_path:
discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects)
discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects, max_depth=max_depth)
else:
discovered = self._discover_indexes_in_project(p)
discovered = self._discover_indexes_in_project(p, max_depth=max_depth)
if len(discovered) > 0:
projects_count += 1
print(f"📊 Total: {total_indexes} indexes across {projects_count} projects")
Expand All @@ -540,13 +615,17 @@ def list_indexes(self):
print(" leann build my-docs --docs ./documents")

def _discover_indexes_in_project(
self, project_path: Path, exclude_dirs: Optional[list[Path]] = None
self, project_path: Path, exclude_dirs: Optional[list[Path]] = None, max_depth: int = 3
):
"""Discover all indexes in a project directory (both CLI and apps formats)

exclude_dirs: when provided, skip any APP-format index files that are
located under these directories. This prevents duplicates when the
current project is a parent directory of other registered projects.
Args:
project_path: The project directory to search.
exclude_dirs: When provided, skip any APP-format index files that are
located under these directories. This prevents duplicates when the
current project is a parent directory of other registered projects.
max_depth: Maximum directory depth to scan for app-format indexes.
Default is 3. Increase if indexes are in deeply nested directories.
"""
indexes = []
exclude_dirs = exclude_dirs or []
Expand Down Expand Up @@ -583,9 +662,10 @@ def _discover_indexes_in_project(
}
)

# 2. Apps format: *.leann.meta.json files anywhere in the project
# 2. Apps format: *.leann.meta.json files in the project
# Use limited-depth search to avoid scanning entire large directories
cli_indexes_dir = project_path / ".leann" / "indexes"
for meta_file in project_path.rglob("*.leann.meta.json"):
for meta_file in self._find_meta_files_limited(project_path, max_depth=max_depth):
if meta_file.is_file():
# Skip CLI-built indexes (which store meta under .leann/indexes/<name>/)
try:
Expand Down Expand Up @@ -696,59 +776,43 @@ def _find_all_matching_indexes(self, index_name: str):
# b) by the parent directory name (e.g., `new_txt`)
seen_app_meta = set()

# 2a) by file base
for meta_file in project_path.rglob(f"{index_name}.leann.meta.json"):
if meta_file.is_file():
# Skip CLI-built indexes' meta under .leann/indexes
try:
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
except Exception:
pass
is_current = project_path == current_path
key = (str(project_path), str(meta_file))
if key in seen_app_meta:
continue
seen_app_meta.add(key)
matches.append(
{
"project_path": project_path,
"files_dir": meta_file.parent,
"meta_file": meta_file,
"is_current": is_current,
"kind": "app",
"display_name": meta_file.parent.name,
"file_base": meta_file.name.replace(".leann.meta.json", ""),
}
)
# Use limited-depth search to avoid scanning large directories
for meta_file in self._find_meta_files_limited(project_path, max_depth=3):
if not meta_file.is_file():
continue

# 2b) by parent directory name
for meta_file in project_path.rglob("*.leann.meta.json"):
if meta_file.is_file() and meta_file.parent.name == index_name:
# Skip CLI-built indexes' meta under .leann/indexes
try:
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
except Exception:
pass
is_current = project_path == current_path
key = (str(project_path), str(meta_file))
if key in seen_app_meta:
# Skip CLI-built indexes' meta under .leann/indexes
try:
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
seen_app_meta.add(key)
matches.append(
{
"project_path": project_path,
"files_dir": meta_file.parent,
"meta_file": meta_file,
"is_current": is_current,
"kind": "app",
"display_name": meta_file.parent.name,
"file_base": meta_file.name.replace(".leann.meta.json", ""),
}
)
except Exception:
pass

file_base = meta_file.name.replace(".leann.meta.json", "")
parent_name = meta_file.parent.name

# Check if this matches the requested index_name
# Match by file base or by parent directory name
if file_base != index_name and parent_name != index_name:
continue

is_current = project_path == current_path
key = (str(project_path), str(meta_file))
if key in seen_app_meta:
continue
seen_app_meta.add(key)
matches.append(
{
"project_path": project_path,
"files_dir": meta_file.parent,
"meta_file": meta_file,
"is_current": is_current,
"kind": "app",
"display_name": parent_name,
"file_base": file_base,
}
)

# Sort: current project first, then by project name
matches.sort(key=lambda x: (not x["is_current"], x["project_path"].name))
Expand Down Expand Up @@ -1679,7 +1743,7 @@ async def run(self, args=None):
return

if args.command == "list":
self.list_indexes()
self.list_indexes(max_depth=args.max_depth)
elif args.command == "remove":
self.remove_index(args.index_name, args.force)
elif args.command == "build":
Expand Down
60 changes: 59 additions & 1 deletion packages/leann-core/src/leann/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,63 @@ def autodiscover_backends():
# print("INFO: Backend auto-discovery finished.")


def _has_app_indexes_limited(root: Path, max_depth: int = 3) -> bool:
"""Check if directory contains app-format indexes with limited depth search.

Skips common large directories that shouldn't contain LEANN indexes.
"""
skip_dirs = {
".git",
".svn",
".hg",
"node_modules",
"__pycache__",
".venv",
"venv",
".env",
"env",
".tox",
".nox",
".mypy_cache",
".pytest_cache",
".ruff_cache",
"dist",
"build",
".eggs",
".cache",
".npm",
".yarn",
"vendor",
"Pods",
".gradle",
"target",
}

def should_skip(dir_name: str) -> bool:
if dir_name in skip_dirs:
return True
if dir_name.startswith(".") and dir_name != ".leann":
return True
return False

def search_dir(path: Path, current_depth: int) -> bool:
if current_depth > max_depth:
return False

try:
for item in path.iterdir():
if item.is_file() and item.name.endswith(".leann.meta.json"):
return True
elif item.is_dir() and not should_skip(item.name):
if search_dir(item, current_depth + 1):
return True
except (PermissionError, OSError):
pass
return False

return search_dir(root, 0)


def register_project_directory(project_dir: Optional[Union[str, Path]] = None):
"""
Register a project directory in the global LEANN registry.
Expand All @@ -65,8 +122,9 @@ def register_project_directory(project_dir: Optional[Union[str, Path]] = None):

# Only register directories that have some kind of LEANN content
# Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format)
# Use limited-depth search to avoid scanning large directories like $HOME
has_cli_indexes = (project_dir / ".leann" / "indexes").exists()
has_app_indexes = any(project_dir.rglob("*.leann.meta.json"))
has_app_indexes = _has_app_indexes_limited(project_dir, max_depth=3)

if not (has_cli_indexes or has_app_indexes):
# Don't register if there are no LEANN indexes
Expand Down
Loading
Loading