From 740c3d38f4619b85b3a9f46292ec36ca86ed22f5 Mon Sep 17 00:00:00 2001 From: Cheelax Date: Sat, 5 Jul 2025 09:27:44 +0200 Subject: [PATCH 1/3] Enhance configuration options in README and CLI - Added examples for configuring processing limits in README.md. - Introduced new CLI options for max files, max total size, and max directory depth. - Updated environment variable support in config.py for various limits. - Modified ingestion functions to accept new parameters for file processing limits. - Enhanced limit checks in ingestion logic to utilize new configuration options. --- README.md | 55 ++++++++++++++++++++++++++++++ src/gitingest/cli.py | 29 +++++++++++++++- src/gitingest/config.py | 34 ++++++++++++++---- src/gitingest/entrypoint.py | 18 ++++++++++ src/gitingest/ingestion.py | 30 +++++++++------- src/gitingest/query_parser.py | 12 +++++++ src/gitingest/schemas/ingestion.py | 11 +++++- src/server/server_config.py | 27 ++++++++++++--- 8 files changed, 190 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index aab92204..cc246f22 100644 --- a/README.md +++ b/README.md @@ -135,12 +135,58 @@ By default, the digest is written to a text file (`digest.txt`) in your current - Use `--output/-o ` to write to a specific file. - Use `--output/-o -` to output directly to `STDOUT` (useful for piping to other tools). +Configure processing limits: + +```bash +# Set higher limits for large repositories +gitingest https://github.com/torvalds/linux \ + --max-files 100000 \ + --max-total-size 2147483648 \ + --max-directory-depth 25 + +# Process only Python files up to 1MB each +gitingest /path/to/project \ + --include-pattern "*.py" \ + --max-size 1048576 \ + --max-files 1000 +``` + See more options and usage details with: ```bash gitingest --help ``` +### 🔧 Configuration via Environment Variables + +You can configure various limits and settings using environment variables. All configuration environment variables start with the `GITINGEST_` prefix: + +**File Processing Configuration:** +- `GITINGEST_MAX_FILE_SIZE` - Maximum size of a single file to process (default: 10485760 bytes, 10MB) +- `GITINGEST_MAX_FILES` - Maximum number of files to process (default: 10000) +- `GITINGEST_MAX_TOTAL_SIZE_BYTES` - Maximum size of output file (default: 524288000 bytes, 500MB) +- `GITINGEST_MAX_DIRECTORY_DEPTH` - Maximum depth of directory traversal (default: 20) +- `GITINGEST_DEFAULT_TIMEOUT` - Default operation timeout in seconds (default: 60) +- `GITINGEST_OUTPUT_FILE_NAME` - Default output filename (default: "digest.txt") +- `GITINGEST_TMP_BASE_PATH` - Base path for temporary files (default: system temp directory) + +**Server Configuration (for self-hosting):** +- `GITINGEST_MAX_DISPLAY_SIZE` - Maximum size of content to display in UI (default: 300000 bytes) +- `GITINGEST_DELETE_REPO_AFTER` - Repository cleanup timeout in seconds (default: 3600, 1 hour) +- `GITINGEST_MAX_FILE_SIZE_KB` - Maximum file size for UI slider in KB (default: 102400, 100MB) +- `GITINGEST_MAX_SLIDER_POSITION` - Maximum slider position in UI (default: 500) + +**Example usage:** + +```bash +# Configure for large scientific repositories +export GITINGEST_MAX_FILES=50000 +export GITINGEST_MAX_FILE_SIZE=20971520 # 20MB +export GITINGEST_MAX_TOTAL_SIZE_BYTES=1073741824 # 1GB + +gitingest https://github.com/some/large-repo +``` + ## 🐍 Python package usage ```python @@ -169,6 +215,15 @@ summary, tree, content = ingest("https://github.com/username/private-repo") # Include repository submodules summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True) + +# Configure limits programmatically +summary, tree, content = ingest( + "https://github.com/username/large-repo", + max_file_size=20 * 1024 * 1024, # 20MB per file + max_files=50000, # 50k files max + max_total_size_bytes=1024**3, # 1GB total + max_directory_depth=30 # 30 levels deep +) ``` By default, this won't write a file but can be enabled with the `output` argument. diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index e14ed681..23b4a2c7 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -9,13 +9,16 @@ import click from typing_extensions import Unpack -from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME +from gitingest.config import MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MAX_DIRECTORY_DEPTH, OUTPUT_FILE_NAME from gitingest.entrypoint import ingest_async class _CLIArgs(TypedDict): source: str max_size: int + max_files: int + max_total_size: int + max_directory_depth: int exclude_pattern: tuple[str, ...] include_pattern: tuple[str, ...] branch: str | None @@ -34,6 +37,24 @@ class _CLIArgs(TypedDict): show_default=True, help="Maximum file size to process in bytes", ) +@click.option( + "--max-files", + default=MAX_FILES, + show_default=True, + help="Maximum number of files to process", +) +@click.option( + "--max-total-size", + default=MAX_TOTAL_SIZE_BYTES, + show_default=True, + help="Maximum total size of all files in bytes", +) +@click.option( + "--max-directory-depth", + default=MAX_DIRECTORY_DEPTH, + show_default=True, + help="Maximum depth of directory traversal", +) @click.option("--exclude-pattern", "-e", multiple=True, help="Shell-style patterns to exclude.") @click.option( "--include-pattern", @@ -112,6 +133,9 @@ async def _async_main( source: str, *, max_size: int = MAX_FILE_SIZE, + max_files: int = MAX_FILES, + max_total_size: int = MAX_TOTAL_SIZE_BYTES, + max_directory_depth: int = MAX_DIRECTORY_DEPTH, exclude_pattern: tuple[str, ...] | None = None, include_pattern: tuple[str, ...] | None = None, branch: str | None = None, @@ -170,6 +194,9 @@ async def _async_main( summary, _, _ = await ingest_async( source, max_file_size=max_size, + max_files=max_files, + max_total_size_bytes=max_total_size, + max_directory_depth=max_directory_depth, include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 3d154684..94b4f921 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -1,14 +1,34 @@ """Configuration file for the project.""" +import os import tempfile from pathlib import Path -MAX_FILE_SIZE = 10 * 1024 * 1024 # Maximum size of a single file to process (10 MB) -MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10_000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # Maximum size of output file (500 MB) -DEFAULT_TIMEOUT = 60 # seconds +# Helper function to get environment variables with type conversion +def _get_env_var(key: str, default, cast_func=None): + """Get environment variable with GITINGEST_ prefix and optional type casting.""" + env_key = f"GITINGEST_{key}" + value = os.environ.get(env_key) + + if value is None: + return default + + if cast_func: + try: + return cast_func(value) + except (ValueError, TypeError): + print(f"Warning: Invalid value for {env_key}: {value}. Using default: {default}") + return default + + return value -OUTPUT_FILE_NAME = "digest.txt" +# Configuration with environment variable support +MAX_FILE_SIZE = _get_env_var("MAX_FILE_SIZE", 10 * 1024 * 1024, int) # Maximum size of a single file to process (10 MB) +MAX_DIRECTORY_DEPTH = _get_env_var("MAX_DIRECTORY_DEPTH", 20, int) # Maximum depth of directory traversal +MAX_FILES = _get_env_var("MAX_FILES", 10_000, int) # Maximum number of files to process +MAX_TOTAL_SIZE_BYTES = _get_env_var("MAX_TOTAL_SIZE_BYTES", 500 * 1024 * 1024, int) # Maximum size of output file (500 MB) +DEFAULT_TIMEOUT = _get_env_var("DEFAULT_TIMEOUT", 60, int) # seconds -TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest" +OUTPUT_FILE_NAME = _get_env_var("OUTPUT_FILE_NAME", "digest.txt") + +TMP_BASE_PATH = Path(_get_env_var("TMP_BASE_PATH", tempfile.gettempdir())) / "gitingest" diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index f64dec08..cee1c05e 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -22,6 +22,9 @@ async def ingest_async( source: str, *, max_file_size: int = MAX_FILE_SIZE, + max_files: int | None = None, + max_total_size_bytes: int | None = None, + max_directory_depth: int | None = None, include_patterns: str | set[str] | None = None, exclude_patterns: str | set[str] | None = None, branch: str | None = None, @@ -77,6 +80,9 @@ async def ingest_async( query: IngestionQuery = await parse_query( source=source, max_file_size=max_file_size, + max_files=max_files, + max_total_size_bytes=max_total_size_bytes, + max_directory_depth=max_directory_depth, from_web=False, include_patterns=include_patterns, ignore_patterns=exclude_patterns, @@ -101,6 +107,9 @@ def ingest( source: str, *, max_file_size: int = MAX_FILE_SIZE, + max_files: int | None = None, + max_total_size_bytes: int | None = None, + max_directory_depth: int | None = None, include_patterns: str | set[str] | None = None, exclude_patterns: str | set[str] | None = None, branch: str | None = None, @@ -122,6 +131,12 @@ def ingest( The source to analyze, which can be a URL (for a Git repository) or a local directory path. max_file_size : int Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). + max_files : int | None + Maximum number of files to process. If ``None``, uses the default from config (default: 10,000). + max_total_size_bytes : int | None + Maximum total size of all files to process in bytes. If ``None``, uses the default from config (default: 500 MB). + max_directory_depth : int | None + Maximum depth of directory traversal. If ``None``, uses the default from config (default: 20). include_patterns : str | set[str] | None Pattern or set of patterns specifying which files to include. If ``None``, all files are included. exclude_patterns : str | set[str] | None @@ -159,6 +174,9 @@ def ingest( ingest_async( source=source, max_file_size=max_file_size, + max_files=max_files, + max_total_size_bytes=max_total_size_bytes, + max_directory_depth=max_directory_depth, include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 2990a875..abffa3b5 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -97,7 +97,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem Statistics tracking object for the total file count and size. """ - if limit_exceeded(stats, depth=node.depth): + if limit_exceeded(stats, depth=node.depth, query=query): return for sub_path in node.path.iterdir(): @@ -113,7 +113,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem if sub_path.stat().st_size > query.max_file_size: print(f"Skipping file {sub_path}: would exceed max file size limit") continue - _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) + _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path, query=query) elif sub_path.is_dir(): child_directory_node = FileSystemNode( name=sub_path.name, @@ -167,7 +167,7 @@ def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemS parent_node.file_count += 1 -def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: +def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path, query: IngestionQuery) -> None: """Process a file in the file system. This function checks the file's size, increments the statistics, and reads its content. @@ -183,14 +183,16 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat Statistics tracking object for the total file count and size. local_path : Path The base path of the repository or directory being processed. + query : IngestionQuery + The query object containing the limit configurations. """ - if stats.total_files + 1 > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + if stats.total_files + 1 > query.max_files: + print(f"Maximum file limit ({query.max_files}) reached") return file_size = path.stat().st_size - if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: + if stats.total_size + file_size > query.max_total_size_bytes: print(f"Skipping file {path}: would exceed total size limit") return @@ -212,7 +214,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat parent_node.file_count += 1 -def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: +def limit_exceeded(stats: FileSystemStats, depth: int, query: IngestionQuery) -> bool: """Check if any of the traversal limits have been exceeded. This function checks if the current traversal has exceeded any of the configured limits: @@ -224,6 +226,8 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: Statistics tracking object for the total file count and size. depth : int The current depth of directory traversal. + query : IngestionQuery + The query object containing the limit configurations. Returns ------- @@ -231,16 +235,16 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: ``True`` if any limit has been exceeded, ``False`` otherwise. """ - if depth > MAX_DIRECTORY_DEPTH: - print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached") + if depth > query.max_directory_depth: + print(f"Maximum depth limit ({query.max_directory_depth}) reached") return True - if stats.total_files >= MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + if stats.total_files >= query.max_files: + print(f"Maximum file limit ({query.max_files}) reached") return True # TODO: end recursion - if stats.total_size >= MAX_TOTAL_SIZE_BYTES: - print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached") + if stats.total_size >= query.max_total_size_bytes: + print(f"Maxumum total size limit ({query.max_total_size_bytes / 1024 / 1024:.1f}MB) reached") return True # TODO: end recursion return False diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index db9cb3cb..9b314e24 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -28,6 +28,9 @@ async def parse_query( source: str, *, max_file_size: int, + max_files: int | None = None, + max_total_size_bytes: int | None = None, + max_directory_depth: int | None = None, from_web: bool, include_patterns: set[str] | str | None = None, ignore_patterns: set[str] | str | None = None, @@ -41,6 +44,12 @@ async def parse_query( The source URL or file path to parse. max_file_size : int The maximum file size in bytes to include. + max_files : int | None + The maximum number of files to process. If None, uses default from config. + max_total_size_bytes : int | None + The maximum total size of all files in bytes. If None, uses default from config. + max_directory_depth : int | None + The maximum depth of directory traversal. If None, uses default from config. from_web : bool Flag indicating whether the source is a web URL. include_patterns : set[str] | str | None @@ -89,6 +98,9 @@ async def parse_query( branch=query.branch, commit=query.commit, max_file_size=max_file_size, + max_files=max_files if max_files is not None else query.max_files, + max_total_size_bytes=max_total_size_bytes if max_total_size_bytes is not None else query.max_total_size_bytes, + max_directory_depth=max_directory_depth if max_directory_depth is not None else query.max_directory_depth, ignore_patterns=ignore_patterns_set, include_patterns=parsed_include, ) diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index c40e11d6..e9a69902 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -7,7 +7,7 @@ from pydantic import BaseModel, Field -from gitingest.config import MAX_FILE_SIZE +from gitingest.config import MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MAX_DIRECTORY_DEPTH @dataclass @@ -77,6 +77,12 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes The tag of the repository. max_file_size : int The maximum file size to ingest (default: 10 MB). + max_files : int + The maximum number of files to process (default: 10,000). + max_total_size_bytes : int + The maximum total size of all files in bytes (default: 500 MB). + max_directory_depth : int + The maximum depth of directory traversal (default: 20). ignore_patterns : set[str] The patterns to ignore (default: ``set()``). include_patterns : set[str] | None @@ -98,6 +104,9 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes commit: str | None = None tag: str | None = None max_file_size: int = Field(default=MAX_FILE_SIZE) + max_files: int = Field(default=MAX_FILES) + max_total_size_bytes: int = Field(default=MAX_TOTAL_SIZE_BYTES) + max_directory_depth: int = Field(default=MAX_DIRECTORY_DEPTH) ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type include_patterns: set[str] | None = None include_submodules: bool = False diff --git a/src/server/server_config.py b/src/server/server_config.py index ffc9c7bc..4098c2a0 100644 --- a/src/server/server_config.py +++ b/src/server/server_config.py @@ -2,14 +2,33 @@ from __future__ import annotations +import os from fastapi.templating import Jinja2Templates -MAX_DISPLAY_SIZE: int = 300_000 -DELETE_REPO_AFTER: int = 60 * 60 # In seconds (1 hour) +# Helper function to get environment variables with type conversion +def _get_env_var(key: str, default, cast_func=None): + """Get environment variable with GITINGEST_ prefix and optional type casting.""" + env_key = f"GITINGEST_{key}" + value = os.environ.get(env_key) + + if value is None: + return default + + if cast_func: + try: + return cast_func(value) + except (ValueError, TypeError): + print(f"Warning: Invalid value for {env_key}: {value}. Using default: {default}") + return default + + return value + +MAX_DISPLAY_SIZE: int = _get_env_var("MAX_DISPLAY_SIZE", 300_000, int) +DELETE_REPO_AFTER: int = _get_env_var("DELETE_REPO_AFTER", 60 * 60, int) # In seconds (1 hour) # Slider configuration (if updated, update the logSliderToSize function in src/static/js/utils.js) -MAX_FILE_SIZE_KB: int = 100 * 1024 # 100 MB -MAX_SLIDER_POSITION: int = 500 # Maximum slider position +MAX_FILE_SIZE_KB: int = _get_env_var("MAX_FILE_SIZE_KB", 100 * 1024, int) # 100 MB +MAX_SLIDER_POSITION: int = _get_env_var("MAX_SLIDER_POSITION", 500, int) # Maximum slider position EXAMPLE_REPOS: list[dict[str, str]] = [ {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, From 3bc4ed7049798fbe6b2f97accfc6c0dfe28a8392 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 5 Jul 2025 13:55:09 +0200 Subject: [PATCH 2/3] fix: deduplicate env-var helper, add `--tag` flag, and tidy docs/tests Refactor * Deduplicate `_get_env_var` by moving it to `utils/config_utils.py`. * Remove redundant `local_path` parameter from `_process_file`. Fix * Add missing `tag` parameter to `_async_main`, `main`, and `_CLIArgs`. * Introduce the missing `--tag` CLI flag. Docs and consistency * Update `README.md` for `markdownlint` compliance and other minor tweaks. * Add missing argument docs to `_async_main` docstring. * Re-order global variables in `config.py` for consistency. * Swap the order of `include_patterns` and `ignore_patterns` in `parse_query` and `ingest_async`. * Tidy docstrings for `_async_main`, `IngestionQuery`, `parse_query`, `ingest_async`, and `ingest`. Tests * Temporarily disable `[tool.ruff.lint.isort]` due to conflict with the `isort` pre-commit hook. * Add new arguments to `expected` in `test_parse_query_without_host`. * Run `pre-commit` hooks. --- README.md | 42 ++++++++-------- pyproject.toml | 6 +-- src/gitingest/cli.py | 24 ++++++--- src/gitingest/config.py | 32 +++--------- src/gitingest/entrypoint.py | 52 +++++++++++--------- src/gitingest/ingestion.py | 13 +++-- src/gitingest/query_parser.py | 18 +++---- src/gitingest/schemas/ingestion.py | 36 +++++++------- src/gitingest/utils/config_utils.py | 40 +++++++++++++++ src/server/query_processor.py | 2 +- src/server/server_config.py | 19 +------ tests/query_parser/test_git_host_agnostic.py | 3 ++ tests/query_parser/test_query_parser.py | 4 +- 13 files changed, 159 insertions(+), 132 deletions(-) create mode 100644 src/gitingest/utils/config_utils.py diff --git a/README.md b/README.md index cc246f22..a3263aa6 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ By default, the digest is written to a text file (`digest.txt`) in your current - Use `--output/-o ` to write to a specific file. - Use `--output/-o -` to output directly to `STDOUT` (useful for piping to other tools). -Configure processing limits: +### 🔧 Configure processing limits ```bash # Set higher limits for large repositories @@ -157,32 +157,34 @@ See more options and usage details with: gitingest --help ``` -### 🔧 Configuration via Environment Variables +### Configuration via Environment Variables You can configure various limits and settings using environment variables. All configuration environment variables start with the `GITINGEST_` prefix: -**File Processing Configuration:** -- `GITINGEST_MAX_FILE_SIZE` - Maximum size of a single file to process (default: 10485760 bytes, 10MB) -- `GITINGEST_MAX_FILES` - Maximum number of files to process (default: 10000) -- `GITINGEST_MAX_TOTAL_SIZE_BYTES` - Maximum size of output file (default: 524288000 bytes, 500MB) -- `GITINGEST_MAX_DIRECTORY_DEPTH` - Maximum depth of directory traversal (default: 20) -- `GITINGEST_DEFAULT_TIMEOUT` - Default operation timeout in seconds (default: 60) -- `GITINGEST_OUTPUT_FILE_NAME` - Default output filename (default: "digest.txt") -- `GITINGEST_TMP_BASE_PATH` - Base path for temporary files (default: system temp directory) +#### File Processing Configuration -**Server Configuration (for self-hosting):** -- `GITINGEST_MAX_DISPLAY_SIZE` - Maximum size of content to display in UI (default: 300000 bytes) -- `GITINGEST_DELETE_REPO_AFTER` - Repository cleanup timeout in seconds (default: 3600, 1 hour) -- `GITINGEST_MAX_FILE_SIZE_KB` - Maximum file size for UI slider in KB (default: 102400, 100MB) -- `GITINGEST_MAX_SLIDER_POSITION` - Maximum slider position in UI (default: 500) +- `GITINGEST_MAX_FILE_SIZE` - Maximum size of a single file to process *(default: 10485760 bytes, 10 MB)* +- `GITINGEST_MAX_FILES` - Maximum number of files to process *(default: 10000)* +- `GITINGEST_MAX_TOTAL_SIZE_BYTES` - Maximum size of output file *(default: 524288000 bytes, 500 MB)* +- `GITINGEST_MAX_DIRECTORY_DEPTH` - Maximum depth of directory traversal *(default: 20)* +- `GITINGEST_DEFAULT_TIMEOUT` - Default operation timeout in seconds *(default: 60)* +- `GITINGEST_OUTPUT_FILE_NAME` - Default output filename *(default: "digest.txt")* +- `GITINGEST_TMP_BASE_PATH` - Base path for temporary files *(default: system temp directory)* -**Example usage:** +#### Server Configuration (for self-hosting) + +- `GITINGEST_MAX_DISPLAY_SIZE` - Maximum size of content to display in UI *(default: 300000 bytes)* +- `GITINGEST_DELETE_REPO_AFTER` - Repository cleanup timeout in seconds *(default: 3600, 1 hour)* +- `GITINGEST_MAX_FILE_SIZE_KB` - Maximum file size for UI slider in kB *(default: 102400, 100 MB)* +- `GITINGEST_MAX_SLIDER_POSITION` - Maximum slider position in UI *(default: 500)* + +#### Example usage ```bash # Configure for large scientific repositories export GITINGEST_MAX_FILES=50000 -export GITINGEST_MAX_FILE_SIZE=20971520 # 20MB -export GITINGEST_MAX_TOTAL_SIZE_BYTES=1073741824 # 1GB +export GITINGEST_MAX_FILE_SIZE=20971520 # 20 MB +export GITINGEST_MAX_TOTAL_SIZE_BYTES=1073741824 # 1 GB gitingest https://github.com/some/large-repo ``` @@ -219,9 +221,9 @@ summary, tree, content = ingest("https://github.com/username/repo-with-submodule # Configure limits programmatically summary, tree, content = ingest( "https://github.com/username/large-repo", - max_file_size=20 * 1024 * 1024, # 20MB per file + max_file_size=20 * 1024 * 1024, # 20 MB per file max_files=50000, # 50k files max - max_total_size_bytes=1024**3, # 1GB total + max_total_size_bytes=1024**3, # 1 GB total max_directory_depth=30 # 30 levels deep ) ``` diff --git a/pyproject.toml b/pyproject.toml index 641f9c30..bd8f3ea4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,9 +97,9 @@ per-file-ignores = { "tests/**/*.py" = ["S101"] } # Skip the "assert used" warni [tool.ruff.lint.pylint] max-returns = 10 -[tool.ruff.lint.isort] -order-by-type = true -case-sensitive = true +# [tool.ruff.lint.isort] +# order-by-type = true +# case-sensitive = true [tool.pycln] all = true diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 23b4a2c7..4a516feb 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -9,7 +9,7 @@ import click from typing_extensions import Unpack -from gitingest.config import MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MAX_DIRECTORY_DEPTH, OUTPUT_FILE_NAME +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES, OUTPUT_FILE_NAME from gitingest.entrypoint import ingest_async @@ -22,6 +22,7 @@ class _CLIArgs(TypedDict): exclude_pattern: tuple[str, ...] include_pattern: tuple[str, ...] branch: str | None + tag: str | None include_gitignored: bool include_submodules: bool token: str | None @@ -63,6 +64,7 @@ class _CLIArgs(TypedDict): help="Shell-style patterns to include.", ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") +@click.option("--tag", default=None, help="Tag to clone and ingest") @click.option( "--include-gitignored", is_flag=True, @@ -119,7 +121,7 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: $ gitingest --include-pattern "*.js" --exclude-pattern "node_modules/*" Private repositories: - $ gitingest https://github.com/user/private-repo -t ghp_token + $ gitingest https://github.com/user/private-repo --token ghp_token $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo Include submodules: @@ -139,6 +141,7 @@ async def _async_main( exclude_pattern: tuple[str, ...] | None = None, include_pattern: tuple[str, ...] | None = None, branch: str | None = None, + tag: str | None = None, include_gitignored: bool = False, include_submodules: bool = False, token: str | None = None, @@ -156,21 +159,29 @@ async def _async_main( A directory path or a Git repository URL. max_size : int Maximum file size in bytes to ingest (default: 10 MB). + max_files : int + Maximum number of files to ingest (default: 10,000). + max_total_size : int + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int + Maximum depth of directory traversal (default: 20). exclude_pattern : tuple[str, ...] | None Glob patterns for pruning the file set. include_pattern : tuple[str, ...] | None Glob patterns for including files in the output. branch : str | None - Git branch to ingest. If ``None``, the repository's default branch is used. + Git branch to clone and ingest (default: the default branch). + tag : str | None + Git tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool - If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``). + If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - The path where the output file will be written (default: ``digest.txt`` in current directory). + The path where the output file is written (default: ``digest.txt`` in current directory). Use ``"-"`` to write to ``stdout``. Raises @@ -197,9 +208,10 @@ async def _async_main( max_files=max_files, max_total_size_bytes=max_total_size, max_directory_depth=max_directory_depth, - include_patterns=include_patterns, exclude_patterns=exclude_patterns, + include_patterns=include_patterns, branch=branch, + tag=tag, include_gitignored=include_gitignored, include_submodules=include_submodules, token=token, diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 94b4f921..298f597f 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -1,34 +1,16 @@ """Configuration file for the project.""" -import os import tempfile from pathlib import Path -# Helper function to get environment variables with type conversion -def _get_env_var(key: str, default, cast_func=None): - """Get environment variable with GITINGEST_ prefix and optional type casting.""" - env_key = f"GITINGEST_{key}" - value = os.environ.get(env_key) - - if value is None: - return default - - if cast_func: - try: - return cast_func(value) - except (ValueError, TypeError): - print(f"Warning: Invalid value for {env_key}: {value}. Using default: {default}") - return default - - return value +from gitingest.utils.config_utils import _get_env_var -# Configuration with environment variable support -MAX_FILE_SIZE = _get_env_var("MAX_FILE_SIZE", 10 * 1024 * 1024, int) # Maximum size of a single file to process (10 MB) -MAX_DIRECTORY_DEPTH = _get_env_var("MAX_DIRECTORY_DEPTH", 20, int) # Maximum depth of directory traversal -MAX_FILES = _get_env_var("MAX_FILES", 10_000, int) # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = _get_env_var("MAX_TOTAL_SIZE_BYTES", 500 * 1024 * 1024, int) # Maximum size of output file (500 MB) -DEFAULT_TIMEOUT = _get_env_var("DEFAULT_TIMEOUT", 60, int) # seconds +MAX_FILE_SIZE = _get_env_var("MAX_FILE_SIZE", 10 * 1024 * 1024, int) # Max file size to process in bytes (10 MB) +MAX_FILES = _get_env_var("MAX_FILES", 10_000, int) # Max number of files to process +MAX_TOTAL_SIZE_BYTES = _get_env_var("MAX_TOTAL_SIZE_BYTES", 500 * 1024 * 1024, int) # Max output file size (500 MB) +MAX_DIRECTORY_DEPTH = _get_env_var("MAX_DIRECTORY_DEPTH", 20, int) # Max depth of directory traversal -OUTPUT_FILE_NAME = _get_env_var("OUTPUT_FILE_NAME", "digest.txt") +DEFAULT_TIMEOUT = _get_env_var("DEFAULT_TIMEOUT", 60, int) # Default timeout for git operations in seconds +OUTPUT_FILE_NAME = _get_env_var("OUTPUT_FILE_NAME", "digest.txt") TMP_BASE_PATH = Path(_get_env_var("TMP_BASE_PATH", tempfile.gettempdir())) / "gitingest" diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index cee1c05e..2f77b387 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -25,8 +25,8 @@ async def ingest_async( max_files: int | None = None, max_total_size_bytes: int | None = None, max_directory_depth: int | None = None, - include_patterns: str | set[str] | None = None, exclude_patterns: str | set[str] | None = None, + include_patterns: str | set[str] | None = None, branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, @@ -43,17 +43,23 @@ async def ingest_async( Parameters ---------- source : str - The source to analyze, which can be a URL (for a Git repository) or a local directory path. + A directory path or a Git repository URL. max_file_size : int - Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). - include_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to include. If ``None``, all files are included. + Maximum file size in bytes to ingest (default: 10 MB). + max_files : int | None + Maximum number of files to ingest (default: 10,000). + max_total_size_bytes : int | None + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int | None + Maximum depth of directory traversal (default: 20). exclude_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to exclude. If ``None``, no files are excluded. + Glob patterns for pruning the file set. + include_patterns : str | set[str] | None + Glob patterns for including files in the output. branch : str | None - The branch to clone and ingest (default: the default branch). + Git branch to clone and ingest (default: the default branch). tag : str | None - The tag to clone and ingest. If ``None``, no tag is used. + Git tag to to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool @@ -62,7 +68,7 @@ async def ingest_async( GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - File path where the summary and content should be written. + File path where the summary and content is written. If ``"-"`` (dash), the results are written to ``stdout``. If ``None``, the results are not written to a file. @@ -84,8 +90,8 @@ async def ingest_async( max_total_size_bytes=max_total_size_bytes, max_directory_depth=max_directory_depth, from_web=False, - include_patterns=include_patterns, ignore_patterns=exclude_patterns, + include_patterns=include_patterns, token=token, ) @@ -110,8 +116,8 @@ def ingest( max_files: int | None = None, max_total_size_bytes: int | None = None, max_directory_depth: int | None = None, - include_patterns: str | set[str] | None = None, exclude_patterns: str | set[str] | None = None, + include_patterns: str | set[str] | None = None, branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, @@ -128,23 +134,23 @@ def ingest( Parameters ---------- source : str - The source to analyze, which can be a URL (for a Git repository) or a local directory path. + A directory path or a Git repository URL. max_file_size : int - Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). + Maximum file size in bytes to ingest (default: 10 MB). max_files : int | None - Maximum number of files to process. If ``None``, uses the default from config (default: 10,000). + Maximum number of files to ingest (default: 10,000). max_total_size_bytes : int | None - Maximum total size of all files to process in bytes. If ``None``, uses the default from config (default: 500 MB). + Maximum total size of output file in bytes (default: 500 MB). max_directory_depth : int | None - Maximum depth of directory traversal. If ``None``, uses the default from config (default: 20). - include_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to include. If ``None``, all files are included. + Maximum depth of directory traversal (default: 20). exclude_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to exclude. If ``None``, no files are excluded. + Glob patterns for pruning the file set. + include_patterns : str | set[str] | None + Glob patterns for including files in the output. branch : str | None - The branch to clone and ingest (default: the default branch). + Git branch to clone and ingest (default: the default branch). tag : str | None - The tag to clone and ingest. If ``None``, no tag is used. + Git tag to to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool @@ -153,7 +159,7 @@ def ingest( GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - File path where the summary and content should be written. + File path where the summary and content is written. If ``"-"`` (dash), the results are written to ``stdout``. If ``None``, the results are not written to a file. @@ -177,8 +183,8 @@ def ingest( max_files=max_files, max_total_size_bytes=max_total_size_bytes, max_directory_depth=max_directory_depth, - include_patterns=include_patterns, exclude_patterns=exclude_patterns, + include_patterns=include_patterns, branch=branch, tag=tag, include_gitignored=include_gitignored, diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index abffa3b5..50ff0ccf 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import TYPE_CHECKING -from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.output_formatter import format_node from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include @@ -113,7 +112,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem if sub_path.stat().st_size > query.max_file_size: print(f"Skipping file {sub_path}: would exceed max file size limit") continue - _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path, query=query) + _process_file(path=sub_path, parent_node=node, stats=stats, query=query) elif sub_path.is_dir(): child_directory_node = FileSystemNode( name=sub_path.name, @@ -167,7 +166,7 @@ def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemS parent_node.file_count += 1 -def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path, query: IngestionQuery) -> None: +def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, query: IngestionQuery) -> None: """Process a file in the file system. This function checks the file's size, increments the statistics, and reads its content. @@ -181,8 +180,6 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat The dictionary to accumulate the results. stats : FileSystemStats Statistics tracking object for the total file count and size. - local_path : Path - The base path of the repository or directory being processed. query : IngestionQuery The query object containing the limit configurations. @@ -193,7 +190,9 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat file_size = path.stat().st_size if stats.total_size + file_size > query.max_total_size_bytes: - print(f"Skipping file {path}: would exceed total size limit") + print( + f"Skipping file {path}: would exceed total size limit ({query.max_total_size_bytes / 1024 / 1024:.1f}MB)", + ) return stats.total_files += 1 @@ -204,7 +203,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat type=FileSystemNodeType.FILE, size=file_size, file_count=1, - path_str=str(path.relative_to(local_path)), + path_str=str(path.relative_to(query.local_path)), path=path, depth=parent_node.depth + 1, ) diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index fa914509..cab36e1a 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -31,8 +31,8 @@ async def parse_query( max_total_size_bytes: int | None = None, max_directory_depth: int | None = None, from_web: bool, - include_patterns: set[str] | str | None = None, ignore_patterns: set[str] | str | None = None, + include_patterns: set[str] | str | None = None, token: str | None = None, ) -> IngestionQuery: """Parse the input source to extract details for the query and process the include and ignore patterns. @@ -40,21 +40,21 @@ async def parse_query( Parameters ---------- source : str - The source URL or file path to parse. + A directory path or a Git repository URL. max_file_size : int - The maximum file size in bytes to include. + Maximum file size in bytes to ingest (default: 10 MB). max_files : int | None - The maximum number of files to process. If None, uses default from config. + Maximum number of files to ingest (default: 10,000). max_total_size_bytes : int | None - The maximum total size of all files in bytes. If None, uses default from config. + Maximum total size of output file in bytes (default: 500 MB). max_directory_depth : int | None - The maximum depth of directory traversal. If None, uses default from config. + Maximum depth of directory traversal (default: 20). from_web : bool Flag indicating whether the source is a web URL. - include_patterns : set[str] | str | None - Patterns to include. Can be a set of strings or a single string. ignore_patterns : set[str] | str | None - Patterns to ignore. Can be a set of strings or a single string. + Glob patterns to ignore. Can be a set of strings or a single string. + include_patterns : set[str] | str | None + Glob patterns to include. Can be a set of strings or a single string. token : str | None GitHub personal access token (PAT) for accessing private repositories. diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index e9a69902..79e2bd68 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -7,7 +7,7 @@ from pydantic import BaseModel, Field -from gitingest.config import MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MAX_DIRECTORY_DEPTH +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES @dataclass @@ -54,39 +54,39 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes Attributes ---------- user_name : str | None - The username or owner of the repository. + Username or owner of the repository. repo_name : str | None - The name of the repository. + Name of the repository. local_path : Path - The local path to the repository or file. + Local path to the repository or file. url : str | None - The URL of the repository. + URL of the repository. slug : str - The slug of the repository. + Slug of the repository. id : str - The ID of the repository. + ID of the repository. subpath : str - The subpath to the repository or file (default: ``"/"``). + Subpath to the repository or file (default: ``"/"``). type : str | None - The type of the repository or file. + Type of the repository or file. branch : str | None - The branch of the repository. + Branch of the repository. commit : str | None - The commit of the repository. + Commit of the repository. tag: str | None - The tag of the repository. + Tag of the repository. max_file_size : int - The maximum file size to ingest (default: 10 MB). + Maximum file size in bytes to ingest (default: 10 MB). max_files : int - The maximum number of files to process (default: 10,000). + Maximum number of files to ingest (default: 10,000). max_total_size_bytes : int - The maximum total size of all files in bytes (default: 500 MB). + Maximum total size of output file in bytes (default: 500 MB). max_directory_depth : int - The maximum depth of directory traversal (default: 20). + Maximum depth of directory traversal (default: 20). ignore_patterns : set[str] - The patterns to ignore (default: ``set()``). + Patterns to ignore. include_patterns : set[str] | None - The patterns to include. + Patterns to include. include_submodules : bool Whether to include all Git submodules within the repository. (default: ``False``) diff --git a/src/gitingest/utils/config_utils.py b/src/gitingest/utils/config_utils.py new file mode 100644 index 00000000..a5ff563e --- /dev/null +++ b/src/gitingest/utils/config_utils.py @@ -0,0 +1,40 @@ +"""Configuration utilities.""" + +from __future__ import annotations + +import os +from typing import Callable + + +def _get_env_var(key: str, default: int | str, cast_func: Callable[[str], int | str] | None = None) -> int | str: + """Get environment variable with ``GITINGEST_`` prefix and optional type casting. + + Parameters + ---------- + key : str + The name of the environment variable. + default : int | str + The default value to return if the environment variable is not set. + cast_func : Callable[[str], int | str] | None + The function to cast the environment variable to the desired type. + + Returns + ------- + int | str + The value of the environment variable, cast to the desired type if provided. + + """ + env_key = f"GITINGEST_{key}" + value = os.environ.get(env_key) + + if value is None: + return default + + if cast_func: + try: + return cast_func(value) + except (ValueError, TypeError): + print(f"Warning: Invalid value for {env_key}: {value}. Using default: {default}") + return default + + return value diff --git a/src/server/query_processor.py b/src/server/query_processor.py index c5a15e8e..7e2f938f 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -73,8 +73,8 @@ async def process_query( source=input_text, max_file_size=max_file_size, from_web=True, - include_patterns=include_patterns, ignore_patterns=exclude_patterns, + include_patterns=include_patterns, token=token, ) query.ensure_url() diff --git a/src/server/server_config.py b/src/server/server_config.py index 4098c2a0..fa3137ad 100644 --- a/src/server/server_config.py +++ b/src/server/server_config.py @@ -2,26 +2,9 @@ from __future__ import annotations -import os from fastapi.templating import Jinja2Templates -# Helper function to get environment variables with type conversion -def _get_env_var(key: str, default, cast_func=None): - """Get environment variable with GITINGEST_ prefix and optional type casting.""" - env_key = f"GITINGEST_{key}" - value = os.environ.get(env_key) - - if value is None: - return default - - if cast_func: - try: - return cast_func(value) - except (ValueError, TypeError): - print(f"Warning: Invalid value for {env_key}: {value}. Using default: {default}") - return default - - return value +from gitingest.utils.config_utils import _get_env_var MAX_DISPLAY_SIZE: int = _get_env_var("MAX_DISPLAY_SIZE", 300_000, int) DELETE_REPO_AFTER: int = _get_env_var("DELETE_REPO_AFTER", 60 * 60, int) # In seconds (1 hour) diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index d3d2542a..053c6543 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -67,6 +67,9 @@ async def test_parse_query_without_host( "tag": None, "commit": None, "max_file_size": 50, + "max_directory_depth": 20, + "max_files": 10_000, + "max_total_size_bytes": 500 * 1024 * 1024, "include_patterns": None, "include_submodules": False, } diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index f6033352..d16220d8 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -205,7 +205,7 @@ async def test_parse_query_empty_patterns() -> None: When ``parse_query`` is called, Then ``include_patterns`` becomes ``None`` and default ignore patterns apply. """ - query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") + query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, ignore_patterns="", include_patterns="") assert query.include_patterns is None assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @@ -223,8 +223,8 @@ async def test_parse_query_include_and_ignore_overlap() -> None: DEMO_URL, max_file_size=50, from_web=True, - include_patterns="*.py", ignore_patterns={"*.py", "*.txt"}, + include_patterns="*.py", ) assert query.include_patterns == {"*.py"} From 3a2b347176a5a40452abcddd39fb5032c04d9242 Mon Sep 17 00:00:00 2001 From: Cheelax Date: Mon, 7 Jul 2025 15:05:51 +0200 Subject: [PATCH 3/3] Refactor environment variable handling in configuration files - Introduced a new helper function `_get_int_env_var` to retrieve environment variables as integers with default fallback and error handling. - Updated `config.py` and `server_config.py` to use the new helper function for better clarity and maintainability. - Simplified `_get_env_var` in `config_utils.py` to focus solely on retrieving string values without type casting. --- src/gitingest/config.py | 18 +++++++++++++----- src/gitingest/utils/config_utils.py | 20 +++++--------------- src/server/server_config.py | 16 ++++++++++++---- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 298f597f..1b4a66ea 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -5,12 +5,20 @@ from gitingest.utils.config_utils import _get_env_var -MAX_FILE_SIZE = _get_env_var("MAX_FILE_SIZE", 10 * 1024 * 1024, int) # Max file size to process in bytes (10 MB) -MAX_FILES = _get_env_var("MAX_FILES", 10_000, int) # Max number of files to process -MAX_TOTAL_SIZE_BYTES = _get_env_var("MAX_TOTAL_SIZE_BYTES", 500 * 1024 * 1024, int) # Max output file size (500 MB) -MAX_DIRECTORY_DEPTH = _get_env_var("MAX_DIRECTORY_DEPTH", 20, int) # Max depth of directory traversal +def _get_int_env_var(key: str, default: int) -> int: + """Get environment variable as integer with fallback to default.""" + try: + return int(_get_env_var(key, str(default))) + except ValueError: + print(f"Warning: Invalid value for GITINGEST_{key}. Using default: {default}") + return default -DEFAULT_TIMEOUT = _get_env_var("DEFAULT_TIMEOUT", 60, int) # Default timeout for git operations in seconds +MAX_FILE_SIZE = _get_int_env_var("MAX_FILE_SIZE", 10 * 1024 * 1024) # Max file size to process in bytes (10 MB) +MAX_FILES = _get_int_env_var("MAX_FILES", 10_000) # Max number of files to process +MAX_TOTAL_SIZE_BYTES = _get_int_env_var("MAX_TOTAL_SIZE_BYTES", 500 * 1024 * 1024) # Max output file size (500 MB) +MAX_DIRECTORY_DEPTH = _get_int_env_var("MAX_DIRECTORY_DEPTH", 20) # Max depth of directory traversal + +DEFAULT_TIMEOUT = _get_int_env_var("DEFAULT_TIMEOUT", 60) # Default timeout for git operations in seconds OUTPUT_FILE_NAME = _get_env_var("OUTPUT_FILE_NAME", "digest.txt") TMP_BASE_PATH = Path(_get_env_var("TMP_BASE_PATH", tempfile.gettempdir())) / "gitingest" diff --git a/src/gitingest/utils/config_utils.py b/src/gitingest/utils/config_utils.py index a5ff563e..6c957de0 100644 --- a/src/gitingest/utils/config_utils.py +++ b/src/gitingest/utils/config_utils.py @@ -3,25 +3,22 @@ from __future__ import annotations import os -from typing import Callable -def _get_env_var(key: str, default: int | str, cast_func: Callable[[str], int | str] | None = None) -> int | str: - """Get environment variable with ``GITINGEST_`` prefix and optional type casting. +def _get_env_var(key: str, default: str) -> str: + """Get environment variable with ``GITINGEST_`` prefix. Parameters ---------- key : str The name of the environment variable. - default : int | str + default : str The default value to return if the environment variable is not set. - cast_func : Callable[[str], int | str] | None - The function to cast the environment variable to the desired type. Returns ------- - int | str - The value of the environment variable, cast to the desired type if provided. + str + The value of the environment variable as a string. """ env_key = f"GITINGEST_{key}" @@ -30,11 +27,4 @@ def _get_env_var(key: str, default: int | str, cast_func: Callable[[str], int | if value is None: return default - if cast_func: - try: - return cast_func(value) - except (ValueError, TypeError): - print(f"Warning: Invalid value for {env_key}: {value}. Using default: {default}") - return default - return value diff --git a/src/server/server_config.py b/src/server/server_config.py index fa3137ad..c9084429 100644 --- a/src/server/server_config.py +++ b/src/server/server_config.py @@ -6,12 +6,20 @@ from gitingest.utils.config_utils import _get_env_var -MAX_DISPLAY_SIZE: int = _get_env_var("MAX_DISPLAY_SIZE", 300_000, int) -DELETE_REPO_AFTER: int = _get_env_var("DELETE_REPO_AFTER", 60 * 60, int) # In seconds (1 hour) +def _get_int_env_var(key: str, default: int) -> int: + """Get environment variable as integer with fallback to default.""" + try: + return int(_get_env_var(key, str(default))) + except ValueError: + print(f"Warning: Invalid value for GITINGEST_{key}. Using default: {default}") + return default + +MAX_DISPLAY_SIZE: int = _get_int_env_var("MAX_DISPLAY_SIZE", 300_000) +DELETE_REPO_AFTER: int = _get_int_env_var("DELETE_REPO_AFTER", 60 * 60) # In seconds (1 hour) # Slider configuration (if updated, update the logSliderToSize function in src/static/js/utils.js) -MAX_FILE_SIZE_KB: int = _get_env_var("MAX_FILE_SIZE_KB", 100 * 1024, int) # 100 MB -MAX_SLIDER_POSITION: int = _get_env_var("MAX_SLIDER_POSITION", 500, int) # Maximum slider position +MAX_FILE_SIZE_KB: int = _get_int_env_var("MAX_FILE_SIZE_KB", 100 * 1024) # 100 MB +MAX_SLIDER_POSITION: int = _get_int_env_var("MAX_SLIDER_POSITION", 500) # Maximum slider position EXAMPLE_REPOS: list[dict[str, str]] = [ {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"},