diff --git a/README.md b/README.md index 314f3a8a..6ec7a93c 100644 --- a/README.md +++ b/README.md @@ -135,12 +135,60 @@ By default, the digest is written to a text file (`digest.txt`) in your current - Use `--output/-o ` to write to a specific file. - Use `--output/-o -` to output directly to `STDOUT` (useful for piping to other tools). +### 🔧 Configure processing limits + +```bash +# Set higher limits for large repositories +gitingest https://github.com/torvalds/linux \ + --max-files 100000 \ + --max-total-size 2147483648 \ + --max-directory-depth 25 + +# Process only Python files up to 1MB each +gitingest /path/to/project \ + --include-pattern "*.py" \ + --max-size 1048576 \ + --max-files 1000 +``` + See more options and usage details with: ```bash gitingest --help ``` +### Configuration via Environment Variables + +You can configure various limits and settings using environment variables. All configuration environment variables start with the `GITINGEST_` prefix: + +#### File Processing Configuration + +- `GITINGEST_MAX_FILE_SIZE` - Maximum size of a single file to process *(default: 10485760 bytes, 10 MB)* +- `GITINGEST_MAX_FILES` - Maximum number of files to process *(default: 10000)* +- `GITINGEST_MAX_TOTAL_SIZE_BYTES` - Maximum size of output file *(default: 524288000 bytes, 500 MB)* +- `GITINGEST_MAX_DIRECTORY_DEPTH` - Maximum depth of directory traversal *(default: 20)* +- `GITINGEST_DEFAULT_TIMEOUT` - Default operation timeout in seconds *(default: 60)* +- `GITINGEST_OUTPUT_FILE_NAME` - Default output filename *(default: "digest.txt")* +- `GITINGEST_TMP_BASE_PATH` - Base path for temporary files *(default: system temp directory)* + +#### Server Configuration (for self-hosting) + +- `GITINGEST_MAX_DISPLAY_SIZE` - Maximum size of content to display in UI *(default: 300000 bytes)* +- `GITINGEST_DELETE_REPO_AFTER` - Repository cleanup timeout in seconds *(default: 3600, 1 hour)* +- `GITINGEST_MAX_FILE_SIZE_KB` - Maximum file size for UI slider in kB *(default: 102400, 100 MB)* +- `GITINGEST_MAX_SLIDER_POSITION` - Maximum slider position in UI *(default: 500)* + +#### Example usage + +```bash +# Configure for large scientific repositories +export GITINGEST_MAX_FILES=50000 +export GITINGEST_MAX_FILE_SIZE=20971520 # 20 MB +export GITINGEST_MAX_TOTAL_SIZE_BYTES=1073741824 # 1 GB + +gitingest https://github.com/some/large-repo +``` + ## 🐍 Python package usage ```python @@ -169,6 +217,15 @@ summary, tree, content = ingest("https://github.com/username/private-repo") # Include repository submodules summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True) + +# Configure limits programmatically +summary, tree, content = ingest( + "https://github.com/username/large-repo", + max_file_size=20 * 1024 * 1024, # 20 MB per file + max_files=50000, # 50k files max + max_total_size_bytes=1024**3, # 1 GB total + max_directory_depth=30 # 30 levels deep +) ``` By default, this won't write a file but can be enabled with the `output` argument. diff --git a/pyproject.toml b/pyproject.toml index fb78abab..acac68ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,9 +98,9 @@ per-file-ignores = { "tests/**/*.py" = ["S101"] } # Skip the "assert used" warni [tool.ruff.lint.pylint] max-returns = 10 -[tool.ruff.lint.isort] -order-by-type = true -case-sensitive = true +# [tool.ruff.lint.isort] +# order-by-type = true +# case-sensitive = true [tool.pycln] all = true diff --git a/src/gitingest/__main__.py b/src/gitingest/__main__.py index e14ed681..4a516feb 100644 --- a/src/gitingest/__main__.py +++ b/src/gitingest/__main__.py @@ -9,16 +9,20 @@ import click from typing_extensions import Unpack -from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES, OUTPUT_FILE_NAME from gitingest.entrypoint import ingest_async class _CLIArgs(TypedDict): source: str max_size: int + max_files: int + max_total_size: int + max_directory_depth: int exclude_pattern: tuple[str, ...] include_pattern: tuple[str, ...] branch: str | None + tag: str | None include_gitignored: bool include_submodules: bool token: str | None @@ -34,6 +38,24 @@ class _CLIArgs(TypedDict): show_default=True, help="Maximum file size to process in bytes", ) +@click.option( + "--max-files", + default=MAX_FILES, + show_default=True, + help="Maximum number of files to process", +) +@click.option( + "--max-total-size", + default=MAX_TOTAL_SIZE_BYTES, + show_default=True, + help="Maximum total size of all files in bytes", +) +@click.option( + "--max-directory-depth", + default=MAX_DIRECTORY_DEPTH, + show_default=True, + help="Maximum depth of directory traversal", +) @click.option("--exclude-pattern", "-e", multiple=True, help="Shell-style patterns to exclude.") @click.option( "--include-pattern", @@ -42,6 +64,7 @@ class _CLIArgs(TypedDict): help="Shell-style patterns to include.", ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") +@click.option("--tag", default=None, help="Tag to clone and ingest") @click.option( "--include-gitignored", is_flag=True, @@ -98,7 +121,7 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: $ gitingest --include-pattern "*.js" --exclude-pattern "node_modules/*" Private repositories: - $ gitingest https://github.com/user/private-repo -t ghp_token + $ gitingest https://github.com/user/private-repo --token ghp_token $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo Include submodules: @@ -112,9 +135,13 @@ async def _async_main( source: str, *, max_size: int = MAX_FILE_SIZE, + max_files: int = MAX_FILES, + max_total_size: int = MAX_TOTAL_SIZE_BYTES, + max_directory_depth: int = MAX_DIRECTORY_DEPTH, exclude_pattern: tuple[str, ...] | None = None, include_pattern: tuple[str, ...] | None = None, branch: str | None = None, + tag: str | None = None, include_gitignored: bool = False, include_submodules: bool = False, token: str | None = None, @@ -132,21 +159,29 @@ async def _async_main( A directory path or a Git repository URL. max_size : int Maximum file size in bytes to ingest (default: 10 MB). + max_files : int + Maximum number of files to ingest (default: 10,000). + max_total_size : int + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int + Maximum depth of directory traversal (default: 20). exclude_pattern : tuple[str, ...] | None Glob patterns for pruning the file set. include_pattern : tuple[str, ...] | None Glob patterns for including files in the output. branch : str | None - Git branch to ingest. If ``None``, the repository's default branch is used. + Git branch to clone and ingest (default: the default branch). + tag : str | None + Git tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool - If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``). + If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - The path where the output file will be written (default: ``digest.txt`` in current directory). + The path where the output file is written (default: ``digest.txt`` in current directory). Use ``"-"`` to write to ``stdout``. Raises @@ -170,9 +205,13 @@ async def _async_main( summary, _, _ = await ingest_async( source, max_file_size=max_size, - include_patterns=include_patterns, + max_files=max_files, + max_total_size_bytes=max_total_size, + max_directory_depth=max_directory_depth, exclude_patterns=exclude_patterns, + include_patterns=include_patterns, branch=branch, + tag=tag, include_gitignored=include_gitignored, include_submodules=include_submodules, token=token, diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 3d154684..1b4a66ea 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -3,12 +3,22 @@ import tempfile from pathlib import Path -MAX_FILE_SIZE = 10 * 1024 * 1024 # Maximum size of a single file to process (10 MB) -MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10_000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # Maximum size of output file (500 MB) -DEFAULT_TIMEOUT = 60 # seconds +from gitingest.utils.config_utils import _get_env_var -OUTPUT_FILE_NAME = "digest.txt" +def _get_int_env_var(key: str, default: int) -> int: + """Get environment variable as integer with fallback to default.""" + try: + return int(_get_env_var(key, str(default))) + except ValueError: + print(f"Warning: Invalid value for GITINGEST_{key}. Using default: {default}") + return default -TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest" +MAX_FILE_SIZE = _get_int_env_var("MAX_FILE_SIZE", 10 * 1024 * 1024) # Max file size to process in bytes (10 MB) +MAX_FILES = _get_int_env_var("MAX_FILES", 10_000) # Max number of files to process +MAX_TOTAL_SIZE_BYTES = _get_int_env_var("MAX_TOTAL_SIZE_BYTES", 500 * 1024 * 1024) # Max output file size (500 MB) +MAX_DIRECTORY_DEPTH = _get_int_env_var("MAX_DIRECTORY_DEPTH", 20) # Max depth of directory traversal + +DEFAULT_TIMEOUT = _get_int_env_var("DEFAULT_TIMEOUT", 60) # Default timeout for git operations in seconds + +OUTPUT_FILE_NAME = _get_env_var("OUTPUT_FILE_NAME", "digest.txt") +TMP_BASE_PATH = Path(_get_env_var("TMP_BASE_PATH", tempfile.gettempdir())) / "gitingest" diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index f64dec08..2f77b387 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -22,8 +22,11 @@ async def ingest_async( source: str, *, max_file_size: int = MAX_FILE_SIZE, - include_patterns: str | set[str] | None = None, + max_files: int | None = None, + max_total_size_bytes: int | None = None, + max_directory_depth: int | None = None, exclude_patterns: str | set[str] | None = None, + include_patterns: str | set[str] | None = None, branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, @@ -40,17 +43,23 @@ async def ingest_async( Parameters ---------- source : str - The source to analyze, which can be a URL (for a Git repository) or a local directory path. + A directory path or a Git repository URL. max_file_size : int - Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). - include_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to include. If ``None``, all files are included. + Maximum file size in bytes to ingest (default: 10 MB). + max_files : int | None + Maximum number of files to ingest (default: 10,000). + max_total_size_bytes : int | None + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int | None + Maximum depth of directory traversal (default: 20). exclude_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to exclude. If ``None``, no files are excluded. + Glob patterns for pruning the file set. + include_patterns : str | set[str] | None + Glob patterns for including files in the output. branch : str | None - The branch to clone and ingest (default: the default branch). + Git branch to clone and ingest (default: the default branch). tag : str | None - The tag to clone and ingest. If ``None``, no tag is used. + Git tag to to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool @@ -59,7 +68,7 @@ async def ingest_async( GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - File path where the summary and content should be written. + File path where the summary and content is written. If ``"-"`` (dash), the results are written to ``stdout``. If ``None``, the results are not written to a file. @@ -77,9 +86,12 @@ async def ingest_async( query: IngestionQuery = await parse_query( source=source, max_file_size=max_file_size, + max_files=max_files, + max_total_size_bytes=max_total_size_bytes, + max_directory_depth=max_directory_depth, from_web=False, - include_patterns=include_patterns, ignore_patterns=exclude_patterns, + include_patterns=include_patterns, token=token, ) @@ -101,8 +113,11 @@ def ingest( source: str, *, max_file_size: int = MAX_FILE_SIZE, - include_patterns: str | set[str] | None = None, + max_files: int | None = None, + max_total_size_bytes: int | None = None, + max_directory_depth: int | None = None, exclude_patterns: str | set[str] | None = None, + include_patterns: str | set[str] | None = None, branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, @@ -119,17 +134,23 @@ def ingest( Parameters ---------- source : str - The source to analyze, which can be a URL (for a Git repository) or a local directory path. + A directory path or a Git repository URL. max_file_size : int - Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). - include_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to include. If ``None``, all files are included. + Maximum file size in bytes to ingest (default: 10 MB). + max_files : int | None + Maximum number of files to ingest (default: 10,000). + max_total_size_bytes : int | None + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int | None + Maximum depth of directory traversal (default: 20). exclude_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to exclude. If ``None``, no files are excluded. + Glob patterns for pruning the file set. + include_patterns : str | set[str] | None + Glob patterns for including files in the output. branch : str | None - The branch to clone and ingest (default: the default branch). + Git branch to clone and ingest (default: the default branch). tag : str | None - The tag to clone and ingest. If ``None``, no tag is used. + Git tag to to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool @@ -138,7 +159,7 @@ def ingest( GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - File path where the summary and content should be written. + File path where the summary and content is written. If ``"-"`` (dash), the results are written to ``stdout``. If ``None``, the results are not written to a file. @@ -159,8 +180,11 @@ def ingest( ingest_async( source=source, max_file_size=max_file_size, - include_patterns=include_patterns, + max_files=max_files, + max_total_size_bytes=max_total_size_bytes, + max_directory_depth=max_directory_depth, exclude_patterns=exclude_patterns, + include_patterns=include_patterns, branch=branch, tag=tag, include_gitignored=include_gitignored, diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 2990a875..50ff0ccf 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import TYPE_CHECKING -from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.output_formatter import format_node from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include @@ -97,7 +96,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem Statistics tracking object for the total file count and size. """ - if limit_exceeded(stats, depth=node.depth): + if limit_exceeded(stats, depth=node.depth, query=query): return for sub_path in node.path.iterdir(): @@ -113,7 +112,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem if sub_path.stat().st_size > query.max_file_size: print(f"Skipping file {sub_path}: would exceed max file size limit") continue - _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) + _process_file(path=sub_path, parent_node=node, stats=stats, query=query) elif sub_path.is_dir(): child_directory_node = FileSystemNode( name=sub_path.name, @@ -167,7 +166,7 @@ def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemS parent_node.file_count += 1 -def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: +def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, query: IngestionQuery) -> None: """Process a file in the file system. This function checks the file's size, increments the statistics, and reads its content. @@ -181,17 +180,19 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat The dictionary to accumulate the results. stats : FileSystemStats Statistics tracking object for the total file count and size. - local_path : Path - The base path of the repository or directory being processed. + query : IngestionQuery + The query object containing the limit configurations. """ - if stats.total_files + 1 > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + if stats.total_files + 1 > query.max_files: + print(f"Maximum file limit ({query.max_files}) reached") return file_size = path.stat().st_size - if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {path}: would exceed total size limit") + if stats.total_size + file_size > query.max_total_size_bytes: + print( + f"Skipping file {path}: would exceed total size limit ({query.max_total_size_bytes / 1024 / 1024:.1f}MB)", + ) return stats.total_files += 1 @@ -202,7 +203,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat type=FileSystemNodeType.FILE, size=file_size, file_count=1, - path_str=str(path.relative_to(local_path)), + path_str=str(path.relative_to(query.local_path)), path=path, depth=parent_node.depth + 1, ) @@ -212,7 +213,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat parent_node.file_count += 1 -def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: +def limit_exceeded(stats: FileSystemStats, depth: int, query: IngestionQuery) -> bool: """Check if any of the traversal limits have been exceeded. This function checks if the current traversal has exceeded any of the configured limits: @@ -224,6 +225,8 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: Statistics tracking object for the total file count and size. depth : int The current depth of directory traversal. + query : IngestionQuery + The query object containing the limit configurations. Returns ------- @@ -231,16 +234,16 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: ``True`` if any limit has been exceeded, ``False`` otherwise. """ - if depth > MAX_DIRECTORY_DEPTH: - print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached") + if depth > query.max_directory_depth: + print(f"Maximum depth limit ({query.max_directory_depth}) reached") return True - if stats.total_files >= MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + if stats.total_files >= query.max_files: + print(f"Maximum file limit ({query.max_files}) reached") return True # TODO: end recursion - if stats.total_size >= MAX_TOTAL_SIZE_BYTES: - print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached") + if stats.total_size >= query.max_total_size_bytes: + print(f"Maxumum total size limit ({query.max_total_size_bytes / 1024 / 1024:.1f}MB) reached") return True # TODO: end recursion return False diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 5fabb226..cab36e1a 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -27,9 +27,12 @@ async def parse_query( source: str, *, max_file_size: int, + max_files: int | None = None, + max_total_size_bytes: int | None = None, + max_directory_depth: int | None = None, from_web: bool, - include_patterns: set[str] | str | None = None, ignore_patterns: set[str] | str | None = None, + include_patterns: set[str] | str | None = None, token: str | None = None, ) -> IngestionQuery: """Parse the input source to extract details for the query and process the include and ignore patterns. @@ -37,15 +40,21 @@ async def parse_query( Parameters ---------- source : str - The source URL or file path to parse. + A directory path or a Git repository URL. max_file_size : int - The maximum file size in bytes to include. + Maximum file size in bytes to ingest (default: 10 MB). + max_files : int | None + Maximum number of files to ingest (default: 10,000). + max_total_size_bytes : int | None + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int | None + Maximum depth of directory traversal (default: 20). from_web : bool Flag indicating whether the source is a web URL. - include_patterns : set[str] | str | None - Patterns to include. Can be a set of strings or a single string. ignore_patterns : set[str] | str | None - Patterns to ignore. Can be a set of strings or a single string. + Glob patterns to ignore. Can be a set of strings or a single string. + include_patterns : set[str] | str | None + Glob patterns to include. Can be a set of strings or a single string. token : str | None GitHub personal access token (PAT) for accessing private repositories. @@ -88,6 +97,9 @@ async def parse_query( branch=query.branch, commit=query.commit, max_file_size=max_file_size, + max_files=max_files if max_files is not None else query.max_files, + max_total_size_bytes=max_total_size_bytes if max_total_size_bytes is not None else query.max_total_size_bytes, + max_directory_depth=max_directory_depth if max_directory_depth is not None else query.max_directory_depth, ignore_patterns=ignore_patterns_set, include_patterns=parsed_include, ) diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index c40e11d6..79e2bd68 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -7,7 +7,7 @@ from pydantic import BaseModel, Field -from gitingest.config import MAX_FILE_SIZE +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES @dataclass @@ -54,33 +54,39 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes Attributes ---------- user_name : str | None - The username or owner of the repository. + Username or owner of the repository. repo_name : str | None - The name of the repository. + Name of the repository. local_path : Path - The local path to the repository or file. + Local path to the repository or file. url : str | None - The URL of the repository. + URL of the repository. slug : str - The slug of the repository. + Slug of the repository. id : str - The ID of the repository. + ID of the repository. subpath : str - The subpath to the repository or file (default: ``"/"``). + Subpath to the repository or file (default: ``"/"``). type : str | None - The type of the repository or file. + Type of the repository or file. branch : str | None - The branch of the repository. + Branch of the repository. commit : str | None - The commit of the repository. + Commit of the repository. tag: str | None - The tag of the repository. + Tag of the repository. max_file_size : int - The maximum file size to ingest (default: 10 MB). + Maximum file size in bytes to ingest (default: 10 MB). + max_files : int + Maximum number of files to ingest (default: 10,000). + max_total_size_bytes : int + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int + Maximum depth of directory traversal (default: 20). ignore_patterns : set[str] - The patterns to ignore (default: ``set()``). + Patterns to ignore. include_patterns : set[str] | None - The patterns to include. + Patterns to include. include_submodules : bool Whether to include all Git submodules within the repository. (default: ``False``) @@ -98,6 +104,9 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes commit: str | None = None tag: str | None = None max_file_size: int = Field(default=MAX_FILE_SIZE) + max_files: int = Field(default=MAX_FILES) + max_total_size_bytes: int = Field(default=MAX_TOTAL_SIZE_BYTES) + max_directory_depth: int = Field(default=MAX_DIRECTORY_DEPTH) ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type include_patterns: set[str] | None = None include_submodules: bool = False diff --git a/src/gitingest/utils/config_utils.py b/src/gitingest/utils/config_utils.py new file mode 100644 index 00000000..6c957de0 --- /dev/null +++ b/src/gitingest/utils/config_utils.py @@ -0,0 +1,30 @@ +"""Configuration utilities.""" + +from __future__ import annotations + +import os + + +def _get_env_var(key: str, default: str) -> str: + """Get environment variable with ``GITINGEST_`` prefix. + + Parameters + ---------- + key : str + The name of the environment variable. + default : str + The default value to return if the environment variable is not set. + + Returns + ------- + str + The value of the environment variable as a string. + + """ + env_key = f"GITINGEST_{key}" + value = os.environ.get(env_key) + + if value is None: + return default + + return value diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 8513426b..33f130e8 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -73,8 +73,8 @@ async def process_query( source=input_text, max_file_size=max_file_size, from_web=True, - include_patterns=include_patterns, ignore_patterns=exclude_patterns, + include_patterns=include_patterns, token=token, ) query.ensure_url() diff --git a/src/server/server_config.py b/src/server/server_config.py index 99ef5c91..80716761 100644 --- a/src/server/server_config.py +++ b/src/server/server_config.py @@ -4,12 +4,22 @@ from fastapi.templating import Jinja2Templates -MAX_DISPLAY_SIZE: int = 300_000 -DELETE_REPO_AFTER: int = 60 * 60 # In seconds (1 hour) +from gitingest.utils.config_utils import _get_env_var + +def _get_int_env_var(key: str, default: int) -> int: + """Get environment variable as integer with fallback to default.""" + try: + return int(_get_env_var(key, str(default))) + except ValueError: + print(f"Warning: Invalid value for GITINGEST_{key}. Using default: {default}") + return default + +MAX_DISPLAY_SIZE: int = _get_int_env_var("MAX_DISPLAY_SIZE", 300_000) +DELETE_REPO_AFTER: int = _get_int_env_var("DELETE_REPO_AFTER", 60 * 60) # In seconds (1 hour) # Slider configuration (if updated, update the logSliderToSize function in src/static/js/utils.js) -MAX_FILE_SIZE_KB: int = 100 * 1024 # 100 MB -MAX_SLIDER_POSITION: int = 500 # Maximum slider position +MAX_FILE_SIZE_KB: int = _get_int_env_var("MAX_FILE_SIZE_KB", 100 * 1024) # 100 MB +MAX_SLIDER_POSITION: int = _get_int_env_var("MAX_SLIDER_POSITION", 500) # Maximum slider position EXAMPLE_REPOS: list[dict[str, str]] = [ {"name": "Gitingest", "url": "https://github.com/coderamp-labs/gitingest"}, diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index d3d2542a..053c6543 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -67,6 +67,9 @@ async def test_parse_query_without_host( "tag": None, "commit": None, "max_file_size": 50, + "max_directory_depth": 20, + "max_files": 10_000, + "max_total_size_bytes": 500 * 1024 * 1024, "include_patterns": None, "include_submodules": False, } diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index f6033352..d16220d8 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -205,7 +205,7 @@ async def test_parse_query_empty_patterns() -> None: When ``parse_query`` is called, Then ``include_patterns`` becomes ``None`` and default ignore patterns apply. """ - query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") + query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, ignore_patterns="", include_patterns="") assert query.include_patterns is None assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @@ -223,8 +223,8 @@ async def test_parse_query_include_and_ignore_overlap() -> None: DEMO_URL, max_file_size=50, from_web=True, - include_patterns="*.py", ignore_patterns={"*.py", "*.txt"}, + include_patterns="*.py", ) assert query.include_patterns == {"*.py"}