diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0cc77b6b..6faabd0f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,3 +90,17 @@ If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK 13. **Iterate** on any review feedback—update your branch and repeat **6 – 11** as needed. *(Optional) Invite a maintainer to your branch for easier collaboration.* + +--- + +## CSS & build artefacts + +- **Do not commit `src/static/css/site.css`.** The CI pipeline runs `npm run build:css` during the container/image build, so the artefact is produced automatically. + + *(Optional) Invite project maintainer to your branch for easier collaboration.* + +- When developing locally you may run the build yourself (see step 9) so you can preview the styles. + +## Dependency Management + +When you add a new import from an external package, make sure to add it to both `requirements.txt` and `pyproject.toml` (if applicable). This ensures all environments and CI/CD pipelines have the correct dependencies installed. diff --git a/pyproject.toml b/pyproject.toml index 3095339b..2c19767a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,8 @@ dependencies = [ "tiktoken>=0.7.0", # Support for o200k_base encoding "typing_extensions>= 4.0.0; python_version < '3.10'", "uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150) - "prometheus-client", + "autotiktokenizer=*", + "prometheus-client" ] license = {file = "LICENSE"} diff --git a/requirements.txt b/requirements.txt index 712360e9..c01dda52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ slowapi starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw tiktoken>=0.7.0 # Support for o200k_base encoding uvicorn>=0.11.7 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 +autotiktokenizer diff --git a/src/server/routers/dynamic.py b/src/server/routers/dynamic.py index 93b9d68b..b91c2287 100644 --- a/src/server/routers/dynamic.py +++ b/src/server/routers/dynamic.py @@ -1,6 +1,6 @@ """The dynamic router module defines handlers for dynamic path requests.""" -from fastapi import APIRouter, Request +from fastapi import APIRouter, Depends, Request, HTTPException from fastapi.responses import HTMLResponse from server.server_config import templates @@ -29,6 +29,8 @@ async def catch_all(request: Request, full_path: str) -> HTMLResponse: and other default parameters such as file size. """ + if full_path.startswith("api/"): + raise HTTPException(status_code=405, detail="Method Not Allowed") return templates.TemplateResponse( "git.jinja", { diff --git a/src/server/routers/index.py b/src/server/routers/index.py index af4abd51..57379cae 100644 --- a/src/server/routers/index.py +++ b/src/server/routers/index.py @@ -1,12 +1,66 @@ """Module defining the FastAPI router for the home page of the application.""" -from fastapi import APIRouter, Request +from fastapi import APIRouter, Depends, Request, Form, HTTPException + from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates +from autotiktokenizer import AutoTikTokenizer +import tiktoken +from typing import Optional +from gitingest.utils.compat_typing import Annotated +from server.models import QueryForm +from server.query_processor import process_query from server.server_config import EXAMPLE_REPOS, templates +from server.server_utils import limiter +from pydantic import BaseModel, Field + router = APIRouter() +templates = Jinja2Templates(directory="server/templates") + +SUPPORTED_MODELS = { + 'GPT-2 (OpenAI)': 'openai-community/gpt2', + 'GPT-3 (OpenAI)': 'openai-community/gpt2', + 'GPT-3.5 (OpenAI)': 'openai-community/gpt2', + 'GPT-3.5-turbo (OpenAI)': 'openai-community/gpt2', + 'GPT-4 (OpenAI)': 'openai-community/gpt2', + 'Claude (approximate, uses GPT-2)': 'openai-community/gpt2', + 'Gemini (approximate, uses T5)': 't5-base', + 'Llama-2 (Meta)': 'meta-llama/Llama-2-7b-hf', + 'Llama-3 (Meta)': 'meta-llama/Meta-Llama-3-8B', + 'Mistral-7B (MistralAI)': 'mistralai/Mistral-7B-v0.1', + 'Mixtral-8x7B (MistralAI)': 'mistralai/Mixtral-8x7B-v0.1', + 'Phi-3-mini (Microsoft)': 'microsoft/phi-3-mini-4k-instruct', + 'Gemma-2B (Google)': 'google/gemma-2b', + 'Qwen2-7B (Alibaba)': 'Qwen/Qwen2-7B', + 'Yi-34B (01.AI)': '01-ai/Yi-34B-Chat', + 'Falcon-7B (TII)': 'tiiuae/falcon-7b', + 'MPT-7B (MosaicML)': 'mosaicml/mpt-7b', + 'Baichuan-7B (Baichuan)': 'baichuan-inc/Baichuan-7B', + 'XLM-RoBERTa-base (Facebook)': 'xlm-roberta-base', + 'RoBERTa-base (Facebook)': 'roberta-base', + 'DistilBERT-base-uncased': 'distilbert-base-uncased', + 'GPT-Neo-1.3B (EleutherAI)': 'EleutherAI/gpt-neo-1.3B', + 'GPT-J-6B (EleutherAI)': 'EleutherAI/gpt-j-6B', + 'GPT-Bloom-560m (BigScience)': 'bigscience/bloom-560m', + 'BERT-base-uncased': 'bert-base-uncased', + 'T5-base': 't5-base', +} +# Note: Gemini and Claude use approximate tokenizers (T5 and GPT-2, respectively) as no official public tokenizers exist for these models. + +def get_tokenizer(model_id): + return AutoTikTokenizer.from_pretrained(model_id) + +def count_tokens(input_text, model_id): + if model_id == 'openai-community/gpt2': + # Use tiktoken for OpenAI models + enc = tiktoken.encoding_for_model("gpt-3.5-turbo") + return len(enc.encode(input_text)) + else: + tokenizer = AutoTikTokenizer.from_pretrained(model_id) + return len(tokenizer.encode(input_text)) @router.get("/", response_class=HTMLResponse, include_in_schema=False) async def home(request: Request) -> HTMLResponse: @@ -35,3 +89,123 @@ async def home(request: Request) -> HTMLResponse: "default_max_file_size": 243, }, ) + + +@router.post("/", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def index_post(request: Request, form: Annotated[QueryForm, Depends(QueryForm.as_form)]) -> HTMLResponse: + """Process the form submission with user input for query parameters. + + This endpoint handles POST requests from the home page form. It processes the user-submitted + input (e.g., text, file size, pattern type) and invokes the ``process_query`` function to handle + the query logic, returning the result as an HTML response. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + form : Annotated[QueryForm, Depends(QueryForm.as_form)] + The form data submitted by the user. + + Returns + ------- + HTMLResponse + An HTML response containing the results of processing the form input and query logic, + which will be rendered and returned to the user. + + """ + resolved_token = form.token if form.token else None + return await process_query( + request, + input_text=form.input_text, + slider_position=form.max_file_size, + pattern_type=form.pattern_type, + pattern=form.pattern, + is_index=True, + token=resolved_token, + ) + + +class TokenCountRequest(BaseModel): + input_text: str = Field(..., description="The text to count tokens for") + model_id: str = Field(default="openai-community/gpt2", description="The model ID to use for tokenization") + +class TokenCountResponse(BaseModel): + token_count: int = Field(..., description="Number of tokens in the input text") + model_id: str = Field(..., description="Model ID used for tokenization") + character_count: int = Field(..., description="Number of characters in the input text") + +@router.post("/api/tokencount", response_model=TokenCountResponse) +async def api_token_count( + request: Optional[TokenCountRequest] = None, + input_text: str = Form(None), + model_id: str = Form(default="openai-community/gpt2"), +): + """ + Count tokens in the provided text using the specified model's tokenizer. + Accepts both JSON and form data. + """ + # If JSON body was provided, use that + if request: + text = request.input_text + model = request.model_id + # Otherwise use form data + else: + text = input_text + model = model_id + + if not text or not text.strip(): + raise HTTPException(status_code=400, detail="Input text cannot be empty") + + if model not in SUPPORTED_MODELS.values(): + raise HTTPException( + status_code=400, + detail=f"Unsupported model ID. Must be one of: {', '.join(SUPPORTED_MODELS.values())}" + ) + + try: + token_count = count_tokens(text, model) + return TokenCountResponse( + token_count=token_count, + model_id=model, + character_count=len(text) + ) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@router.get("/tokencount", response_class=HTMLResponse) +async def tokencount_ui(request: Request): + return templates.TemplateResponse( + "tokencount.jinja", + {"request": request, "supported_models": SUPPORTED_MODELS, "input_text": "", "model_id": "openai-community/gpt2", "result": None, "error": None} + ) + +@router.post("/tokencount", response_class=HTMLResponse) +async def tokencount_post(request: Request, input_text: str = Form(...), model_id: str = Form("openai-community/gpt2")): + error = None + result = None + if not input_text or not input_text.strip(): + error = "Input text cannot be empty." + elif model_id not in SUPPORTED_MODELS.values(): + error = f"Unsupported model ID. Must be one of: {', '.join(SUPPORTED_MODELS.values())}" + else: + try: + token_count = count_tokens(input_text, model_id) + result = { + "token_count": token_count, + "model_id": model_id, + "character_count": len(input_text) + } + except Exception as e: + error = str(e) + return templates.TemplateResponse( + "tokencount.jinja", + { + "request": request, + "supported_models": SUPPORTED_MODELS, + "input_text": input_text, + "model_id": model_id, + "result": result, + "error": error + } + ) diff --git a/src/server/templates/components/footer.jinja b/src/server/templates/components/footer.jinja index 9784dfeb..ee065590 100644 --- a/src/server/templates/components/footer.jinja +++ b/src/server/templates/components/footer.jinja @@ -4,12 +4,38 @@
{# Left column — Chrome + PyPI #}
+ + chrome + Extension + + + python + Python package + + + token estimator + Token Estimator + {{ footer_icon_link('https://chromewebstore.google.com/detail/adfjahbijlkjfoicpjkhjicpjpjfaood', 'icons/chrome.svg', 'Chrome Extension') }} {{ footer_icon_link('https://pypi.org/project/gitingest', 'icons/python.svg', 'Python Package') }} + {{ footer_icon_link('/tokencount', + 'icons/tokens.svg', + 'Token Estimator') }} +
{# Right column - Discord #}
diff --git a/src/server/templates/tokencount.jinja b/src/server/templates/tokencount.jinja new file mode 100644 index 00000000..46a77f81 --- /dev/null +++ b/src/server/templates/tokencount.jinja @@ -0,0 +1,40 @@ +{% extends "base.jinja" %} +{% block title %}Token Estimator{% endblock %} +{% block content %} +
+
+
+

Token Estimator

+
+
+ + +
+
+ + +
+
+ +
+
+ {% if result %} +
+

Result

+

Token count: {{ result.token_count }}

+

Character count: {{ result.character_count }}

+

Model: {{ result.model_id }}

+
+ {% endif %} + {% if error %} +
+ Error: {{ error }} +
+ {% endif %} +
+
+{% endblock %} \ No newline at end of file diff --git a/tests/test_server.py b/tests/test_server.py new file mode 100644 index 00000000..4b48848c --- /dev/null +++ b/tests/test_server.py @@ -0,0 +1,23 @@ +from fastapi.testclient import TestClient +from src.server.main import app + +client = TestClient(app, base_url="http://localhost") + + +def test_tokencount_valid(): + response = client.post("/tokencount", json={"input_text": "Hello world!", "model_id": "openai-community/gpt2"}, headers={"host": "localhost"}) + if response.status_code != 200: + print("Response content:", response.content) + assert response.status_code == 200 + data = response.json() + assert "token_count" in data + assert isinstance(data["token_count"], int) + assert data["token_count"] > 0 + +def test_tokencount_missing_input(): + response = client.post("/tokencount", json={"model_id": "openai-community/gpt2"}, headers={"host": "localhost"}) + if response.status_code != 400: + print("Response content:", response.content) + assert response.status_code == 400 + data = response.json() + assert "error" in data \ No newline at end of file