From 17297c62a27308edfb9b61cd1135ed0b4e11484c Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 29 May 2026 07:35:13 +0800 Subject: [PATCH 1/4] feat: add LLM auto context compaction (3-layer pipeline) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement server-side context compaction triggered when prompt tokens exceed a configurable threshold (default 90% of max_ctx): - Layer 1: Edit compaction (strip thinking blocks, truncate/dedupe tool outputs) — CPU only, <1ms - Layer 2: Self-summarization via internal generate() pass — condenses older conversation history into a concise summary - Layer 3: Hard truncation — progressive tail-keeping as last resort New CLI flags: --compaction, --compaction-threshold, --compaction-max-tokens, --compaction-keep-recent API: context_management parameter in Responses API allows per-request threshold override. Response includes usage.compacted_tokens_saved. Includes integration test harness (harness/test_compaction.py) and research documentation in docs/. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/llm-context-compaction.md | 985 ++++++++++++++++++++++++++++++ docs/small-model-compression.md | 362 +++++++++++ harness/test_compaction.py | 197 ++++++ server/CMakeLists.txt | 2 + server/src/server/compaction.cpp | 309 ++++++++++ server/src/server/compaction.h | 44 ++ server/src/server/http_server.cpp | 211 +++++-- server/src/server/http_server.h | 13 + server/src/server/server_main.cpp | 21 + 9 files changed, 2105 insertions(+), 39 deletions(-) create mode 100644 docs/llm-context-compaction.md create mode 100644 docs/small-model-compression.md create mode 100644 harness/test_compaction.py create mode 100644 server/src/server/compaction.cpp create mode 100644 server/src/server/compaction.h diff --git a/docs/llm-context-compaction.md b/docs/llm-context-compaction.md new file mode 100644 index 00000000..d4a799e6 --- /dev/null +++ b/docs/llm-context-compaction.md @@ -0,0 +1,985 @@ +# LLM Automatic Context Compaction: Research & Implementation Guide + +## Executive Summary + +LLM Auto Context Compaction is the industry's response to **Context Bloat** and **Context Rot** — the accumulation of massive intermediate tokens (tool outputs, reasoning chains, redundant history) that exhausts model context limits, drives up costs quadratically, increases latency, and degrades reasoning quality. This report analyzes how context compaction works across OpenAI, Anthropic, academic research, and major frameworks, then provides a detailed **implementation blueprint for the lucebox-hub inference server** — a C++/CUDA inference engine that currently has **no compaction** and simply returns HTTP 400 when context overflows[^1]. + +The key finding: **no major inference engine implements server-side self-summarization today**[^2]. All existing compaction happens either client-side (frameworks like LangChain, Inspect AI) or at the API provider level (OpenAI Responses API, Anthropic Beta). Implementing compaction directly in lucebox-hub would make it the **first open-source inference server with native self-summarization**. + +--- + +## Table of Contents + +1. [Architectural Taxonomy](#1-architectural-taxonomy) +2. [OpenAI's Implementation (Responses API)](#2-openais-implementation-responses-api) +3. [Anthropic's Implementation](#3-anthropics-implementation) +4. [Academic Advances](#4-academic-advances) +5. [Framework Implementations](#5-framework-implementations) +6. [Inference Engine Landscape](#6-inference-engine-landscape) +7. [lucebox-hub: Current State](#7-lucebox-hub-current-state) +8. [Implementation Blueprint for lucebox-hub](#8-implementation-blueprint-for-lucebox-hub) +9. [Comparative Trade-offs](#9-comparative-trade-offs) +10. [Architectural Recommendations](#10-architectural-recommendations) + +--- + +## 1. Architectural Taxonomy + +Modern context compaction splits into three core methodologies: + +``` +[Full Conversation Window (Messages, Tools, Code, Errors)] + │ + ▼ + (Crosses Threshold: e.g., 90% of Window) + ┌─────────────┴──────────────┐ + ▼ ▼ +┌─────────────────┐ ┌────────────────────────────────┐ +│ Pruning / Edit │ │ Semantic Compaction │ +│ Compaction │ └───────────────┬────────────────┘ +└────────┬────────┘ │ + │ ┌────────────────┴───────────────┐ + │ ▼ ▼ + │ ┌──────────────┐ ┌──────────────┐ + │ │ Text-Based │ │ Provider- │ + │ │Summarization │ │ Native Embed │ + ▼ └──────┬───────┘ └──────┬───────┘ +[Drop Tool Output, │ │ + Extended Thinking] ▼ ▼ + ┌────────────────────────────────────────────────────┐ + │ [Compacted Context: Small, High-Density Footprint]│ + └────────────────────────────────────────────────────┘ +``` + +### A. Provider-Native Token-Level Compression + +OpenAI and Anthropic both offer **server-side compaction** that delegates optimization to the infrastructure provider. The server monitors tokens in-stream, and when a threshold is crossed, executes an inline compaction pass producing an opaque encrypted state blob[^3][^4]. + +### B. Dynamic & Incremental Text Summarization + +Frameworks like LangChain and Inspect AI use a secondary LLM call to periodically compress older conversation text into high-density summaries. The key pattern is **incremental construction**: `Existing Summary + New Chunk → Updated Summary`[^5][^6]. + +### C. Edit & Loss-Aware Pruning + +Rather than rewriting text, edit compaction selectively alters prompt structure: stripping thinking blocks, truncating tool outputs, or using perplexity-guided token removal[^7][^8]. + +--- + +## 2. OpenAI's Implementation (Responses API) + +### 2.1 In-Stream Server-Side Compaction + +**Endpoint:** `POST /v1/responses` + +Developers specify a `context_management` configuration block: + +```json +{ + "model": "gpt-5.5", + "context_management": [ + { "type": "compaction", "compact_threshold": 100000 } + ], + "input": [ + { "role": "system", "content": "You are an autonomous engineering agent." }, + { "role": "user", "content": "Analyze the log files and debug the memory leak." } + ] +} +``` + +When token usage crosses the `compact_threshold`, the backend produces an encrypted `ResponseCompactionItem`[^9]: + +```python +class ResponseCompactionItem(BaseModel): + id: str # Unique ID + encrypted_content: str # Opaque compressed state + type: Literal["compaction"] # Always "compaction" + created_by: Optional[str] # Actor identifier +``` + +**Response during compaction event:** + +```json +{ + "id": "resp_92kL8xPqZ2a1", + "output": [ + { + "id": "item_comp_7x8y9z", + "type": "compaction", + "encrypted_content": "eyJhcmNoaXZl..." + }, + { + "id": "item_msg_1a2b3c", + "type": "message", + "role": "assistant", + "content": "I have completed the log analysis..." + } + ], + "usage": { + "prompt_tokens": 125000, + "completion_tokens": 450, + "compacted_tokens_saved": 98000 + } +} +``` + +### 2.2 Standalone Compact Endpoint + +**Endpoint:** `POST /v1/responses/compact` + +For explicit, deterministic state minimization[^10]: + +```python +# Python SDK +compacted = client.responses.compact( + model="gpt-5", + previous_response_id="resp_abc123", # Or pass full input array + service_tier="flex", # 50% discount for latency-insensitive +) +# Returns CompactedResponse with object="response.compaction" +``` + +**Response schema:** + +```python +class CompactedResponse(BaseModel): + id: str + created_at: int + object: Literal["response.compaction"] + output: List[ResponseOutputItem] # User messages + compaction item + usage: ResponseUsage # Token accounting +``` + +### 2.3 State Chaining Mechanics + +**Stateless Array Chaining:** + +```json +{ + "input": [ + { "type": "compaction", "encrypted_content": "..." }, + { "role": "user", "content": "Great, draft a hotfix for db.py." } + ] +} +``` + +**Stateful ID Chaining:** + +```json +{ + "model": "gpt-5.5", + "previous_response_id": "resp_92kL8xPqZ2a1", + "input": [{ "role": "user", "content": "Great, draft a hotfix." }] +} +``` + +### 2.4 SDK Details + +**Python SDK signature:** + +```python +client.responses.create( + model="gpt-5.2-codex", + input=conversation, + store=False, + context_management=[{"type": "compaction", "compact_threshold": 100000}], +) +``` + +**TypeScript/Node.js SDK:** + +```typescript +const compactedResponse = await client.responses.compact({ + model: 'gpt-5.4', + previous_response_id: 'resp_abc123', + service_tier: 'flex', +}); +``` + +### 2.5 Feature Timeline + +| Date | SDK Version | Change | +|------|------------|--------| +| Dec 4, 2025 | ~2.9.x | `/responses/compact` endpoint introduced[^11] | +| Dec 10, 2025 | v2.10.0 | `model` parameter made required | +| Jan 9, 2026 | ~2.12.x | `completed_at` property added | +| May 13, 2026 | v2.37.0 | `service_tier` parameter added[^12] | + +### 2.6 Key Notes + +- `context_management` type currently only supports `"compaction"` +- `truncation: "auto"` is a simpler alternative (drops items from beginning, no summary) +- `service_tier: "flex"` gives 50% discount for latency-insensitive compaction +- The `encrypted_content` field is opaque — cannot be read or audited by humans +- ZDR (Zero Data Retention) compatible via encrypted compaction items + +--- + +## 3. Anthropic's Implementation + +### 3.1 Server-Side Compaction API (Beta, Jan 2026) + +**Beta header:** `compact-2026-01-12` + +```python +class BetaCompact20260112EditParam(TypedDict, total=False): + type: Required[Literal["compact_20260112"]] + instructions: Optional[str] # Custom prompt (REPLACES default) + pause_after_compaction: bool # Return early with stop_reason:"compaction" + trigger: Optional[BetaInputTokensTriggerParam] # Default: 150,000 tokens (min 50K) +``` + +**Compaction block (round-tripped in subsequent requests):** + +```python +class BetaCompactionBlockParam(TypedDict, total=False): + type: Required[Literal["compaction"]] + cache_control: Optional[BetaCacheControlEphemeralParam] # Cacheable! + content: Optional[str] # Human-readable summary + encrypted_content: Optional[str] # Opaque metadata +``` + +**Key differences from OpenAI:** + +| Feature | Anthropic | OpenAI | +|---------|-----------|--------| +| Summary readable? | ✅ `content` field is human-readable | ❌ Fully opaque blob | +| Pause after compaction? | ✅ `stop_reason: "compaction"` | ❌ No | +| Custom summary prompt? | ✅ Completely replaces default | Limited | +| Use cheaper model? | ❌ Always same model | N/A | +| Cache on compaction block? | ✅ `cache_control` supported | N/A | + +### 3.2 Context Editing (Fine-Grained Control) + +Beta header: `context-management-2025-06-27` + +| Strategy | What It Does | +|----------|-------------| +| `clear_tool_uses_20250919` | Clears oldest tool results, keeps last N (default 3) | +| `clear_thinking_20251015` | Manages `` blocks per model defaults | + +Key parameters: +- `keep: 3` — preserve most recent 3 tool interactions +- `clear_at_least` — minimum tokens to clear (avoids cache invalidation if not worthwhile) +- `exclude_tools` — tools whose results are never cleared + +### 3.3 Claude Code's `/compact` Slash Command + +Claude Code's default summary structure: + +``` +1. Task Overview (core request, success criteria) +2. Current State (completed work, files modified) +3. Important Discoveries (constraints, decisions, errors) +4. Next Steps (specific actions, blockers, priorities) +5. Context to Preserve (user preferences, domain details) +``` + +When context approaches limits, Claude Code **automatically compacts** using this structure[^14]. Persistent rules belong in `CLAUDE.md` because they're re-injected on every request. + +### 3.4 Prompt Caching Interaction + +- Add `cache_control` at end of system prompt to cache independently of compaction +- Compaction blocks can be cached too (add `cache_control` to the block) +- Tool result clearing **invalidates** cached prefixes → use `clear_at_least` threshold +- Keeping thinking blocks → preserves cache; clearing → invalidates at that point + +--- + +## 4. Academic Advances + +### 4.1 Active Context Compression / "Focus Framework" (Jan 2026) + +**Paper:** "Active Context Compression: Autonomous Memory Management in LLM Agents" +**arXiv:** [2601.07190](https://arxiv.org/abs/2601.07190)[^15] + +The agent **autonomously decides** when to consolidate learnings into a persistent "Knowledge Block" and prunes raw history — inspired by slime mold exploration strategies. + +| Metric | Result | +|--------|--------| +| Token reduction | 22.7% average, up to **57%** | +| Accuracy preservation | 100% (60% → 60% on SWE-bench Lite) | +| Autonomous compressions/task | 6.0 average | + +### 4.2 Semantic-Anchor Compression / SAC (ICLR 2026) + +**Paper:** "Autoencoding-Free Context Compression for LLMs via Contextual Semantic Anchors" +**arXiv:** [2510.08907](https://arxiv.org/abs/2510.08907)[^16] +**GitHub:** [lx-Meteors/SAC](https://github.com/lx-Meteors/SAC) + +SAC selects **anchor tokens directly from the original context** (no learned compression tokens), modifies bidirectional attention so anchors aggregate surrounding KV representations: + +``` +Traditional: [COMP_1, COMP_2, ...COMP_N] ← trained autoencoder +SAC: [token_A*, token_B*, ...token_K*] ← selected anchors with modified attention +``` + +Consistently outperforms existing methods across compression ratios (Llama-3.2-1B/3B, Llama-3.1-8B). + +### 4.3 IC-Former (EMNLP 2024) + +**Paper:** "In-Context Former: Lightning-fast Compressing Context for Large Language Model" +**arXiv:** [2406.13618](https://arxiv.org/abs/2406.13618)[^17] +**GitHub:** [wonderful9462/IC-Former](https://github.com/wonderful9462/IC-Former) + +A lightweight **cross-attention encoder** (~630M, 9% of target LLM) with learnable "digest tokens" compresses context in **O(n) time**: + +| Metric | Result | +|--------|--------| +| Speed improvement | **68–112× faster** than baseline | +| Performance preserved | >90% of downstream accuracy | +| FLOP reduction | 1/32 of self-attention baseline | + +### 4.4 ACON Framework (Microsoft, Oct 2025) + +**Paper:** "ACON: Optimizing Context Compression for Long-horizon LLM Agents" +**arXiv:** [2510.00615](https://arxiv.org/abs/2510.00615)[^18] +**GitHub:** [microsoft/acon](https://github.com/microsoft/acon) + +Gradient-free pipeline: when compressed context causes task failure, an evaluator LLM analyzes what was lost and **refines the compression guideline in natural language**. + +| Metric | Result | +|--------|--------| +| Memory reduction | 26–54% (peak tokens) | +| Accuracy preserved | >95% when distilled | +| Small-LM improvement | +46% for smaller agents | + +### 4.5 SWE-Pruner (Bytedance, Jan 2026) + +**arXiv:** [2601.16746](https://arxiv.org/abs/2601.16746)[^19] +**GitHub:** [Ayanami1314/swe-pruner](https://github.com/Ayanami1314/swe-pruner) + +Task-aware pruning for coding agents using a **0.6B neural skimmer** (Qwen3-Reranker-0.6B): + +| Metric | Result | +|--------|--------| +| Token reduction (SWE-Bench) | 23–54% while **improving** success rates | +| Compression (LongCodeQA) | **14.84× compression** with minimal impact | +| Cost savings | ~40% on Claude API tokens | +| Training F1 | 0.78 | + +### 4.6 LLMLingua Series (Microsoft, 2023–2024) + +**GitHub:** [microsoft/LLMLingua](https://github.com/microsoft/LLMLingua) + +| Paper | Venue | Model | Key Metric | +|-------|-------|-------|-----------| +| LLMLingua | EMNLP 2023 | GPT-2 Small (124M) | Up to **20x compression** | +| LongLLMLingua | ACL 2024 | — | +21.4% RAG with 1/4 tokens | +| LLMLingua-2 | ACL 2024 | XLM-RoBERTa-L (561M) | **3–6x faster** than v1 | + +### 4.7 Approach Taxonomy + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Context Compression Approach Taxonomy │ +├─────────────────┬───────────────────────────────────────────┤ +│ TRAINING-BASED │ SAC (anchor tokens + bidirectional attn) │ +│ Model Changes │ IC-Former (cross-attn digest tokens) │ +│ │ LLMLingua-2 (BERT classifier distilled) │ +├─────────────────┼───────────────────────────────────────────┤ +│ GUIDELINE-BASED │ ACON (gradient-free NL guideline optim.) │ +│ No model change │ SWE-Pruner (goal → neural skimmer) │ +│ │ LLMLingua v1 (perplexity scoring, no tune) │ +├─────────────────┼───────────────────────────────────────────┤ +│ AGENT-NATIVE │ Focus/ACC (self-regulating agent tools) │ +│ Autonomous │ │ +├─────────────────┼───────────────────────────────────────────┤ +│ KV-CACHE LEVEL │ SnapKV (attention head eviction) │ +│ Inference Opt. │ PFlash (speculative prefill — lucebox) │ +└─────────────────┴───────────────────────────────────────────┘ +``` + +--- + +## 5. Framework Implementations + +### 5.1 Inspect AI — Most Sophisticated (5 Strategies) + +[UKGovernmentBEIS/inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai)[^5] + +```python +class CompactionStrategy(abc.ABC): + def __init__(self, *, threshold: int | float = 0.9, memory: bool = True): + # threshold: float [0,1] = % of context window; int = absolute token count + + @abc.abstractmethod + async def compact(self, model, messages, tools): + ... # Returns (compacted_input, optional_summary) +``` + +**5 strategies:** + +| Strategy | Approach | What's Preserved | +|----------|----------|-----------------| +| `CompactionAuto` | Try Native → fallback Summary | Depends on sub-strategy | +| `CompactionNative` | Provider API (OpenAI/Anthropic) | Opaque server-side | +| `CompactionSummary` | LLM summarization (incremental) | System + input + summary | +| `CompactionEdit` | Strip thinking + truncate tools | Structure + last N results | +| `CompactionTrim` | Keep fraction of messages | System + recent % | + +**Key patterns:** +- Three-tier token counting (baseline reuse → delta → full count) +- Memory pre-warning at 90% of threshold +- Retry loop (up to 3 iterations if first pass insufficient) +- Incremental summarization (only summarize since last summary) + +**Integration:** + +```python +from inspect_ai.agent import react +from inspect_ai.model import CompactionAuto, CompactionSummary, CompactionEdit + +react(tools=[bash(), text_editor()], compaction=CompactionAuto()) +react(tools=[bash()], compaction=CompactionSummary(threshold=0.8)) +react(tools=[bash()], compaction=CompactionEdit(keep_tool_uses=3)) +``` + +### 5.2 OpenAI Agents SDK + +[openai/openai-agents-python](https://github.com/openai/openai-agents-python)[^21] + +```python +class OpenAIResponsesCompactionSession: + DEFAULT_COMPACTION_THRESHOLD = 10 # candidate items (not tokens!) + + def __init__(self, underlying_session, model="gpt-4.1", + compaction_mode: Literal["previous_response_id", "input", "auto"] = "auto", + should_trigger_compaction: Callable | None = None): + ... + + async def run_compaction(self): + compacted = await self.client.responses.compact(**kwargs) + await self._replace_underlying_session_items(compacted.output) # atomic with rollback +``` + +Trigger: fires when ≥10 "candidate items" (assistant messages, tool calls, reasoning) exist. + +### 5.3 LangChain v1 — Middleware Pattern + +```python +class SummarizationMiddleware(AgentMiddleware): + def __init__(self, model, trigger=("fraction", 0.8), + keep=("messages", 20), summary_prompt=DEFAULT_SUMMARY_PROMPT): + ... +``` + +**Classic memory classes (deprecated since 0.3.1):** +- `ConversationSummaryMemory` — rolling summary every turn +- `ConversationTokenBufferMemory` — FIFO truncation with token limit +- `ConversationSummaryBufferMemory` — hybrid: recent raw + older summarized + +### 5.4 AutoGen — Pure Truncation + +Three context classes: +- `BufferedChatCompletionContext` — N-message sliding window +- `TokenLimitedChatCompletionContext` — drops from **middle** (unique!) +- `HeadAndTailChatCompletionContext` — preserve first + last + +### 5.5 Semantic Kernel (C#) + +```csharp +// Count-based truncation with hysteresis +new ChatHistoryTruncationReducer(targetCount: 10, thresholdCount: 5) + +// LLM summarization with incremental detection +new ChatHistorySummarizationReducer(chatService, targetCount: 2, thresholdCount: 4) +``` + +### 5.6 Cross-Framework Comparison + +| Framework | Approach | Threshold | Tool Handling | Native API | +|-----------|----------|-----------|---------------|------------| +| **Inspect AI** | 5 strategies | % of ctx or absolute | `keep_tool_uses=N` + placeholder | ✅ | +| **OpenAI Agents SDK** | Server-side opaque | ≥10 candidate items | Excluded from candidates | ✅ | +| **LangChain v1** | LLM summarization | Fraction/tokens/messages | Pairs kept together | ❌ | +| **AutoGen** | Pure truncation | Count or tokens | Function pair protection | ❌ | +| **Semantic Kernel** | Truncation or LLM summary | Count + hysteresis | Function pair protection | ❌ | + +--- + +## 6. Inference Engine Landscape + +### No Server Implements Self-Summarization + +**Critical finding:** No major inference engine (vLLM, llama.cpp, SGLang, TGI) implements server-side self-summarization[^2]. + +| Engine | Strategy | Mechanism | +|--------|----------|-----------| +| **llama.cpp** | Ring-buffer shift (`--context-shift`) | Drop middle tokens, shift KV positions | +| **vLLM** | Preemption + full re-prefill | Free all KV blocks, recompute entirely | +| **SGLang** | Radix tree LRU eviction | Cross-request prefix sharing | +| **All others** | Hard rejection (HTTP 400) | No action taken | + +### llama.cpp's Context-Shift + +When enabled, keeps `n_keep` tokens from front (system prompt), discards `n_discard` from middle, shifts remaining positions[^23]: + +```cpp +common_context_seq_rm (ctx_tgt, slot.id, head_p, head_c); +common_context_seq_add(ctx_tgt, slot.id, head_c, head_c + n_match, kv_shift); +``` + +### The Proxy Pattern (agentguard) + +[Roboter-Schlafen-Nicht/agentguard](https://github.com/Roboter-Schlafen-Nicht/agentguard)[^24] — 3-phase compaction: + +1. **Rule-based truncation**: Stub old tool results, deduplicate file reads +2. **LLM summarization**: Call a separate small model (e.g., Qwen2.5-coder:3b) +3. **Hard cap**: Drop oldest atomic message groups until under budget + +--- + +## 7. lucebox-hub: Current State + +### 7.1 Architecture Overview + +lucebox-hub is a **C++17/CUDA inference server** implementing DFlash speculative decoding + DDTree verification for 3–5× speedup on Qwen3.5/3.6-27B[^1]. It serves three API formats: + +``` +Clients (Claude Code, Codex, Open WebUI) + │ + ▼ +┌─────────────────────────────────────────┐ +│ lucebox-hub HTTP Server │ +│ ┌───────────┬──────────┬────────────┐ │ +│ │/v1/chat/ │/v1/ │/v1/ │ │ +│ │completions│messages │responses │ │ +│ └───────────┴──────────┴────────────┘ │ +│ │ (client threads) │ +│ ▼ │ +│ ┌────────────────────────────────────┐ │ +│ │ Single Worker Thread │ │ +│ │ [PFlash] → [Prefix Cache] → GPU │ │ +│ └────────────────────────────────────┘ │ +│ │ │ +│ ┌──────┴───────────────────────────┐ │ +│ │ Prefix Cache (2-tier LRU) │ │ +│ │ Disk Cache (.dkv files) │ │ +│ │ Tool Memory (LRU, 50K entries) │ │ +│ └───────────────────────────────────┘ │ +└─────────────────────────────────────────┘ +``` + +### 7.2 Current Context Overflow Handling + +**ZERO compaction exists.** The only overflow handling[^25]: + +```cpp +// server/src/server/http_server.cpp:1027-1030 +if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + send_error(fd, 400, "prompt + max_tokens exceeds context window"); + return true; +} +``` + +### 7.3 Existing Infrastructure Relevant to Compaction + +| Component | File | Relevance | +|-----------|------|-----------| +| PFlash speculative prefill | `flashprefill.h` | Existing "compression" (structural, not semantic) | +| Prefix Cache (2-tier LRU) | `prefix_cache.h` | Cache invalidation after compaction | +| Tool Memory | `tool_memory.h` | LRU of tool call text for replay | +| Thinking Budget | `http_server.h:64-88` | Precedent for token budget control | +| Token Counting | `/v1/messages/count_tokens` | Pre-flight token measurement | +| Chat Template Rendering | `chat_template.cpp` | Message → token string pipeline | + +### 7.4 Threading Model + +``` +Main Thread: accept() loop → spawn client threads +Client Threads (detached): parse HTTP → route_request() → block on job.cv +Worker Thread (single): dequeue → [pflash] → [prefix cache] → generate() → stream back +``` + +**Critical constraint:** Only the worker thread calls `backend_.generate()`. Compaction must run in the worker thread[^26]. + +--- + +## 8. Implementation Blueprint for lucebox-hub + +### 8.1 Layered Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ Incoming Request │ +└───────────────────────────┬─────────────────────────────┘ + ▼ + ┌─────────────────────────────┐ + │ Token Count Check │ + │ prompt_tokens + max_output │ + └──────────────┬──────────────┘ + │ + ┌──────────────┴──────────────┐ + │ Under threshold? │ + │ YES → Normal Generation │ + │ NO ↓ │ + └──────────────┬──────────────┘ + ▼ + ┌───────────────────────────────────────────────┐ + │ Layer 1: Edit Compaction (CPU-only, <1ms) │ + │ • Strip blocks from old turns │ + │ • Truncate old tool results to placeholder │ + │ • Deduplicate repeated file reads │ + └────────────────────────┬──────────────────────┘ + │ Still over? + ▼ + ┌───────────────────────────────────────────────┐ + │ Layer 2: Self-Summarization (GPU, 5-30s) │ + │ • Internal generate() pass with summary prompt │ + │ • Replace old turns with [CONTEXT SUMMARY] │ + │ • Preserve system prompt + recent 30% turns │ + └────────────────────────┬──────────────────────┘ + │ Still over? + ▼ + ┌───────────────────────────────────────────────┐ + │ Layer 3: Hard Truncation (CPU, last resort) │ + │ • Keep system + last N messages only │ + │ • Drop everything else │ + └────────────────────────┬──────────────────────┘ + │ Still over? + ▼ + ┌──────────────────┐ + │ HTTP 400 Error │ + └──────────────────┘ +``` + +### 8.2 Configuration Additions + +**`ServerConfig` additions (`http_server.h`):** + +```cpp +// Context compaction configuration +bool compaction_enabled = false; +float compaction_threshold = 0.9f; // trigger at 90% of max_ctx +int compaction_max_tokens = 2048; // max tokens for summary output +float compaction_keep_recent = 0.3f; // keep last 30% of turns verbatim +bool compaction_strip_thinking = true; // Layer 1: strip old blocks +int compaction_keep_tool_uses = 3; // Layer 1: keep last N tool results +std::string compaction_prompt; // Custom summarization prompt +``` + +**CLI flags (`server_main.cpp`):** + +``` +--compaction Enable auto context compaction +--compaction-threshold Trigger ratio (default 0.9) +--compaction-max-tokens Max summary length (default 2048) +--compaction-keep-recent Recent turn ratio to preserve (default 0.3) +``` + +### 8.3 Layer 1: Edit Compaction (CPU-Only) + +Inspired by Inspect AI's `CompactionEdit`[^7]: + +```cpp +// New file: server/src/server/compaction.h +struct CompactionResult { + bool applied = false; + std::vector compacted_messages; + int tokens_saved = 0; +}; + +CompactionResult edit_compact( + const std::vector& messages, + const ServerConfig& config) { + + CompactionResult result; + result.compacted_messages = messages; + + // Phase 1: Strip thinking blocks from all but last turn + for (int i = 0; i < (int)result.compacted_messages.size() - 1; i++) { + auto& msg = result.compacted_messages[i]; + if (msg.role == "assistant") { + strip_thinking_blocks(msg.content); // Remove ... + } + } + + // Phase 2: Truncate old tool results (keep last N) + int tool_count = 0; + for (int i = result.compacted_messages.size() - 1; i >= 0; i--) { + if (is_tool_result(result.compacted_messages[i])) { + tool_count++; + if (tool_count > config.compaction_keep_tool_uses) { + result.compacted_messages[i].content = "(Tool result removed)"; + } + } + } + + // Phase 3: Deduplicate repeated file reads + // Keep only latest read of each file path + std::unordered_set seen_paths; + for (int i = result.compacted_messages.size() - 1; i >= 0; i--) { + auto path = extract_file_read_path(result.compacted_messages[i]); + if (!path.empty()) { + if (seen_paths.count(path)) { + result.compacted_messages[i].content = + "[dedup: previously read " + path + "]"; + } + seen_paths.insert(path); + } + } + + result.applied = true; + return result; +} +``` + +### 8.4 Layer 2: Self-Summarization + +The novel pattern — the server uses its own loaded model to summarize older turns: + +```cpp +CompactionResult summarize_compact( + const std::vector& messages, + const ServerConfig& config, + Tokenizer& tokenizer, + ModelBackend& backend) { + + // 1. Split messages: keep recent N% verbatim + int keep_from = messages.size() * (1.0f - config.compaction_keep_recent); + std::vector old_msgs(messages.begin(), messages.begin() + keep_from); + std::vector recent_msgs(messages.begin() + keep_from, messages.end()); + + // 2. Construct summarization prompt + std::vector summary_request; + summary_request.push_back({"system", + "Summarize the following conversation concisely. " + "Preserve: key decisions, file paths, current task state, error messages. " + "Do not reproduce code verbatim. Keep under 500 words."}); + summary_request.push_back({"user", serialize_messages(old_msgs)}); + + // 3. Render + tokenize summary request + std::string rendered = render_chat_template( + summary_request, chat_format_, true, false, ""); + std::vector prompt_tokens = tokenizer.encode(rendered); + + // 4. Generate summary (internal inference pass) + GenerateRequest sum_req; + sum_req.prompt = prompt_tokens; + sum_req.n_gen = config.compaction_max_tokens; + sum_req.sampler = {.temp = 0.0f}; // greedy for determinism + + DaemonIO sum_io; + sum_io.stream_fd = -1; + std::vector output_tokens; + sum_io.on_token = [&](int32_t tok) -> bool { + output_tokens.push_back(tok); + return true; + }; + backend.generate(sum_req, sum_io); + + // 5. Decode summary text + std::string summary_text = tokenizer.decode(output_tokens); + + // 6. Rebuild message array + CompactionResult result; + result.compacted_messages.push_back(messages[0]); // Preserve system prompt + result.compacted_messages.push_back( + {"assistant", "[CONTEXT SUMMARY]\n\n" + summary_text}); + result.compacted_messages.insert( + result.compacted_messages.end(), recent_msgs.begin(), recent_msgs.end()); + result.applied = true; + return result; +} +``` + +### 8.5 Integration Point: Worker Thread + +```cpp +// In worker_loop() at http_server.cpp, after job dequeue, before generation: +void worker_loop() { + while (running_) { + auto job = dequeue(); + auto& req = job.request; + + // === COMPACTION INSERTION POINT === + if (req.compaction_needed && config_.compaction_enabled) { + // Layer 1: Edit compaction (CPU-only, fast) + auto edit_result = edit_compact(req.chat_messages, config_); + std::string rendered = render_chat_template( + edit_result.compacted_messages, ...); + req.prompt_tokens = tokenizer_.encode(rendered); + + // Check if Layer 1 was sufficient + if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + // Layer 2: Self-summarization (requires GPU inference) + auto sum_result = summarize_compact( + edit_result.compacted_messages, config_, tokenizer_, backend_); + rendered = render_chat_template( + sum_result.compacted_messages, ...); + req.prompt_tokens = tokenizer_.encode(rendered); + } + + // Final check — Layer 3: Hard truncation + if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + hard_truncate(req, config_); + } + + req.compaction_applied = true; + } + + // ... normal generation continues ... + } +} +``` + +### 8.6 Prefix Cache Implications + +After compaction, the prefix hash changes: + +```cpp +// After compaction, only attempt prefix match on system prompt portion +if (req.compaction_applied) { + int sys_tokens = req.system_prompt_token_count; + auto cache_hit = prefix_cache_.lookup( + std::vector(req.prompt_tokens.begin(), + req.prompt_tokens.begin() + sys_tokens)); +} +``` + +| Scenario | KV Cache Impact | +|----------|----------------| +| No compaction (current) | Full prefix hash valid | +| Edit only (strip thinking) | Partial miss after stripped regions | +| Summarization | Full miss except system prompt prefix | +| Hard truncation | System prompt still hits | + +### 8.7 API Response: Compaction Signal + +For the Responses API, signal compaction in the response: + +```json +{ + "id": "resp_abc123", + "object": "response", + "output": [ + { + "type": "compaction_state", + "data": "" + }, + { + "type": "message", + "role": "assistant", + "content": "..." + } + ], + "usage": { + "prompt_tokens": 15000, + "completion_tokens": 450, + "compacted_tokens_saved": 85000 + } +} +``` + +### 8.8 OpenAI-Compatible `context_management` Parameter + +```cpp +// In route_request(), parse context_management from body: +if (body.contains("context_management")) { + for (auto& cm : body["context_management"]) { + if (cm["type"] == "compaction") { + req.compaction_threshold = cm.value("compact_threshold", + (int)(config_.max_ctx * 0.9)); + } + } +} +``` + +### 8.9 Streaming Compaction Notification + +``` +data: {"type":"compaction","status":"started","original_tokens":125000}\n\n +... (compaction runs) ... +data: {"type":"compaction","status":"completed","saved_tokens":98000}\n\n +data: {"type":"message","content":"..."}\n\n +``` + +--- + +## 9. Comparative Trade-offs + +| Strategy | Latency | GPU Mem | Quality | Stateless | lucebox-hub Fit | +|----------|:-------:|:-------:|:-------:|:---------:|:---------------:| +| **Hard 400** (current) | None | None | N/A | ✅ | Current | +| **Edit compaction** (L1) | <1ms | None | Medium | ✅ | **Excellent** | +| **Self-summarization** (L2) | 5–30s | Same | Best | ✅ | **Good** | +| **Ring-buffer shift** | <1ms | None | Low | ❌ | Poor | +| **Sidecar 0.6B model** | 1–5s | +1GB | Good | ✅ | Good | +| **Provider-native** | N/A | N/A | Best | ✅ | N/A (IS provider) | + +### Key Risks for Self-Summarization + +1. **Deadlock**: Single worker busy with compaction → no other requests served. Mitigation: cap `compaction_max_tokens` aggressively (512–1024). +2. **Latency spike**: Users see 15–30s instead of 5s. Mitigation: SSE `compaction_started` event. +3. **Quality**: Q4-quantized 27B summarizing 64K coding session may lose critical details. Mitigation: preserve last 30–50% verbatim. +4. **Cache pollution**: Summary tokens evict existing LRU entries. Mitigation: separate cache namespace. + +--- + +## 10. Architectural Recommendations + +### Recommendation 1: Adopt Layered Mitigation + +| Layer | Action | Cost | When | +|-------|--------|------|------| +| L1 | Strip thinking + truncate old tools | CPU, <1ms | Always at 80% | +| L2 | Self-summarize with own model | GPU, 5–30s | When L1 insufficient (90%) | +| L3 | Hard truncation (keep system + last N) | CPU | Emergency fallback | + +### Recommendation 2: Support OpenAI `context_management` API + +Parse the standard parameter in `/v1/responses` requests for drop-in compatibility with OpenAI Agents SDK. + +### Recommendation 3: Streaming Compaction Notification + +Emit SSE event immediately so clients know compaction is in progress. + +### Recommendation 4: Incremental Construction + +Mark summary messages with metadata. On next compaction, only summarize content after the last summary — avoid re-summarizing already-summarized content. + +### Recommendation 5: Preserve System Prompt for Cache Hits + +Never modify the system prompt during compaction. This preserves prefix-cache hits for the highest-value cache entry. + +### Recommendation 6: Configurable via Model Cards + +```json +{ + "model": "qwen3.5-27b", + "compaction": { + "enabled": true, + "threshold": 0.85, + "max_summary_tokens": 1024, + "keep_recent_ratio": 0.4 + } +} +``` + +--- + +## Footnotes + +[^1]: `server/src/server/http_server.cpp:1027-1030` — Hard 400 error on context overflow +[^2]: Research finding: no inference engine (vLLM, llama.cpp, SGLang) implements self-summarization +[^3]: [openai/openai-python — ResponseCompactionItem](https://github.com/openai/openai-python/blob/main/src/openai/types/responses/response_compaction_item.py) +[^4]: [anthropics/anthropic-sdk-python — BetaCompact20260112EditParam](https://github.com/anthropics/anthropic-sdk-python/blob/main/src/anthropic/types/beta/beta_compact_20260112_edit_param.py) +[^5]: [UKGovernmentBEIS/inspect_ai — CompactionSummary](https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/model/_compaction/summary.py) +[^6]: [langchain-ai/langchain — SummarizationMiddleware](https://github.com/langchain-ai/langchain/blob/master/libs/langchain_v1/langchain/agents/middleware/summarization.py) +[^7]: [UKGovernmentBEIS/inspect_ai — CompactionEdit](https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/model/_compaction/edit.py) +[^8]: [microsoft/LLMLingua](https://github.com/microsoft/LLMLingua) +[^9]: [openai/openai-python — response_create_params.py](https://github.com/openai/openai-python/blob/main/src/openai/types/responses/response_create_params.py) +[^10]: [openai/openai-python — response_compact_params.py](https://github.com/openai/openai-python/blob/main/src/openai/types/responses/response_compact_params.py) +[^11]: [openai/openai-python commit 1039d56](https://github.com/openai/openai-python/commit/1039d5637779e035263019a687b562d3ab5d2c1a) +[^12]: [openai/openai-python commit 625827c](https://github.com/openai/openai-python/commit/625827c5509ece3c40e5002be37a9bd9d91b5374) +[^14]: [anthropics/anthropic-sdk-python — _beta_compaction_control.py](https://github.com/anthropics/anthropic-sdk-python/blob/main/src/anthropic/lib/tools/_beta_compaction_control.py) +[^15]: [arXiv:2601.07190](https://arxiv.org/abs/2601.07190) — Active Context Compression +[^16]: [arXiv:2510.08907](https://arxiv.org/abs/2510.08907) — SAC (ICLR 2026) +[^17]: [arXiv:2406.13618](https://arxiv.org/abs/2406.13618) — IC-Former (EMNLP 2024) +[^18]: [arXiv:2510.00615](https://arxiv.org/abs/2510.00615) — ACON Framework +[^19]: [arXiv:2601.16746](https://arxiv.org/abs/2601.16746) — SWE-Pruner +[^21]: [openai/openai-agents-python — compaction session](https://github.com/openai/openai-agents-python/blob/main/src/agents/memory/openai_responses_compaction_session.py) +[^23]: [ggml-org/llama.cpp — server-context.cpp](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/server-context.cpp) +[^24]: [Roboter-Schlafen-Nicht/agentguard](https://github.com/Roboter-Schlafen-Nicht/agentguard) +[^25]: `server/src/server/http_server.cpp:1027-1030` in lucebox-hub +[^26]: `server/src/server/http_server.h:288-302` — Single worker thread architecture diff --git a/docs/small-model-compression.md b/docs/small-model-compression.md new file mode 100644 index 00000000..15e718ae --- /dev/null +++ b/docs/small-model-compression.md @@ -0,0 +1,362 @@ +# Small Model Context Compression: Research Report + +## Executive Summary + +Multiple peer-reviewed papers **definitively prove** that 0.6B–0.8B models can do context compression effectively. The critical insight is that **architecture matters more than scale** in the sub-1B range: a 561M bidirectional encoder (XLM-RoBERTa) outperforms a 7B causal LM (LLaMA) for token-level compression tasks. For lucebox-hub, which already loads Qwen3.5-0.8B as a draft model, the same model could double as a context compressor with minimal additional VRAM cost. + +--- + +## 1. Complete Evidence: Sub-1B Models for Compression + +### 1.1 SWE-Pruner (ByteDance, 2026) — 0.6B + +**Paper:** [arXiv:2601.16746](https://arxiv.org/abs/2601.16746) +**GitHub:** [Ayanami1314/swe-pruner](https://github.com/Ayanami1314/swe-pruner) +**HuggingFace:** [ayanami-kitasan/code-pruner](https://huggingface.co/ayanami-kitasan/code-pruner) + +**Architecture:** +- Base: `Qwen/Qwen3-Reranker-0.6B` (0.6B parameters) +- Head: CRF compression head with multi-layer fusion +- Bottleneck dim: 256, 1 fusion layer, 8 attention heads, dropout=0.4 +- Output: Binary line-level keep/prune decision per code line + +**Training:** +- Dataset: 61K Python code samples (GitHub → dedup → query generation → line-level labeling via LLM) +- Loss: Focal loss (auto-alpha) + score regression loss (λ=0.05) +- Hardware: 8×A100-80GB, ~4 hours, 3 epochs, lr=1e-4, AdamW + +**Performance:** + +| Benchmark | Metric | Result | +|-----------|--------|--------| +| SWE-Bench Verified | Token reduction | **23–54%** | +| SWE-Bench Verified | Task success | Maintained or **improved** | +| LongCodeQA | Compression | Up to **14.84×** | +| Training | F1 score | **0.78** | +| Claude Sonnet 4.5 | Cost savings | **~40%** | + +**Why 0.6B works:** The task is framed as *reranking* (binary classification), not generation. Qwen3-Reranker is designed for relevance scoring — compression is just line-level binary classification with a goal hint. + +--- + +### 1.2 LLMLingua-2 (Microsoft, ACL 2024) — 561M / 178M + +**Paper:** [arXiv:2403.12968](https://arxiv.org/abs/2403.12968) +**GitHub:** [microsoft/LLMLingua](https://github.com/microsoft/LLMLingua) +**HuggingFace:** [microsoft/llmlingua-2-xlm-roberta-large-meetingbank](https://huggingface.co/microsoft/llmlingua-2-xlm-roberta-large-meetingbank) + +**Two published models:** + +| Model | Architecture | Size | +|-------|-------------|------| +| `llmlingua-2-xlm-roberta-large-meetingbank` | XLM-RoBERTa Large | **~561M** | +| `llmlingua-2-bert-base-multilingual-cased-meetingbank` | mBERT | **~178M** | + +**Mechanism:** Token binary classification (keep/drop) via bidirectional encoder. Each token gets P(preserve) score; tokens below threshold are removed. + +**Training:** Data distillation — GPT-4 generates compressed texts, then binary keep/drop labels are aligned back to original tokens. + +**Performance vs LLMLingua v1 (which uses 7B LLaMA):** +- **3×–6× faster** compression speed +- **1.6×–2.9×** end-to-end latency reduction +- Better out-of-domain generalization +- Faithful (extractive only — no hallucination) + +**Critical insight from paper:** +> "Information entropy [unidirectional] may be a suboptimal compression metric: it only leverages unidirectional context and may fail to capture all essential information... We use a Transformer encoder as the base architecture to capture all essential information from the **full bidirectional context**." + +**Verdict:** 561M bidirectional **outperforms** 7B unidirectional for this task. + +--- + +### 1.3 IC-Former (EMNLP 2024) — ~630M + +**Paper:** [arXiv:2406.13618](https://arxiv.org/abs/2406.13618) +**GitHub:** [wonderful9462/IC-Former](https://github.com/wonderful9462/IC-Former) +**HuggingFace:** [wonderful9462/IC-Former](https://huggingface.co/wonderful9462/IC-Former) + +**Architecture:** +- Separate lightweight cross-attention module — NOT the LLM itself +- Size: **~9% of target LLM** (with 7B target → ~630M) +- Components: N cross-attention layers + M learnable "digest token" embeddings +- Mechanism: Digest tokens attend over context embeddings via cross-attention +- Complexity: O(kn) — linear in context length + +**Performance:** + +| Metric | Value | +|--------|-------| +| FLOPs vs baseline | **1/32** | +| Speed improvement | **68–112× faster** | +| Performance retained | **>90% of baseline** | +| Compression ratio | **4×** (soft prompt output) | + +**Paper claim:** "It is lightweight and efficient, with a parameter size that is **9% of the target LLM**... requires only 1/32 of the floating-point operations during compression." + +--- + +### 1.4 RECOMP (EMNLP 2023) — 110M / 770M + +**Paper:** [arXiv:2310.04408](https://arxiv.org/abs/2310.04408) +**GitHub:** [carriex/recomp](https://github.com/carriex/recomp) +**HuggingFace:** [fangyuan/nq_extractive_compressor](https://huggingface.co/fangyuan/nq_extractive_compressor) + +**Two compressors:** + +| Compressor | Architecture | Size | Training | +|-----------|-------------|------|----------| +| Extractive | Dual-encoder (Contriever) | **110M** | Contrastive (sentence helpfulness) | +| Abstractive | T5-large (seq2seq) | **~770M** | Supervised (GPT-3.5 summaries) | + +**Performance (base LM = Flan-UL2 20B):** + +| Dataset | Method | Compression | EM Drop | +|---------|--------|:-----------:|:-------:| +| NQ | RECOMP Extractive | **~6% tokens** | -2.8 | +| NQ | RECOMP Abstractive | **~5% tokens** | -2.4 | +| TQA | RECOMP Abstractive | **~5% tokens** | -3.7 | + +**Key finding:** Compressors trained for one LM **transfer** to other LMs — the 110M compressor can serve any black-box target. + +--- + +### 1.5 Selective Context (EMNLP 2023) — 124M (GPT-2) + +**Paper:** [arXiv:2310.06201](https://arxiv.org/abs/2310.06201) +**GitHub:** [liyucheng09/Selective_Context](https://github.com/liyucheng09/Selective_Context) + +**Architecture:** Uses GPT-2 Small (124M) for self-information scoring. No training needed. + +```python +class SelectiveContext: + def __init__(self, model_type='gpt2', lang='en'): + self.model = GPT2LMHeadModel.from_pretrained('gpt2') # 124M params +``` + +**Mechanism:** Self-information (surprisal) = −log₂P(token | context). Low self-information = redundant = pruned. + +**Performance:** +- **50% context reduction** → 36% memory reduction, 32% inference time reduction +- Only −0.023 BERTScore drop, −0.038 faithfulness drop +- Training-free (zero-shot application of off-the-shelf GPT-2) + +--- + +### 1.6 LLMLingua v1 (EMNLP 2023) — 124M default + +**Paper:** [arXiv:2310.05736](https://arxiv.org/abs/2310.05736) +**GitHub:** [microsoft/LLMLingua](https://github.com/microsoft/LLMLingua) + +**Default compressor:** GPT-2 Small (124M). Also supports phi-2 (2.7B) and LLaMA-7B. + +**Performance (GSM8K, target = GPT-3.5-Turbo):** + +| Method | Compression | EM | +|--------|:-----------:|:---:| +| Full context | 1× | 78.85 | +| LLMLingua (7B) 5× | 5× | **79.08** | +| LLMLingua (7B) 14× | 14× | 77.41 | +| LLMLingua (7B) 20× | 20× | 77.33 | + +**Critical GPT-2 Small ablation:** + +| Compressor | EM (5×) | EM (14×) | EM (20×) | +|------------|:-------:|:--------:|:--------:| +| Alpaca-7B | 79.08 | 77.41 | 77.33 | +| **GPT2-Alpaca (124M)** | **77.02** | **76.42** | **76.27** | +| **Δ** | **−2.06** | **−0.99** | **−1.06** | + +**Key finding:** A **60× smaller** compressor (124M vs 7B) loses only **~1–2 EM points**. The iterative algorithm compensates for the weaker perplexity signal. + +--- + +### 1.7 Additional HuggingFace Models + +| Model | Params | Type | Purpose | +|-------|--------|------|---------| +| `gravitee-io/very-small-prompt-compression` | **60.5M** | T5-small | Short prompt compression | +| `dotslashderek/flan-t5-small-prompt-compression` | **77M** | FLAN-T5-small | Prompt compression | +| `princeton-nlp/AutoCompressor-1.3b-30k` | 1.3B | OPT-1.3B | Summary vectors (30K ctx) | + +--- + +## 2. Compressor Size vs. Quality: The Evidence + +### 2.1 Direct Comparison Table + +| Size | Model | Task | Quality vs Full Context | +|------|-------|------|:-----------------------:| +| **110M** (Contriever) | RECOMP extractive | QA | ~6% tokens, -2.8 EM | +| **124M** (GPT-2) | Selective Context | General | 50% reduction, -0.023 BERTScore | +| **124M** (GPT-2) | LLMLingua v1 | CoT/QA | ~1-2 EM loss vs 7B compressor | +| **178M** (mBERT) | LLMLingua-2 small | Task-agnostic | Good in-domain, moderate OOD | +| **561M** (XLM-RoBERTa-L) | LLMLingua-2 large | Task-agnostic | **Best sub-1B performance** | +| **600M** (Qwen3-Reranker) | SWE-Pruner | Code agents | **F1=0.78, no accuracy loss** | +| **~630M** (IC-Former) | IC-Former | General LLM | **>90% baseline, 68-112× faster** | +| 7B (LLaMA) | LLMLingua v1 | CoT/RAG | Best PPL scoring, but slower | +| 7B (LLaMA) | AutoCompressor | Long docs | Best soft-prompt quality | + +### 2.2 Diminishing Returns Curve + +``` +Quality + ^ + | ●——●——● (XLM-RoBERTa 561M ≈ LLaMA-7B for discriminative tasks) + | ● (Contriever 110M already near-ceiling for extractive) + | ● + | ● + +--+--+----+--------+---> Model Size + GPT-2 BERT RoBERTa-L LLaMA-7B GPT-4 + 124M 178M 561M 7B 100B+ +``` + +**Key finding: Diminishing returns hit quickly.** For discriminative compression (token classification, sentence selection), 500–600M is essentially at the performance ceiling. For generative compression (abstractive summarization), larger models do significantly better. + +### 2.3 Why Architecture > Scale Below 1B + +From LLMLingua-2 paper: + +| Property | Causal LM (GPT-2, LLaMA) | Bidirectional Encoder (BERT, RoBERTa) | +|----------|:-------------------------:|:-------------------------------------:| +| Context | Left-to-right only | Full bidirectional | +| Task fit | PPL scoring (indirect) | Binary classification (direct) | +| Speed | Autoregressive (slow) | Single pass (fast) | +| 561M quality | Good | **Outperforms 7B causal** | + +The bidirectional encoder sees the full token context (left AND right), making importance scoring fundamentally more informed. A 561M encoder captures more relevant signal than a 7B decoder looking only leftward. + +--- + +## 3. Minimum Viable Compressor Size + +### By Compression Type + +| Compression Type | Minimum Viable | Sweet Spot | Notes | +|-----------------|:--------------:|:----------:|-------| +| Perplexity-based token filter | ~120M (GPT-2) | 7B | 60× smaller → only 2pts loss | +| Binary token classification | ~110M (mBERT) | ~560M (XLM-RoBERTa-L) | Bidirectional architecture key | +| Sentence extraction / reranking | ~110M (Contriever) | ~110M | Gains from size plateau fast | +| Code-aware line pruning | ~600M (Qwen3-Reranker) | ~600M | Task-specific fine-tuning critical | +| Abstractive summarization | ~770M (T5-large) | ~3B | T5-small too lossy | +| Soft vector compression | 1.3B+ | 7B | Must = target model | + +### Practical Floor + +**~100–200M is the absolute minimum** for reasonable compression. Below that, models lack sufficient world knowledge to judge which content is informationally critical. + +**~500–600M is the practical sweet spot** for discriminative (keep/drop) compression with no loss in downstream task accuracy. + +--- + +## 4. Relevance to lucebox-hub + +### 4.1 Existing 0.8B Draft Model + +lucebox-hub already loads **Qwen3.5-0.8B** as the speculative decoding draft model. This model: +- Is already in GPU memory +- Has the same tokenizer as the target 27B model +- Is fast at inference (the whole point of speculative decoding) + +### 4.2 Three Integration Options + +#### Option A: Perplexity-Based Scoring (Zero Training) + +Use the 0.8B draft model as a **Selective Context** scorer: + +```cpp +// Score each token's self-information using the draft model +// Low self-information tokens are redundant → prune them +float score_token_importance(const std::vector& context, int pos) { + float logprob = draft_model_.forward_single(context, pos); + return -logprob; // self-information = -log P(token | context) +} +``` + +- **Training needed:** None +- **Quality:** Comparable to GPT-2 baseline (~124M), should be better at 0.8B +- **Latency:** One forward pass through 0.8B model over the context +- **Token savings:** 20–50% depending on threshold + +#### Option B: Fine-Tuned Binary Classifier (Like SWE-Pruner) + +Add a CRF head to the 0.8B model for line-level or token-level keep/prune: + +```cpp +// Fine-tuned model outputs binary decision per token/line +struct PruningDecision { + std::vector keep_mask; // true = keep, false = prune +}; +PruningDecision classify_tokens(const std::vector& prompt_tokens) { + auto hidden_states = draft_model_.forward(prompt_tokens); + return crf_head_.decode(hidden_states); +} +``` + +- **Training needed:** ~4 hours on 8×A100 (per SWE-Pruner) +- **Quality:** F1 ~0.78 for code (SWE-Pruner benchmark) +- **Latency:** One forward pass + CRF decode +- **Token savings:** 23–54% + +#### Option C: Bidirectional Encoder Sidecar + +Load a separate **XLM-RoBERTa-Large (561M)** or fine-tuned **BERT (178M)** as a dedicated compressor: + +```cpp +// Separate compressor model loaded alongside main model +class TokenCompressor { + BertModel encoder_; // 561M XLM-RoBERTa-Large + LinearHead classifier_; // Binary keep/drop +public: + std::vector classify(const std::string& text); +}; +``` + +- **Training needed:** GPT-4 distillation (LLMLingua-2 approach) +- **Quality:** Best-in-class for sub-1B discriminative compression +- **Latency:** Single forward pass, very fast +- **Extra VRAM:** ~1.1GB (FP16) or ~600MB (INT8) + +### 4.3 Recommendation for lucebox-hub + +**Start with Option A** (zero training, immediate value): +- Use the existing 0.8B draft model for perplexity scoring +- Apply Selective Context algorithm (drop low self-information tokens) +- Expected: 20–50% token reduction with minimal quality loss +- Zero additional VRAM, zero training, implementable in days + +**Graduate to Option B** if coding-specific compression needed: +- Fine-tune a CRF head on the 0.8B draft model +- Use SWE-Pruner's approach: line-level binary classification +- Expected: 23–54% reduction with potential accuracy improvement +- Requires ~4 hours training + separate head weights + +--- + +## 5. Key Papers & Links + +| Paper | Size | Year | arXiv | GitHub | +|-------|:----:|:----:|-------|--------| +| SWE-Pruner | 0.6B | 2026 | [2601.16746](https://arxiv.org/abs/2601.16746) | [Ayanami1314/swe-pruner](https://github.com/Ayanami1314/swe-pruner) | +| LLMLingua-2 | 561M | 2024 | [2403.12968](https://arxiv.org/abs/2403.12968) | [microsoft/LLMLingua](https://github.com/microsoft/LLMLingua) | +| IC-Former | ~630M | 2024 | [2406.13618](https://arxiv.org/abs/2406.13618) | [wonderful9462/IC-Former](https://github.com/wonderful9462/IC-Former) | +| RECOMP | 110M | 2023 | [2310.04408](https://arxiv.org/abs/2310.04408) | [carriex/recomp](https://github.com/carriex/recomp) | +| Selective Context | 124M | 2023 | [2310.06201](https://arxiv.org/abs/2310.06201) | [liyucheng09/Selective_Context](https://github.com/liyucheng09/Selective_Context) | +| LLMLingua | 124M | 2023 | [2310.05736](https://arxiv.org/abs/2310.05736) | [microsoft/LLMLingua](https://github.com/microsoft/LLMLingua) | +| AutoCompressor | 1.3B+ | 2023 | [2305.14788](https://arxiv.org/abs/2305.14788) | [princeton-nlp/AutoCompressors](https://github.com/princeton-nlp/AutoCompressors) | +| Gisting | 7B | 2023 | [2304.08467](https://arxiv.org/abs/2304.08467) | — | +| ICAE | 7B+LoRA | 2024 | [2307.06945](https://arxiv.org/abs/2307.06945) | [getao/icae](https://github.com/getao/icae) | +| ACON | 14B | 2025 | [2510.00615](https://arxiv.org/abs/2510.00615) | [microsoft/acon](https://github.com/microsoft/acon) | + +--- + +## 6. Gaps and Open Questions + +1. **No unified compressor-size ablation study exists.** Nobody has published "0.5B vs 0.8B vs 1B vs 3B vs 7B on the same task" — this is a genuine research gap. + +2. **ACON does NOT test sub-1B distillation.** Their "smaller" means 14B vs GPT-4.1, not sub-1B. + +3. **SWE-Pruner notes room for improvement:** "Scaling to 2M training examples did not improve results much — a larger base model (e.g., Qwen3-Reranker-8B) may help more." The 0.6B has headroom. + +4. **No paper tests Qwen3.5-0.8B specifically** as a compression model. But given it's larger than GPT-2 (124M) and in the same family as Qwen3-Reranker-0.6B, performance should be between the two. + +5. **Abstractive summarization at 0.8B is untested.** T5-small (77M) is too lossy for abstractive; T5-large (770M) works. A 0.8B instruction-tuned model (like Qwen3.5-0.8B) could potentially do basic summarization, but no paper confirms this for context compression specifically. diff --git a/harness/test_compaction.py b/harness/test_compaction.py new file mode 100644 index 00000000..013f0007 --- /dev/null +++ b/harness/test_compaction.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +import argparse +import json +import os +import shlex +import subprocess +import sys +import time +import urllib.error +import urllib.request +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[1] +DEFAULT_BIN = REPO_ROOT / "server/build/dflash_server" +DEFAULT_LOG = REPO_ROOT / ".harness-work/compaction_test_server.log" + + +def http_get(url: str): + with urllib.request.urlopen(url, timeout=2.0) as resp: + return resp.status, resp.read().decode("utf-8") + + +def http_post_json(url: str, payload: dict) -> dict: + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + url, + data=data, + headers={"Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=120.0) as resp: + body = resp.read().decode("utf-8") + return json.loads(body) + + +def wait_for_health(base_url: str, deadline_s: float = 120.0) -> None: + deadline = time.time() + deadline_s + last_error = None + while time.time() < deadline: + try: + status, _ = http_get(f"{base_url}/health") + if status == 200: + return + except Exception as exc: # pragma: no cover - harness convenience + last_error = exc + time.sleep(1.0) + raise RuntimeError(f"server did not become healthy: {last_error}") + + +def make_tool_payload(repeats: int, size: int, path: str) -> list[dict]: + tool_blob = json.dumps({"path": path, "content": "X" * size}) + items: list[dict] = [ + {"role": "developer", "content": "Answer with the single word OK."}, + {"role": "user", "content": "Use prior tool results and answer tersely."}, + ] + for idx in range(repeats): + items.append( + { + "role": "tool", + "content": tool_blob, + "tool_call_id": f"tool_{idx}", + } + ) + items.append({"role": "assistant", "content": "hidden reasoningReady."}) + items.append({"role": "user", "content": "Final answer only."}) + return items + + +def assert_no_compaction(resp: dict) -> None: + usage = resp.get("usage", {}) + saved = usage.get("compacted_tokens_saved", 0) + assert saved == 0, f"expected no compaction, got compacted_tokens_saved={saved}" + + +def assert_compaction(resp: dict, label: str) -> None: + usage = resp.get("usage", {}) + saved = usage.get("compacted_tokens_saved", 0) + assert saved > 0, f"expected compaction for {label}, got usage={usage}" + + +def build_command(args: argparse.Namespace) -> list[str]: + cmd = [ + str(args.server_bin), + args.model, + "--host", + args.host, + "--port", + str(args.port), + "--model-name", + args.model_name, + "--max-ctx", + str(args.max_ctx), + "--max-tokens", + str(args.max_output_tokens), + "--compaction", + "--compaction-threshold", + "0.5", + ] + if args.draft: + cmd += ["--draft", args.draft] + if args.extra_server_args: + cmd += shlex.split(args.extra_server_args) + return cmd + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run an end-to-end compaction smoke test.") + parser.add_argument("--server-bin", type=Path, default=DEFAULT_BIN) + parser.add_argument("--model", default=os.getenv("TARGET") or os.getenv("MODEL_PATH")) + parser.add_argument("--draft", default=os.getenv("DRAFT")) + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=int(os.getenv("COMPACTION_TEST_PORT", "18081"))) + parser.add_argument("--model-name", default=os.getenv("MODEL_ID", "luce-dflash")) + parser.add_argument("--max-ctx", type=int, default=int(os.getenv("MAX_CTX", "2048"))) + parser.add_argument("--max-output-tokens", type=int, default=64) + parser.add_argument("--extra-server-args", default=os.getenv("COMPACTION_SERVER_EXTRA_ARGS", "")) + parser.add_argument("--server-log", type=Path, default=DEFAULT_LOG) + args = parser.parse_args() + + if not args.model: + print("Set --model or TARGET/MODEL_PATH before running this harness.", file=sys.stderr) + return 2 + if not args.server_bin.exists(): + print(f"Server binary not found: {args.server_bin}", file=sys.stderr) + return 2 + + args.server_log.parent.mkdir(parents=True, exist_ok=True) + base_url = f"http://{args.host}:{args.port}" + cmd = build_command(args) + + print("[compaction-test] starting server:", " ".join(shlex.quote(part) for part in cmd)) + with args.server_log.open("w", encoding="utf-8") as log_file: + proc = subprocess.Popen( + cmd, + cwd=REPO_ROOT, + stdout=log_file, + stderr=subprocess.STDOUT, + text=True, + ) + try: + wait_for_health(base_url) + + short_resp = http_post_json( + f"{base_url}/v1/responses", + { + "model": args.model_name, + "stream": False, + "max_output_tokens": args.max_output_tokens, + "input": [ + {"role": "developer", "content": "Answer with OK."}, + {"role": "user", "content": "Say OK."}, + ], + }, + ) + assert_no_compaction(short_resp) + print("[compaction-test] short request passed") + + long_resp = http_post_json( + f"{base_url}/v1/responses", + { + "model": args.model_name, + "stream": False, + "max_output_tokens": args.max_output_tokens, + "input": make_tool_payload(repeats=10, size=1000, path="/repo/a.cpp"), + }, + ) + assert_compaction(long_resp, "long request") + print("[compaction-test] long request passed") + + override_resp = http_post_json( + f"{base_url}/v1/responses", + { + "model": args.model_name, + "stream": False, + "max_output_tokens": args.max_output_tokens, + "context_management": [ + {"type": "compaction", "compact_threshold": 128} + ], + "input": make_tool_payload(repeats=4, size=700, path="/repo/b.cpp"), + }, + ) + assert_compaction(override_resp, "context_management override") + print("[compaction-test] override request passed") + return 0 + finally: + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=20) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=20) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 71298ff6..c38b9d95 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -708,6 +708,7 @@ if(DFLASH27B_TESTS) add_executable(dflash_server src/server/server_main.cpp src/server/http_server.cpp + src/server/compaction.cpp src/server/model_card.cpp ) target_include_directories(dflash_server PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) @@ -776,6 +777,7 @@ if(DFLASH27B_TESTS) add_executable(test_server_unit test/test_server_unit.cpp) target_sources(test_server_unit PRIVATE src/server/http_server.cpp + src/server/compaction.cpp src/server/model_card.cpp) target_include_directories(test_server_unit PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") diff --git a/server/src/server/compaction.cpp b/server/src/server/compaction.cpp new file mode 100644 index 00000000..e3010c4e --- /dev/null +++ b/server/src/server/compaction.cpp @@ -0,0 +1,309 @@ +#include "compaction.h" + +#include "common/model_backend.h" +#include "tokenizer.h" + +#include +#include +#include + +namespace dflash::common { +namespace { + +int message_chars(const std::vector & messages) { + int total = 0; + for (const auto & msg : messages) { + total += (int)msg.role.size(); + total += (int)msg.content.size(); + total += (int)msg.tool_call_id.size(); + } + return total; +} + +std::string trim(std::string text) { + const auto is_ws = [](unsigned char ch) { + return ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t'; + }; + while (!text.empty() && is_ws((unsigned char)text.front())) { + text.erase(text.begin()); + } + while (!text.empty() && is_ws((unsigned char)text.back())) { + text.pop_back(); + } + return text; +} + +bool replace_content(ChatMessage & msg, const std::string & content) { + if (msg.content == content) return false; + msg.content = content; + return true; +} + +std::string truncate_tool_content(const std::string & content) { + static constexpr size_t kHeadChars = 1536; + static constexpr size_t kTailChars = 768; + static constexpr size_t kMinTruncate = 4096; + if (content.size() <= kMinTruncate) return content; + return content.substr(0, kHeadChars) + + "\n...[tool result truncated during context compaction]...\n" + + content.substr(content.size() - kTailChars); +} + +std::string quoted_value_after(const std::string & text, const std::string & key) { + size_t pos = text.find(key); + if (pos == std::string::npos) return {}; + pos = text.find(':', pos + key.size()); + if (pos == std::string::npos) return {}; + pos = text.find('"', pos); + if (pos == std::string::npos) return {}; + size_t end = text.find('"', pos + 1); + if (end == std::string::npos || end <= pos + 1) return {}; + return text.substr(pos + 1, end - pos - 1); +} + +std::string default_compaction_prompt() { + return "You are compacting earlier chat context for an inference server. " + "Write a concise factual summary that preserves user goals, constraints, decisions, " + "open tasks, important tool findings, and any data the assistant must remember. " + "Do not invent details. Output summary text only."; +} + +std::string serialize_messages(const std::vector & messages) { + std::ostringstream out; + for (const auto & msg : messages) { + out << "[" << msg.role; + if (!msg.tool_call_id.empty()) { + out << " tool_call_id=" << msg.tool_call_id; + } + out << "]\n" << msg.content << "\n\n"; + } + return out.str(); +} + +size_t leading_system_count(const std::vector & messages) { + size_t count = 0; + while (count < messages.size() && messages[count].role == "system") { + ++count; + } + return count; +} + +} // namespace + +std::string strip_thinking_blocks(const std::string & text) { + std::string out; + out.reserve(text.size()); + bool removed = false; + + size_t pos = 0; + while (pos < text.size()) { + size_t start = text.find("", pos); + if (start == std::string::npos) { + out.append(text, pos, std::string::npos); + break; + } + removed = true; + out.append(text, pos, start - pos); + size_t end = text.find("", start + 7); + if (end == std::string::npos) { + break; + } + pos = end + 8; + } + + return removed ? trim(out) : text; +} + +bool is_tool_result(const ChatMessage & msg) { + return msg.role == "tool"; +} + +std::string extract_file_read_path(const ChatMessage & msg) { + if (!is_tool_result(msg)) return {}; + + for (const std::string key : {std::string("\"path\""), std::string("\"file_path\"")}) { + std::string value = quoted_value_after(msg.content, key); + if (!value.empty()) return value; + } + + if (msg.content.rfind("Path ", 0) == 0) { + size_t end = msg.content.find('\n'); + return msg.content.substr(5, end == std::string::npos ? std::string::npos : end - 5); + } + + return {}; +} + +CompactionResult edit_compact(const std::vector & messages, + int keep_tool_uses, + bool strip_thinking) { + CompactionResult result; + result.pre_compaction_tokens = message_chars(messages); + result.compacted_messages = messages; + + if (messages.empty()) return result; + + const int tool_limit = std::max(0, keep_tool_uses); + int last_assistant = -1; + for (int i = 0; i < (int)messages.size(); ++i) { + if (messages[i].role == "assistant") last_assistant = i; + } + + bool changed = false; + int seen_tool_results = 0; + std::unordered_set kept_paths; + + for (int i = (int)result.compacted_messages.size() - 1; i >= 0; --i) { + auto & msg = result.compacted_messages[(size_t)i]; + + if (strip_thinking && msg.role == "assistant" && i != last_assistant) { + changed |= replace_content(msg, strip_thinking_blocks(msg.content)); + } + + if (!is_tool_result(msg)) continue; + + ++seen_tool_results; + const std::string path = extract_file_read_path(msg); + if (!path.empty()) { + auto inserted = kept_paths.insert(path); + if (!inserted.second) { + changed |= replace_content( + msg, + std::string("[Earlier tool result omitted during context compaction; a newer read of ") + + path + " is kept.]"); + continue; + } + } + + if (seen_tool_results > tool_limit) { + changed |= replace_content( + msg, + path.empty() + ? std::string("[Tool result omitted during context compaction.]") + : std::string("[Tool result omitted during context compaction for ") + + path + ".]"); + continue; + } + + changed |= replace_content(msg, truncate_tool_content(msg.content)); + } + + result.applied = changed; + if (changed) { + result.tokens_saved = std::max(0, result.pre_compaction_tokens - message_chars(result.compacted_messages)); + } + return result; +} + +CompactionResult summarize_compact(const std::vector & messages, + float keep_recent_ratio, + int max_summary_tokens, + const std::string & compaction_prompt, + void * backend_ptr, + void * tokenizer_ptr, + int chat_format) { + CompactionResult result; + result.pre_compaction_tokens = message_chars(messages); + result.compacted_messages = messages; + + auto * backend = static_cast(backend_ptr); + auto * tokenizer = static_cast(tokenizer_ptr); + if (!backend || !tokenizer || messages.size() < 3) { + return result; + } + + const size_t system_prefix = leading_system_count(messages); + const size_t non_system = messages.size() - system_prefix; + if (non_system < 3) return result; + + const float clamped_ratio = std::max(0.05f, std::min(0.95f, keep_recent_ratio)); + const size_t keep_recent = std::max(1, (size_t)std::ceil((double)non_system * clamped_ratio)); + if (keep_recent >= non_system) return result; + + const size_t summary_end = messages.size() - keep_recent; + if (summary_end <= system_prefix) return result; + + std::vector older(messages.begin() + system_prefix, + messages.begin() + summary_end); + std::vector recent(messages.begin() + summary_end, + messages.end()); + if (older.empty()) return result; + + const std::string prompt_text = compaction_prompt.empty() + ? default_compaction_prompt() + : compaction_prompt; + + std::vector summary_request = { + ChatMessage{"system", prompt_text, ""}, + ChatMessage{"user", std::string("Summarize this earlier conversation history for future continuation:\n\n") + + serialize_messages(older), ""} + }; + + const std::string rendered = render_chat_template( + summary_request, + static_cast(chat_format), + true, + false, + ""); + + GenerateRequest gen_req; + gen_req.prompt = tokenizer->encode(rendered); + gen_req.n_gen = std::max(64, max_summary_tokens); + gen_req.sampler.temp = 0.0f; + gen_req.do_sample = false; + gen_req.stream = false; + + DaemonIO io; + auto gen_result = backend->generate(gen_req, io); + if (!gen_result.ok || gen_result.tokens.empty()) { + return result; + } + + std::string summary = trim(strip_thinking_blocks(tokenizer->decode(gen_result.tokens))); + if (summary.empty()) { + return result; + } + + std::vector compacted; + compacted.reserve(system_prefix + 1 + recent.size()); + compacted.insert(compacted.end(), messages.begin(), messages.begin() + system_prefix); + compacted.push_back(ChatMessage{"system", std::string("Conversation summary:\n") + summary, ""}); + compacted.insert(compacted.end(), recent.begin(), recent.end()); + + const int compacted_chars = message_chars(compacted); + if (compacted_chars >= result.pre_compaction_tokens) { + return result; + } + + result.applied = true; + result.compacted_messages = std::move(compacted); + result.tokens_saved = result.pre_compaction_tokens - compacted_chars; + return result; +} + +CompactionResult hard_truncate(const std::vector & messages, + int max_messages_to_keep) { + CompactionResult result; + result.pre_compaction_tokens = message_chars(messages); + result.compacted_messages = messages; + + const size_t system_prefix = leading_system_count(messages); + const int keep = std::max(0, max_messages_to_keep); + const size_t non_system = messages.size() - system_prefix; + if ((size_t)keep >= non_system) { + return result; + } + + const size_t tail_start = messages.size() - (size_t)keep; + std::vector compacted; + compacted.reserve(system_prefix + (size_t)keep); + compacted.insert(compacted.end(), messages.begin(), messages.begin() + system_prefix); + compacted.insert(compacted.end(), messages.begin() + std::max(system_prefix, tail_start), messages.end()); + + result.applied = true; + result.compacted_messages = std::move(compacted); + result.tokens_saved = std::max(0, result.pre_compaction_tokens - message_chars(result.compacted_messages)); + return result; +} + +} // namespace dflash::common diff --git a/server/src/server/compaction.h b/server/src/server/compaction.h new file mode 100644 index 00000000..0f1968e9 --- /dev/null +++ b/server/src/server/compaction.h @@ -0,0 +1,44 @@ +#pragma once + +#include "chat_template.h" + +#include +#include +#include + +namespace dflash::common { + +class Tokenizer; +struct ModelBackend; +struct ServerConfig; + +struct CompactionResult { + bool applied = false; + std::vector compacted_messages; + int tokens_saved = 0; + int pre_compaction_tokens = 0; +}; + +CompactionResult edit_compact( + const std::vector & messages, + int keep_tool_uses, + bool strip_thinking); + +CompactionResult summarize_compact( + const std::vector & messages, + float keep_recent_ratio, + int max_summary_tokens, + const std::string & compaction_prompt, + void * backend_ptr, + void * tokenizer_ptr, + int chat_format); + +CompactionResult hard_truncate( + const std::vector & messages, + int max_messages_to_keep); + +std::string strip_thinking_blocks(const std::string & text); +bool is_tool_result(const ChatMessage & msg); +std::string extract_file_read_path(const ChatMessage & msg); + +} // namespace dflash::common diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index a89309dd..033db470 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -4,10 +4,12 @@ // job queue, worker thread with SSE streaming and disconnect detection. #include "http_server.h" +#include "compaction.h" #include "sse_emitter.h" #include "tool_hint.h" #include +#include #include #include #include @@ -77,6 +79,32 @@ static size_t json_array_size(const json & value) { return value.is_array() ? value.size() : 0; } +static std::string render_request_messages(const ServerConfig & config, + const Tokenizer & tokenizer, + ChatFormat chat_format, + const std::vector & messages, + bool enable_thinking, + const std::string & tools_json) { + if (!config.chat_template_src.empty()) { + const std::string & bos_str = (tokenizer.bos_id() >= 0) + ? tokenizer.raw_token(tokenizer.bos_id()) + : std::string(); + const std::string & eos_str = (tokenizer.eos_id() >= 0) + ? tokenizer.raw_token(tokenizer.eos_id()) + : std::string(); + return render_chat_template_jinja( + config.chat_template_src, + messages, + bos_str, + eos_str, + /*add_generation_prompt=*/true, + enable_thinking, + tools_json); + } + + return render_chat_template(messages, chat_format, true, enable_thinking, tools_json); +} + // Build the /props response body. // // Non-static so unit tests can call it directly (declared in http_server.h). @@ -756,6 +784,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { ParsedRequest req; std::string err; + std::vector chat_msgs; try { json body = json::parse(hr.body); @@ -882,8 +911,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { } // Render messages to text and tokenize. - std::vector chat_msgs = - normalize_chat_messages(req.messages, req.format, tool_memory_); + chat_msgs = normalize_chat_messages(req.messages, req.format, tool_memory_); // Determine thinking mode BEFORE rendering so the template can inject // the \n\n\n\n block when thinking is disabled. @@ -1003,6 +1031,15 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { // Bandit: parse session_id from extra_body (opt-in adaptive keep_ratio) req.session_id = parse_session_id_from_body(body); + if (req.format == ApiFormat::RESPONSES && body.contains("context_management") + && body["context_management"].is_array()) { + for (auto & cm : body["context_management"]) { + if (cm.is_object() && cm.value("type", "") == "compaction") { + req.compaction_threshold_override = cm.value("compact_threshold", 0); + } + } + } + // Serialize tools JSON for template injection. std::string tools_json; if (req.tools.is_array() && !req.tools.empty()) { @@ -1010,39 +1047,13 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { } std::string rendered; - if (!config_.chat_template_src.empty()) { - // Jinja path: caller supplied a chat template file via - // --chat-template-file. Override the hardcoded QWEN3/LAGUNA - // renderer. Used for tool-using agents that need the Anthropic - // tool_use envelope (e.g. froggeric Qwen3.6 template). - // - // Special tokens like <|im_start|> / <|im_end|> are stored - // verbatim in the GGUF vocab — use raw_token() to skip the - // GPT-2 byte decode (otherwise <0xC4><0x91> nonsense appears). - const std::string & bos_str = (tokenizer_.bos_id() >= 0) - ? tokenizer_.raw_token(tokenizer_.bos_id()) - : std::string(); - const std::string & eos_str = (tokenizer_.eos_id() >= 0) - ? tokenizer_.raw_token(tokenizer_.eos_id()) - : std::string(); - try { - rendered = render_chat_template_jinja( - config_.chat_template_src, - chat_msgs, - bos_str, - eos_str, - /*add_generation_prompt=*/true, - enable_thinking, - tools_json); - } catch (const std::exception & e) { - send_error(fd, 500, - std::string("chat template (jinja) render failed: ") + e.what()); - return true; - } - } else { - rendered = render_chat_template(chat_msgs, chat_format_, - true, enable_thinking, - tools_json); + try { + rendered = render_request_messages( + config_, tokenizer_, chat_format_, chat_msgs, enable_thinking, tools_json); + } catch (const std::exception & e) { + send_error(fd, 500, + std::string("chat template render failed: ") + e.what()); + return true; } req.prompt_tokens = tokenizer_.encode(rendered); @@ -1059,8 +1070,17 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { return true; // handled (with error) } - // Check context length. - if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + // Check context length / compaction threshold. + const int total_request_tokens = (int)req.prompt_tokens.size() + req.max_output; + const int compaction_trigger = req.compaction_threshold_override > 0 + ? req.compaction_threshold_override + : std::max(1, (int)std::ceil((double)config_.max_ctx * config_.compaction_threshold)); + if (config_.compaction_enabled + && total_request_tokens >= std::min(config_.max_ctx, compaction_trigger)) { + req.compaction_needed = true; + req.pre_compaction_tokens = (int)req.prompt_tokens.size(); + req.chat_messages_copy = chat_msgs; + } else if (total_request_tokens > config_.max_ctx) { send_error(fd, 400, "prompt + max_tokens exceeds context window"); return true; } @@ -1108,8 +1128,9 @@ void HttpServer::worker_loop() { if (!job) break; // stopping int fd = job->fd; - const auto & req = job->req; + auto & req = job->req; auto started_at = std::chrono::steady_clock::now(); + bool stream_headers_sent = false; auto finish_job = [&]() { std::lock_guard lk(job->mu); @@ -1118,7 +1139,7 @@ void HttpServer::worker_loop() { }; auto fail_request = [&](int status, const std::string & message) { std::fprintf(stderr, "[server] request failed: %s\n", message.c_str()); - if (req.stream) { + if (req.stream && stream_headers_sent) { json err = {{"error", {{"message", message}, {"type", "server_error"}}}}; const std::string chunk = "data: " + err.dump() + "\n\n"; send_all(fd, chunk.data(), chunk.size()); @@ -1130,6 +1151,105 @@ void HttpServer::worker_loop() { finish_job(); }; + std::string tools_json; + if (req.tools.is_array() && !req.tools.empty()) { + tools_json = req.tools.dump(); + } + auto render_compacted_prompt = [&](const std::vector & messages, + std::vector & tokens_out) { + try { + const std::string rendered = render_request_messages( + config_, tokenizer_, chat_format_, messages, req.thinking_enabled, tools_json); + tokens_out = tokenizer_.encode(rendered); + return true; + } catch (const std::exception & e) { + fail_request(500, std::string("chat template render failed: ") + e.what()); + return false; + } + }; + + if (req.compaction_needed && config_.compaction_enabled && !req.chat_messages_copy.empty()) { + std::vector current_messages = req.chat_messages_copy; + bool render_failed = false; + + auto edit_result = edit_compact( + current_messages, + config_.compaction_keep_tool_uses, + config_.compaction_strip_thinking); + if (edit_result.applied) { + current_messages = std::move(edit_result.compacted_messages); + if (!render_compacted_prompt(current_messages, req.prompt_tokens)) { + render_failed = true; + } + } + if (render_failed) continue; + + if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + auto sum_result = summarize_compact( + current_messages, + config_.compaction_keep_recent, + config_.compaction_max_tokens, + config_.compaction_prompt, + &backend_, + &tokenizer_, + (int)chat_format_); + if (sum_result.applied) { + current_messages = std::move(sum_result.compacted_messages); + if (!render_compacted_prompt(current_messages, req.prompt_tokens)) { + render_failed = true; + } + } + } + if (render_failed) continue; + + if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + size_t system_prefix = 0; + while (system_prefix < current_messages.size() + && current_messages[system_prefix].role == "system") { + ++system_prefix; + } + const int non_system = (int)current_messages.size() - (int)system_prefix; + bool truncated_to_fit = false; + for (int keep = std::max(0, non_system - 1); keep >= 0; --keep) { + auto trunc_result = hard_truncate(current_messages, keep); + const auto & candidate_messages = trunc_result.applied + ? trunc_result.compacted_messages + : current_messages; + std::vector candidate_tokens; + if (!render_compacted_prompt(candidate_messages, candidate_tokens)) { + render_failed = true; + break; + } + if ((int)candidate_tokens.size() + req.max_output <= config_.max_ctx) { + current_messages = candidate_messages; + req.prompt_tokens = std::move(candidate_tokens); + truncated_to_fit = true; + break; + } + } + if (render_failed) continue; + if (!truncated_to_fit && (int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + fail_request(400, "context too long even after compaction"); + continue; + } + } + + const int saved = req.pre_compaction_tokens - (int)req.prompt_tokens.size(); + req.compaction_applied = saved > 0; + if (req.compaction_applied) { + std::fprintf(stderr, + "[server] compaction applied: %d -> %d tokens (saved %d)\n", + req.pre_compaction_tokens, + (int)req.prompt_tokens.size(), + saved); + } + } + + if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + fail_request(400, "context too long even after compaction"); + continue; + } + std::fprintf(stderr, "[server] chat START %s format=%s stream=%s prompt_tokens=%zu " "max_tokens=%d tools=%zu\n", @@ -1147,6 +1267,7 @@ void HttpServer::worker_loop() { finish_job(); continue; } + stream_headers_sent = true; } // Create SSE emitter for streaming state machine. @@ -1664,6 +1785,9 @@ void HttpServer::worker_loop() { fci < 0 ? emitted : fci; const int content_tokens_emitted = fci < 0 ? 0 : emitted - fci; + const int compacted_tokens_saved = req.compaction_applied + ? std::max(0, req.pre_compaction_tokens - (int)req.prompt_tokens.size()) + : 0; json resp; switch (req.format) { @@ -1755,6 +1879,9 @@ void HttpServer::worker_loop() { {"timings", build_timings_json(gen_timings, total_completion_tokens)}, {"accept_rate", result.accept_rate} }; + if (compacted_tokens_saved > 0) { + chat_usage["compacted_tokens_saved"] = compacted_tokens_saved; + } resp = { {"id", req.response_id}, {"object", "chat.completion"}, @@ -1819,6 +1946,9 @@ void HttpServer::worker_loop() { {"timings", build_timings_json(gen_timings, total_completion_tokens)}, {"accept_rate", result.accept_rate} }; + if (compacted_tokens_saved > 0) { + anth_usage["compacted_tokens_saved"] = compacted_tokens_saved; + } resp = { {"id", req.response_id}, {"type", "message"}, {"role", "assistant"}, {"model", req.model}, @@ -1855,6 +1985,9 @@ void HttpServer::worker_loop() { {"timings", build_timings_json(gen_timings, total_completion_tokens)}, {"accept_rate", result.accept_rate} }; + if (compacted_tokens_saved > 0) { + resp_usage["compacted_tokens_saved"] = compacted_tokens_saved; + } resp = { {"id", req.response_id}, {"object", "response"}, {"status", "completed"}, {"model", req.model}, diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 999eb5d9..2cfde4fc 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -163,6 +163,14 @@ struct ServerConfig { // the Anthropic tool_use envelope, e.g. froggeric Qwen3.6 template. std::string chat_template_src; // literal Jinja source (loaded from file) std::string chat_template_path; // path it was loaded from (logged at startup) + + bool compaction_enabled = false; + float compaction_threshold = 0.9f; + int compaction_max_tokens = 1024; + float compaction_keep_recent = 0.3f; + bool compaction_strip_thinking = true; + int compaction_keep_tool_uses = 3; + std::string compaction_prompt; }; // ─── Parsed request ───────────────────────────────────────────────────── @@ -200,6 +208,11 @@ struct ParsedRequest { std::vector stop_sequences; // Bandit: per-session adaptive keep_ratio opt-in std::string session_id; + bool compaction_needed = false; + bool compaction_applied = false; + int pre_compaction_tokens = 0; + int compaction_threshold_override = 0; + std::vector chat_messages_copy; }; // Build the /props response body. Exposed (non-static) so unit tests diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index 0f31739e..6d316111 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -226,6 +226,13 @@ static void print_usage(const char * prog) { " Overrides the hardcoded Qwen3/Laguna\n" " renderer. Empty or missing falls back\n" " to the hardcoded template.\n" + "\n" + "Context compaction:\n" + " --compaction Enable auto context compaction\n" + " --compaction-threshold Trigger ratio of max_ctx (default: 0.9)\n" + " --compaction-max-tokens Max tokens for self-summary (default: 1024)\n" + " --compaction-keep-recent Fraction of recent turns kept verbatim\n" + " during summarization (default: 0.3)\n" "\n", prog); } @@ -421,6 +428,14 @@ int main(int argc, char ** argv) { sconfig.chat_template_path = path; std::fprintf(stderr, "[server] loaded chat template from %s (%ld bytes)\n", path, n); } + } else if (std::strcmp(argv[i], "--compaction") == 0) { + sconfig.compaction_enabled = true; + } else if (std::strcmp(argv[i], "--compaction-threshold") == 0 && i + 1 < argc) { + sconfig.compaction_threshold = std::stof(argv[++i]); + } else if (std::strcmp(argv[i], "--compaction-max-tokens") == 0 && i + 1 < argc) { + sconfig.compaction_max_tokens = std::atoi(argv[++i]); + } else if (std::strcmp(argv[i], "--compaction-keep-recent") == 0 && i + 1 < argc) { + sconfig.compaction_keep_recent = std::stof(argv[++i]); } else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) { sconfig.disk_cache_dir = argv[++i]; } else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) { @@ -758,6 +773,12 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[server] │ ddtree_budget = %d\n", bargs.ddtree_budget); std::fprintf(stderr, "[server] │ prefix_cache = %d slots\n", sconfig.prefix_cache_cap); std::fprintf(stderr, "[server] │ cors = %s\n", sconfig.enable_cors ? "ON" : "off"); + std::fprintf(stderr, "[server] │ compaction = %s\n", sconfig.compaction_enabled ? "ON" : "off"); + if (sconfig.compaction_enabled) { + std::fprintf(stderr, "[server] │ compact_thresh = %.3f\n", sconfig.compaction_threshold); + std::fprintf(stderr, "[server] │ compact_recent = %.3f\n", sconfig.compaction_keep_recent); + std::fprintf(stderr, "[server] │ compact_summary = %d\n", sconfig.compaction_max_tokens); + } std::fprintf(stderr, "[server] │ cache_type_k = %s\n", #ifdef GGML_USE_HIP cache_type_k.empty() ? "q4_0 (default, HIP)" : cache_type_k.c_str()); From 48fedbc9520f054ab3d7e8ebe1399be2992e7229 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 29 May 2026 07:40:47 +0800 Subject: [PATCH 2/4] feat: use prefill-drafter (Qwen3-0.6B) as compaction backend When --prefill-drafter is present and --compaction is enabled, create a dedicated Qwen3Backend from the drafter GGUF for Layer 2 summarization. This avoids tying up the main target model (27B+) for summary generation and is much faster (~0.6B inference for short summaries). The compaction backend shares the drafter_tokenizer already loaded for pflash. If the backend fails to initialize, falls back gracefully to using the main model for summarization. Also adds --prefill-drafter flag support to harness/test_compaction.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- harness/test_compaction.py | 3 +++ server/src/server/http_server.cpp | 14 +++++++++--- server/src/server/http_server.h | 9 ++++++++ server/src/server/server_main.cpp | 36 +++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 3 deletions(-) diff --git a/harness/test_compaction.py b/harness/test_compaction.py index 013f0007..c61e42b5 100644 --- a/harness/test_compaction.py +++ b/harness/test_compaction.py @@ -99,6 +99,8 @@ def build_command(args: argparse.Namespace) -> list[str]: ] if args.draft: cmd += ["--draft", args.draft] + if args.prefill_drafter: + cmd += ["--prefill-drafter", args.prefill_drafter] if args.extra_server_args: cmd += shlex.split(args.extra_server_args) return cmd @@ -109,6 +111,7 @@ def main() -> int: parser.add_argument("--server-bin", type=Path, default=DEFAULT_BIN) parser.add_argument("--model", default=os.getenv("TARGET") or os.getenv("MODEL_PATH")) parser.add_argument("--draft", default=os.getenv("DRAFT")) + parser.add_argument("--prefill-drafter", default=os.getenv("PREFILL_DRAFTER")) parser.add_argument("--host", default="127.0.0.1") parser.add_argument("--port", type=int, default=int(os.getenv("COMPACTION_TEST_PORT", "18081"))) parser.add_argument("--model-name", default=os.getenv("MODEL_ID", "luce-dflash")) diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index 033db470..71c87ddd 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -1185,14 +1185,22 @@ void HttpServer::worker_loop() { if (render_failed) continue; if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + // Use dedicated compaction backend (small model) if available, + // otherwise fall back to the main target model. + ModelBackend * sum_backend = compaction_backend_ ? compaction_backend_ : &backend_; + Tokenizer * sum_tokenizer = compaction_tokenizer_ ? compaction_tokenizer_ : &tokenizer_; + // Compaction backend (Qwen3-0.6B) always uses QWEN3 chat format. + int sum_chat_format = compaction_backend_ + ? (int)ChatFormat::QWEN3 + : (int)chat_format_; auto sum_result = summarize_compact( current_messages, config_.compaction_keep_recent, config_.compaction_max_tokens, config_.compaction_prompt, - &backend_, - &tokenizer_, - (int)chat_format_); + sum_backend, + sum_tokenizer, + sum_chat_format); if (sum_result.applied) { current_messages = std::move(sum_result.compacted_messages); if (!render_compacted_prompt(current_messages, req.prompt_tokens)) { diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 2cfde4fc..e637af39 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -236,6 +236,13 @@ class HttpServer { // Set the optional pflash drafter tokenizer. void set_drafter_tokenizer(Tokenizer * tok) { drafter_tokenizer_ = tok; } + // Set a dedicated compaction backend (e.g. Qwen3-0.6B from --prefill-drafter). + // When set, Layer 2 summarization uses this small model instead of the main target. + void set_compaction_backend(ModelBackend * b, Tokenizer * tok) { + compaction_backend_ = b; + compaction_tokenizer_ = tok; + } + // Set the chat template format (detected from model arch). void set_chat_format(ChatFormat fmt) { chat_format_ = fmt; } @@ -290,6 +297,8 @@ class HttpServer { ModelBackend & backend_; Tokenizer & tokenizer_; Tokenizer * drafter_tokenizer_ = nullptr; // pflash drafter (optional) + ModelBackend * compaction_backend_ = nullptr; // dedicated compaction backend (optional) + Tokenizer * compaction_tokenizer_ = nullptr; // tokenizer for compaction backend ServerConfig config_; ChatFormat chat_format_; PFlashDrafterIpcClient pflash_remote_; diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index 6d316111..cc24d844 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -911,6 +911,39 @@ int main(int argc, char ** argv) { server.set_drafter_tokenizer(&drafter_tokenizer); } + // When --prefill-drafter is present and compaction is enabled, use the + // drafter (Qwen3-0.6B) as a dedicated compaction backend for Layer 2 + // summarization. This avoids tying up the main target model for summary + // generation and is much faster (~0.6B vs 27B+). + std::unique_ptr compaction_backend; + if (sconfig.compaction_enabled && !sconfig.pflash_drafter_path.empty()) { + std::fprintf(stderr, "[server] creating compaction backend from %s\n", + sconfig.pflash_drafter_path.c_str()); + BackendArgs cbargs; + cbargs.model_path = sconfig.pflash_drafter_path.c_str(); + cbargs.device.gpu = sconfig.pflash_drafter_gpu; + cbargs.device.max_ctx = 4096; // compaction summaries are short + cbargs.stream_fd = -1; + cbargs.chunk = 512; + compaction_backend = create_backend(cbargs); + if (compaction_backend) { + // Use the drafter_tokenizer (already loaded for pflash) or load one. + if (!pflash_enabled) { + if (!drafter_tokenizer.load_from_gguf(sconfig.pflash_drafter_path.c_str())) { + std::fprintf(stderr, "[server] compaction backend tokenizer load failed\n"); + compaction_backend.reset(); + } + } + if (compaction_backend) { + server.set_compaction_backend(compaction_backend.get(), &drafter_tokenizer); + std::fprintf(stderr, "[server] compaction backend ready (drafter model)\n"); + } + } else { + std::fprintf(stderr, "[server] compaction backend creation failed, " + "falling back to main model for summarization\n"); + } + } + // Lazy-draft: park decode draft at startup to free VRAM (~3.3 GB). if (sconfig.lazy_draft && bargs.draft_path) { backend->park("draft"); @@ -919,6 +952,9 @@ int main(int argc, char ** argv) { int ret = server.run(); // Cleanup. + if (compaction_backend) { + compaction_backend->shutdown(); + } backend->shutdown(); return ret; } From ccc9a6ffa89a83baebe7e1352d8981a68c4a17fa Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 29 May 2026 07:48:10 +0800 Subject: [PATCH 3/4] refactor: enable compaction by default, remove --compaction flag Compaction is now always enabled server-side. Triggering is driven by client HTTP request body (context_management parameter). Added --no-compaction to explicitly disable if needed. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- harness/test_compaction.py | 1 - server/src/server/http_server.h | 2 +- server/src/server/server_main.cpp | 8 ++++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/harness/test_compaction.py b/harness/test_compaction.py index c61e42b5..b17823ad 100644 --- a/harness/test_compaction.py +++ b/harness/test_compaction.py @@ -93,7 +93,6 @@ def build_command(args: argparse.Namespace) -> list[str]: str(args.max_ctx), "--max-tokens", str(args.max_output_tokens), - "--compaction", "--compaction-threshold", "0.5", ] diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index e637af39..7783948f 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -164,7 +164,7 @@ struct ServerConfig { std::string chat_template_src; // literal Jinja source (loaded from file) std::string chat_template_path; // path it was loaded from (logged at startup) - bool compaction_enabled = false; + bool compaction_enabled = true; float compaction_threshold = 0.9f; int compaction_max_tokens = 1024; float compaction_keep_recent = 0.3f; diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index cc24d844..97e2c50f 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -227,8 +227,8 @@ static void print_usage(const char * prog) { " renderer. Empty or missing falls back\n" " to the hardcoded template.\n" "\n" - "Context compaction:\n" - " --compaction Enable auto context compaction\n" + "Context compaction (enabled by default, triggered by client request):\n" + " --no-compaction Disable auto context compaction\n" " --compaction-threshold Trigger ratio of max_ctx (default: 0.9)\n" " --compaction-max-tokens Max tokens for self-summary (default: 1024)\n" " --compaction-keep-recent Fraction of recent turns kept verbatim\n" @@ -428,8 +428,8 @@ int main(int argc, char ** argv) { sconfig.chat_template_path = path; std::fprintf(stderr, "[server] loaded chat template from %s (%ld bytes)\n", path, n); } - } else if (std::strcmp(argv[i], "--compaction") == 0) { - sconfig.compaction_enabled = true; + } else if (std::strcmp(argv[i], "--no-compaction") == 0) { + sconfig.compaction_enabled = false; } else if (std::strcmp(argv[i], "--compaction-threshold") == 0 && i + 1 < argc) { sconfig.compaction_threshold = std::stof(argv[++i]); } else if (std::strcmp(argv[i], "--compaction-max-tokens") == 0 && i + 1 < argc) { From 8dc04705548b90d6f245a481f7f28290339c1dc8 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 29 May 2026 08:10:46 +0800 Subject: [PATCH 4/4] simplify: derive compaction_max_tokens from max_ctx, remove CLI flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary output length is always max_ctx/10 clamped to [256, 2048]. No reason for it to be independently configurable — it should always scale with context size. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- server/src/server/http_server.h | 2 +- server/src/server/server_main.cpp | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 7783948f..7baf172a 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -166,7 +166,7 @@ struct ServerConfig { bool compaction_enabled = true; float compaction_threshold = 0.9f; - int compaction_max_tokens = 1024; + int compaction_max_tokens = 0; // resolved at startup: max_ctx / 10, clamped [256,2048] float compaction_keep_recent = 0.3f; bool compaction_strip_thinking = true; int compaction_keep_tool_uses = 3; diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index 97e2c50f..30f28717 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -230,7 +230,6 @@ static void print_usage(const char * prog) { "Context compaction (enabled by default, triggered by client request):\n" " --no-compaction Disable auto context compaction\n" " --compaction-threshold Trigger ratio of max_ctx (default: 0.9)\n" - " --compaction-max-tokens Max tokens for self-summary (default: 1024)\n" " --compaction-keep-recent Fraction of recent turns kept verbatim\n" " during summarization (default: 0.3)\n" "\n", prog); @@ -432,8 +431,6 @@ int main(int argc, char ** argv) { sconfig.compaction_enabled = false; } else if (std::strcmp(argv[i], "--compaction-threshold") == 0 && i + 1 < argc) { sconfig.compaction_threshold = std::stof(argv[++i]); - } else if (std::strcmp(argv[i], "--compaction-max-tokens") == 0 && i + 1 < argc) { - sconfig.compaction_max_tokens = std::atoi(argv[++i]); } else if (std::strcmp(argv[i], "--compaction-keep-recent") == 0 && i + 1 < argc) { sconfig.compaction_keep_recent = std::stof(argv[++i]); } else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) { @@ -773,11 +770,16 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[server] │ ddtree_budget = %d\n", bargs.ddtree_budget); std::fprintf(stderr, "[server] │ prefix_cache = %d slots\n", sconfig.prefix_cache_cap); std::fprintf(stderr, "[server] │ cors = %s\n", sconfig.enable_cors ? "ON" : "off"); + // Resolve compaction_max_tokens from max_ctx if not explicitly set. + if (sconfig.compaction_max_tokens <= 0) { + sconfig.compaction_max_tokens = std::max(256, std::min(2048, sconfig.max_ctx / 10)); + } + std::fprintf(stderr, "[server] │ compaction = %s\n", sconfig.compaction_enabled ? "ON" : "off"); if (sconfig.compaction_enabled) { std::fprintf(stderr, "[server] │ compact_thresh = %.3f\n", sconfig.compaction_threshold); std::fprintf(stderr, "[server] │ compact_recent = %.3f\n", sconfig.compaction_keep_recent); - std::fprintf(stderr, "[server] │ compact_summary = %d\n", sconfig.compaction_max_tokens); + std::fprintf(stderr, "[server] │ compact_summary = %d (max_ctx=%d)\n", sconfig.compaction_max_tokens, sconfig.max_ctx); } std::fprintf(stderr, "[server] │ cache_type_k = %s\n", #ifdef GGML_USE_HIP