From 3beb18e92d73bc2259cded67372ce8ec263ffd32 Mon Sep 17 00:00:00 2001 From: allenliang2022 Date: Sun, 21 Jun 2026 20:02:11 +0800 Subject: [PATCH] fix(context): broaden streaming stale-compressor guard to any model-window mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The live-usage snapshot's #3256 default-only guard only corrected the compressor's cached context_length when it exactly equalled the config global cap (model.context_length). A compressor left holding a *different* model's window — e.g. a claude-opus-4.8 (1M) session whose compressor was seeded/last-updated with claude-opus-4.5's 168k — does not satisfy that == check, so the stale 168k passed straight through to every 'done' event. Symptom: refresh (GET /api/session hydration) shows the correct 1M, but sending a message reverts the indicator to 168k, and the auto-compress marker fires far too early. Broaden the guard: always resolve the real per-model window for the agent's CURRENT model and surface it whenever it differs from the compressor's cached value. Resolution reuses the SAME helper hydration uses (routes._context_length_lookup_inputs_for_model + get_model_context_length) so the streaming/SSE path and GET /api/session land on the IDENTICAL value, honoring nested per-model config overrides (model..models..context_length) and custom-provider keys. Reusing the helper (instead of hand-reading the flat top-level model.context_length, which is None under nested config) avoids a new 'refresh 1M / send-a-message 936k' mismatch. The per-stream _real_ctx_cache (resolve at most once per stream, not per metering tick) is preserved. Backend-only, no frontend changes. Verified end-to-end on a real opus-4.8/copilot turn: indicator 168.0k -> 1.0M and no longer reverts on subsequent messages. --- api/streaming.py | 110 +++++++++++++++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 37 deletions(-) diff --git a/api/streaming.py b/api/streaming.py index adddf30ab0..da9977d975 100644 --- a/api/streaming.py +++ b/api/streaming.py @@ -5721,50 +5721,86 @@ def _live_usage_snapshot(): _cc = getattr(_agent, 'context_compressor', None) if _cc: _cc_cl_u = getattr(_cc, 'context_length', 0) or 0 - # Default-only guard (#3256): the agent-side compressor is - # built in agent_init with the global model.context_length - # applied unconditionally — for non-default models that - # value is the stale global cap (e.g. 232K). Drop it here - # so the live usage payload doesn't surface the wrong cap. - # PERF: resolve the real per-model cap at most once per - # stream (cached in _real_ctx_cache). This snapshot runs on - # every metering tick; doing the config read + metadata - # lookup per tick froze non-default-model streams. + # Stale-compressor self-heal (#3256, broadened): the + # agent-side compressor caches a context_length from the + # model it was *built/last-updated* with. After an in-place + # model switch (or when agent_init seeded it with the global + # model.context_length cap), that cached value can be the + # WRONG model's window — e.g. a session on claude-opus-4.8 + # (1M / 936k prompt on Copilot) whose compressor still holds + # claude-opus-4.5's 168k. The original guard only corrected + # the narrow case where the cached value equalled the config + # cap exactly; a leftover *other-model* value (168k) slipped + # straight through to the live usage payload. Broaden it: + # ALWAYS resolve the real per-model window for the agent's + # CURRENT model and, when that differs from the cached value, + # surface the real one. Frontend hydration (GET /api/session) + # already does this; this aligns the streaming path with it + # so "refresh shows 1M, send-a-message drops to 168k" can't + # happen. + # PERF: resolve at most once per stream (cached in + # _real_ctx_cache). This snapshot runs on every metering + # tick; doing the config read + metadata lookup per tick + # froze non-default-model streams. if _real_ctx_cache[0] is None: - _resolved_real = 0 # 0 = guard not applicable / failed + _resolved_real = 0 # 0 = no correction / lookup failed try: - from api.config import get_config as _gc_u - _cfg_u = _gc_u() - _mcfg_u = _cfg_u.get('model', {}) if isinstance(_cfg_u, dict) else {} - if isinstance(_mcfg_u, dict): - _def_u = str(_mcfg_u.get('default') or '').strip() - _raw_u = _mcfg_u.get('context_length') + _sm_u = str(getattr(_agent, 'model', '') or '').strip() + _prov_u = str(getattr(_agent, 'provider', '') or '').strip() + _base_u = str(getattr(_agent, 'base_url', '') or '').strip() + _key_u = getattr(_agent, 'api_key', '') or '' + if _sm_u: + # Resolve the real window through the SAME helper + # hydration uses (routes._context_length_lookup_inputs_for_model + # + get_model_context_length). This honors the + # nested per-model config override + # (model..models..context_length, + # e.g. claude-opus-4.8 -> 1,000,000) and custom- + # provider keys, so the streaming/SSE path and the + # GET /api/session path land on the IDENTICAL value. + # Reusing the helper (instead of hand-reading the + # flat top-level model.context_length, which is + # None here) is what prevents a new mismatch like + # "refresh shows 1M, send-a-message shows 936k". try: - _cl_u = int(_raw_u) if _raw_u is not None else 0 - except (TypeError, ValueError): - _cl_u = 0 - _sm_u = str(getattr(_agent, 'model', '') or '').strip() - from api.routes import _model_matches_configured_default as _mmcd_u - if ( - _cl_u > 0 - and _cc_cl_u == _cl_u - and _def_u - and _sm_u - and not _mmcd_u(_sm_u, _def_u, getattr(_agent, 'provider', '') or '') - ): - # Recompute from real per-model metadata. + from api.routes import ( + _context_length_lookup_inputs_for_model as _cli_u, + ) + from api.config import get_config as _gc_u + from agent.model_metadata import get_model_context_length as _g_u + _cfg_u = _gc_u() + _lk_u = _cli_u( + _sm_u, + _prov_u, + base_url=_base_u, + api_key=_key_u, + cfg=_cfg_u if isinstance(_cfg_u, dict) else {}, + ) + _real_u = _g_u( + _sm_u, + _lk_u.base_url, + api_key=_lk_u.api_key, + config_context_length=_lk_u.config_context_length, + provider=_lk_u.provider or _prov_u or '', + custom_providers=_lk_u.custom_providers, + ) or 0 + # Only treat it as a correction when the real + # window is valid AND disagrees with the + # compressor's cached value. Equal => nothing + # to fix, leave the fast path untouched. + if _real_u and _real_u != _cc_cl_u: + _resolved_real = _real_u + except TypeError: + # Older hermes-agent: legacy 2-arg form. try: - from agent.model_metadata import get_model_context_length as _g_u - _real_u = _g_u( - _sm_u, - getattr(_agent, 'base_url', '') or '', - config_context_length=None, - provider=getattr(_agent, 'provider', '') or '', - ) or 0 - if _real_u: + from agent.model_metadata import get_model_context_length as _g2_u + _real_u = _g2_u(_sm_u, _base_u) or 0 + if _real_u and _real_u != _cc_cl_u: _resolved_real = _real_u except Exception: pass + except Exception: + pass except Exception: _resolved_real = 0 _real_ctx_cache[0] = _resolved_real