feat: capture thinking text via MITM dual-call merge
The LS makes TWO separate Google API calls for thinking models: Call 1: response + thinking token count (no thinking text) Call 2: thinking summary text (no thinking tokens) Each hits a different StreamingAccumulator, so we: 1. Capture response_text in StreamingAccumulator (non-thinking parts) 2. In MitmStore::record_usage, detect when Call 2 arrives for a cascade that already has thinking tokens from Call 1 3. Merge Call 2's response_text as thinking_text on Call 1's usage Also injects includeThoughts into Google API requests via MITM modify to ensure thinking text is available in SSE responses.
This commit is contained in:
@@ -26,6 +26,9 @@ pub struct ApiUsage {
|
||||
/// Captured from Google SSE parts with `thought: true` or Anthropic thinking blocks.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub thinking_text: Option<String>,
|
||||
/// The response text captured from SSE parts (for merge detection).
|
||||
#[serde(skip)]
|
||||
pub response_text: Option<String>,
|
||||
/// Google-specific: response output tokens (non-thinking portion)
|
||||
pub response_output_tokens: u64,
|
||||
|
||||
@@ -122,10 +125,36 @@ impl MitmStore {
|
||||
}
|
||||
}
|
||||
|
||||
// Store latest usage for the cascade (if we can identify it)
|
||||
// Store latest usage for the cascade (if we can identify it).
|
||||
//
|
||||
// Merge logic for v1internal thinking summaries:
|
||||
// The LS makes TWO Google API calls per thinking request:
|
||||
// Call 1: response + thinking token count (thinking_output_tokens > 0, no thinking text)
|
||||
// Call 2: thinking summary text (thinking_output_tokens == 0, response_text has the summary)
|
||||
//
|
||||
// When Call 2 arrives, we merge its response_text as thinking_text into Call 1's usage.
|
||||
let key = cascade_id.map(|s| s.to_string()).unwrap_or_else(|| "_latest".to_string());
|
||||
let mut latest = self.latest_usage.write().await;
|
||||
latest.insert(key, usage);
|
||||
|
||||
if let Some(existing) = latest.get_mut(&key) {
|
||||
if existing.thinking_output_tokens > 0
|
||||
&& existing.thinking_text.is_none()
|
||||
&& usage.thinking_output_tokens == 0
|
||||
&& usage.response_text.is_some()
|
||||
{
|
||||
// Call 2: thinking summary — merge into existing Call 1 usage
|
||||
existing.thinking_text = usage.response_text;
|
||||
debug!(
|
||||
thinking_text_len = existing.thinking_text.as_ref().map_or(0, |t| t.len()),
|
||||
"MITM: merged thinking summary text into existing usage"
|
||||
);
|
||||
} else {
|
||||
// Normal case: replace existing usage
|
||||
latest.insert(key, usage);
|
||||
}
|
||||
} else {
|
||||
latest.insert(key, usage);
|
||||
}
|
||||
|
||||
// Evict old entries to prevent unbounded memory growth
|
||||
const MAX_ENTRIES: usize = 500;
|
||||
|
||||
Reference in New Issue
Block a user