diff --git a/src/mitm/intercept.rs b/src/mitm/intercept.rs index cd94941..3447319 100644 --- a/src/mitm/intercept.rs +++ b/src/mitm/intercept.rs @@ -59,6 +59,9 @@ pub struct StreamingAccumulator { pub thinking_tokens: u64, /// Accumulated thinking/reasoning text from the model. pub thinking_text: String, + /// Accumulated response text (non-thinking parts). + /// Used to identify "thinking summary" calls in the v1internal API. + pub response_text: String, pub model: Option, pub stop_reason: Option, pub is_complete: bool, @@ -83,16 +86,24 @@ impl StreamingAccumulator { if let Some(model) = response["modelVersion"].as_str() { self.model = Some(model.to_string()); } - // Extract thinking text from parts with thought: true if let Some(candidates) = response.get("candidates").and_then(|c| c.as_array()) { for candidate in candidates { if let Some(parts) = candidate["content"]["parts"].as_array() { for part in parts { + // Public Gemini API: explicit thought flag if part["thought"].as_bool() == Some(true) { if let Some(text) = part["text"].as_str() { self.thinking_text.push_str(text); } } + // Capture non-thinking response text (skip thoughtSignature parts) + else if part.get("thoughtSignature").is_none() { + if let Some(text) = part["text"].as_str() { + if !text.is_empty() { + self.response_text.push_str(text); + } + } + } } } // Check for completion @@ -172,6 +183,11 @@ impl StreamingAccumulator { } else { Some(self.thinking_text) }; + let response_text = if self.response_text.is_empty() { + None + } else { + Some(self.response_text) + }; ApiUsage { input_tokens: self.input_tokens, output_tokens: self.output_tokens, @@ -179,6 +195,7 @@ impl StreamingAccumulator { cache_read_input_tokens: self.cache_read_input_tokens, thinking_output_tokens: self.thinking_tokens, thinking_text, + response_text, response_output_tokens: 0, model: self.model, stop_reason: self.stop_reason, @@ -203,6 +220,7 @@ fn extract_usage_from_message(msg: &Value) -> Option { cache_read_input_tokens: usage["cache_read_input_tokens"].as_u64().unwrap_or(0), thinking_output_tokens: 0, thinking_text: None, + response_text: None, response_output_tokens: 0, model: msg["model"].as_str().map(|s| s.to_string()), stop_reason: msg["stop_reason"].as_str().map(|s| s.to_string()), diff --git a/src/mitm/modify.rs b/src/mitm/modify.rs index 6195b31..f9dce3a 100644 --- a/src/mitm/modify.rs +++ b/src/mitm/modify.rs @@ -152,6 +152,47 @@ pub fn modify_request(body: &[u8]) -> Option> { } } + // ── 4. Inject includeThoughts to capture thinking text ─────────────── + // Without this flag, Google only reports thinking token counts + // but doesn't send the thinking text in SSE parts. + { + // Ensure request.generationConfig.thinkingConfig.includeThoughts = true + let request = json.get_mut("request").and_then(|v| v.as_object_mut()); + if let Some(req) = request { + let gen_config = req + .entry("generationConfig") + .or_insert_with(|| serde_json::json!({})); + if let Some(gc) = gen_config.as_object_mut() { + let thinking_config = gc + .entry("thinkingConfig") + .or_insert_with(|| serde_json::json!({})); + if let Some(tc) = thinking_config.as_object_mut() { + if !tc.contains_key("includeThoughts") { + tc.insert("includeThoughts".to_string(), Value::Bool(true)); + changes.push("inject includeThoughts".to_string()); + } + } + } + } else { + // Not wrapped in request — try top-level (public API format) + let gen_config = json.as_object_mut().and_then(|o| { + Some(o.entry("generationConfig") + .or_insert_with(|| serde_json::json!({}))) + }); + if let Some(gc) = gen_config.and_then(|v| v.as_object_mut()) { + let thinking_config = gc + .entry("thinkingConfig") + .or_insert_with(|| serde_json::json!({})); + if let Some(tc) = thinking_config.as_object_mut() { + if !tc.contains_key("includeThoughts") { + tc.insert("includeThoughts".to_string(), Value::Bool(true)); + changes.push("inject includeThoughts (top-level)".to_string()); + } + } + } + } + } + if changes.is_empty() { return None; // Nothing modified } diff --git a/src/mitm/proto.rs b/src/mitm/proto.rs index 5d016ad..8561a59 100644 --- a/src/mitm/proto.rs +++ b/src/mitm/proto.rs @@ -80,6 +80,7 @@ impl GrpcUsage { output_tokens: self.output_tokens, thinking_output_tokens: self.thinking_output_tokens, thinking_text: None, // gRPC proto doesn't carry thinking text + response_text: None, response_output_tokens: self.response_output_tokens, cache_creation_input_tokens: self.cache_write_tokens, cache_read_input_tokens: self.cache_read_tokens, diff --git a/src/mitm/store.rs b/src/mitm/store.rs index 71931d3..69176ed 100644 --- a/src/mitm/store.rs +++ b/src/mitm/store.rs @@ -26,6 +26,9 @@ pub struct ApiUsage { /// Captured from Google SSE parts with `thought: true` or Anthropic thinking blocks. #[serde(skip_serializing_if = "Option::is_none")] pub thinking_text: Option, + /// The response text captured from SSE parts (for merge detection). + #[serde(skip)] + pub response_text: Option, /// Google-specific: response output tokens (non-thinking portion) pub response_output_tokens: u64, @@ -122,10 +125,36 @@ impl MitmStore { } } - // Store latest usage for the cascade (if we can identify it) + // Store latest usage for the cascade (if we can identify it). + // + // Merge logic for v1internal thinking summaries: + // The LS makes TWO Google API calls per thinking request: + // Call 1: response + thinking token count (thinking_output_tokens > 0, no thinking text) + // Call 2: thinking summary text (thinking_output_tokens == 0, response_text has the summary) + // + // When Call 2 arrives, we merge its response_text as thinking_text into Call 1's usage. let key = cascade_id.map(|s| s.to_string()).unwrap_or_else(|| "_latest".to_string()); let mut latest = self.latest_usage.write().await; - latest.insert(key, usage); + + if let Some(existing) = latest.get_mut(&key) { + if existing.thinking_output_tokens > 0 + && existing.thinking_text.is_none() + && usage.thinking_output_tokens == 0 + && usage.response_text.is_some() + { + // Call 2: thinking summary — merge into existing Call 1 usage + existing.thinking_text = usage.response_text; + debug!( + thinking_text_len = existing.thinking_text.as_ref().map_or(0, |t| t.len()), + "MITM: merged thinking summary text into existing usage" + ); + } else { + // Normal case: replace existing usage + latest.insert(key, usage); + } + } else { + latest.insert(key, usage); + } // Evict old entries to prevent unbounded memory growth const MAX_ENTRIES: usize = 500;