feat: capture thinking text via MITM dual-call merge

The LS makes TWO separate Google API calls for thinking models: Call 1: response + thinking token count (no thinking text) Call 2: thinking summary text (no thinking tokens) Each hits a different StreamingAccumulator, so we: 1. Capture response_text in StreamingAccumulator (non-thinking parts) 2. In MitmStore::record_usage, detect when Call 2 arrives for a cascade that already has thinking tokens from Call 1 3. Merge Call 2's response_text as thinking_text on Call 1's usage Also injects includeThoughts into Google API requests via MITM modify to ensure thinking text is available in SSE responses.
2026-02-14 19:49:15 -06:00
parent 905d55beb5
commit 34b9553484
4 changed files with 92 additions and 3 deletions
--- a/src/mitm/intercept.rs
+++ b/src/mitm/intercept.rs
@@ -59,6 +59,9 @@ pub struct StreamingAccumulator {
    pub thinking_tokens: u64,
    /// Accumulated thinking/reasoning text from the model.
    pub thinking_text: String,
    /// Accumulated response text (non-thinking parts).
    /// Used to identify "thinking summary" calls in the v1internal API.
    pub response_text: String,
    pub model: Option<String>,
    pub stop_reason: Option<String>,
    pub is_complete: bool,
@@ -83,16 +86,24 @@ impl StreamingAccumulator {
            if let Some(model) = response["modelVersion"].as_str() {
                self.model = Some(model.to_string());
            }
            // Extract thinking text from parts with thought: true
            if let Some(candidates) = response.get("candidates").and_then(|c| c.as_array()) {
                for candidate in candidates {
                    if let Some(parts) = candidate["content"]["parts"].as_array() {
                        for part in parts {
                            // Public Gemini API: explicit thought flag
                            if part["thought"].as_bool() == Some(true) {
                                if let Some(text) = part["text"].as_str() {
                                    self.thinking_text.push_str(text);
                                }
                            }
                            // Capture non-thinking response text (skip thoughtSignature parts)
                            else if part.get("thoughtSignature").is_none() {
                                if let Some(text) = part["text"].as_str() {
                                    if !text.is_empty() {
                                        self.response_text.push_str(text);
                                    }
                                }
                            }
                        }
                    }
                    // Check for completion
@@ -172,6 +183,11 @@ impl StreamingAccumulator {
        } else {
            Some(self.thinking_text)
        };
        let response_text = if self.response_text.is_empty() {
            None
        } else {
            Some(self.response_text)
        };
        ApiUsage {
            input_tokens: self.input_tokens,
            output_tokens: self.output_tokens,
@@ -179,6 +195,7 @@ impl StreamingAccumulator {
            cache_read_input_tokens: self.cache_read_input_tokens,
            thinking_output_tokens: self.thinking_tokens,
            thinking_text,
            response_text,
            response_output_tokens: 0,
            model: self.model,
            stop_reason: self.stop_reason,
@@ -203,6 +220,7 @@ fn extract_usage_from_message(msg: &Value) -> Option<ApiUsage> {
        cache_read_input_tokens: usage["cache_read_input_tokens"].as_u64().unwrap_or(0),
        thinking_output_tokens: 0,
        thinking_text: None,
        response_text: None,
        response_output_tokens: 0,
        model: msg["model"].as_str().map(|s| s.to_string()),
        stop_reason: msg["stop_reason"].as_str().map(|s| s.to_string()),
--- a/src/mitm/modify.rs
+++ b/src/mitm/modify.rs
@@ -152,6 +152,47 @@ pub fn modify_request(body: &[u8]) -> Option<Vec<u8>> {
        }
    }
    // ── 4. Inject includeThoughts to capture thinking text ───────────────
    // Without this flag, Google only reports thinking token counts
    // but doesn't send the thinking text in SSE parts.
    {
        // Ensure request.generationConfig.thinkingConfig.includeThoughts = true
        let request = json.get_mut("request").and_then(|v| v.as_object_mut());
        if let Some(req) = request {
            let gen_config = req
                .entry("generationConfig")
                .or_insert_with(|| serde_json::json!({}));
            if let Some(gc) = gen_config.as_object_mut() {
                let thinking_config = gc
                    .entry("thinkingConfig")
                    .or_insert_with(|| serde_json::json!({}));
                if let Some(tc) = thinking_config.as_object_mut() {
                    if !tc.contains_key("includeThoughts") {
                        tc.insert("includeThoughts".to_string(), Value::Bool(true));
                        changes.push("inject includeThoughts".to_string());
                    }
                }
            }
        } else {
            // Not wrapped in request — try top-level (public API format)
            let gen_config = json.as_object_mut().and_then(|o| {
                Some(o.entry("generationConfig")
                    .or_insert_with(|| serde_json::json!({})))
            });
            if let Some(gc) = gen_config.and_then(|v| v.as_object_mut()) {
                let thinking_config = gc
                    .entry("thinkingConfig")
                    .or_insert_with(|| serde_json::json!({}));
                if let Some(tc) = thinking_config.as_object_mut() {
                    if !tc.contains_key("includeThoughts") {
                        tc.insert("includeThoughts".to_string(), Value::Bool(true));
                        changes.push("inject includeThoughts (top-level)".to_string());
                    }
                }
            }
        }
    }
    if changes.is_empty() {
        return None; // Nothing modified
    }
--- a/src/mitm/proto.rs
+++ b/src/mitm/proto.rs
@@ -80,6 +80,7 @@ impl GrpcUsage {
            output_tokens: self.output_tokens,
            thinking_output_tokens: self.thinking_output_tokens,
            thinking_text: None, // gRPC proto doesn't carry thinking text
            response_text: None,
            response_output_tokens: self.response_output_tokens,
            cache_creation_input_tokens: self.cache_write_tokens,
            cache_read_input_tokens: self.cache_read_tokens,
--- a/src/mitm/store.rs
+++ b/src/mitm/store.rs
@@ -26,6 +26,9 @@ pub struct ApiUsage {
    /// Captured from Google SSE parts with `thought: true` or Anthropic thinking blocks.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub thinking_text: Option<String>,
    /// The response text captured from SSE parts (for merge detection).
    #[serde(skip)]
    pub response_text: Option<String>,
    /// Google-specific: response output tokens (non-thinking portion)
    pub response_output_tokens: u64,
@@ -122,10 +125,36 @@ impl MitmStore {
            }
        }
-        // Store latest usage for the cascade (if we can identify it)
+        // Store latest usage for the cascade (if we can identify it).
        //
        // Merge logic for v1internal thinking summaries:
        // The LS makes TWO Google API calls per thinking request:
        //   Call 1: response + thinking token count (thinking_output_tokens > 0, no thinking text)
        //   Call 2: thinking summary text (thinking_output_tokens == 0, response_text has the summary)
        //
        // When Call 2 arrives, we merge its response_text as thinking_text into Call 1's usage.
        let key = cascade_id.map(|s| s.to_string()).unwrap_or_else(|| "_latest".to_string());
        let mut latest = self.latest_usage.write().await;
-        latest.insert(key, usage);
+
        if let Some(existing) = latest.get_mut(&key) {
            if existing.thinking_output_tokens > 0
                && existing.thinking_text.is_none()
                && usage.thinking_output_tokens == 0
                && usage.response_text.is_some()
            {
                // Call 2: thinking summary — merge into existing Call 1 usage
                existing.thinking_text = usage.response_text;
                debug!(
                    thinking_text_len = existing.thinking_text.as_ref().map_or(0, |t| t.len()),
                    "MITM: merged thinking summary text into existing usage"
                );
            } else {
                // Normal case: replace existing usage
                latest.insert(key, usage);
            }
        } else {
            latest.insert(key, usage);
        }
        // Evict old entries to prevent unbounded memory growth
        const MAX_ENTRIES: usize = 500;