feat: capture thinking text via MITM dual-call merge

The LS makes TWO separate Google API calls for thinking models: Call 1: response + thinking token count (no thinking text) Call 2: thinking summary text (no thinking tokens) Each hits a different StreamingAccumulator, so we: 1. Capture response_text in StreamingAccumulator (non-thinking parts) 2. In MitmStore::record_usage, detect when Call 2 arrives for a cascade that already has thinking tokens from Call 1 3. Merge Call 2's response_text as thinking_text on Call 1's usage Also injects includeThoughts into Google API requests via MITM modify to ensure thinking text is available in SSE responses.
2026-02-14 19:49:15 -06:00
parent 905d55beb5
commit 34b9553484
4 changed files with 92 additions and 3 deletions
--- a/src/mitm/intercept.rs
+++ b/src/mitm/intercept.rs
@@ -59,6 +59,9 @@ pub struct StreamingAccumulator {
    pub thinking_tokens: u64,
    /// Accumulated thinking/reasoning text from the model.
    pub thinking_text: String,
+    /// Accumulated response text (non-thinking parts).
+    /// Used to identify "thinking summary" calls in the v1internal API.
+    pub response_text: String,
    pub model: Option<String>,
    pub stop_reason: Option<String>,
    pub is_complete: bool,
@@ -83,16 +86,24 @@ impl StreamingAccumulator {
            if let Some(model) = response["modelVersion"].as_str() {
                self.model = Some(model.to_string());
            }
-            // Extract thinking text from parts with thought: true
            if let Some(candidates) = response.get("candidates").and_then(|c| c.as_array()) {
                for candidate in candidates {
                    if let Some(parts) = candidate["content"]["parts"].as_array() {
                        for part in parts {
+                            // Public Gemini API: explicit thought flag
                            if part["thought"].as_bool() == Some(true) {
                                if let Some(text) = part["text"].as_str() {
                                    self.thinking_text.push_str(text);
                                }
                            }
+                            // Capture non-thinking response text (skip thoughtSignature parts)
+                            else if part.get("thoughtSignature").is_none() {
+                                if let Some(text) = part["text"].as_str() {
+                                    if !text.is_empty() {
+                                        self.response_text.push_str(text);
+                                    }
+                                }
+                            }
                        }
                    }
                    // Check for completion
@@ -172,6 +183,11 @@ impl StreamingAccumulator {
        } else {
            Some(self.thinking_text)
        };
+        let response_text = if self.response_text.is_empty() {
+            None
+        } else {
+            Some(self.response_text)
+        };
        ApiUsage {
            input_tokens: self.input_tokens,
            output_tokens: self.output_tokens,
@@ -179,6 +195,7 @@ impl StreamingAccumulator {
            cache_read_input_tokens: self.cache_read_input_tokens,
            thinking_output_tokens: self.thinking_tokens,
            thinking_text,
+            response_text,
            response_output_tokens: 0,
            model: self.model,
            stop_reason: self.stop_reason,
@@ -203,6 +220,7 @@ fn extract_usage_from_message(msg: &Value) -> Option<ApiUsage> {
        cache_read_input_tokens: usage["cache_read_input_tokens"].as_u64().unwrap_or(0),
        thinking_output_tokens: 0,
        thinking_text: None,
+        response_text: None,
        response_output_tokens: 0,
        model: msg["model"].as_str().map(|s| s.to_string()),
        stop_reason: msg["stop_reason"].as_str().map(|s| s.to_string()),