feat: capture thinking text via MITM dual-call merge

The LS makes TWO separate Google API calls for thinking models: Call 1: response + thinking token count (no thinking text) Call 2: thinking summary text (no thinking tokens) Each hits a different StreamingAccumulator, so we: 1. Capture response_text in StreamingAccumulator (non-thinking parts) 2. In MitmStore::record_usage, detect when Call 2 arrives for a cascade that already has thinking tokens from Call 1 3. Merge Call 2's response_text as thinking_text on Call 1's usage Also injects includeThoughts into Google API requests via MITM modify to ensure thinking text is available in SSE responses.
2026-02-14 19:49:15 -06:00
parent 905d55beb5
commit 34b9553484
4 changed files with 92 additions and 3 deletions
--- a/src/mitm/store.rs
+++ b/src/mitm/store.rs
@@ -26,6 +26,9 @@ pub struct ApiUsage {
    /// Captured from Google SSE parts with `thought: true` or Anthropic thinking blocks.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub thinking_text: Option<String>,
+    /// The response text captured from SSE parts (for merge detection).
+    #[serde(skip)]
+    pub response_text: Option<String>,
    /// Google-specific: response output tokens (non-thinking portion)
    pub response_output_tokens: u64,

@@ -122,10 +125,36 @@ impl MitmStore {
            }
        }

-        // Store latest usage for the cascade (if we can identify it)
+        // Store latest usage for the cascade (if we can identify it).
+        //
+        // Merge logic for v1internal thinking summaries:
+        // The LS makes TWO Google API calls per thinking request:
+        //   Call 1: response + thinking token count (thinking_output_tokens > 0, no thinking text)
+        //   Call 2: thinking summary text (thinking_output_tokens == 0, response_text has the summary)
+        //
+        // When Call 2 arrives, we merge its response_text as thinking_text into Call 1's usage.
        let key = cascade_id.map(|s| s.to_string()).unwrap_or_else(|| "_latest".to_string());
        let mut latest = self.latest_usage.write().await;
-        latest.insert(key, usage);
+
+        if let Some(existing) = latest.get_mut(&key) {
+            if existing.thinking_output_tokens > 0
+                && existing.thinking_text.is_none()
+                && usage.thinking_output_tokens == 0
+                && usage.response_text.is_some()
+            {
+                // Call 2: thinking summary — merge into existing Call 1 usage
+                existing.thinking_text = usage.response_text;
+                debug!(
+                    thinking_text_len = existing.thinking_text.as_ref().map_or(0, |t| t.len()),
+                    "MITM: merged thinking summary text into existing usage"
+                );
+            } else {
+                // Normal case: replace existing usage
+                latest.insert(key, usage);
+            }
+        } else {
+            latest.insert(key, usage);
+        }

        // Evict old entries to prevent unbounded memory growth
        const MAX_ENTRIES: usize = 500;