feat: inject images via MITM layer instead of relying on LS

The LS silently ignores the 'images' field from our
SendUserCascadeMessageRequest proto — it never forwards image data
to Google's API.

New approach: store the image in MitmStore, then the MITM request
modifier injects it as 'inlineData' directly into the last user
message's parts array in the Google API JSON request.

Flow:
  Client → Proxy (decode base64) → MitmStore.set_pending_image()
  LS → Google API → MITM intercepts → inject inlineData part
  → Google receives image + text together

This works for all three API endpoints (responses, completions,
gemini).
This commit is contained in:
Nikketryhard
2026-02-15 17:57:32 -06:00
parent 0a33c1b706
commit 89bea030cc
7 changed files with 108 additions and 2 deletions

View File

@@ -303,6 +303,16 @@ pub(crate) async fn handle_completions(
// Send message on primary cascade // Send message on primary cascade
state.mitm_store.set_active_cascade(&cascade_id).await; state.mitm_store.set_active_cascade(&cascade_id).await;
// Store image for MITM injection (LS doesn't forward images to Google API)
if let Some(ref img) = image {
use base64::Engine;
state.mitm_store.set_pending_image(
crate::mitm::store::PendingImage {
base64_data: base64::engine::general_purpose::STANDARD.encode(&img.data),
mime_type: img.mime_type.clone(),
}
).await;
}
match state match state
.backend .backend
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref()) .send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())

View File

@@ -274,6 +274,16 @@ pub(crate) async fn handle_gemini(
// Send message // Send message
state.mitm_store.set_active_cascade(&cascade_id).await; state.mitm_store.set_active_cascade(&cascade_id).await;
// Store image for MITM injection (LS doesn't forward images to Google API)
if let Some(ref img) = image {
use base64::Engine;
state.mitm_store.set_pending_image(
crate::mitm::store::PendingImage {
base64_data: base64::engine::general_purpose::STANDARD.encode(&img.data),
mime_type: img.mime_type.clone(),
}
).await;
}
match state match state
.backend .backend
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref()) .send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())

View File

@@ -352,6 +352,16 @@ pub(crate) async fn handle_responses(
// Send message // Send message
state.mitm_store.set_active_cascade(&cascade_id).await; state.mitm_store.set_active_cascade(&cascade_id).await;
// Store image for MITM injection (LS doesn't forward images to Google API)
if let Some(ref img) = image {
use base64::Engine;
state.mitm_store.set_pending_image(
crate::mitm::store::PendingImage {
base64_data: base64::engine::general_purpose::STANDARD.encode(&img.data),
mime_type: img.mime_type.clone(),
}
).await;
}
match state match state
.backend .backend
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref()) .send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())

View File

@@ -366,6 +366,12 @@ impl Backend {
return Err("No OAuth token available".to_string()); return Err("No OAuth token available".to_string());
} }
let proto = crate::proto::build_request_with_image(cascade_id, text, &token, model_enum, image); let proto = crate::proto::build_request_with_image(cascade_id, text, &token, model_enum, image);
if image.is_some() {
tracing::info!(
proto_size = proto.len(),
"SendUserCascadeMessage proto built with image"
);
}
self.call_proto("SendUserCascadeMessage", proto).await self.call_proto("SendUserCascadeMessage", proto).await
} }

View File

@@ -8,7 +8,7 @@ use regex::Regex;
use serde_json::Value; use serde_json::Value;
use tracing::info; use tracing::info;
use super::store::{CapturedFunctionCall, PendingToolResult}; use super::store::{CapturedFunctionCall, PendingImage, PendingToolResult};
/// Strip ALL tool definitions. /// Strip ALL tool definitions.
/// Must be true: with tools present, the LS enters full agentic mode /// Must be true: with tools present, the LS enters full agentic mode
@@ -28,6 +28,8 @@ pub struct ToolContext {
pub last_calls: Vec<CapturedFunctionCall>, pub last_calls: Vec<CapturedFunctionCall>,
/// Client-specified generation parameters (temperature, top_p, etc.). /// Client-specified generation parameters (temperature, top_p, etc.).
pub generation_params: Option<super::store::GenerationParams>, pub generation_params: Option<super::store::GenerationParams>,
/// Pending image to inject as inlineData in the user message.
pub pending_image: Option<PendingImage>,
} }
/// Modify a streamGenerateContent request body in-place. /// Modify a streamGenerateContent request body in-place.
@@ -451,6 +453,44 @@ pub fn modify_request(body: &[u8], tool_ctx: Option<&ToolContext>) -> Option<Vec
} }
} }
// ── 7. Inject pending image as inlineData ────────────────────────────
// The LS doesn't forward images from our SendUserCascadeMessage proto to
// Google's API, so we inject them here at the MITM layer.
if let Some(ref ctx) = tool_ctx {
if let Some(ref img) = ctx.pending_image {
if let Some(contents) = json
.pointer_mut("/request/contents")
.and_then(|v| v.as_array_mut())
{
// Find the last user-role message and add inlineData to its parts
let mut injected = false;
for msg in contents.iter_mut().rev() {
let is_user = msg["role"].as_str() == Some("user");
if is_user {
if let Some(parts) = msg.get_mut("parts").and_then(|v| v.as_array_mut()) {
parts.push(serde_json::json!({
"inlineData": {
"mimeType": img.mime_type,
"data": img.base64_data
}
}));
injected = true;
changes.push(format!(
"inject image ({}; {} bytes base64)",
img.mime_type,
img.base64_data.len()
));
break;
}
}
}
if !injected {
tracing::warn!("MITM: pending image but no user message found to inject into");
}
}
}
}
if changes.is_empty() { if changes.is_empty() {
return None; // Nothing modified return None; // Nothing modified
} }

View File

@@ -562,14 +562,16 @@ async fn handle_http_over_tls(
let pending_results = store.take_tool_results().await; let pending_results = store.take_tool_results().await;
let last_calls = store.get_last_function_calls().await; let last_calls = store.get_last_function_calls().await;
let generation_params = store.get_generation_params().await; let generation_params = store.get_generation_params().await;
let pending_image = store.take_pending_image().await;
let tool_ctx = if tools.is_some() || !pending_results.is_empty() || generation_params.is_some() { let tool_ctx = if tools.is_some() || !pending_results.is_empty() || generation_params.is_some() || pending_image.is_some() {
Some(super::modify::ToolContext { Some(super::modify::ToolContext {
tools, tools,
tool_config, tool_config,
pending_results, pending_results,
last_calls, last_calls,
generation_params, generation_params,
pending_image,
}) })
} else { } else {
None None

View File

@@ -60,6 +60,17 @@ pub struct PendingToolResult {
pub result: serde_json::Value, pub result: serde_json::Value,
} }
/// A pending image to inject via MITM into the Google API request.
/// The LS doesn't forward images from our SendUserCascadeMessage proto,
/// so we inject them directly at the MITM layer.
#[derive(Debug, Clone)]
pub struct PendingImage {
/// Base64-encoded image data (no prefix).
pub base64_data: String,
/// MIME type, e.g. "image/png".
pub mime_type: String,
}
/// Client-specified generation parameters for MITM injection. /// Client-specified generation parameters for MITM injection.
/// Set by API handlers, consumed by the MITM modify layer. /// Set by API handlers, consumed by the MITM modify layer.
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
@@ -137,6 +148,10 @@ pub struct MitmStore {
// ── Grounding metadata capture ────────────────────────────────────── // ── Grounding metadata capture ──────────────────────────────────────
/// Captured grounding metadata from Google API responses (search results). /// Captured grounding metadata from Google API responses (search results).
captured_grounding: Arc<RwLock<Option<serde_json::Value>>>, captured_grounding: Arc<RwLock<Option<serde_json::Value>>>,
// ── Pending image for MITM injection ─────────────────────────────────
/// Image to inject into the next Google API request via MITM.
pending_image: Arc<RwLock<Option<PendingImage>>>,
} }
/// Aggregate statistics across all intercepted traffic. /// Aggregate statistics across all intercepted traffic.
@@ -181,6 +196,7 @@ impl MitmStore {
response_complete: Arc::new(AtomicBool::new(false)), response_complete: Arc::new(AtomicBool::new(false)),
generation_params: Arc::new(RwLock::new(None)), generation_params: Arc::new(RwLock::new(None)),
captured_grounding: Arc::new(RwLock::new(None)), captured_grounding: Arc::new(RwLock::new(None)),
pending_image: Arc::new(RwLock::new(None)),
} }
} }
@@ -506,4 +522,16 @@ impl MitmStore {
pub async fn peek_grounding(&self) -> Option<serde_json::Value> { pub async fn peek_grounding(&self) -> Option<serde_json::Value> {
self.captured_grounding.read().await.clone() self.captured_grounding.read().await.clone()
} }
// ── Pending image for MITM injection ─────────────────────────────────
/// Store a pending image for MITM injection.
pub async fn set_pending_image(&self, image: PendingImage) {
*self.pending_image.write().await = Some(image);
}
/// Take (consume) pending image for injection.
pub async fn take_pending_image(&self) -> Option<PendingImage> {
self.pending_image.write().await.take()
}
} }