feat: inject images via MITM layer instead of relying on LS

The LS silently ignores the 'images' field from our
SendUserCascadeMessageRequest proto — it never forwards image data
to Google's API.

New approach: store the image in MitmStore, then the MITM request
modifier injects it as 'inlineData' directly into the last user
message's parts array in the Google API JSON request.

Flow:
  Client → Proxy (decode base64) → MitmStore.set_pending_image()
  LS → Google API → MITM intercepts → inject inlineData part
  → Google receives image + text together

This works for all three API endpoints (responses, completions,
gemini).
This commit is contained in:
Nikketryhard
2026-02-15 17:57:32 -06:00
parent 0a33c1b706
commit 89bea030cc
7 changed files with 108 additions and 2 deletions

View File

@@ -303,6 +303,16 @@ pub(crate) async fn handle_completions(
// Send message on primary cascade
state.mitm_store.set_active_cascade(&cascade_id).await;
// Store image for MITM injection (LS doesn't forward images to Google API)
if let Some(ref img) = image {
use base64::Engine;
state.mitm_store.set_pending_image(
crate::mitm::store::PendingImage {
base64_data: base64::engine::general_purpose::STANDARD.encode(&img.data),
mime_type: img.mime_type.clone(),
}
).await;
}
match state
.backend
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())

View File

@@ -274,6 +274,16 @@ pub(crate) async fn handle_gemini(
// Send message
state.mitm_store.set_active_cascade(&cascade_id).await;
// Store image for MITM injection (LS doesn't forward images to Google API)
if let Some(ref img) = image {
use base64::Engine;
state.mitm_store.set_pending_image(
crate::mitm::store::PendingImage {
base64_data: base64::engine::general_purpose::STANDARD.encode(&img.data),
mime_type: img.mime_type.clone(),
}
).await;
}
match state
.backend
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())

View File

@@ -352,6 +352,16 @@ pub(crate) async fn handle_responses(
// Send message
state.mitm_store.set_active_cascade(&cascade_id).await;
// Store image for MITM injection (LS doesn't forward images to Google API)
if let Some(ref img) = image {
use base64::Engine;
state.mitm_store.set_pending_image(
crate::mitm::store::PendingImage {
base64_data: base64::engine::general_purpose::STANDARD.encode(&img.data),
mime_type: img.mime_type.clone(),
}
).await;
}
match state
.backend
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())

View File

@@ -366,6 +366,12 @@ impl Backend {
return Err("No OAuth token available".to_string());
}
let proto = crate::proto::build_request_with_image(cascade_id, text, &token, model_enum, image);
if image.is_some() {
tracing::info!(
proto_size = proto.len(),
"SendUserCascadeMessage proto built with image"
);
}
self.call_proto("SendUserCascadeMessage", proto).await
}

View File

@@ -8,7 +8,7 @@ use regex::Regex;
use serde_json::Value;
use tracing::info;
use super::store::{CapturedFunctionCall, PendingToolResult};
use super::store::{CapturedFunctionCall, PendingImage, PendingToolResult};
/// Strip ALL tool definitions.
/// Must be true: with tools present, the LS enters full agentic mode
@@ -28,6 +28,8 @@ pub struct ToolContext {
pub last_calls: Vec<CapturedFunctionCall>,
/// Client-specified generation parameters (temperature, top_p, etc.).
pub generation_params: Option<super::store::GenerationParams>,
/// Pending image to inject as inlineData in the user message.
pub pending_image: Option<PendingImage>,
}
/// Modify a streamGenerateContent request body in-place.
@@ -451,6 +453,44 @@ pub fn modify_request(body: &[u8], tool_ctx: Option<&ToolContext>) -> Option<Vec
}
}
// ── 7. Inject pending image as inlineData ────────────────────────────
// The LS doesn't forward images from our SendUserCascadeMessage proto to
// Google's API, so we inject them here at the MITM layer.
if let Some(ref ctx) = tool_ctx {
if let Some(ref img) = ctx.pending_image {
if let Some(contents) = json
.pointer_mut("/request/contents")
.and_then(|v| v.as_array_mut())
{
// Find the last user-role message and add inlineData to its parts
let mut injected = false;
for msg in contents.iter_mut().rev() {
let is_user = msg["role"].as_str() == Some("user");
if is_user {
if let Some(parts) = msg.get_mut("parts").and_then(|v| v.as_array_mut()) {
parts.push(serde_json::json!({
"inlineData": {
"mimeType": img.mime_type,
"data": img.base64_data
}
}));
injected = true;
changes.push(format!(
"inject image ({}; {} bytes base64)",
img.mime_type,
img.base64_data.len()
));
break;
}
}
}
if !injected {
tracing::warn!("MITM: pending image but no user message found to inject into");
}
}
}
}
if changes.is_empty() {
return None; // Nothing modified
}

View File

@@ -562,14 +562,16 @@ async fn handle_http_over_tls(
let pending_results = store.take_tool_results().await;
let last_calls = store.get_last_function_calls().await;
let generation_params = store.get_generation_params().await;
let pending_image = store.take_pending_image().await;
let tool_ctx = if tools.is_some() || !pending_results.is_empty() || generation_params.is_some() {
let tool_ctx = if tools.is_some() || !pending_results.is_empty() || generation_params.is_some() || pending_image.is_some() {
Some(super::modify::ToolContext {
tools,
tool_config,
pending_results,
last_calls,
generation_params,
pending_image,
})
} else {
None

View File

@@ -60,6 +60,17 @@ pub struct PendingToolResult {
pub result: serde_json::Value,
}
/// A pending image to inject via MITM into the Google API request.
/// The LS doesn't forward images from our SendUserCascadeMessage proto,
/// so we inject them directly at the MITM layer.
#[derive(Debug, Clone)]
pub struct PendingImage {
/// Base64-encoded image data (no prefix).
pub base64_data: String,
/// MIME type, e.g. "image/png".
pub mime_type: String,
}
/// Client-specified generation parameters for MITM injection.
/// Set by API handlers, consumed by the MITM modify layer.
#[derive(Debug, Clone, Default)]
@@ -137,6 +148,10 @@ pub struct MitmStore {
// ── Grounding metadata capture ──────────────────────────────────────
/// Captured grounding metadata from Google API responses (search results).
captured_grounding: Arc<RwLock<Option<serde_json::Value>>>,
// ── Pending image for MITM injection ─────────────────────────────────
/// Image to inject into the next Google API request via MITM.
pending_image: Arc<RwLock<Option<PendingImage>>>,
}
/// Aggregate statistics across all intercepted traffic.
@@ -181,6 +196,7 @@ impl MitmStore {
response_complete: Arc::new(AtomicBool::new(false)),
generation_params: Arc::new(RwLock::new(None)),
captured_grounding: Arc::new(RwLock::new(None)),
pending_image: Arc::new(RwLock::new(None)),
}
}
@@ -506,4 +522,16 @@ impl MitmStore {
pub async fn peek_grounding(&self) -> Option<serde_json::Value> {
self.captured_grounding.read().await.clone()
}
// ── Pending image for MITM injection ─────────────────────────────────
/// Store a pending image for MITM injection.
pub async fn set_pending_image(&self, image: PendingImage) {
*self.pending_image.write().await = Some(image);
}
/// Take (consume) pending image for injection.
pub async fn take_pending_image(&self) -> Option<PendingImage> {
self.pending_image.write().await.take()
}
}