diff --git a/src/api/completions.rs b/src/api/completions.rs index 4a242b4..14d369e 100644 --- a/src/api/completions.rs +++ b/src/api/completions.rs @@ -82,10 +82,13 @@ fn google_to_openai_finish_reason(stop_reason: Option<&str>) -> &'static str { /// Builds the full conversation context including all messages (system, user, /// assistant, tool) so the model has complete history — matching how OpenAI /// sends the entire messages array to the model. -fn extract_chat_input(messages: &[CompletionMessage]) -> String { - // Always build the full conversation — we used to only take the last user - // message which broke multi-turn conversations via the messages array. - build_conversation_with_tools(messages) +fn extract_chat_input(messages: &[CompletionMessage]) -> (String, Option) { + // Extract image from last user message content array + let image = messages.iter().rev() + .find(|m| m.role == "user") + .and_then(|m| super::util::extract_first_image(&m.content)); + // Always build the full conversation + (build_conversation_with_tools(messages), image) } /// Extract text content from a message's content field (string or array). @@ -257,7 +260,7 @@ pub(crate) async fn handle_completions( ); } - let user_text = extract_chat_input(&body.messages); + let (user_text, image) = extract_chat_input(&body.messages); if user_text.is_empty() { return err_response( StatusCode::BAD_REQUEST, @@ -302,7 +305,7 @@ pub(crate) async fn handle_completions( state.mitm_store.set_active_cascade(&cascade_id).await; match state .backend - .send_message(&cascade_id, &user_text, model.model_enum) + .send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref()) .await { Ok((200, _)) => { @@ -361,7 +364,7 @@ pub(crate) async fn handle_completions( match state.backend.create_cascade().await { Ok(cid) => { // Send the same message on each extra cascade - match state.backend.send_message(&cid, &user_text, model.model_enum).await { + match state.backend.send_message_with_image(&cid, &user_text, model.model_enum, image.as_ref()).await { Ok((200, _)) => { let bg = Arc::clone(&state.backend); let cid2 = cid.clone(); diff --git a/src/api/gemini.rs b/src/api/gemini.rs index ba33783..a418b78 100644 --- a/src/api/gemini.rs +++ b/src/api/gemini.rs @@ -135,11 +135,12 @@ pub(crate) async fn handle_gemini( ); } - // Extract user text + // Extract user text and optional image + let mut image: Option = None; let user_text = match &body.input { serde_json::Value::String(s) => s.clone(), serde_json::Value::Array(arr) => { - // Support array input: can be strings or {text: "..."} objects + // Support array input: strings, {text: "..."}, or {inlineData: {mimeType, data}} let mut parts: Vec = Vec::new(); for item in arr { match item { @@ -148,6 +149,25 @@ pub(crate) async fn handle_gemini( if let Some(text) = obj.get("text").and_then(|v| v.as_str()) { parts.push(text.to_string()); } + // Gemini-native inlineData format + if image.is_none() { + if let Some(inline) = obj.get("inlineData") { + if let (Some(mime), Some(b64)) = ( + inline["mimeType"].as_str(), + inline["data"].as_str(), + ) { + if let Some(img) = super::util::parse_data_uri( + &format!("data:{mime};base64,{b64}") + ) { + image = Some(img); + } + } + } + // Also support OpenAI-style image_url in Gemini input + if let Some(img) = super::util::extract_image_from_content(item) { + image = Some(img); + } + } } _ => {} } @@ -256,7 +276,7 @@ pub(crate) async fn handle_gemini( state.mitm_store.set_active_cascade(&cascade_id).await; match state .backend - .send_message(&cascade_id, &user_text, model.model_enum) + .send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref()) .await { Ok((200, _)) => { diff --git a/src/api/responses.rs b/src/api/responses.rs index 03c6ce7..7fe8f90 100644 --- a/src/api/responses.rs +++ b/src/api/responses.rs @@ -30,9 +30,14 @@ struct ToolResultInput { } /// Extract user text from Responses API `input` field. -/// Also extracts any function_call_output items for tool result handling. -fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>) -> (String, Vec) { +/// Also extracts any function_call_output items for tool result handling, +/// and the first inline image (base64 data URI) if present. +fn extract_responses_input( + input: &serde_json::Value, + instructions: Option<&str>, +) -> (String, Vec, Option) { let mut tool_results: Vec = Vec::new(); + let mut image: Option = None; let user_text = match input { serde_json::Value::String(s) => s.clone(), @@ -50,6 +55,10 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str> }); } } + // Extract first image from top-level input items + if image.is_none() { + image = super::util::extract_image_from_content(item); + } } // If we have tool results but no text, generate a follow-up prompt @@ -76,20 +85,26 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str> .iter() .rev() .find(|item| item["role"].as_str() == Some("user")) - .and_then(|item| match &item["content"] { - serde_json::Value::String(s) => Some(s.clone()), - serde_json::Value::Array(parts) => Some( - parts - .iter() - .filter(|p| { - let t = p["type"].as_str().unwrap_or(""); - t == "input_text" || t == "text" - }) - .filter_map(|p| p["text"].as_str()) - .collect::>() - .join(" "), - ), - _ => None, + .and_then(|item| { + // Also scan content array for images + if image.is_none() { + image = super::util::extract_first_image(&item["content"]); + } + match &item["content"] { + serde_json::Value::String(s) => Some(s.clone()), + serde_json::Value::Array(parts) => Some( + parts + .iter() + .filter(|p| { + let t = p["type"].as_str().unwrap_or(""); + t == "input_text" || t == "text" + }) + .filter_map(|p| p["text"].as_str()) + .collect::>() + .join(" "), + ), + _ => None, + } }) .unwrap_or_default() } @@ -102,7 +117,7 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str> _ => user_text, }; - (final_text, tool_results) + (final_text, tool_results, image) } /// Extract conversation/session ID from Responses API `conversation` field. @@ -199,7 +214,7 @@ pub(crate) async fn handle_responses( ); } - let (user_text, tool_results) = extract_responses_input(&body.input, body.instructions.as_deref()); + let (user_text, tool_results, image) = extract_responses_input(&body.input, body.instructions.as_deref()); // Handle tool result submission (function_call_output in input) let is_tool_result_turn = !tool_results.is_empty(); @@ -339,7 +354,7 @@ pub(crate) async fn handle_responses( state.mitm_store.set_active_cascade(&cascade_id).await; match state .backend - .send_message(&cascade_id, &user_text, model.model_enum) + .send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref()) .await { Ok((200, _)) => { diff --git a/src/api/util.rs b/src/api/util.rs index 393e327..2f4bee6 100644 --- a/src/api/util.rs +++ b/src/api/util.rs @@ -4,9 +4,12 @@ use axum::{ http::StatusCode, response::{sse::Event, IntoResponse, Json}, }; +use base64::Engine; use std::time::{SystemTime, UNIX_EPOCH}; +use tracing::warn; use super::types::{ErrorDetail, ErrorResponse}; +use crate::proto::ImageData; pub(crate) fn err_response( status: StatusCode, @@ -34,3 +37,55 @@ pub(crate) fn responses_sse_event(event_type: &str, data: serde_json::Value) -> .event(event_type) .data(serde_json::to_string(&data).unwrap()) } + +// ─── Image extraction ──────────────────────────────────────────────────────── + +/// Parse a base64 data URI like `data:image/png;base64,iVBOR...` into ImageData. +/// Also accepts plain URLs (returns None — we only support inline base64). +pub(crate) fn parse_data_uri(url: &str) -> Option { + // data:image/png;base64, + let rest = url.strip_prefix("data:")?; + let (header, b64) = rest.split_once(";base64,")?; + let mime_type = header.to_string(); + + match base64::engine::general_purpose::STANDARD.decode(b64) { + Ok(data) => { + tracing::info!(mime = %mime_type, size = data.len(), "Decoded inline image"); + Some(ImageData { mime_type, data }) + } + Err(e) => { + warn!(error = %e, "Failed to decode base64 image data"); + None + } + } +} + +/// Extract an image from an OpenAI content array item. +/// +/// Supports: +/// - Chat Completions: `{"type": "image_url", "image_url": {"url": "data:..."}}` +/// - Responses API: `{"type": "input_image", "image_url": "data:..."}` or +/// `{"type": "input_image", "url": "data:..."}` +pub(crate) fn extract_image_from_content(item: &serde_json::Value) -> Option { + let item_type = item["type"].as_str().unwrap_or(""); + + match item_type { + // OpenAI Chat Completions format + "image_url" => { + let url = item["image_url"]["url"].as_str()?; + parse_data_uri(url) + } + // OpenAI Responses API format + "input_image" => { + let url = item["image_url"].as_str() + .or_else(|| item["url"].as_str())?; + parse_data_uri(url) + } + _ => None, + } +} + +/// Extract the first image from a content array (Value::Array of content parts). +pub(crate) fn extract_first_image(content: &serde_json::Value) -> Option { + content.as_array()?.iter().find_map(extract_image_from_content) +} diff --git a/src/backend.rs b/src/backend.rs index 0be341d..dfcadc3 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -343,17 +343,29 @@ impl Backend { } /// SendUserCascadeMessage with binary protobuf body. + #[allow(dead_code)] pub async fn send_message( &self, cascade_id: &str, text: &str, model_enum: u32, + ) -> Result<(u16, Vec), String> { + self.send_message_with_image(cascade_id, text, model_enum, None).await + } + + /// SendUserCascadeMessage with optional image attachment. + pub async fn send_message_with_image( + &self, + cascade_id: &str, + text: &str, + model_enum: u32, + image: Option<&crate::proto::ImageData>, ) -> Result<(u16, Vec), String> { let token = self.oauth_token().await; if token.is_empty() { return Err("No OAuth token available".to_string()); } - let proto = crate::proto::build_request(cascade_id, text, &token, model_enum); + let proto = crate::proto::build_request_with_image(cascade_id, text, &token, model_enum, image); self.call_proto("SendUserCascadeMessage", proto).await } diff --git a/src/proto.rs b/src/proto.rs index ee56c31..fcfdc48 100644 --- a/src/proto.rs +++ b/src/proto.rs @@ -116,19 +116,49 @@ pub fn build_init_metadata( /// /// Field layout: /// 1: cascade_id (string) -/// 2: { 1: text } (message) +/// 2: ChatMessage { 1: text, 6: Blob { 1: mime_type, 2: data } } /// 3: metadata { 1: client_name, 3: oauth_token, 4: "en", 7: version, 12: client_name } /// 5: PlannerConfig { 1: inner_config, 7: { 1: 1 } } /// inner_config contains: f2 (conv mode), f13 (tool config), f15 (model), f21 (ephemeral), f32 (knowledge) /// 11: conversation_history = true +#[allow(dead_code)] pub fn build_request(cascade_id: &str, text: &str, oauth_token: &str, model_enum: u32) -> Vec { + build_request_with_image(cascade_id, text, oauth_token, model_enum, None) +} + +/// Image data to embed in the ChatMessage protobuf. +pub struct ImageData { + /// MIME type, e.g. "image/png", "image/jpeg", "image/webp", "image/gif" + pub mime_type: String, + /// Raw image bytes (NOT base64 — already decoded) + pub data: Vec, +} + +/// Build `SendUserCascadeMessageRequest` with optional image attachment. +/// +/// When `image` is Some, the ChatMessage includes a Blob field (field 6) +/// alongside the text (field 1). This matches how the real Antigravity +/// webview sends images: `ChatMessage { text, blob: { mime_type, data } }`. +pub fn build_request_with_image( + cascade_id: &str, + text: &str, + oauth_token: &str, + model_enum: u32, + image: Option<&ImageData>, +) -> Vec { let mut msg = Vec::with_capacity(256); // Field 1: cascade_id msg.extend(proto_string(1, cascade_id.as_bytes())); - // Field 2: { field 1: text } - msg.extend(proto_message(2, &proto_string(1, text.as_bytes()))); + // Field 2: ChatMessage { f1: text, f6?: Blob { f1: mime_type, f2: data } } + let mut chat_msg = proto_string(1, text.as_bytes()); + if let Some(img) = image { + let mut blob = proto_string(1, img.mime_type.as_bytes()); + blob.extend(proto_string(2, &img.data)); + chat_msg.extend(proto_message(6, &blob)); + } + msg.extend(proto_message(2, &chat_msg)); // Field 3: Metadata (Auth + Client ID) let mut meta = Vec::new();