feat: add image support across all endpoints (responses, completions, gemini)
This commit is contained in:
@@ -82,10 +82,13 @@ fn google_to_openai_finish_reason(stop_reason: Option<&str>) -> &'static str {
|
||||
/// Builds the full conversation context including all messages (system, user,
|
||||
/// assistant, tool) so the model has complete history — matching how OpenAI
|
||||
/// sends the entire messages array to the model.
|
||||
fn extract_chat_input(messages: &[CompletionMessage]) -> String {
|
||||
// Always build the full conversation — we used to only take the last user
|
||||
// message which broke multi-turn conversations via the messages array.
|
||||
build_conversation_with_tools(messages)
|
||||
fn extract_chat_input(messages: &[CompletionMessage]) -> (String, Option<crate::proto::ImageData>) {
|
||||
// Extract image from last user message content array
|
||||
let image = messages.iter().rev()
|
||||
.find(|m| m.role == "user")
|
||||
.and_then(|m| super::util::extract_first_image(&m.content));
|
||||
// Always build the full conversation
|
||||
(build_conversation_with_tools(messages), image)
|
||||
}
|
||||
|
||||
/// Extract text content from a message's content field (string or array).
|
||||
@@ -257,7 +260,7 @@ pub(crate) async fn handle_completions(
|
||||
);
|
||||
}
|
||||
|
||||
let user_text = extract_chat_input(&body.messages);
|
||||
let (user_text, image) = extract_chat_input(&body.messages);
|
||||
if user_text.is_empty() {
|
||||
return err_response(
|
||||
StatusCode::BAD_REQUEST,
|
||||
@@ -302,7 +305,7 @@ pub(crate) async fn handle_completions(
|
||||
state.mitm_store.set_active_cascade(&cascade_id).await;
|
||||
match state
|
||||
.backend
|
||||
.send_message(&cascade_id, &user_text, model.model_enum)
|
||||
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())
|
||||
.await
|
||||
{
|
||||
Ok((200, _)) => {
|
||||
@@ -361,7 +364,7 @@ pub(crate) async fn handle_completions(
|
||||
match state.backend.create_cascade().await {
|
||||
Ok(cid) => {
|
||||
// Send the same message on each extra cascade
|
||||
match state.backend.send_message(&cid, &user_text, model.model_enum).await {
|
||||
match state.backend.send_message_with_image(&cid, &user_text, model.model_enum, image.as_ref()).await {
|
||||
Ok((200, _)) => {
|
||||
let bg = Arc::clone(&state.backend);
|
||||
let cid2 = cid.clone();
|
||||
|
||||
@@ -135,11 +135,12 @@ pub(crate) async fn handle_gemini(
|
||||
);
|
||||
}
|
||||
|
||||
// Extract user text
|
||||
// Extract user text and optional image
|
||||
let mut image: Option<crate::proto::ImageData> = None;
|
||||
let user_text = match &body.input {
|
||||
serde_json::Value::String(s) => s.clone(),
|
||||
serde_json::Value::Array(arr) => {
|
||||
// Support array input: can be strings or {text: "..."} objects
|
||||
// Support array input: strings, {text: "..."}, or {inlineData: {mimeType, data}}
|
||||
let mut parts: Vec<String> = Vec::new();
|
||||
for item in arr {
|
||||
match item {
|
||||
@@ -148,6 +149,25 @@ pub(crate) async fn handle_gemini(
|
||||
if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
|
||||
parts.push(text.to_string());
|
||||
}
|
||||
// Gemini-native inlineData format
|
||||
if image.is_none() {
|
||||
if let Some(inline) = obj.get("inlineData") {
|
||||
if let (Some(mime), Some(b64)) = (
|
||||
inline["mimeType"].as_str(),
|
||||
inline["data"].as_str(),
|
||||
) {
|
||||
if let Some(img) = super::util::parse_data_uri(
|
||||
&format!("data:{mime};base64,{b64}")
|
||||
) {
|
||||
image = Some(img);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Also support OpenAI-style image_url in Gemini input
|
||||
if let Some(img) = super::util::extract_image_from_content(item) {
|
||||
image = Some(img);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -256,7 +276,7 @@ pub(crate) async fn handle_gemini(
|
||||
state.mitm_store.set_active_cascade(&cascade_id).await;
|
||||
match state
|
||||
.backend
|
||||
.send_message(&cascade_id, &user_text, model.model_enum)
|
||||
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())
|
||||
.await
|
||||
{
|
||||
Ok((200, _)) => {
|
||||
|
||||
@@ -30,9 +30,14 @@ struct ToolResultInput {
|
||||
}
|
||||
|
||||
/// Extract user text from Responses API `input` field.
|
||||
/// Also extracts any function_call_output items for tool result handling.
|
||||
fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>) -> (String, Vec<ToolResultInput>) {
|
||||
/// Also extracts any function_call_output items for tool result handling,
|
||||
/// and the first inline image (base64 data URI) if present.
|
||||
fn extract_responses_input(
|
||||
input: &serde_json::Value,
|
||||
instructions: Option<&str>,
|
||||
) -> (String, Vec<ToolResultInput>, Option<crate::proto::ImageData>) {
|
||||
let mut tool_results: Vec<ToolResultInput> = Vec::new();
|
||||
let mut image: Option<crate::proto::ImageData> = None;
|
||||
|
||||
let user_text = match input {
|
||||
serde_json::Value::String(s) => s.clone(),
|
||||
@@ -50,6 +55,10 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
|
||||
});
|
||||
}
|
||||
}
|
||||
// Extract first image from top-level input items
|
||||
if image.is_none() {
|
||||
image = super::util::extract_image_from_content(item);
|
||||
}
|
||||
}
|
||||
|
||||
// If we have tool results but no text, generate a follow-up prompt
|
||||
@@ -76,7 +85,12 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
|
||||
.iter()
|
||||
.rev()
|
||||
.find(|item| item["role"].as_str() == Some("user"))
|
||||
.and_then(|item| match &item["content"] {
|
||||
.and_then(|item| {
|
||||
// Also scan content array for images
|
||||
if image.is_none() {
|
||||
image = super::util::extract_first_image(&item["content"]);
|
||||
}
|
||||
match &item["content"] {
|
||||
serde_json::Value::String(s) => Some(s.clone()),
|
||||
serde_json::Value::Array(parts) => Some(
|
||||
parts
|
||||
@@ -90,6 +104,7 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
|
||||
.join(" "),
|
||||
),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
@@ -102,7 +117,7 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
|
||||
_ => user_text,
|
||||
};
|
||||
|
||||
(final_text, tool_results)
|
||||
(final_text, tool_results, image)
|
||||
}
|
||||
|
||||
/// Extract conversation/session ID from Responses API `conversation` field.
|
||||
@@ -199,7 +214,7 @@ pub(crate) async fn handle_responses(
|
||||
);
|
||||
}
|
||||
|
||||
let (user_text, tool_results) = extract_responses_input(&body.input, body.instructions.as_deref());
|
||||
let (user_text, tool_results, image) = extract_responses_input(&body.input, body.instructions.as_deref());
|
||||
|
||||
// Handle tool result submission (function_call_output in input)
|
||||
let is_tool_result_turn = !tool_results.is_empty();
|
||||
@@ -339,7 +354,7 @@ pub(crate) async fn handle_responses(
|
||||
state.mitm_store.set_active_cascade(&cascade_id).await;
|
||||
match state
|
||||
.backend
|
||||
.send_message(&cascade_id, &user_text, model.model_enum)
|
||||
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())
|
||||
.await
|
||||
{
|
||||
Ok((200, _)) => {
|
||||
|
||||
@@ -4,9 +4,12 @@ use axum::{
|
||||
http::StatusCode,
|
||||
response::{sse::Event, IntoResponse, Json},
|
||||
};
|
||||
use base64::Engine;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use tracing::warn;
|
||||
|
||||
use super::types::{ErrorDetail, ErrorResponse};
|
||||
use crate::proto::ImageData;
|
||||
|
||||
pub(crate) fn err_response(
|
||||
status: StatusCode,
|
||||
@@ -34,3 +37,55 @@ pub(crate) fn responses_sse_event(event_type: &str, data: serde_json::Value) ->
|
||||
.event(event_type)
|
||||
.data(serde_json::to_string(&data).unwrap())
|
||||
}
|
||||
|
||||
// ─── Image extraction ────────────────────────────────────────────────────────
|
||||
|
||||
/// Parse a base64 data URI like `data:image/png;base64,iVBOR...` into ImageData.
|
||||
/// Also accepts plain URLs (returns None — we only support inline base64).
|
||||
pub(crate) fn parse_data_uri(url: &str) -> Option<ImageData> {
|
||||
// data:image/png;base64,<data>
|
||||
let rest = url.strip_prefix("data:")?;
|
||||
let (header, b64) = rest.split_once(";base64,")?;
|
||||
let mime_type = header.to_string();
|
||||
|
||||
match base64::engine::general_purpose::STANDARD.decode(b64) {
|
||||
Ok(data) => {
|
||||
tracing::info!(mime = %mime_type, size = data.len(), "Decoded inline image");
|
||||
Some(ImageData { mime_type, data })
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(error = %e, "Failed to decode base64 image data");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract an image from an OpenAI content array item.
|
||||
///
|
||||
/// Supports:
|
||||
/// - Chat Completions: `{"type": "image_url", "image_url": {"url": "data:..."}}`
|
||||
/// - Responses API: `{"type": "input_image", "image_url": "data:..."}` or
|
||||
/// `{"type": "input_image", "url": "data:..."}`
|
||||
pub(crate) fn extract_image_from_content(item: &serde_json::Value) -> Option<ImageData> {
|
||||
let item_type = item["type"].as_str().unwrap_or("");
|
||||
|
||||
match item_type {
|
||||
// OpenAI Chat Completions format
|
||||
"image_url" => {
|
||||
let url = item["image_url"]["url"].as_str()?;
|
||||
parse_data_uri(url)
|
||||
}
|
||||
// OpenAI Responses API format
|
||||
"input_image" => {
|
||||
let url = item["image_url"].as_str()
|
||||
.or_else(|| item["url"].as_str())?;
|
||||
parse_data_uri(url)
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the first image from a content array (Value::Array of content parts).
|
||||
pub(crate) fn extract_first_image(content: &serde_json::Value) -> Option<ImageData> {
|
||||
content.as_array()?.iter().find_map(extract_image_from_content)
|
||||
}
|
||||
|
||||
@@ -343,17 +343,29 @@ impl Backend {
|
||||
}
|
||||
|
||||
/// SendUserCascadeMessage with binary protobuf body.
|
||||
#[allow(dead_code)]
|
||||
pub async fn send_message(
|
||||
&self,
|
||||
cascade_id: &str,
|
||||
text: &str,
|
||||
model_enum: u32,
|
||||
) -> Result<(u16, Vec<u8>), String> {
|
||||
self.send_message_with_image(cascade_id, text, model_enum, None).await
|
||||
}
|
||||
|
||||
/// SendUserCascadeMessage with optional image attachment.
|
||||
pub async fn send_message_with_image(
|
||||
&self,
|
||||
cascade_id: &str,
|
||||
text: &str,
|
||||
model_enum: u32,
|
||||
image: Option<&crate::proto::ImageData>,
|
||||
) -> Result<(u16, Vec<u8>), String> {
|
||||
let token = self.oauth_token().await;
|
||||
if token.is_empty() {
|
||||
return Err("No OAuth token available".to_string());
|
||||
}
|
||||
let proto = crate::proto::build_request(cascade_id, text, &token, model_enum);
|
||||
let proto = crate::proto::build_request_with_image(cascade_id, text, &token, model_enum, image);
|
||||
self.call_proto("SendUserCascadeMessage", proto).await
|
||||
}
|
||||
|
||||
|
||||
36
src/proto.rs
36
src/proto.rs
@@ -116,19 +116,49 @@ pub fn build_init_metadata(
|
||||
///
|
||||
/// Field layout:
|
||||
/// 1: cascade_id (string)
|
||||
/// 2: { 1: text } (message)
|
||||
/// 2: ChatMessage { 1: text, 6: Blob { 1: mime_type, 2: data } }
|
||||
/// 3: metadata { 1: client_name, 3: oauth_token, 4: "en", 7: version, 12: client_name }
|
||||
/// 5: PlannerConfig { 1: inner_config, 7: { 1: 1 } }
|
||||
/// inner_config contains: f2 (conv mode), f13 (tool config), f15 (model), f21 (ephemeral), f32 (knowledge)
|
||||
/// 11: conversation_history = true
|
||||
#[allow(dead_code)]
|
||||
pub fn build_request(cascade_id: &str, text: &str, oauth_token: &str, model_enum: u32) -> Vec<u8> {
|
||||
build_request_with_image(cascade_id, text, oauth_token, model_enum, None)
|
||||
}
|
||||
|
||||
/// Image data to embed in the ChatMessage protobuf.
|
||||
pub struct ImageData {
|
||||
/// MIME type, e.g. "image/png", "image/jpeg", "image/webp", "image/gif"
|
||||
pub mime_type: String,
|
||||
/// Raw image bytes (NOT base64 — already decoded)
|
||||
pub data: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Build `SendUserCascadeMessageRequest` with optional image attachment.
|
||||
///
|
||||
/// When `image` is Some, the ChatMessage includes a Blob field (field 6)
|
||||
/// alongside the text (field 1). This matches how the real Antigravity
|
||||
/// webview sends images: `ChatMessage { text, blob: { mime_type, data } }`.
|
||||
pub fn build_request_with_image(
|
||||
cascade_id: &str,
|
||||
text: &str,
|
||||
oauth_token: &str,
|
||||
model_enum: u32,
|
||||
image: Option<&ImageData>,
|
||||
) -> Vec<u8> {
|
||||
let mut msg = Vec::with_capacity(256);
|
||||
|
||||
// Field 1: cascade_id
|
||||
msg.extend(proto_string(1, cascade_id.as_bytes()));
|
||||
|
||||
// Field 2: { field 1: text }
|
||||
msg.extend(proto_message(2, &proto_string(1, text.as_bytes())));
|
||||
// Field 2: ChatMessage { f1: text, f6?: Blob { f1: mime_type, f2: data } }
|
||||
let mut chat_msg = proto_string(1, text.as_bytes());
|
||||
if let Some(img) = image {
|
||||
let mut blob = proto_string(1, img.mime_type.as_bytes());
|
||||
blob.extend(proto_string(2, &img.data));
|
||||
chat_msg.extend(proto_message(6, &blob));
|
||||
}
|
||||
msg.extend(proto_message(2, &chat_msg));
|
||||
|
||||
// Field 3: Metadata (Auth + Client ID)
|
||||
let mut meta = Vec::new();
|
||||
|
||||
Reference in New Issue
Block a user