feat: add image support across all endpoints (responses, completions, gemini)

This commit is contained in:
Nikketryhard
2026-02-15 17:25:33 -06:00
parent ca9f808ee3
commit 976c44fdd4
6 changed files with 168 additions and 33 deletions

View File

@@ -82,10 +82,13 @@ fn google_to_openai_finish_reason(stop_reason: Option<&str>) -> &'static str {
/// Builds the full conversation context including all messages (system, user,
/// assistant, tool) so the model has complete history — matching how OpenAI
/// sends the entire messages array to the model.
fn extract_chat_input(messages: &[CompletionMessage]) -> String {
// Always build the full conversation — we used to only take the last user
// message which broke multi-turn conversations via the messages array.
build_conversation_with_tools(messages)
fn extract_chat_input(messages: &[CompletionMessage]) -> (String, Option<crate::proto::ImageData>) {
// Extract image from last user message content array
let image = messages.iter().rev()
.find(|m| m.role == "user")
.and_then(|m| super::util::extract_first_image(&m.content));
// Always build the full conversation
(build_conversation_with_tools(messages), image)
}
/// Extract text content from a message's content field (string or array).
@@ -257,7 +260,7 @@ pub(crate) async fn handle_completions(
);
}
let user_text = extract_chat_input(&body.messages);
let (user_text, image) = extract_chat_input(&body.messages);
if user_text.is_empty() {
return err_response(
StatusCode::BAD_REQUEST,
@@ -302,7 +305,7 @@ pub(crate) async fn handle_completions(
state.mitm_store.set_active_cascade(&cascade_id).await;
match state
.backend
.send_message(&cascade_id, &user_text, model.model_enum)
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())
.await
{
Ok((200, _)) => {
@@ -361,7 +364,7 @@ pub(crate) async fn handle_completions(
match state.backend.create_cascade().await {
Ok(cid) => {
// Send the same message on each extra cascade
match state.backend.send_message(&cid, &user_text, model.model_enum).await {
match state.backend.send_message_with_image(&cid, &user_text, model.model_enum, image.as_ref()).await {
Ok((200, _)) => {
let bg = Arc::clone(&state.backend);
let cid2 = cid.clone();

View File

@@ -135,11 +135,12 @@ pub(crate) async fn handle_gemini(
);
}
// Extract user text
// Extract user text and optional image
let mut image: Option<crate::proto::ImageData> = None;
let user_text = match &body.input {
serde_json::Value::String(s) => s.clone(),
serde_json::Value::Array(arr) => {
// Support array input: can be strings or {text: "..."} objects
// Support array input: strings, {text: "..."}, or {inlineData: {mimeType, data}}
let mut parts: Vec<String> = Vec::new();
for item in arr {
match item {
@@ -148,6 +149,25 @@ pub(crate) async fn handle_gemini(
if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
parts.push(text.to_string());
}
// Gemini-native inlineData format
if image.is_none() {
if let Some(inline) = obj.get("inlineData") {
if let (Some(mime), Some(b64)) = (
inline["mimeType"].as_str(),
inline["data"].as_str(),
) {
if let Some(img) = super::util::parse_data_uri(
&format!("data:{mime};base64,{b64}")
) {
image = Some(img);
}
}
}
// Also support OpenAI-style image_url in Gemini input
if let Some(img) = super::util::extract_image_from_content(item) {
image = Some(img);
}
}
}
_ => {}
}
@@ -256,7 +276,7 @@ pub(crate) async fn handle_gemini(
state.mitm_store.set_active_cascade(&cascade_id).await;
match state
.backend
.send_message(&cascade_id, &user_text, model.model_enum)
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())
.await
{
Ok((200, _)) => {

View File

@@ -30,9 +30,14 @@ struct ToolResultInput {
}
/// Extract user text from Responses API `input` field.
/// Also extracts any function_call_output items for tool result handling.
fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>) -> (String, Vec<ToolResultInput>) {
/// Also extracts any function_call_output items for tool result handling,
/// and the first inline image (base64 data URI) if present.
fn extract_responses_input(
input: &serde_json::Value,
instructions: Option<&str>,
) -> (String, Vec<ToolResultInput>, Option<crate::proto::ImageData>) {
let mut tool_results: Vec<ToolResultInput> = Vec::new();
let mut image: Option<crate::proto::ImageData> = None;
let user_text = match input {
serde_json::Value::String(s) => s.clone(),
@@ -50,6 +55,10 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
});
}
}
// Extract first image from top-level input items
if image.is_none() {
image = super::util::extract_image_from_content(item);
}
}
// If we have tool results but no text, generate a follow-up prompt
@@ -76,7 +85,12 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
.iter()
.rev()
.find(|item| item["role"].as_str() == Some("user"))
.and_then(|item| match &item["content"] {
.and_then(|item| {
// Also scan content array for images
if image.is_none() {
image = super::util::extract_first_image(&item["content"]);
}
match &item["content"] {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Array(parts) => Some(
parts
@@ -90,6 +104,7 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
.join(" "),
),
_ => None,
}
})
.unwrap_or_default()
}
@@ -102,7 +117,7 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
_ => user_text,
};
(final_text, tool_results)
(final_text, tool_results, image)
}
/// Extract conversation/session ID from Responses API `conversation` field.
@@ -199,7 +214,7 @@ pub(crate) async fn handle_responses(
);
}
let (user_text, tool_results) = extract_responses_input(&body.input, body.instructions.as_deref());
let (user_text, tool_results, image) = extract_responses_input(&body.input, body.instructions.as_deref());
// Handle tool result submission (function_call_output in input)
let is_tool_result_turn = !tool_results.is_empty();
@@ -339,7 +354,7 @@ pub(crate) async fn handle_responses(
state.mitm_store.set_active_cascade(&cascade_id).await;
match state
.backend
.send_message(&cascade_id, &user_text, model.model_enum)
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())
.await
{
Ok((200, _)) => {

View File

@@ -4,9 +4,12 @@ use axum::{
http::StatusCode,
response::{sse::Event, IntoResponse, Json},
};
use base64::Engine;
use std::time::{SystemTime, UNIX_EPOCH};
use tracing::warn;
use super::types::{ErrorDetail, ErrorResponse};
use crate::proto::ImageData;
pub(crate) fn err_response(
status: StatusCode,
@@ -34,3 +37,55 @@ pub(crate) fn responses_sse_event(event_type: &str, data: serde_json::Value) ->
.event(event_type)
.data(serde_json::to_string(&data).unwrap())
}
// ─── Image extraction ────────────────────────────────────────────────────────
/// Parse a base64 data URI like `data:image/png;base64,iVBOR...` into ImageData.
/// Also accepts plain URLs (returns None — we only support inline base64).
pub(crate) fn parse_data_uri(url: &str) -> Option<ImageData> {
// data:image/png;base64,<data>
let rest = url.strip_prefix("data:")?;
let (header, b64) = rest.split_once(";base64,")?;
let mime_type = header.to_string();
match base64::engine::general_purpose::STANDARD.decode(b64) {
Ok(data) => {
tracing::info!(mime = %mime_type, size = data.len(), "Decoded inline image");
Some(ImageData { mime_type, data })
}
Err(e) => {
warn!(error = %e, "Failed to decode base64 image data");
None
}
}
}
/// Extract an image from an OpenAI content array item.
///
/// Supports:
/// - Chat Completions: `{"type": "image_url", "image_url": {"url": "data:..."}}`
/// - Responses API: `{"type": "input_image", "image_url": "data:..."}` or
/// `{"type": "input_image", "url": "data:..."}`
pub(crate) fn extract_image_from_content(item: &serde_json::Value) -> Option<ImageData> {
let item_type = item["type"].as_str().unwrap_or("");
match item_type {
// OpenAI Chat Completions format
"image_url" => {
let url = item["image_url"]["url"].as_str()?;
parse_data_uri(url)
}
// OpenAI Responses API format
"input_image" => {
let url = item["image_url"].as_str()
.or_else(|| item["url"].as_str())?;
parse_data_uri(url)
}
_ => None,
}
}
/// Extract the first image from a content array (Value::Array of content parts).
pub(crate) fn extract_first_image(content: &serde_json::Value) -> Option<ImageData> {
content.as_array()?.iter().find_map(extract_image_from_content)
}

View File

@@ -343,17 +343,29 @@ impl Backend {
}
/// SendUserCascadeMessage with binary protobuf body.
#[allow(dead_code)]
pub async fn send_message(
&self,
cascade_id: &str,
text: &str,
model_enum: u32,
) -> Result<(u16, Vec<u8>), String> {
self.send_message_with_image(cascade_id, text, model_enum, None).await
}
/// SendUserCascadeMessage with optional image attachment.
pub async fn send_message_with_image(
&self,
cascade_id: &str,
text: &str,
model_enum: u32,
image: Option<&crate::proto::ImageData>,
) -> Result<(u16, Vec<u8>), String> {
let token = self.oauth_token().await;
if token.is_empty() {
return Err("No OAuth token available".to_string());
}
let proto = crate::proto::build_request(cascade_id, text, &token, model_enum);
let proto = crate::proto::build_request_with_image(cascade_id, text, &token, model_enum, image);
self.call_proto("SendUserCascadeMessage", proto).await
}

View File

@@ -116,19 +116,49 @@ pub fn build_init_metadata(
///
/// Field layout:
/// 1: cascade_id (string)
/// 2: { 1: text } (message)
/// 2: ChatMessage { 1: text, 6: Blob { 1: mime_type, 2: data } }
/// 3: metadata { 1: client_name, 3: oauth_token, 4: "en", 7: version, 12: client_name }
/// 5: PlannerConfig { 1: inner_config, 7: { 1: 1 } }
/// inner_config contains: f2 (conv mode), f13 (tool config), f15 (model), f21 (ephemeral), f32 (knowledge)
/// 11: conversation_history = true
#[allow(dead_code)]
pub fn build_request(cascade_id: &str, text: &str, oauth_token: &str, model_enum: u32) -> Vec<u8> {
build_request_with_image(cascade_id, text, oauth_token, model_enum, None)
}
/// Image data to embed in the ChatMessage protobuf.
pub struct ImageData {
/// MIME type, e.g. "image/png", "image/jpeg", "image/webp", "image/gif"
pub mime_type: String,
/// Raw image bytes (NOT base64 — already decoded)
pub data: Vec<u8>,
}
/// Build `SendUserCascadeMessageRequest` with optional image attachment.
///
/// When `image` is Some, the ChatMessage includes a Blob field (field 6)
/// alongside the text (field 1). This matches how the real Antigravity
/// webview sends images: `ChatMessage { text, blob: { mime_type, data } }`.
pub fn build_request_with_image(
cascade_id: &str,
text: &str,
oauth_token: &str,
model_enum: u32,
image: Option<&ImageData>,
) -> Vec<u8> {
let mut msg = Vec::with_capacity(256);
// Field 1: cascade_id
msg.extend(proto_string(1, cascade_id.as_bytes()));
// Field 2: { field 1: text }
msg.extend(proto_message(2, &proto_string(1, text.as_bytes())));
// Field 2: ChatMessage { f1: text, f6?: Blob { f1: mime_type, f2: data } }
let mut chat_msg = proto_string(1, text.as_bytes());
if let Some(img) = image {
let mut blob = proto_string(1, img.mime_type.as_bytes());
blob.extend(proto_string(2, &img.data));
chat_msg.extend(proto_message(6, &blob));
}
msg.extend(proto_message(2, &chat_msg));
// Field 3: Metadata (Auth + Client ID)
let mut meta = Vec::new();