feat: add image support across all endpoints (responses, completions, gemini)

This commit is contained in:
Nikketryhard
2026-02-15 17:25:33 -06:00
parent ca9f808ee3
commit 976c44fdd4
6 changed files with 168 additions and 33 deletions

View File

@@ -30,9 +30,14 @@ struct ToolResultInput {
}
/// Extract user text from Responses API `input` field.
/// Also extracts any function_call_output items for tool result handling.
fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>) -> (String, Vec<ToolResultInput>) {
/// Also extracts any function_call_output items for tool result handling,
/// and the first inline image (base64 data URI) if present.
fn extract_responses_input(
input: &serde_json::Value,
instructions: Option<&str>,
) -> (String, Vec<ToolResultInput>, Option<crate::proto::ImageData>) {
let mut tool_results: Vec<ToolResultInput> = Vec::new();
let mut image: Option<crate::proto::ImageData> = None;
let user_text = match input {
serde_json::Value::String(s) => s.clone(),
@@ -50,6 +55,10 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
});
}
}
// Extract first image from top-level input items
if image.is_none() {
image = super::util::extract_image_from_content(item);
}
}
// If we have tool results but no text, generate a follow-up prompt
@@ -76,20 +85,26 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
.iter()
.rev()
.find(|item| item["role"].as_str() == Some("user"))
.and_then(|item| match &item["content"] {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Array(parts) => Some(
parts
.iter()
.filter(|p| {
let t = p["type"].as_str().unwrap_or("");
t == "input_text" || t == "text"
})
.filter_map(|p| p["text"].as_str())
.collect::<Vec<_>>()
.join(" "),
),
_ => None,
.and_then(|item| {
// Also scan content array for images
if image.is_none() {
image = super::util::extract_first_image(&item["content"]);
}
match &item["content"] {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Array(parts) => Some(
parts
.iter()
.filter(|p| {
let t = p["type"].as_str().unwrap_or("");
t == "input_text" || t == "text"
})
.filter_map(|p| p["text"].as_str())
.collect::<Vec<_>>()
.join(" "),
),
_ => None,
}
})
.unwrap_or_default()
}
@@ -102,7 +117,7 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
_ => user_text,
};
(final_text, tool_results)
(final_text, tool_results, image)
}
/// Extract conversation/session ID from Responses API `conversation` field.
@@ -199,7 +214,7 @@ pub(crate) async fn handle_responses(
);
}
let (user_text, tool_results) = extract_responses_input(&body.input, body.instructions.as_deref());
let (user_text, tool_results, image) = extract_responses_input(&body.input, body.instructions.as_deref());
// Handle tool result submission (function_call_output in input)
let is_tool_result_turn = !tool_results.is_empty();
@@ -339,7 +354,7 @@ pub(crate) async fn handle_responses(
state.mitm_store.set_active_cascade(&cascade_id).await;
match state
.backend
.send_message(&cascade_id, &user_text, model.model_enum)
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())
.await
{
Ok((200, _)) => {