feat: add image support across all endpoints (responses, completions, gemini)
This commit is contained in:
@@ -30,9 +30,14 @@ struct ToolResultInput {
|
||||
}
|
||||
|
||||
/// Extract user text from Responses API `input` field.
|
||||
/// Also extracts any function_call_output items for tool result handling.
|
||||
fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>) -> (String, Vec<ToolResultInput>) {
|
||||
/// Also extracts any function_call_output items for tool result handling,
|
||||
/// and the first inline image (base64 data URI) if present.
|
||||
fn extract_responses_input(
|
||||
input: &serde_json::Value,
|
||||
instructions: Option<&str>,
|
||||
) -> (String, Vec<ToolResultInput>, Option<crate::proto::ImageData>) {
|
||||
let mut tool_results: Vec<ToolResultInput> = Vec::new();
|
||||
let mut image: Option<crate::proto::ImageData> = None;
|
||||
|
||||
let user_text = match input {
|
||||
serde_json::Value::String(s) => s.clone(),
|
||||
@@ -50,6 +55,10 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
|
||||
});
|
||||
}
|
||||
}
|
||||
// Extract first image from top-level input items
|
||||
if image.is_none() {
|
||||
image = super::util::extract_image_from_content(item);
|
||||
}
|
||||
}
|
||||
|
||||
// If we have tool results but no text, generate a follow-up prompt
|
||||
@@ -76,20 +85,26 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
|
||||
.iter()
|
||||
.rev()
|
||||
.find(|item| item["role"].as_str() == Some("user"))
|
||||
.and_then(|item| match &item["content"] {
|
||||
serde_json::Value::String(s) => Some(s.clone()),
|
||||
serde_json::Value::Array(parts) => Some(
|
||||
parts
|
||||
.iter()
|
||||
.filter(|p| {
|
||||
let t = p["type"].as_str().unwrap_or("");
|
||||
t == "input_text" || t == "text"
|
||||
})
|
||||
.filter_map(|p| p["text"].as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
),
|
||||
_ => None,
|
||||
.and_then(|item| {
|
||||
// Also scan content array for images
|
||||
if image.is_none() {
|
||||
image = super::util::extract_first_image(&item["content"]);
|
||||
}
|
||||
match &item["content"] {
|
||||
serde_json::Value::String(s) => Some(s.clone()),
|
||||
serde_json::Value::Array(parts) => Some(
|
||||
parts
|
||||
.iter()
|
||||
.filter(|p| {
|
||||
let t = p["type"].as_str().unwrap_or("");
|
||||
t == "input_text" || t == "text"
|
||||
})
|
||||
.filter_map(|p| p["text"].as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
@@ -102,7 +117,7 @@ fn extract_responses_input(input: &serde_json::Value, instructions: Option<&str>
|
||||
_ => user_text,
|
||||
};
|
||||
|
||||
(final_text, tool_results)
|
||||
(final_text, tool_results, image)
|
||||
}
|
||||
|
||||
/// Extract conversation/session ID from Responses API `conversation` field.
|
||||
@@ -199,7 +214,7 @@ pub(crate) async fn handle_responses(
|
||||
);
|
||||
}
|
||||
|
||||
let (user_text, tool_results) = extract_responses_input(&body.input, body.instructions.as_deref());
|
||||
let (user_text, tool_results, image) = extract_responses_input(&body.input, body.instructions.as_deref());
|
||||
|
||||
// Handle tool result submission (function_call_output in input)
|
||||
let is_tool_result_turn = !tool_results.is_empty();
|
||||
@@ -339,7 +354,7 @@ pub(crate) async fn handle_responses(
|
||||
state.mitm_store.set_active_cascade(&cascade_id).await;
|
||||
match state
|
||||
.backend
|
||||
.send_message(&cascade_id, &user_text, model.model_enum)
|
||||
.send_message_with_image(&cascade_id, &user_text, model.model_enum, image.as_ref())
|
||||
.await
|
||||
{
|
||||
Ok((200, _)) => {
|
||||
|
||||
Reference in New Issue
Block a user