fix: gemini route, usage capture, search timeout, and trace finalization

- Add missing /v1/gemini POST route and handler
- Capture MitmEvent::Usage in gemini sync/streaming handlers
- Add retry counter (max 3) to search handler to prevent hang
- Add trace finalization at all gemini_sync channel exit points
- Fix UpstreamError trace outcome label
- Add timeout trace with error recording
- Dispatch Usage before ResponseComplete in SSE flush
This commit is contained in:
Nikketryhard
2026-02-18 01:31:18 -06:00
parent 48674f65da
commit 28d3296c87
11 changed files with 1480 additions and 221 deletions

View File

@@ -435,21 +435,33 @@ pub(crate) async fn handle_completions(
.map(|r| r.calls.clone())
.unwrap_or_default();
// Build event channel for streaming
let has_custom_tools = tools.is_some();
let (mitm_rx, event_tx) = if has_custom_tools && body.stream {
let (tx, rx) = tokio::sync::mpsc::channel(64);
(Some(rx), Some(tx))
} else {
(None, None)
};
// Build event channel — always created for MITM response path
let (tx, rx) = tokio::sync::mpsc::channel(64);
let (mitm_rx, event_tx) = (Some(rx), tx);
// Build pending tool results from latest round
let pending_tool_results = tool_rounds.last()
.map(|r| r.results.clone())
.unwrap_or_default();
// Register all per-request state atomically
// Start debug trace
let trace = state.trace.start(&cascade_id, "POST /v1/chat/completions", model_name, body.stream);
if let Some(ref t) = trace {
t.set_client_request(crate::trace::ClientRequestSummary {
message_count: body.messages.len(),
tool_count: body.tools.as_ref().map_or(0, |t| t.len()),
tool_round_count: tool_rounds.len(),
user_text_len: user_text.len(),
user_text_preview: user_text.chars().take(200).collect(),
system_prompt: body.messages.iter().any(|m| m.role == "system"),
has_image: image.is_some(),
}).await;
// Start turn 0
t.start_turn().await;
}
let mitm_gate = std::sync::Arc::new(tokio::sync::Notify::new());
let mitm_gate_clone = mitm_gate.clone();
state.mitm_store.register_request(crate::mitm::store::RequestContext {
cascade_id: cascade_id.clone(),
pending_user_text: user_text.clone(),
@@ -463,6 +475,9 @@ pub(crate) async fn handle_completions(
last_function_calls,
call_id_to_name,
created_at: std::time::Instant::now(),
gate: mitm_gate_clone,
trace_handle: trace.clone(),
trace_turn: 0,
}).await;
// Send REAL user text to LS
@@ -480,6 +495,7 @@ pub(crate) async fn handle_completions(
}
Ok((status, _)) => {
state.mitm_store.remove_request(&cascade_id).await;
if let Some(ref t) = trace { t.record_error(format!("Backend returned {status}")).await; t.finish("backend_error").await; }
return err_response(
StatusCode::BAD_GATEWAY,
format!("Backend returned {status}"),
@@ -488,6 +504,7 @@ pub(crate) async fn handle_completions(
}
Err(e) => {
state.mitm_store.remove_request(&cascade_id).await;
if let Some(ref t) = trace { t.record_error(format!("Send failed: {e}")).await; t.finish("send_error").await; }
return err_response(
StatusCode::BAD_GATEWAY,
format!("Send failed: {e}"),
@@ -496,6 +513,34 @@ pub(crate) async fn handle_completions(
}
}
// Wait for MITM gate: 5s → 502 if MITM enabled
let gate_start = std::time::Instant::now();
let gate_matched = tokio::time::timeout(
std::time::Duration::from_secs(5),
mitm_gate.notified(),
).await;
let gate_wait_ms = gate_start.elapsed().as_millis() as u64;
if gate_matched.is_err() {
if state.mitm_enabled {
state.mitm_store.remove_request(&cascade_id).await;
if let Some(ref t) = trace {
t.record_error("MITM gate timeout (5s)".to_string()).await;
t.finish("mitm_timeout").await;
}
return err_response(
StatusCode::BAD_GATEWAY,
"MITM proxy did not match request within 5s".to_string(),
"mitm_timeout",
);
}
warn!(cascade = %cascade_id, "MITM gate timeout (--no-mitm mode)");
} else {
debug!(cascade = %cascade_id, gate_wait_ms, "MITM gate signaled — request matched");
if let Some(ref t) = trace {
t.record_mitm_match(0, gate_wait_ms).await;
}
}
let completion_id = format!(
"chatcmpl-{}",
uuid::Uuid::new_v4().to_string().replace('-', "")
@@ -515,6 +560,7 @@ pub(crate) async fn handle_completions(
body.timeout,
include_usage,
mitm_rx,
trace,
)
.await
} else if n <= 1 {
@@ -524,6 +570,7 @@ pub(crate) async fn handle_completions(
model_name.to_string(),
cascade_id,
body.timeout,
trace,
)
.await
} else {
@@ -653,6 +700,7 @@ async fn chat_completions_stream(
timeout: u64,
include_usage: bool,
mitm_rx: Option<tokio::sync::mpsc::Receiver<crate::mitm::store::MitmEvent>>,
trace: Option<crate::trace::TraceHandle>,
) -> axum::response::Response {
let stream = async_stream::stream! {
let start = std::time::Instant::now();
@@ -774,6 +822,21 @@ async fn chat_completions_stream(
}
yield Ok(Event::default().data("[DONE]"));
state.mitm_store.remove_request(&cascade_id).await;
if let Some(ref t) = trace {
let (ipt, opt, crt2, tht) = if let Some(ref u) = last_usage {
(u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens)
} else { (0, 0, 0, 0) };
t.record_response(0, crate::trace::ResponseSummary {
text_len: 0, thinking_len: 0, text_preview: String::new(),
finish_reason: Some("tool_calls".to_string()),
function_calls: calls.iter().map(|fc| crate::trace::FunctionCallSummary {
name: fc.name.clone(), args_preview: serde_json::to_string(&fc.args).unwrap_or_default().chars().take(200).collect(),
}).collect(),
grounding: false,
}).await;
t.set_usage(crate::trace::TrackedUsage { input_tokens: ipt, output_tokens: opt, thinking_tokens: tht, cache_read: crt2 }).await;
t.finish("tool_call").await;
}
return;
}
MitmEvent::ResponseComplete => {
@@ -802,6 +865,19 @@ async fn chat_completions_stream(
}
yield Ok(Event::default().data("[DONE]"));
state.mitm_store.remove_request(&cascade_id).await;
if let Some(ref t) = trace {
let (ipt, opt, crt2, tht) = if let Some(ref u) = mitm {
(u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens)
} else { (0, 0, 0, 0) };
t.record_response(0, crate::trace::ResponseSummary {
text_len: acc_text.len(), thinking_len: acc_thinking.len(),
text_preview: acc_text.chars().take(200).collect(),
finish_reason: Some("stop".to_string()),
function_calls: Vec::new(), grounding: false,
}).await;
t.set_usage(crate::trace::TrackedUsage { input_tokens: ipt, output_tokens: opt, thinking_tokens: tht, cache_read: crt2 }).await;
t.finish("completed").await;
}
return;
} else if !acc_thinking.is_empty() && !did_unblock_ls {
// Thinking-only response — LS needs follow-up API calls.
@@ -844,6 +920,19 @@ async fn chat_completions_stream(
}
yield Ok(Event::default().data("[DONE]"));
state.mitm_store.remove_request(&cascade_id).await;
if let Some(ref t) = trace {
let (ipt, opt, crt2, tht) = if let Some(ref u) = mitm {
(u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens)
} else { (0, 0, 0, 0) };
t.record_response(0, crate::trace::ResponseSummary {
text_len: 0, thinking_len: acc_thinking.len(),
text_preview: String::new(),
finish_reason: Some("stop".to_string()),
function_calls: Vec::new(), grounding: false,
}).await;
t.set_usage(crate::trace::TrackedUsage { input_tokens: ipt, output_tokens: opt, thinking_tokens: tht, cache_read: crt2 }).await;
t.finish("thinking_timeout").await;
}
return;
}
// Don't break — wait for more channel events
@@ -860,6 +949,14 @@ async fn chat_completions_stream(
)));
yield Ok(Event::default().data("[DONE]"));
state.mitm_store.remove_request(&cascade_id).await;
if let Some(ref t) = trace {
t.record_response(0, crate::trace::ResponseSummary {
text_len: 0, thinking_len: 0, text_preview: String::new(),
finish_reason: Some("stop".to_string()),
function_calls: Vec::new(), grounding: false,
}).await;
t.finish("empty_response").await;
}
return;
}
continue 'channel_loop;
@@ -900,6 +997,15 @@ async fn chat_completions_stream(
)));
}
yield Ok(Event::default().data("[DONE]"));
if let Some(ref t) = trace {
t.record_response(0, crate::trace::ResponseSummary {
text_len: last_text.len(), thinking_len: last_thinking_len,
text_preview: last_text.chars().take(200).collect(),
finish_reason: Some("stop".to_string()),
function_calls: Vec::new(), grounding: false,
}).await;
t.finish("channel_closed").await;
}
return;
} else {
// ── Fallback: LS steps (no MITM capture active) ──
@@ -1046,6 +1152,7 @@ async fn chat_completions_sync(
model_name: String,
cascade_id: String,
timeout: u64,
trace: Option<crate::trace::TraceHandle>,
) -> axum::response::Response {
let result = poll_for_response(&state, &cascade_id, timeout).await;
if let Some(ref err) = result.upstream_error {
@@ -1084,6 +1191,27 @@ async fn chat_completions_sync(
message["reasoning_content"] = serde_json::json!(thinking);
}
// Record trace data
if let Some(ref t) = trace {
t.record_response(0, crate::trace::ResponseSummary {
text_len: result.text.len(),
thinking_len: result.thinking.as_ref().map_or(0, |s| s.len()),
text_preview: result.text.chars().take(200).collect(),
finish_reason: Some(finish_reason.to_string()),
function_calls: Vec::new(),
grounding: false,
}).await;
if prompt_tokens > 0 || completion_tokens > 0 {
t.set_usage(crate::trace::TrackedUsage {
input_tokens: prompt_tokens,
output_tokens: completion_tokens,
thinking_tokens: thinking_tokens,
cache_read: cached_tokens,
}).await;
}
t.finish("completed").await;
}
Json(serde_json::json!({
"id": completion_id,
"object": "chat.completion",