feat: forward generation params via MITM + add usageMetadata to Gemini

- Add GenerationParams struct to MitmStore for temperature, top_p,
  top_k, max_output_tokens, stop_sequences, frequency/presence_penalty
- MITM modify_request injects params into request.generationConfig
- All 3 endpoints (Completions, Responses, Gemini) store client params
- Add usageMetadata to Gemini sync responses (promptTokenCount,
  candidatesTokenCount, totalTokenCount, thoughtsTokenCount)
- Add generation param fields to GeminiRequest (temperature, topP, etc.)
- Completions stream_options.include_usage emits final usage chunk
- Completions reasoning_tokens in completion_tokens_details
- Update endpoint gap analysis doc (all high-priority gaps resolved)
This commit is contained in:
Nikketryhard
2026-02-15 14:23:05 -06:00
parent 735c3e357d
commit b1bd57ab5e
9 changed files with 1216 additions and 46 deletions

View File

@@ -242,6 +242,25 @@ pub(crate) async fn handle_responses(
state.mitm_store.set_tool_config(gemini_config).await;
}
// Store generation parameters for MITM injection
{
use crate::mitm::store::GenerationParams;
let gp = GenerationParams {
temperature: body.temperature,
top_p: body.top_p,
top_k: None,
max_output_tokens: body.max_output_tokens,
stop_sequences: None,
frequency_penalty: None,
presence_penalty: None,
};
if gp.temperature.is_some() || gp.top_p.is_some() || gp.max_output_tokens.is_some() {
state.mitm_store.set_generation_params(gp).await;
} else {
state.mitm_store.clear_generation_params().await;
}
}
let response_id = format!(
"resp_{}",
uuid::Uuid::new_v4().to_string().replace('-', "")
@@ -1003,19 +1022,21 @@ async fn handle_responses_stream(
}
};
let mut thinking_started = false;
let mut thinking_done = false;
let mut last_thinking_len: usize = 0;
while start.elapsed().as_secs() < timeout {
if let Ok((status, data)) = state.backend.get_steps(&cascade_id).await {
if status == 200 {
if let Some(steps) = data["steps"].as_array() {
// Check for thinking content (appears before response text)
if !thinking_emitted {
if let Some(tc) = extract_thinking_content(steps) {
thinking_text = Some(tc.clone());
thinking_emitted = true;
// ── Phase 1: Stream thinking deltas progressively ──
if let Some(tc) = extract_thinking_content(steps) {
if !thinking_started {
// First time we see thinking — emit structure events
thinking_started = true;
// Emit full reasoning event sequence at output_index 0
yield Ok(responses_sse_event(
"response.output_item.added",
serde_json::json!({
@@ -1040,6 +1061,14 @@ async fn handle_responses_stream(
"part": { "type": "summary_text", "text": "" },
}),
));
}
// Emit delta if thinking text has grown
if tc.len() > last_thinking_len {
let delta = &tc[last_thinking_len..];
last_thinking_len = tc.len();
thinking_text = Some(tc.clone());
yield Ok(responses_sse_event(
"response.reasoning_summary_text.delta",
serde_json::json!({
@@ -1048,9 +1077,22 @@ async fn handle_responses_stream(
"item_id": &reasoning_id,
"output_index": 0,
"summary_index": 0,
"delta": &tc,
"delta": delta,
}),
));
}
}
// ── Phase 2: Stream text deltas ──
let text = extract_response_text(steps);
let msg_output_index: u32 = if thinking_started { 1 } else { 0 };
if !text.is_empty() && text != last_text {
// Finalize thinking when response text first appears
if thinking_started && !thinking_done {
thinking_done = true;
let final_thinking = thinking_text.clone().unwrap_or_default();
yield Ok(responses_sse_event(
"response.reasoning_summary_text.done",
serde_json::json!({
@@ -1059,7 +1101,7 @@ async fn handle_responses_stream(
"item_id": &reasoning_id,
"output_index": 0,
"summary_index": 0,
"text": &tc,
"text": &final_thinking,
}),
));
yield Ok(responses_sse_event(
@@ -1070,7 +1112,7 @@ async fn handle_responses_stream(
"item_id": &reasoning_id,
"output_index": 0,
"summary_index": 0,
"part": { "type": "summary_text", "text": &tc },
"part": { "type": "summary_text", "text": &final_thinking },
}),
));
yield Ok(responses_sse_event(
@@ -1084,19 +1126,13 @@ async fn handle_responses_stream(
"type": "reasoning",
"summary": [{
"type": "summary_text",
"text": &tc,
"text": &final_thinking,
}],
},
}),
));
}
}
// ── Phase 2: Stream text deltas ──
let text = extract_response_text(steps);
let msg_output_index: u32 = if thinking_emitted { 1 } else { 0 };
if !text.is_empty() && text != last_text {
// Emit message output_item.added on first text
if !message_started {
message_started = true;
@@ -1153,7 +1189,7 @@ async fn handle_responses_stream(
if is_response_done(steps) && !last_text.is_empty() {
debug!("Response done, text length={}", last_text.len());
let mu = extract_model_usage(steps);
let msg_idx: u32 = if thinking_emitted { 1 } else { 0 };
let msg_idx: u32 = if thinking_started { 1 } else { 0 };
let (usage, mitm_thinking) = usage_from_poll(&state.mitm_store, &cascade_id, &mu, &params.user_text, &last_text).await;
let ts = extract_thinking_signature(steps);
// Use already-captured thinking, or MITM thinking, or LS thinking
@@ -1179,7 +1215,7 @@ async fn handle_responses_stream(
if run_status.contains("IDLE") && !last_text.is_empty() {
debug!("Trajectory IDLE, text length={}", last_text.len());
let mu = extract_model_usage(steps);
let msg_idx: u32 = if thinking_emitted { 1 } else { 0 };
let msg_idx: u32 = if thinking_started { 1 } else { 0 };
let (usage, mitm_thinking) = usage_from_poll(&state.mitm_store, &cascade_id, &mu, &params.user_text, &last_text).await;
let ts = extract_thinking_signature(steps);
let tc = thinking_text.clone()