From ca9f808ee3065c90ae0faefefa11988a269569a0 Mon Sep 17 00:00:00 2001 From: Nikketryhard Date: Sun, 15 Feb 2026 17:08:53 -0600 Subject: [PATCH] feat: completions API improvements, gemini endpoint, response types --- GEMINI.md | 85 ++++- docs/endpoint-gap-analysis.md | 538 ++++++----------------------- src/api/completions.rs | 614 +++++++++++++++++++--------------- src/api/gemini.rs | 46 ++- src/api/responses.rs | 85 ++++- src/api/search.rs | 288 ++++++++++++++++ src/api/types.rs | 103 +++++- src/mitm/proxy.rs | 14 +- 8 files changed, 1031 insertions(+), 742 deletions(-) create mode 100644 src/api/search.rs diff --git a/GEMINI.md b/GEMINI.md index 9c5de2a..cd237d9 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -47,17 +47,18 @@ sudo ./scripts/mitm-redirect.sh status # check current state ## Endpoints -| Method | Path | Description | -| -------- | ---------------------- | ----------------------------------------------------------- | -| `POST` | `/v1/responses` | **Responses API** (primary) — supports `stream: true/false` | -| `POST` | `/v1/chat/completions` | Chat Completions API (OpenAI compat shim) | -| `GET` | `/v1/models` | List available models | -| `GET` | `/v1/sessions` | List active sessions | -| `DELETE` | `/v1/sessions/:id` | Delete a session | -| `POST` | `/v1/token` | Set OAuth token at runtime | -| `GET` | `/v1/usage` | MITM-intercepted token usage stats | -| `GET` | `/v1/quota` | LS quota — credits, per-model rate limits, reset timers | -| `GET` | `/health` | Health check | +| Method | Path | Description | +| ---------- | ---------------------- | ----------------------------------------------------------- | +| `POST` | `/v1/responses` | **Responses API** (primary) — supports `stream: true/false` | +| `POST` | `/v1/chat/completions` | Chat Completions API (OpenAI compat shim) | +| `GET/POST` | `/v1/search` | **Web Search** — Google Search grounding, returns results | +| `GET` | `/v1/models` | List available models | +| `GET` | `/v1/sessions` | List active sessions | +| `DELETE` | `/v1/sessions/:id` | Delete a session | +| `POST` | `/v1/token` | Set OAuth token at runtime | +| `GET` | `/v1/usage` | MITM-intercepted token usage stats | +| `GET` | `/v1/quota` | LS quota — credits, per-model rate limits, reset timers | +| `GET` | `/health` | Health check | ## Available Models @@ -116,8 +117,8 @@ curl -s http://localhost:8741/v1/responses \ }' | jq . # Follow-up in same cascade: -curl -s http://localhost:8741/v1/responses \ - -H "Content-Type: application/json" \ +curl -s http://localhost:8741/v1/responses \\ + -H "Content-Type: application/json" \\ -d '{ "model": "gemini-3-flash", "input": "Now multiply that by 10", @@ -126,6 +127,64 @@ curl -s http://localhost:8741/v1/responses \ }' | jq . ``` +## Web Search + +The proxy supports Google Search grounding in two ways: + +### 1. Dedicated Search Endpoint (`/v1/search`) + +Returns structured search results with citations: + +```bash +# Quick GET search +curl -s 'http://localhost:8741/v1/search?q=latest+rust+news' | jq . + +# Full POST search with options +curl -s http://localhost:8741/v1/search \\ + -H "Content-Type: application/json" \\ + -d '{ + "query": "latest Rust programming news", + "model": "gemini-3-flash", + "timeout": 30 + }' | jq . +``` + +Response includes `summary`, `results[]` (title + URL), `citations[]`, and raw `grounding_metadata`. + +### 2. Inline Grounding (on any endpoint) + +Enable Google Search grounding on regular requests: + +```bash +# Completions API +curl -s http://localhost:8741/v1/chat/completions \\ + -H "Content-Type: application/json" \\ + -d '{ + "model": "gemini-3-flash", + "messages": [{"role": "user", "content": "What happened in tech today?"}], + "web_search": true + }' | jq . + +# Responses API (OpenAI-style tool) +curl -s http://localhost:8741/v1/responses \\ + -H "Content-Type: application/json" \\ + -d '{ + "model": "gemini-3-flash", + "input": "What happened in tech today?", + "tools": [{"type": "web_search_preview"}], + "stream": false + }' | jq . + +# Gemini API +curl -s http://localhost:8741/v1/gemini \\ + -H "Content-Type: application/json" \\ + -d '{ + "model": "gemini-3-flash", + "message": "What happened in tech today?", + "google_search": true + }' | jq . +``` + ## Authentication The proxy needs an OAuth token. Three ways to provide it: diff --git a/docs/endpoint-gap-analysis.md b/docs/endpoint-gap-analysis.md index dd1928f..adff5b1 100644 --- a/docs/endpoint-gap-analysis.md +++ b/docs/endpoint-gap-analysis.md @@ -1,464 +1,128 @@ # Endpoint Gap Analysis -> **Generated:** 2026-02-15 (updated) -> **Proxy Version:** 3.1.0 -> **Scope:** All three API endpoints vs official OpenAI / Gemini specifications +> **Updated:** 2026-02-15 +> **Sources:** [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create), [OpenAI Responses API](https://platform.openai.com/docs/api-reference/responses), [Gemini Thinking Mode](https://ai.google.dev/gemini-api/docs/thinking-mode), proxy source code +> **Method:** Full source audit cross-referenced against context7 OpenAI API docs --- -## Table of Contents +## What's Implemented -- [Endpoint Overview](#endpoint-overview) -- [Feature Parity Matrix](#feature-parity-matrix) -- [Detailed Endpoint Analysis](#detailed-endpoint-analysis) - - [Responses API (`/v1/responses`)](#responses-api-v1responses) - - [Chat Completions API (`/v1/chat/completions`)](#chat-completions-api-v1chatcompletions) - - [Gemini API (`/v1/gemini`)](#gemini-api-v1gemini) -- [Priority Gaps](#priority-gaps) -- [Architecture Notes](#architecture-notes) +### All Endpoints + +- ✅ Sync + streaming modes +- ✅ Model selection + validation +- ✅ OAuth auth check +- ✅ Timeout control +- ✅ Tool definitions, tool choice, tool results (OpenAI → Gemini auto-conversion) +- ✅ MITM bypass path for custom tools +- ✅ Thinking/reasoning in both sync and streaming +- ✅ Generation params forwarded via MITM (`temperature`, `top_p`, `top_k`, `max_output_tokens`, `stop_sequences`, `frequency_penalty`, `presence_penalty`) +- ✅ `reasoning_effort` / `thinkingLevel` — forwarded as `generationConfig.thinkingConfig.thinkingLevel` +- ✅ `response_format: {type: "json_object"}` — injected as `responseMimeType: "application/json"` +- ✅ Google Search grounding — `web_search: true` (Completions), `tools: [{type: "web_search_preview"}]` (Responses), `google_search: true` (Gemini) +- ✅ `/v1/search` endpoint — dedicated web search via Google Search grounding, returns structured results + citations + +### Reasoning Effort → Thinking Level Mapping + +| OpenAI `reasoning_effort` | Google `thinkingLevel` | Gemini 3 Pro | Gemini 3 Flash | +| :-----------------------: | :--------------------: | :----------: | :------------: | +| `"low"` | `"low"` | ✅ | ✅ | +| `"medium"` | `"medium"` | ❌ | ✅ | +| `"high"` | `"high"` | ✅ (default) | ✅ (default) | +| — | `"minimal"` | ❌ | ✅ | + +### Completions-Specific + +- ✅ `stream_options.include_usage` — final chunk with usage before `[DONE]` +- ✅ `completion_tokens_details.reasoning_tokens` — thinking token count +- ✅ `prompt_tokens_details.cached_tokens` — cache read tokens +- ✅ `temperature`, `top_p`, `max_tokens`, `max_completion_tokens`, `frequency_penalty`, `presence_penalty` +- ✅ `reasoning_effort` +- ✅ `stop` — string or array, forwarded as `generationConfig.stopSequences` +- ✅ `response_format: {type: "json_object"}` — injects `responseMimeType` +- ✅ `response_format: {type: "json_schema", json_schema: {...}}` — injects `responseMimeType` + `responseSchema` via MITM +- ✅ `n` (multiple choices) — fires N parallel cascades, collects into `choices[]` (sync only, capped at 5) +- ✅ `conversation` — session ID for multi-turn cascade reuse (custom extension) +- ✅ `reasoning_content` — thinking text in assistant message +- ✅ `system_fingerprint` — `fp_` in sync + all streaming chunks +- ✅ `service_tier` — `"default"` in sync + all streaming chunks +- ✅ `logprobs: null` — in every choice (sync + streaming) +- ✅ `metadata` — accepted in request, ignored +- ✅ `finish_reason` — correctly maps Google's `MAX_TOKENS`→`"length"`, `SAFETY`→`"content_filter"`, etc. +- ✅ Full `messages[]` history — all user, assistant, system, tool messages forwarded + +### Responses-Specific + +- ✅ Full streaming event set (all `response.*` events including reasoning summary) +- ✅ `temperature`, `top_p`, `max_output_tokens` +- ✅ `reasoning_effort` — echoed from client request +- ✅ `thinking_signature` for multi-turn thinking chains +- ✅ `instructions`, `metadata`, `user` — echoed in response +- ✅ Usage with MITM-intercepted real tokens +- ✅ `max_tool_calls` — limits tool calls returned per response +- ✅ `conversation` — session reuse +- ✅ `previous_response_id`, `store`, `parallel_tool_calls`, `truncation`, `text.format`, `tool_choice` — echoed +- ✅ `tools` — echoed from client request (was previously always `[]`) +- ✅ `text.format` — `{format: {type: "json_schema", ...}}` injects `responseMimeType` + `responseSchema` via MITM, echoed in response + +### Gemini-Specific + +- ✅ Native tool format (no conversion needed) +- ✅ `usageMetadata` in sync **and streaming** responses +- ✅ `temperature`, `topP`, `topK`, `maxOutputTokens`, `stopSequences` +- ✅ `thinkingLevel` +- ✅ Session/conversation reuse +- ✅ Array/multipart `input` — strings, string arrays, `{text: "..."}` object arrays --- -## Endpoint Overview +## Fixed Bugs -The proxy exposes three main API endpoints, each serving different client ecosystems: +| # | Bug | Fix | +| --- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| B1 | Messages history dropped | `extract_chat_input` now calls `build_conversation_with_tools` with ALL messages — full multi-turn via `messages[]` works. | +| B2 | `finish_reason` never `"length"` | `google_to_openai_finish_reason()` helper maps `MAX_TOKENS`→`"length"`, `SAFETY`/`RECITATION`/etc→`"content_filter"`. Applied to all paths. | +| B3 | `reasoning` always null | `build_response_object` now echoes client's `reasoning_effort` from `RequestParams`. | +| B4 | `tool_choice` always `"auto"` | Changed from `&'static str` to `serde_json::Value`. Echoes whatever the client sent. | +| B5 | `tools` always `[]` | Echoes the client's tools array in the response. | +| B7 | `temperature`/`top_p` wrong | Already defaults to `1.0` via `unwrap_or(1.0)`. Was a false positive — no fix needed. | -| Endpoint | Protocol | Primary Clients | Spec Reference | -| --------------------------- | --------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | -| `POST /v1/responses` | OpenAI Responses API | Claude Code, Antigravity-native clients | [platform.openai.com/docs/api-reference/responses](https://platform.openai.com/docs/api-reference/responses) | -| `POST /v1/chat/completions` | OpenAI Chat Completions API | OpenCode, Vercel AI SDK, any OpenAI-compatible client | [platform.openai.com/docs/api-reference/chat](https://platform.openai.com/docs/api-reference/chat/create) | -| `POST /v1/gemini` | Custom Gemini-native API | Direct Gemini-format consumers | [ai.google.dev/api](https://ai.google.dev/api) (loosely based) | +### Acceptable / Won't Fix -All three endpoints share the same backend pipeline: - -``` -Client Request → Proxy Endpoint → LS (Language Server) → Google API - ↓ - MITM Proxy (captures real usage + injects generation params + tool calls) -``` +| # | Bug | Status | +| --- | ----------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| B6 | `Usage::estimate` fake tokens as fallback | Only triggers on timeout/error paths. Heuristic `len/4` is reasonable for timeouts where output tokens = 0. | --- -## Feature Parity Matrix +## TODO — New Features -### Core Features +### Trivial (all done ✅) -| Feature | Responses | Completions | Gemini | -| -------------------- | :-------: | :---------: | :----: | -| Sync mode | ✅ | ✅ | ✅ | -| Streaming mode (SSE) | ✅ | ✅ | ✅ | -| Model selection | ✅ | ✅ | ✅ | -| Model validation | ✅ | ✅ | ✅ | -| Auth check (OAuth) | ✅ | ✅ | ✅ | -| Timeout control | ✅ | ✅ | ✅ | +All trivial response shape fixes have been implemented. -### Generation Parameters (MITM-injected) +### Medium (schema injection via MITM) — all done ✅ -| Feature | Responses | Completions | Gemini | -| ------------------- | :-------: | :---------: | :----: | -| `temperature` | ✅ | ✅ | ✅ | -| `top_p` / `topP` | ✅ | ✅ | ✅ | -| `top_k` / `topK` | ❌ | ❌ | ✅ | -| `max_output_tokens` | ✅ | ✅ | ✅ | -| `stop_sequences` | ❌ | ❌ | ✅ | -| `frequency_penalty` | ❌ | ✅ | ❌ | -| `presence_penalty` | ❌ | ✅ | ❌ | +All structured output features have been implemented. -> **Note:** All generation parameters are forwarded to Google's API via MITM injection into `request.generationConfig`. They override the LS defaults. +### Hard (new features) -### Thinking / Reasoning +| # | Gap | API | Notes | +| --- | ------------------------- | ---- | ---------------------------------------------------------- | +| 7 | **`parallel_tool_calls`** | Both | Accept param, echo in response. Can't enforce server-side. | -| Feature | Responses | Completions | Gemini | -| ---------------------------------- | :-------------------------------: | :-------------------------------: | :---------------------: | -| Thinking — LS path (streaming) | ✅ `reasoning_summary_text.delta` | ✅ `reasoning_content` delta | ✅ `thought: true` part | -| Thinking — LS path (sync) | ✅ `reasoning` output item | ✅ `reasoning_content` in message | ✅ `thought: true` part | -| Thinking — Bypass path (streaming) | ✅ | ✅ | ✅ | -| Thinking — Bypass path (sync) | ✅ | ✅ | ✅ | -| Thinking signature (multi-turn) | ✅ `thinking_signature` field | ❌ Not applicable | ❌ Not applicable | +### Stretch (research needed) -### Tool Calls - -| Feature | Responses | Completions | Gemini | -| ---------------------------- | :-----------------------------: | :------------------------: | :-------------------------------------: | -| Tool definitions input | ✅ OpenAI format → Gemini | ✅ OpenAI format → Gemini | ✅ Native Gemini format | -| Tool choice control | ✅ `tool_choice` | ✅ `tool_choice` | ✅ `tool_config` | -| Tool call output (streaming) | ✅ `function_call` items | ✅ `tool_calls` in delta | ✅ `functionCall` parts | -| Tool call output (sync) | ✅ `function_call` items | ✅ `tool_calls` in message | ✅ `functionCall` parts | -| Tool result input | ✅ `function_call_output` items | ✅ `tool` role messages | ✅ `functionResponse` in `tool_results` | -| MITM bypass (custom tools) | ✅ | ✅ | ✅ | -| Stale state protection | ✅ | ✅ | ✅ | - -### Session Management - -| Feature | Responses | Completions | Gemini | -| ------------------------------------ | :---------------------: | :--------------: | :---------------------: | -| Session/conversation reuse | ✅ `conversation` field | ❌ Not supported | ✅ `conversation` field | -| Session listing (`GET /v1/sessions`) | ✅ Shared | ✅ Shared | ✅ Shared | -| Session deletion | ✅ Shared | ✅ Shared | ✅ Shared | - -### Usage / Token Tracking - -| Feature | Responses | Completions | Gemini | -| -------------------------------- | :---------------------------: | :-------------------------------: | :--------------------------: | -| Usage in sync response | ✅ MITM real tokens | ✅ MITM real tokens | ✅ `usageMetadata` | -| Usage in streaming (final chunk) | ❌ Not emitted | ✅ `stream_options.include_usage` | ❌ Not emitted | -| `reasoning_tokens` in usage | ✅ In `output_tokens_details` | ✅ In `completion_tokens_details` | ✅ `thoughtsTokenCount` | -| Cache tokens | ✅ `cached_tokens` | ✅ `cached_tokens` | ✅ `cachedContentTokenCount` | +| # | Gap | API | Notes | +| --- | -------------------------- | ---- | ---------------------------------------------------------------------------------------------------------------------------- | +| 12 | **Image/audio modalities** | Both | LS `sendMessage` is text-only. Need to reverse-engineer proto format for binary payloads. Gemini 3 supports vision natively. | --- -## Detailed Endpoint Analysis +## Won't Implement -### Responses API (`/v1/responses`) - -**Spec:** [OpenAI Responses API](https://platform.openai.com/docs/api-reference/responses) - -#### Request Fields - -| Field | Spec | Status | Implementation Details | -| ---------------------------- | ------------- | :----: | -------------------------------------------------------------------------------------------------------------------------------- | -| `model` | Required | ✅ | Mapped to internal model enum via `lookup_model()` | -| `input` | Required | ✅ | String or array. Array supports `message` items and `function_call_output` items | -| `instructions` | Optional | ✅ | Prepended to user text as system instructions | -| `stream` | Optional | ✅ | SSE stream with `response.*` events | -| `tools` | Optional | ✅ | OpenAI function format → auto-converted to Gemini `functionDeclarations` via `openai_tools_to_gemini()` | -| `tool_choice` | Optional | ✅ | `"auto"`, `"required"`, `"none"`, or `{"type":"function","function":{"name":"X"}}` → converted to Gemini `functionCallingConfig` | -| `store` | Optional | ✅ | Accepted, echoed in response. Not actually persisted. | -| `temperature` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig` injection. | -| `top_p` | Optional | ✅ | **Forwarded** to Google via MITM. | -| `max_output_tokens` | Optional | ✅ | **Forwarded** to Google via MITM. | -| `previous_response_id` | Optional | ✅ | Accepted, echoed. Not used for chaining (use `conversation` instead). | -| `metadata` | Optional | ✅ | Accepted, echoed back in response. | -| `user` | Optional | ✅ | Accepted, echoed. | -| `conversation` | **Extension** | ✅ | Proxy-specific: session ID for multi-turn cascade reuse. | -| `timeout` | **Extension** | ✅ | Proxy-specific: request timeout in seconds (default 120). | -| `reasoning.effort` | Optional | ❌ | Could map to model variant selection (e.g., `"high"` → Opus, `"low"` → Flash). | -| `reasoning.generate_summary` | Optional | ❌ | Not implemented. Could control thinking output inclusion. | -| `truncation` | Optional | ❌ | Not applicable — LS manages context window. | -| `parallel_tool_calls` | Optional | ✅ | Hardcoded `true` in response. | - -#### Response Object - -| Field | Spec | Status | Notes | -| ---------------------- | ------------- | :----: | ---------------------------------------------------------------- | -| `id` | Required | ✅ | `resp_` + UUID | -| `object` | Required | ✅ | Always `"response"` | -| `created_at` | Required | ✅ | Unix timestamp | -| `status` | Required | ✅ | `"completed"` or `"incomplete"` | -| `completed_at` | Required | ✅ | Unix timestamp or null | -| `error` | Required | ✅ | null on success | -| `incomplete_details` | Required | ✅ | null | -| `instructions` | Required | ✅ | Echoed from request | -| `max_output_tokens` | Required | ✅ | Echoed or null | -| `model` | Required | ✅ | Model name string | -| `output` | Required | ✅ | Array of `reasoning` and/or `message` items | -| `parallel_tool_calls` | Required | ✅ | `true` | -| `previous_response_id` | Required | ✅ | Echoed or null | -| `reasoning` | Required | ✅ | `{effort: null, summary: null}` | -| `store` | Required | ✅ | Echoed | -| `temperature` | Required | ✅ | Echoed (default 1.0) | -| `text` | Required | ✅ | `{format: {type: "text"}}` | -| `tool_choice` | Required | ✅ | `"auto"` | -| `tools` | Required | ✅ | Echoed or `[]` | -| `top_p` | Required | ✅ | Echoed (default 1.0) | -| `truncation` | Required | ✅ | `"disabled"` | -| `usage` | Required | ✅ | MITM-intercepted real tokens when available, estimated otherwise | -| `user` | Required | ✅ | Echoed or null | -| `metadata` | Required | ✅ | Echoed or `{}` | -| `thinking_signature` | **Extension** | ✅ | Proxy-specific: opaque blob for multi-turn thinking chain | - -#### Streaming Events - -| Event | Spec | Status | Notes | -| ---------------------------------------- | --------- | :----: | ------------------------------ | -| `response.created` | Required | ✅ | Initial response shell | -| `response.in_progress` | Required | ✅ | | -| `response.output_item.added` | Required | ✅ | For reasoning + message items | -| `response.content_part.added` | Required | ✅ | | -| `response.output_text.delta` | Required | ✅ | Progressive text deltas | -| `response.output_text.done` | Required | ✅ | | -| `response.content_part.done` | Required | ✅ | | -| `response.output_item.done` | Required | ✅ | | -| `response.completed` | Required | ✅ | Final event with full response | -| `response.reasoning_summary_text.delta` | Required | ✅ | Progressive thinking deltas | -| `response.reasoning_summary_text.done` | Required | ✅ | | -| `response.function_call_arguments.delta` | For tools | ✅ | Tool call argument streaming | -| `response.function_call_arguments.done` | For tools | ✅ | | - ---- - -### Chat Completions API (`/v1/chat/completions`) - -**Spec:** [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) - -#### Request Fields - -| Field | Spec | Status | Implementation Details | -| ------------------------- | ------------- | :----: | -------------------------------------------------------------------- | -| `model` | Required | ✅ | Mapped to internal model enum | -| `messages` | Required | ✅ | Supports `system`, `developer`, `user`, `assistant`, `tool` roles | -| `messages[].content` | Required | ✅ | String or array of `{type: "text", text: "..."}` objects | -| `messages[].tool_calls` | Optional | ✅ | For assistant messages with tool calls | -| `messages[].tool_call_id` | Optional | ✅ | For tool result messages | -| `stream` | Optional | ✅ | SSE with `chat.completion.chunk` events | -| `stream_options` | Optional | ✅ | `include_usage: true` emits final usage chunk before `[DONE]` | -| `tools` | Optional | ✅ | OpenAI function format → auto-converted to Gemini | -| `tool_choice` | Optional | ✅ | `"auto"`, `"none"`, `"required"`, or specific function | -| `timeout` | **Extension** | ✅ | Proxy-specific (default 120s) | -| `temperature` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig.temperature` | -| `top_p` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig.topP` | -| `max_tokens` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig.maxOutputTokens` | -| `max_completion_tokens` | Optional | ✅ | **Forwarded** (same as `max_tokens`, newer OpenAI param) | -| `frequency_penalty` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig.frequencyPenalty` | -| `presence_penalty` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig.presencePenalty` | -| `user` | Optional | ✅ | Accepted, not used | -| `n` | Optional | ❌ | N/A — single generation only | -| `logprobs` | Optional | ❌ | N/A | -| `top_logprobs` | Optional | ❌ | N/A | -| `logit_bias` | Optional | ❌ | N/A | -| `response_format` | Optional | ❌ | Could be useful for JSON mode | -| `seed` | Optional | ❌ | N/A | -| `stop` | Optional | ❌ | Could be forwarded as `stopSequences` | - -#### Sync Response Object - -| Field | Spec | Status | Notes | -| ------------------------------------------------------------ | ----------- | :----: | -------------------------------------------- | -| `id` | Required | ✅ | `chatcmpl-` + UUID | -| `object` | Required | ✅ | `"chat.completion"` | -| `created` | Required | ✅ | Unix timestamp | -| `model` | Required | ✅ | Model name | -| `choices[0].index` | Required | ✅ | `0` | -| `choices[0].message.role` | Required | ✅ | `"assistant"` | -| `choices[0].message.content` | Required | ✅ | Response text | -| `choices[0].message.reasoning_content` | Extension | ✅ | Thinking text (when model produces thinking) | -| `choices[0].message.tool_calls` | Conditional | ✅ | When model returns tool calls | -| `choices[0].message.refusal` | Optional | ❌ | Not implemented | -| `choices[0].message.annotations` | Optional | ❌ | Not implemented | -| `choices[0].logprobs` | Optional | ❌ | Not implemented | -| `choices[0].finish_reason` | Required | ✅ | `"stop"` or `"tool_calls"` | -| `usage.prompt_tokens` | Required | ✅ | MITM real or estimated | -| `usage.completion_tokens` | Required | ✅ | MITM real or estimated | -| `usage.total_tokens` | Required | ✅ | Sum | -| `usage.prompt_tokens_details.cached_tokens` | Optional | ✅ | MITM cache read tokens | -| `usage.completion_tokens_details.reasoning_tokens` | Optional | ✅ | MITM thinking token count | -| `usage.completion_tokens_details.accepted_prediction_tokens` | Optional | ❌ | N/A | -| `usage.completion_tokens_details.rejected_prediction_tokens` | Optional | ❌ | N/A | -| `system_fingerprint` | Deprecated | ❌ | Cosmetic, not needed | -| `service_tier` | Optional | ❌ | Cosmetic, not needed | - -#### Streaming Chunk Object - -| Field | Spec | Status | Notes | -| ------------------------------------ | --------------- | :----: | ----------------------------------------------------- | -| `id` | Required | ✅ | Same across all chunks | -| `object` | Required | ✅ | `"chat.completion.chunk"` | -| `created` | Required | ✅ | Same across all chunks | -| `model` | Required | ✅ | | -| `choices[0].index` | Required | ✅ | `0` | -| `choices[0].delta.role` | First chunk | ✅ | `"assistant"` in first chunk | -| `choices[0].delta.content` | Text chunks | ✅ | Progressive text deltas | -| `choices[0].delta.reasoning_content` | Thinking chunks | ✅ | Progressive thinking deltas | -| `choices[0].delta.tool_calls` | Tool chunks | ✅ | Tool call data | -| `choices[0].delta` | Final chunk | ✅ | Empty `{}` | -| `choices[0].finish_reason` | Final chunk | ✅ | `"stop"` or `"tool_calls"` | -| `choices[0].logprobs` | Optional | ❌ | Not implemented | -| `usage` (final chunk) | Optional | ✅ | Emitted when `stream_options.include_usage` is `true` | -| `data: [DONE]` | Required | ✅ | Stream termination signal | - ---- - -### Gemini API (`/v1/gemini`) - -**Spec:** Custom endpoint loosely based on [Gemini REST API](https://ai.google.dev/api/generate-content) - -> **Note:** This is NOT a 1:1 Gemini API replica. It's a simplified proxy-native endpoint that uses Gemini's `functionDeclarations` / `functionCall` / `functionResponse` format directly, avoiding OpenAI ↔ Gemini format conversion overhead. - -#### Request Fields - -| Field | Spec | Status | Implementation Details | -| --------------------------------------- | -------- | :----: | --------------------------------------------------------------- | -| `model` | Required | ✅ | Mapped to internal model enum | -| `input` | Required | ✅ | String only (no array/multipart) | -| `tools` | Optional | ✅ | Native Gemini `[{functionDeclarations: [...]}]` format | -| `tool_config` | Optional | ✅ | Native Gemini `{functionCallingConfig: {mode: "AUTO"}}` | -| `tool_results` | Optional | ✅ | Array of `{functionResponse: {name, response}}` | -| `conversation` | Optional | ✅ | Session ID for cascade reuse | -| `stream` | Optional | ✅ | SSE streaming | -| `timeout` | Optional | ✅ | Default 120s | -| `temperature` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig.temperature` | -| `top_p` / `topP` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig.topP` | -| `top_k` / `topK` | Optional | ✅ | **Forwarded** to Google via MITM `generationConfig.topK` | -| `max_output_tokens` / `maxOutputTokens` | Optional | ✅ | **Forwarded** via MITM `generationConfig.maxOutputTokens` | -| `stop_sequences` / `stopSequences` | Optional | ✅ | **Forwarded** via MITM `generationConfig.stopSequences` | - -#### Sync Response Object - -| Field | Spec | Status | Notes | -| -------------------------------------------- | --------- | :----: | -------------------------------- | -| `candidates[0].content.parts` | Required | ✅ | Array of text/functionCall parts | -| `candidates[0].content.parts[].text` | Text | ✅ | Response text | -| `candidates[0].content.parts[].thought` | Extension | ✅ | `true` for thinking parts | -| `candidates[0].content.parts[].functionCall` | Tool call | ✅ | `{name, args}` | -| `candidates[0].content.role` | Required | ✅ | `"model"` | -| `candidates[0].finishReason` | Required | ✅ | `"STOP"` | -| `modelVersion` | Required | ✅ | Model name string | -| `usageMetadata` | Optional | ✅ | MITM-intercepted token counts | - -`usageMetadata` fields: - -| Field | Status | Notes | -| ------------------------- | :----: | ------------------------- | -| `promptTokenCount` | ✅ | Input tokens | -| `candidatesTokenCount` | ✅ | Output tokens | -| `totalTokenCount` | ✅ | Input + output | -| `thoughtsTokenCount` | ✅ | Thinking/reasoning tokens | -| `cachedContentTokenCount` | ✅ | Cache read tokens | - -#### Streaming Format - -Each SSE `data:` chunk is a complete Gemini-format JSON object with progressive `candidates[0].content.parts`: - -``` -data: {"candidates":[{"content":{"parts":[{"text":"thinking...","thought":true}],"role":"model"}}],"modelVersion":"opus-4.6"} - -data: {"candidates":[{"content":{"parts":[{"text":"Hello!"}],"role":"model"}}],"modelVersion":"opus-4.6"} - -data: {"candidates":[{"content":{"parts":[{"text":""}],"role":"model"},"finishReason":"STOP"}],"modelVersion":"opus-4.6"} - -data: [DONE] -``` - ---- - -## Priority Gaps - -### 🔴 High Priority — RESOLVED ✅ - -All high-priority gaps have been addressed: - -1. ~~**Completions: `stream_options.include_usage`**~~ → ✅ Implemented -2. ~~**Completions: `completion_tokens_details.reasoning_tokens`**~~ → ✅ Implemented -3. ~~**Completions: Accept `temperature`, `top_p`, `max_tokens`**~~ → ✅ Forwarded via MITM -4. ~~**Gemini: `usageMetadata`**~~ → ✅ Implemented - -### 🟡 Medium Priority - -5. **Responses: `reasoning.effort`** - - **What:** Map reasoning effort levels (`"high"`, `"medium"`, `"low"`) to model variant selection - - **Why:** Could automatically select Opus vs Flash based on reasoning needs - - **Effort:** Medium — needs model selection logic changes - -6. **Completions: Session/conversation support** - - **What:** Add session reuse similar to Responses and Gemini endpoints - - **Why:** Would allow multi-turn conversations via the completions API - - **Effort:** Medium — need a way to pass session ID (maybe via `user` field or custom header) - -7. **Completions: `stop` sequences** - - **What:** Forward `stop` to Google as `stopSequences` in `generationConfig` - - **Why:** Some clients use stop sequences to control generation - - **Effort:** Trivial — just add to `CompletionRequest` and `GenerationParams` - -8. **Completions: `response_format` (JSON mode)** - - **What:** Forward `response_format: {"type": "json_object"}` to Google's `responseMimeType` - - **Why:** Useful for structured output - - **Effort:** Low — inject `responseMimeType: "application/json"` in generationConfig - -### 🟢 Low Priority - -Cosmetic or not applicable to our architecture: - -9. **`system_fingerprint`** — OpenAI-specific field, meaningless for our proxy -10. **`service_tier`** — OpenAI billing concept, not applicable -11. **`n` > 1** — Multiple completions per request; our backend only generates one -12. **`logprobs`** — Would require token-level access we don't have -13. **`seed`** — Deterministic sampling not controllable through our proxy - ---- - -## Architecture Notes - -### Generation Parameter Injection - -Client-specified sampling parameters are forwarded to Google's API via the MITM request modification pipeline: - -``` -Client sends temperature=0.5 → API handler stores in MitmStore.generation_params - ↓ - LS sends request to Google API - ↓ - MITM intercepts request - ↓ - modify_request() reads generation_params - ↓ - Injects into request.generationConfig: - temperature, topP, topK, maxOutputTokens, - stopSequences, frequencyPenalty, presencePenalty - ↓ - Forwards modified request to Google -``` - -This approach overrides whatever defaults the LS sets, giving clients direct control over sampling parameters. - -### Dual Path Architecture - -All three endpoints share a dual-path architecture: - -``` - ┌─────────────────┐ - │ Has custom │ -Request ────────────► │ tools? │ - └────────┬────────┘ - │ - ┌──── Yes ──┴── No ────┐ - │ │ - ┌─────▼─────┐ ┌─────▼─────┐ - │ MITM │ │ LS Steps │ - │ Bypass │ │ Polling │ - │ Path │ │ Path │ - └─────┬─────┘ └─────┬─────┘ - │ │ - ┌─────▼─────┐ ┌─────▼─────┐ - │ Poll │ │ Poll │ - │ MitmStore │ │ get_steps │ - │ directly │ │ from LS │ - └─────┬─────┘ └─────┬─────┘ - │ │ - └──────────┬────────────┘ - │ - ┌─────▼─────┐ - │ Response │ - │ to client │ - └───────────┘ -``` - -- **Bypass Path:** When custom tools are present, the handler polls `MitmStore` directly for response text, thinking text, and function calls. The MITM proxy captures these from the Google API response before the LS processes them. - -- **LS Path:** When no custom tools are present, the handler polls the LS's `get_steps` API for progressive response data (text, thinking, status). - -### Stale State Protection - -All bypass paths include protection against stale `response_complete` flags from previous requests: - -```rust -if complete && text.is_empty() && thinking.is_none() { - warn!("stale response_complete detected — clearing"); - state.mitm_store.clear_response_async().await; - continue; // or retry -} -``` - -This handles the race condition where a previous request's MITM handler calls `mark_response_complete()` after the new request has already called `clear_response_async()`. - -### Tool Format Conversion - -``` -OpenAI tools ──► openai_tools_to_gemini() ──► Gemini functionDeclarations - │ - MitmStore.set_tools() - │ - MITM proxy injects into - outgoing LS request -``` - -The Gemini endpoint skips this conversion entirely — tools are stored in native Gemini format. +| # | Gap | Reason | +| --- | ------------------------------- | ------------------------------------------------------------------------ | +| 9 | `prediction` (Predicted Output) | Inference-level speculative decoding optimization. No Gemini equivalent. | +| 10 | `logprobs` / `top_logprobs` | Gemini never exposes token-level log probabilities. | diff --git a/src/api/completions.rs b/src/api/completions.rs index 0b8d547..4a242b4 100644 --- a/src/api/completions.rs +++ b/src/api/completions.rs @@ -15,45 +15,77 @@ use super::types::*; use super::util::{err_response, now_unix}; use super::AppState; +/// Extract a conversation/session ID from a flexible JSON value. +/// Accepts a plain string or an object with an "id" field. +fn extract_conversation_id(conv: &Option) -> Option { + match conv { + Some(serde_json::Value::String(s)) => Some(s.clone()), + Some(obj) => obj["id"].as_str().map(|s| s.to_string()), + None => None, + } +} + +/// System fingerprint for completions responses (derived from crate version at compile time). +fn system_fingerprint() -> String { + format!("fp_{}", env!("CARGO_PKG_VERSION").replace('.', "")) +} + +/// Build a streaming chunk JSON with all required OpenAI fields. +/// Includes system_fingerprint, service_tier, and logprobs:null in choices. +fn chunk_json( + id: &str, model: &str, + choices: serde_json::Value, + usage: Option, +) -> String { + let mut obj = serde_json::json!({ + "id": id, + "object": "chat.completion.chunk", + "created": now_unix(), + "model": model, + "system_fingerprint": system_fingerprint(), + "service_tier": "default", + "choices": choices, + }); + if let Some(u) = usage { + obj["usage"] = u; + } + serde_json::to_string(&obj).unwrap_or_default() +} + +/// Build a single choice for a streaming chunk (delta + finish_reason + logprobs). +fn chunk_choice(index: u32, delta: serde_json::Value, finish_reason: Option<&str>) -> serde_json::Value { + serde_json::json!({ + "index": index, + "delta": delta, + "logprobs": serde_json::Value::Null, + "finish_reason": finish_reason, + }) +} + +// ─── Finish reason mapping ─────────────────────────────────────────────────── + +/// Map Google's finishReason → OpenAI's finish_reason. +/// Google: STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER, BLOCKLIST, PROHIBITED_CONTENT +/// OpenAI: stop, length, content_filter, tool_calls (handled separately) +fn google_to_openai_finish_reason(stop_reason: Option<&str>) -> &'static str { + match stop_reason { + Some("MAX_TOKENS") => "length", + Some("SAFETY") | Some("RECITATION") | Some("BLOCKLIST") | Some("PROHIBITED_CONTENT") => "content_filter", + _ => "stop", + } +} + // ─── Input extraction ──────────────────────────────────────────────────────── /// Extract user text from Chat Completions messages array. /// -/// When tool results are present, builds the full conversation including -/// tool call results so the model can continue after tool use. +/// Builds the full conversation context including all messages (system, user, +/// assistant, tool) so the model has complete history — matching how OpenAI +/// sends the entire messages array to the model. fn extract_chat_input(messages: &[CompletionMessage]) -> String { - let has_tool_results = messages.iter().any(|m| m.role == "tool"); - - if has_tool_results { - // Build full conversation context including tool results - return build_conversation_with_tools(messages); - } - - // Simple path: no tools, just extract system + last user message - let mut system_parts = Vec::new(); - let mut user_parts = Vec::new(); - - for msg in messages { - let text = extract_message_text(&msg.content); - if text.is_empty() { - continue; - } - match msg.role.as_str() { - "system" | "developer" => system_parts.push(text), - "user" => user_parts.push(text), - _ => {} - } - } - - let mut result = String::new(); - if !system_parts.is_empty() { - result.push_str(&system_parts.join("\n")); - result.push_str("\n\n"); - } - if let Some(last) = user_parts.last() { - result.push_str(last); - } - result.trim().to_string() + // Always build the full conversation — we used to only take the last user + // message which broke multi-turn conversations via the messages array. + build_conversation_with_tools(messages) } /// Extract text content from a message's content field (string or array). @@ -179,18 +211,36 @@ pub(crate) async fn handle_completions( // Store generation parameters for MITM injection { use crate::mitm::store::GenerationParams; + let (response_mime_type, response_schema) = match body.response_format.as_ref() { + Some(rf) => match rf.format_type.as_str() { + "json_object" | "json" => (Some("application/json".to_string()), None), + "json_schema" => { + let schema = rf.json_schema.as_ref().and_then(|js| js.schema.clone()); + (Some("application/json".to_string()), schema) + } + _ => (None, None), + }, + None => (None, None), + }; let gp = GenerationParams { temperature: body.temperature, top_p: body.top_p, top_k: None, // OpenAI doesn't have top_k max_output_tokens: body.max_tokens.or(body.max_completion_tokens), - stop_sequences: None, // TODO: body.stop + stop_sequences: body.stop.clone().map(|s| s.into_vec()), frequency_penalty: body.frequency_penalty, presence_penalty: body.presence_penalty, + reasoning_effort: body.reasoning_effort.clone(), + response_mime_type, + response_schema, + google_search: body.web_search, }; // Only store if at least one param is set if gp.temperature.is_some() || gp.top_p.is_some() || gp.max_output_tokens.is_some() || gp.frequency_penalty.is_some() || gp.presence_penalty.is_some() + || gp.reasoning_effort.is_some() || gp.stop_sequences.is_some() + || gp.response_mime_type.is_some() || gp.response_schema.is_some() + || gp.google_search { state.mitm_store.set_generation_params(gp).await; } else { @@ -216,8 +266,28 @@ pub(crate) async fn handle_completions( ); } - // Fresh cascade per request - let cascade_id = match state.backend.create_cascade().await { + let n = (body.n.max(1)).min(5); // Cap at 5 to prevent abuse + if n > 1 && body.stream { + warn!("n={n} requested with streaming — streaming only supports n=1, ignoring n"); + } + + // Session/conversation: reuse cascade if conversation ID provided + let session_id_str = extract_conversation_id(&body.conversation); + + // Helper to create a cascade (reuses session or creates fresh) + let create_cascade = |state: Arc, session_id: Option| async move { + if let Some(ref sid) = session_id { + state + .sessions + .get_or_create(Some(sid), || state.backend.create_cascade()) + .await + .map(|sr| sr.cascade_id) + } else { + state.backend.create_cascade().await + } + }; + + let cascade_id = match create_cascade(Arc::clone(&state), session_id_str.clone()).await { Ok(cid) => cid, Err(e) => { return err_response( @@ -228,7 +298,7 @@ pub(crate) async fn handle_completions( } }; - // Send message + // Send message on primary cascade state.mitm_store.set_active_cascade(&cascade_id).await; match state .backend @@ -275,7 +345,7 @@ pub(crate) async fn handle_completions( include_usage, ) .await - } else { + } else if n <= 1 { chat_completions_sync( state, completion_id, @@ -284,6 +354,108 @@ pub(crate) async fn handle_completions( body.timeout, ) .await + } else { + // n > 1: fire additional (n-1) parallel cascades + let mut extra_cascade_ids = Vec::with_capacity((n - 1) as usize); + for _ in 1..n { + match state.backend.create_cascade().await { + Ok(cid) => { + // Send the same message on each extra cascade + match state.backend.send_message(&cid, &user_text, model.model_enum).await { + Ok((200, _)) => { + let bg = Arc::clone(&state.backend); + let cid2 = cid.clone(); + tokio::spawn(async move { let _ = bg.update_annotations(&cid2).await; }); + extra_cascade_ids.push(cid); + } + _ => {} // Skip failed cascades + } + } + Err(_) => {} // Skip failed cascade creation + } + } + + // Poll all cascades in parallel + let mut handles = Vec::with_capacity(n as usize); + let all_cascade_ids: Vec = std::iter::once(cascade_id.clone()) + .chain(extra_cascade_ids) + .collect(); + + for cid in &all_cascade_ids { + let st = Arc::clone(&state); + let cid = cid.clone(); + let timeout = body.timeout; + handles.push(tokio::spawn(async move { + let result = poll_for_response(&st, &cid, timeout).await; + let mitm = match st.mitm_store.take_usage(&cid).await { + Some(u) => Some(u), + None => st.mitm_store.take_usage("_latest").await, + }; + (result, mitm) + })); + } + + let mut choices = Vec::with_capacity(n as usize); + let mut total_prompt = 0u64; + let mut total_completion = 0u64; + let mut total_cached = 0u64; + let mut total_thinking = 0u64; + + for (i, handle) in handles.into_iter().enumerate() { + if let Ok((result, mitm)) = handle.await { + let finish_reason = google_to_openai_finish_reason( + mitm.as_ref().and_then(|u| u.stop_reason.as_deref()), + ); + let (pt, ct, cached, thinking) = if let Some(ref mu) = mitm { + (mu.input_tokens, mu.output_tokens, mu.cache_read_input_tokens, mu.thinking_output_tokens) + } else if let Some(u) = &result.usage { + (u.input_tokens, u.output_tokens, 0, 0) + } else { + (0, 0, 0, 0) + }; + total_prompt += pt; + total_completion += ct; + total_cached += cached; + total_thinking += thinking; + + let mut message = serde_json::json!({ + "role": "assistant", + "content": result.text, + }); + if let Some(ref thinking_text) = result.thinking { + message["reasoning_content"] = serde_json::json!(thinking_text); + } + + choices.push(serde_json::json!({ + "index": i, + "message": message, + "logprobs": serde_json::Value::Null, + "finish_reason": finish_reason, + })); + } + } + + Json(serde_json::json!({ + "id": completion_id, + "object": "chat.completion", + "created": now_unix(), + "model": model_name, + "system_fingerprint": system_fingerprint(), + "service_tier": "default", + "choices": choices, + "usage": { + "prompt_tokens": total_prompt, + "completion_tokens": total_completion, + "total_tokens": total_prompt + total_completion, + "prompt_tokens_details": { + "cached_tokens": total_cached, + }, + "completion_tokens_details": { + "reasoning_tokens": total_thinking, + }, + }, + })) + .into_response() } } @@ -307,21 +479,26 @@ async fn chat_completions_stream( state.mitm_store.clear_response_async().await; // Initial role chunk - yield Ok::<_, std::convert::Infallible>(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {"role": "assistant", "content": ""}, - "finish_reason": serde_json::Value::Null, - }], - })).unwrap_or_default())); + yield Ok::<_, std::convert::Infallible>(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({"role": "assistant", "content": ""}), None)]), + None, + ))); let mut keepalive_counter: u64 = 0; let mut last_thinking_len: usize = 0; + // Helper: build usage JSON from MITM tokens + let build_usage = |pt: u64, ct: u64, crt: u64, tt: u64| -> serde_json::Value { + serde_json::json!({ + "prompt_tokens": pt, + "completion_tokens": ct, + "total_tokens": pt + ct, + "prompt_tokens_details": { "cached_tokens": crt }, + "completion_tokens_details": { "reasoning_tokens": tt }, + }) + }; + while start.elapsed().as_secs() < timeout { // ── Check for MITM-captured function calls FIRST ── // This runs independently of LS steps — the MITM captures tool calls @@ -346,49 +523,28 @@ async fn chat_completions_stream( }, })); } - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {"tool_calls": tool_calls}, - "finish_reason": serde_json::Value::Null, - }], - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({"tool_calls": tool_calls}), None)]), + None, + ))); - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {}, - "finish_reason": "tool_calls", - }], - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({}), Some("tool_calls"))]), + None, + ))); if include_usage { let mitm = state.mitm_store.take_usage(&cascade_id).await .or(state.mitm_store.take_usage("_latest").await); let (pt, ct, crt, tt) = if let Some(ref u) = mitm { (u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens) } else { (0, 0, 0, 0) }; - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [], - "usage": { - "prompt_tokens": pt, - "completion_tokens": ct, - "total_tokens": pt + ct, - "prompt_tokens_details": { "cached_tokens": crt }, - "completion_tokens_details": { "reasoning_tokens": tt }, - }, - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([]), + Some(build_usage(pt, ct, crt, tt)), + ))); } yield Ok(Event::default().data("[DONE]")); return; @@ -413,17 +569,11 @@ async fn chat_completions_stream( let delta = &tc[last_thinking_len..]; last_thinking_len = tc.len(); - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {"reasoning_content": delta}, - "finish_reason": serde_json::Value::Null, - }], - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({"reasoning_content": delta}), None)]), + None, + ))); } } @@ -436,17 +586,11 @@ async fn chat_completions_stream( }; if !delta.is_empty() { - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {"content": delta}, - "finish_reason": serde_json::Value::Null, - }], - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({"content": delta}), None)]), + None, + ))); last_text = text; } } @@ -454,37 +598,24 @@ async fn chat_completions_stream( // Check if MITM response is complete if state.mitm_store.is_response_complete() && !last_text.is_empty() { debug!("Completions: MITM response complete (bypass), text length={}", last_text.len()); - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {}, - "finish_reason": "stop", - }], - })).unwrap_or_default())); + // Take usage FIRST so we can read stop_reason for finish_reason + let mitm = state.mitm_store.take_usage(&cascade_id).await + .or(state.mitm_store.take_usage("_latest").await); + let fr = google_to_openai_finish_reason(mitm.as_ref().and_then(|u| u.stop_reason.as_deref())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({}), Some(fr))]), + None, + ))); if include_usage { - let mitm = state.mitm_store.take_usage(&cascade_id).await - .or(state.mitm_store.take_usage("_latest").await); let (pt, ct, crt, tt) = if let Some(ref u) = mitm { (u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens) } else { (0, 0, 0, 0) }; - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [], - "usage": { - "prompt_tokens": pt, - "completion_tokens": ct, - "total_tokens": pt + ct, - "prompt_tokens_details": { "cached_tokens": crt }, - "completion_tokens_details": { "reasoning_tokens": tt }, - }, - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([]), + Some(build_usage(pt, ct, crt, tt)), + ))); } yield Ok(Event::default().data("[DONE]")); return; @@ -514,48 +645,27 @@ async fn chat_completions_stream( }, })); } - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {"tool_calls": tool_calls}, - "finish_reason": serde_json::Value::Null, - }], - })).unwrap_or_default())); - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {}, - "finish_reason": "tool_calls", - }], - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({"tool_calls": tool_calls}), None)]), + None, + ))); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({}), Some("tool_calls"))]), + None, + ))); if include_usage { let mitm = state.mitm_store.take_usage(&cascade_id).await .or(state.mitm_store.take_usage("_latest").await); let (pt, ct, crt, tt) = if let Some(ref u) = mitm { (u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens) } else { (0, 0, 0, 0) }; - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [], - "usage": { - "prompt_tokens": pt, - "completion_tokens": ct, - "total_tokens": pt + ct, - "prompt_tokens_details": { "cached_tokens": crt }, - "completion_tokens_details": { "reasoning_tokens": tt }, - }, - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([]), + Some(build_usage(pt, ct, crt, tt)), + ))); } yield Ok(Event::default().data("[DONE]")); return; @@ -587,17 +697,11 @@ async fn chat_completions_stream( let delta = &tc[last_thinking_len..]; last_thinking_len = tc.len(); - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {"reasoning_content": delta}, - "finish_reason": serde_json::Value::Null, - }], - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({"reasoning_content": delta}), None)]), + None, + ))); } } @@ -611,17 +715,11 @@ async fn chat_completions_stream( }; if !delta.is_empty() { - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {"content": delta}, - "finish_reason": serde_json::Value::Null, - }], - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({"content": delta}), None)]), + None, + ))); last_text = text.to_string(); } } @@ -629,37 +727,23 @@ async fn chat_completions_stream( // Done check: need DONE status AND non-empty text if is_response_done(steps) && !last_text.is_empty() { debug!("Completions stream done, text length={}", last_text.len()); - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {}, - "finish_reason": "stop", - }], - })).unwrap_or_default())); + let mitm = state.mitm_store.take_usage(&cascade_id).await + .or(state.mitm_store.take_usage("_latest").await); + let fr = google_to_openai_finish_reason(mitm.as_ref().and_then(|u| u.stop_reason.as_deref())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({}), Some(fr))]), + None, + ))); if include_usage { - let mitm = state.mitm_store.take_usage(&cascade_id).await - .or(state.mitm_store.take_usage("_latest").await); let (pt, ct, crt, tt) = if let Some(ref u) = mitm { (u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens) } else { (0, 0, 0, 0) }; - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [], - "usage": { - "prompt_tokens": pt, - "completion_tokens": ct, - "total_tokens": pt + ct, - "prompt_tokens_details": { "cached_tokens": crt }, - "completion_tokens_details": { "reasoning_tokens": tt }, - }, - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([]), + Some(build_usage(pt, ct, crt, tt)), + ))); } yield Ok(Event::default().data("[DONE]")); return; @@ -673,37 +757,23 @@ async fn chat_completions_stream( let run_status = td["status"].as_str().unwrap_or(""); if run_status.contains("IDLE") && !last_text.is_empty() { debug!("Completions IDLE, text length={}", last_text.len()); - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {}, - "finish_reason": "stop", - }], - })).unwrap_or_default())); + let mitm = state.mitm_store.take_usage(&cascade_id).await + .or(state.mitm_store.take_usage("_latest").await); + let fr = google_to_openai_finish_reason(mitm.as_ref().and_then(|u| u.stop_reason.as_deref())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({}), Some(fr))]), + None, + ))); if include_usage { - let mitm = state.mitm_store.take_usage(&cascade_id).await - .or(state.mitm_store.take_usage("_latest").await); let (pt, ct, crt, tt) = if let Some(ref u) = mitm { (u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens) } else { (0, 0, 0, 0) }; - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [], - "usage": { - "prompt_tokens": pt, - "completion_tokens": ct, - "total_tokens": pt + ct, - "prompt_tokens_details": { "cached_tokens": crt }, - "completion_tokens_details": { "reasoning_tokens": tt }, - }, - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([]), + Some(build_usage(pt, ct, crt, tt)), + ))); } yield Ok(Event::default().data("[DONE]")); return; @@ -728,37 +798,23 @@ async fn chat_completions_stream( // Timeout warn!("Completions stream timeout after {}s", timeout); - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [{ - "index": 0, - "delta": {"content": if last_text.is_empty() { "[Timeout waiting for response]" } else { "" }}, - "finish_reason": "stop", - }], - })).unwrap_or_default())); + let mitm = state.mitm_store.take_usage(&cascade_id).await + .or(state.mitm_store.take_usage("_latest").await); + let fr = google_to_openai_finish_reason(mitm.as_ref().and_then(|u| u.stop_reason.as_deref())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([chunk_choice(0, serde_json::json!({"content": if last_text.is_empty() { "[Timeout waiting for response]" } else { "" }}), Some(fr))]), + None, + ))); if include_usage { - let mitm = state.mitm_store.take_usage(&cascade_id).await - .or(state.mitm_store.take_usage("_latest").await); let (pt, ct, crt, tt) = if let Some(ref u) = mitm { (u.input_tokens, u.output_tokens, u.cache_read_input_tokens, u.thinking_output_tokens) } else { (0, 0, 0, 0) }; - yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ - "id": completion_id, - "object": "chat.completion.chunk", - "created": now_unix(), - "model": model_name, - "choices": [], - "usage": { - "prompt_tokens": pt, - "completion_tokens": ct, - "total_tokens": pt + ct, - "prompt_tokens_details": { "cached_tokens": crt }, - "completion_tokens_details": { "reasoning_tokens": tt }, - }, - })).unwrap_or_default())); + yield Ok(Event::default().data(chunk_json( + &completion_id, &model_name, + serde_json::json!([]), + Some(build_usage(pt, ct, crt, tt)), + ))); } yield Ok(Event::default().data("[DONE]")); }; @@ -789,7 +845,10 @@ async fn chat_completions_sync( Some(u) => Some(u), None => state.mitm_store.take_usage("_latest").await, }; - let (prompt_tokens, completion_tokens, cached_tokens, thinking_tokens) = if let Some(mitm_usage) = mitm { + + let finish_reason = google_to_openai_finish_reason(mitm.as_ref().and_then(|u| u.stop_reason.as_deref())); + + let (prompt_tokens, completion_tokens, cached_tokens, thinking_tokens) = if let Some(ref mitm_usage) = mitm { (mitm_usage.input_tokens, mitm_usage.output_tokens, mitm_usage.cache_read_input_tokens, mitm_usage.thinking_output_tokens) } else if let Some(u) = &result.usage { (u.input_tokens, u.output_tokens, 0, 0) @@ -811,10 +870,13 @@ async fn chat_completions_sync( "object": "chat.completion", "created": now_unix(), "model": model_name, + "system_fingerprint": system_fingerprint(), + "service_tier": "default", "choices": [{ "index": 0, "message": message, - "finish_reason": "stop", + "logprobs": serde_json::Value::Null, + "finish_reason": finish_reason, }], "usage": { "prompt_tokens": prompt_tokens, diff --git a/src/api/gemini.rs b/src/api/gemini.rs index e1e7b09..ba33783 100644 --- a/src/api/gemini.rs +++ b/src/api/gemini.rs @@ -57,6 +57,14 @@ pub(crate) struct GeminiRequest { /// Stop sequences. #[serde(default, alias = "stopSequences")] pub stop_sequences: Option>, + /// Thinking level for Gemini 3 models: "minimal", "low", "medium", "high". + /// Maps directly to thinkingConfig.thinkingLevel. + #[serde(default, alias = "thinkingLevel")] + pub thinking_level: Option, + /// Enable Google Search grounding. See Gemini API docs. + /// When true, injects {"googleSearch": {}} into tools via MITM. + #[serde(default, alias = "googleSearch")] + pub google_search: bool, } fn default_timeout() -> u64 { @@ -130,10 +138,33 @@ pub(crate) async fn handle_gemini( // Extract user text let user_text = match &body.input { serde_json::Value::String(s) => s.clone(), + serde_json::Value::Array(arr) => { + // Support array input: can be strings or {text: "..."} objects + let mut parts: Vec = Vec::new(); + for item in arr { + match item { + serde_json::Value::String(s) => parts.push(s.clone()), + serde_json::Value::Object(obj) => { + if let Some(text) = obj.get("text").and_then(|v| v.as_str()) { + parts.push(text.to_string()); + } + } + _ => {} + } + } + if parts.is_empty() { + return err_response( + StatusCode::BAD_REQUEST, + "Gemini input array contains no text parts".to_string(), + "invalid_request_error", + ); + } + parts.join("\n") + } _ => { return err_response( StatusCode::BAD_REQUEST, - "Gemini endpoint requires input as a string".to_string(), + "Gemini endpoint requires input as a string or array of text parts".to_string(), "invalid_request_error", ); } @@ -176,9 +207,14 @@ pub(crate) async fn handle_gemini( stop_sequences: body.stop_sequences.clone(), frequency_penalty: None, presence_penalty: None, + reasoning_effort: body.thinking_level.clone(), + response_mime_type: None, + response_schema: None, + google_search: body.google_search, }; if gp.temperature.is_some() || gp.top_p.is_some() || gp.top_k.is_some() || gp.max_output_tokens.is_some() || gp.stop_sequences.is_some() + || gp.reasoning_effort.is_some() || gp.google_search { state.mitm_store.set_generation_params(gp).await; } else { @@ -443,6 +479,7 @@ async fn gemini_stream( }) .collect(); + let usage_meta = build_usage_metadata(&state.mitm_store, &cascade_id).await; yield Ok::<_, std::convert::Infallible>(Event::default().data(serde_json::to_string(&serde_json::json!({ "candidates": [{ "content": { @@ -451,6 +488,7 @@ async fn gemini_stream( }, "finishReason": "STOP", }], + "usageMetadata": usage_meta, "modelVersion": model_name, })).unwrap_or_default())); yield Ok(Event::default().data("[DONE]")); @@ -499,7 +537,8 @@ async fn gemini_stream( // Check completion let complete = state.mitm_store.is_response_complete(); if complete && !last_text.is_empty() { - // Final chunk with finishReason + // Final chunk with finishReason + usageMetadata + let usage_meta = build_usage_metadata(&state.mitm_store, &cascade_id).await; yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ "candidates": [{ "content": { @@ -508,6 +547,7 @@ async fn gemini_stream( }, "finishReason": "STOP", }], + "usageMetadata": usage_meta, "modelVersion": model_name, })).unwrap_or_default())); yield Ok(Event::default().data("[DONE]")); @@ -570,6 +610,7 @@ async fn gemini_stream( // Done check if is_response_done(steps) && !last_text.is_empty() { + let usage_meta = build_usage_metadata(&state.mitm_store, &cascade_id).await; yield Ok(Event::default().data(serde_json::to_string(&serde_json::json!({ "candidates": [{ "content": { @@ -578,6 +619,7 @@ async fn gemini_stream( }, "finishReason": "STOP", }], + "usageMetadata": usage_meta, "modelVersion": model_name, })).unwrap_or_default())); yield Ok(Event::default().data("[DONE]")); diff --git a/src/api/responses.rs b/src/api/responses.rs index a1c2ab8..03c6ce7 100644 --- a/src/api/responses.rs +++ b/src/api/responses.rs @@ -142,12 +142,15 @@ fn build_response_object(data: ResponseData, params: &RequestParams) -> Response output: data.output, parallel_tool_calls: true, previous_response_id: params.previous_response_id.clone(), - reasoning: Reasoning::default(), + reasoning: Reasoning { + effort: params.reasoning_effort.clone(), + summary: None, + }, store: params.store, temperature: params.temperature, - text: TextFormat::default(), - tool_choice: "auto", - tools: vec![], + text: params.text_format.clone(), + tool_choice: params.tool_choice.clone(), + tools: params.tools.clone(), top_p: params.top_p, truncation: "disabled", usage: data.usage, @@ -230,6 +233,13 @@ pub(crate) async fn handle_responses( } // Store client tools in MitmStore for MITM injection + // Detect web_search_preview tool (OpenAI spec) → enable Google Search grounding + let has_web_search = body.tools.as_ref().map_or(false, |tools| { + tools.iter().any(|t| { + let t_type = t["type"].as_str().unwrap_or(""); + t_type == "web_search_preview" || t_type == "web_search" + }) + }); if let Some(ref tools) = body.tools { let gemini_tools = openai_tools_to_gemini(tools); if !gemini_tools.is_empty() { @@ -243,6 +253,28 @@ pub(crate) async fn handle_responses( } // Store generation parameters for MITM injection + // Extract text.format for structured output (json_schema) + let (response_mime_type, response_schema, text_format) = if let Some(ref text_val) = body.text { + let fmt_type = text_val["format"]["type"].as_str().unwrap_or("text"); + if fmt_type == "json_schema" { + let name = text_val["format"]["name"].as_str().map(|s| s.to_string()); + let schema = text_val["format"]["schema"].as_object().map(|o| serde_json::Value::Object(o.clone())); + let strict = text_val["format"]["strict"].as_bool(); + let tf = TextFormat { + format: TextFormatInner { + format_type: "json_schema".to_string(), + name: name.clone(), + schema: schema.clone(), + strict, + }, + }; + (Some("application/json".to_string()), schema, tf) + } else { + (None, None, TextFormat::default()) + } + } else { + (None, None, TextFormat::default()) + }; { use crate::mitm::store::GenerationParams; let gp = GenerationParams { @@ -253,8 +285,15 @@ pub(crate) async fn handle_responses( stop_sequences: None, frequency_penalty: None, presence_penalty: None, + reasoning_effort: body.reasoning_effort.clone(), + response_mime_type, + response_schema, + google_search: has_web_search, }; - if gp.temperature.is_some() || gp.top_p.is_some() || gp.max_output_tokens.is_some() { + if gp.temperature.is_some() || gp.top_p.is_some() || gp.max_output_tokens.is_some() + || gp.reasoning_effort.is_some() || gp.response_mime_type.is_some() + || gp.response_schema.is_some() || gp.google_search + { state.mitm_store.set_generation_params(gp).await; } else { state.mitm_store.clear_generation_params().await; @@ -337,6 +376,11 @@ pub(crate) async fn handle_responses( previous_response_id: body.previous_response_id.clone(), user: body.user.clone(), metadata: body.metadata.clone().unwrap_or(serde_json::json!({})), + max_tool_calls: body.max_tool_calls, + reasoning_effort: body.reasoning_effort.clone(), + tool_choice: body.tool_choice.clone().unwrap_or(serde_json::json!("auto")), + tools: body.tools.clone().unwrap_or_default(), + text_format, }; if body.stream { @@ -365,6 +409,11 @@ struct RequestParams { previous_response_id: Option, user: Option, metadata: serde_json::Value, + max_tool_calls: Option, + reasoning_effort: Option, + tool_choice: serde_json::Value, + tools: Vec, + text_format: TextFormat, } /// Build Usage from the best available source, and extract thinking text from MITM: @@ -471,10 +520,15 @@ async fn handle_responses_sync( while start.elapsed().as_secs() < timeout { // Check for function calls let captured = state.mitm_store.take_any_function_calls().await; - if let Some(ref calls) = captured { + if let Some(ref raw_calls) = captured { + let calls: Vec<_> = if let Some(max) = params.max_tool_calls { + raw_calls.iter().take(max as usize).collect() + } else { + raw_calls.iter().collect() + }; if !calls.is_empty() { let mut output_items: Vec = Vec::new(); - for fc in calls { + for fc in &calls { let call_id = format!( "call_{}", uuid::Uuid::new_v4().to_string().replace('-', "")[..24].to_string() @@ -567,6 +621,14 @@ async fn handle_responses_sync( // Check for captured function calls from MITM (clears the active flag) let captured_tool_calls = state.mitm_store.take_any_function_calls().await; + // Enforce max_tool_calls limit + let captured_tool_calls = captured_tool_calls.map(|mut calls| { + if let Some(max) = params.max_tool_calls { + calls.truncate(max as usize); + } + calls + }); + // If we have captured tool calls, return them as function_call output items if let Some(ref calls) = captured_tool_calls { info!( @@ -714,7 +776,12 @@ async fn handle_responses_stream( while start.elapsed().as_secs() < timeout { // Check for function calls first let captured = state.mitm_store.take_any_function_calls().await; - if let Some(ref calls) = captured { + if let Some(ref raw_calls) = captured { + let calls: Vec<_> = if let Some(max) = params.max_tool_calls { + raw_calls.iter().take(max as usize).collect() + } else { + raw_calls.iter().collect() + }; if !calls.is_empty() { let msg_output_index: u32 = if thinking_emitted { 1 } else { 0 }; for (i, fc) in calls.iter().enumerate() { @@ -762,7 +829,7 @@ async fn handle_responses_stream( // Build output for final response let mut output_items: Vec = Vec::new(); - for fc in calls { + for fc in &calls { let call_id = format!( "call_{}", uuid::Uuid::new_v4().to_string().replace('-', "")[..24].to_string() diff --git a/src/api/search.rs b/src/api/search.rs new file mode 100644 index 0000000..73b8a4e --- /dev/null +++ b/src/api/search.rs @@ -0,0 +1,288 @@ +//! Pure search endpoint (/v1/search) — triggers Google Search via the LS. +//! +//! Sends a minimal prompt to the LS with Google Search grounding enabled, +//! captures the grounding metadata from the response, and returns the +//! search results without the model's generated text. +//! +//! The LS triggers the actual search request to Google's servers; +//! we capture the results via MITM, never calling Google directly. + +use axum::{ + extract::State, + http::StatusCode, + response::{IntoResponse, Json}, +}; +use std::sync::Arc; +use tracing::{info, warn}; + +use super::models::{lookup_model, MODELS}; +use super::polling::poll_for_response; +use super::util::err_response; +use super::AppState; + +/// Search request body. +#[derive(serde::Deserialize)] +pub(crate) struct SearchRequest { + /// Search query. + pub query: String, + /// Model to use for grounding. Defaults to gemini-3-flash (cheapest). + #[serde(default = "default_search_model")] + pub model: String, + /// Timeout in seconds. + #[serde(default = "default_search_timeout")] + pub timeout: u64, + /// Conversation/session ID for context reuse. + #[serde(default)] + pub conversation: Option, + /// Max output tokens — keep low since we only want grounding metadata. + #[serde(default = "default_search_max_tokens")] + pub max_output_tokens: u64, +} + +fn default_search_model() -> String { + "gemini-3-flash".to_string() +} + +fn default_search_timeout() -> u64 { + 30 +} + +fn default_search_max_tokens() -> u64 { + 256 +} + +/// GET /v1/search?q=... — quick search via query param. +pub(crate) async fn handle_search_get( + State(state): State>, + axum::extract::Query(params): axum::extract::Query, +) -> axum::response::Response { + info!("GET /v1/search q={}", params.q); + + let body = SearchRequest { + query: params.q, + model: params.model.unwrap_or_else(default_search_model), + timeout: params.timeout.unwrap_or(default_search_timeout()), + conversation: None, + max_output_tokens: params.max_tokens.unwrap_or(default_search_max_tokens()), + }; + + do_search(state, body).await +} + +#[derive(serde::Deserialize)] +pub(crate) struct SearchQueryParams { + pub q: String, + #[serde(default)] + pub model: Option, + #[serde(default)] + pub timeout: Option, + #[serde(default)] + pub max_tokens: Option, +} + +/// POST /v1/search — full search request. +pub(crate) async fn handle_search_post( + State(state): State>, + Json(body): Json, +) -> axum::response::Response { + info!("POST /v1/search q={}", body.query); + do_search(state, body).await +} + +async fn do_search(state: Arc, body: SearchRequest) -> axum::response::Response { + let model = match lookup_model(&body.model) { + Some(m) => m, + None => { + let names: Vec<&str> = MODELS.iter().map(|m| m.name).collect(); + return err_response( + StatusCode::BAD_REQUEST, + format!("Unknown model: {}. Available: {names:?}", body.model), + "invalid_request_error", + ); + } + }; + + let token = state.backend.oauth_token().await; + if token.is_empty() { + return err_response( + StatusCode::UNAUTHORIZED, + "No OAuth token.".into(), + "authentication_error", + ); + } + + // Enable Google Search grounding via GenerationParams + { + use crate::mitm::store::GenerationParams; + let gp = GenerationParams { + max_output_tokens: Some(body.max_output_tokens), + google_search: true, + ..Default::default() + }; + state.mitm_store.set_generation_params(gp).await; + } + + // Clear any stale tools — we only want googleSearch + state.mitm_store.clear_tools().await; + + // Create a prompt that encourages the model to ground its response + let search_prompt = format!( + "Search the web for the following query and provide a brief summary of the results:\n\n{}", + body.query + ); + + // Session management + let session_id_str = body.conversation.clone(); + let cascade_id = if let Some(ref sid) = session_id_str { + match state + .sessions + .get_or_create(Some(sid), || state.backend.create_cascade()) + .await + { + Ok(sr) => sr.cascade_id, + Err(e) => { + return err_response( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Failed to create session: {e}"), + "server_error", + ); + } + } + } else { + match state.backend.create_cascade().await { + Ok(id) => id, + Err(e) => { + return err_response( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Failed to create cascade: {e}"), + "server_error", + ); + } + } + }; + + // Set active cascade for MITM correlation + state.mitm_store.set_active_cascade(&cascade_id).await; + + // Send the search message + if let Err(e) = state + .backend + .send_message(&cascade_id, &search_prompt, model.model_enum) + .await + { + state.mitm_store.clear_active_cascade().await; + state.mitm_store.clear_generation_params().await; + return err_response( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Failed to send search message: {e}"), + "server_error", + ); + } + + // Poll for response + let poll_result = poll_for_response(&state, &cascade_id, body.timeout).await; + + // Extract grounding metadata + let grounding = state.mitm_store.take_grounding().await; + + // The poll result text contains the model's summary (grounded response) + let response_text = if !poll_result.text.is_empty() { + poll_result.text.clone() + } else { + // Fall back to MITM captured text + state.mitm_store.take_response_text().await.unwrap_or_default() + }; + + // Clean up + state.mitm_store.clear_active_cascade().await; + state.mitm_store.clear_generation_params().await; + state.mitm_store.clear_response_async().await; + + // Build the search response + let mut response = serde_json::json!({ + "object": "search_result", + "query": body.query, + "model": model.name, + "summary": response_text, + }); + + // Include grounding metadata if available + if let Some(ref gm) = grounding { + // Extract structured search results + let mut search_results = Vec::new(); + + // groundingChunks → individual web results + if let Some(chunks) = gm.get("groundingChunks").and_then(|v| v.as_array()) { + for chunk in chunks { + if let Some(web) = chunk.get("web") { + search_results.push(serde_json::json!({ + "title": web.get("title").and_then(|v| v.as_str()).unwrap_or(""), + "url": web.get("uri").and_then(|v| v.as_str()).unwrap_or(""), + })); + } + } + } + + // groundingSupports → citations with source references + let mut citations = Vec::new(); + if let Some(supports) = gm.get("groundingSupports").and_then(|v| v.as_array()) { + for support in supports { + let text = support.get("segment") + .and_then(|s| s.get("text")) + .and_then(|v| v.as_str()) + .unwrap_or(""); + let indices: Vec = support.get("groundingChunkIndices") + .and_then(|v| v.as_array()) + .map(|arr| arr.iter().filter_map(|i| i.as_u64()).collect()) + .unwrap_or_default(); + let scores: Vec = support.get("confidenceScores") + .and_then(|v| v.as_array()) + .map(|arr| arr.iter().filter_map(|s| s.as_f64()).collect()) + .unwrap_or_default(); + + citations.push(serde_json::json!({ + "text": text, + "source_indices": indices, + "confidence_scores": scores, + })); + } + } + + // searchEntryPoint → rendered search widget HTML + let search_url = gm.get("searchEntryPoint") + .and_then(|sep| sep.get("renderedContent")) + .and_then(|v| v.as_str()); + + // webSearchQueries → the actual queries Google used + let queries = gm.get("webSearchQueries") + .and_then(|v| v.as_array()) + .map(|arr| arr.iter().filter_map(|q| q.as_str().map(|s| s.to_string())).collect::>()); + + response["results"] = serde_json::json!(search_results); + response["citations"] = serde_json::json!(citations); + if let Some(qs) = queries { + response["search_queries"] = serde_json::json!(qs); + } + if let Some(url) = search_url { + response["search_widget_html"] = serde_json::json!(url); + } + + // Include raw grounding metadata for advanced consumers + response["grounding_metadata"] = gm.clone(); + } else { + response["results"] = serde_json::json!([]); + response["citations"] = serde_json::json!([]); + warn!("Search completed but no grounding metadata captured — model may not have grounded"); + } + + // Include usage if available + if let Some(ref u) = poll_result.usage { + response["usage"] = serde_json::json!({ + "input_tokens": u.input_tokens, + "output_tokens": u.output_tokens, + "total_tokens": u.input_tokens + u.output_tokens, + }); + } + + Json(response).into_response() +} diff --git a/src/api/types.rs b/src/api/types.rs index b54c95b..53f966c 100644 --- a/src/api/types.rs +++ b/src/api/types.rs @@ -38,6 +38,17 @@ pub(crate) struct ResponsesRequest { /// Tool choice: "auto", "required", "none", or {"type":"function","function":{"name":"X"}}. #[serde(default)] pub tool_choice: Option, + /// Reasoning effort — forwarded as thinkingConfig.thinkingLevel to Google. + /// Values: "low", "medium", "high". + #[serde(default)] + pub reasoning_effort: Option, + /// Maximum number of tool calls allowed per response. + #[serde(default)] + pub max_tool_calls: Option, + /// Text output format — {format: {type: "json_schema", name: "...", schema: {...}}}. + /// When json_schema, injects responseMimeType + responseSchema via MITM. + #[serde(default)] + pub text: Option, } /// Stream options for Chat Completions (controls usage emission in final chunk). @@ -85,6 +96,80 @@ pub(crate) struct CompletionRequest { /// Presence penalty — forwarded to Google via MITM. #[serde(default)] pub presence_penalty: Option, + /// Reasoning effort — forwarded as thinkingConfig.thinkingLevel to Google. + /// Values: "low", "medium", "high". + #[serde(default)] + pub reasoning_effort: Option, + /// Stop sequences — forwarded as generationConfig.stopSequences to Google. + /// Up to 4 sequences where the API will stop generating further tokens. + #[serde(default)] + pub stop: Option, + /// Response format — {"type": "json_object"} or {"type": "json_schema", "json_schema": {...}}. + /// Injected as responseMimeType (+ responseSchema) in generationConfig via MITM. + #[serde(default)] + pub response_format: Option, + /// Session/conversation ID for multi-turn reuse (custom extension). + #[serde(default)] + pub conversation: Option, + /// Metadata — accepted and ignored (no upstream equivalent). + #[serde(default)] + pub metadata: Option, + /// Number of completions to generate. Each uses a separate cascade (costs N× quota). + /// Defaults to 1. Only supported in sync mode; streaming always uses n=1. + #[serde(default = "default_n")] + pub n: u32, + /// Enable Google Search grounding. When true, the model can search the web + /// and responses include grounding metadata with search results/citations. + #[serde(default)] + pub web_search: bool, +} + +fn default_n() -> u32 { 1 } + +/// Stop sequence can be a single string or array of strings (OpenAI accepts both). +#[derive(Deserialize, Clone)] +#[serde(untagged)] +pub(crate) enum StopSequence { + Single(String), + Multiple(Vec), +} + +impl StopSequence { + pub fn into_vec(self) -> Vec { + match self { + StopSequence::Single(s) => vec![s], + StopSequence::Multiple(v) => v, + } + } +} + +/// Response format for structured output. +/// Supports: +/// - `{"type": "json_object"}` — JSON mode (responseMimeType only) +/// - `{"type": "json_schema", "json_schema": {"name": "...", "schema": {...}}}` — structured output (responseMimeType + responseSchema) +/// - `{"type": "text"}` — plain text (default, no injection) +#[derive(Deserialize, Clone)] +pub(crate) struct ResponseFormat { + #[serde(rename = "type")] + pub format_type: String, + /// JSON schema definition for structured output. + /// Only used when format_type is "json_schema". + #[serde(default)] + pub json_schema: Option, +} + +/// JSON schema structured output format. +#[derive(Deserialize, Clone)] +pub(crate) struct JsonSchemaFormat { + /// Schema name (for client identification). + #[serde(default)] + pub name: Option, + /// The actual JSON schema object — forwarded as Gemini's responseSchema. + #[serde(default)] + pub schema: Option, + /// Whether to enable strict schema adherence. + #[serde(default)] + pub strict: Option, } #[derive(Deserialize)] @@ -132,7 +217,7 @@ pub(crate) struct ResponsesResponse { pub store: bool, pub temperature: f64, pub text: TextFormat, - pub tool_choice: &'static str, + pub tool_choice: serde_json::Value, pub tools: Vec, pub top_p: f64, pub truncation: &'static str, @@ -180,7 +265,16 @@ pub(crate) struct TextFormat { #[derive(Serialize, Clone)] pub(crate) struct TextFormatInner { #[serde(rename = "type")] - pub format_type: &'static str, + pub format_type: String, + /// JSON schema — present when format_type is "json_schema". + #[serde(skip_serializing_if = "Option::is_none")] + pub name: Option, + /// The actual schema object. + #[serde(skip_serializing_if = "Option::is_none")] + pub schema: Option, + /// Whether strict mode was requested. + #[serde(skip_serializing_if = "Option::is_none")] + pub strict: Option, } impl Usage { @@ -220,7 +314,10 @@ impl Default for TextFormat { fn default() -> Self { Self { format: TextFormatInner { - format_type: "text", + format_type: "text".to_string(), + name: None, + schema: None, + strict: None, }, } } diff --git a/src/mitm/proxy.rs b/src/mitm/proxy.rs index ed8347d..2fa9aad 100644 --- a/src/mitm/proxy.rs +++ b/src/mitm/proxy.rs @@ -755,7 +755,7 @@ async fn handle_http_over_tls( info!("MITM: stored {} function call(s) from initial body", streaming_acc.function_calls.len()); } - // Capture response + thinking text directly into MitmStore + // Capture response + thinking text + grounding directly into MitmStore if bypass_ls { if !streaming_acc.response_text.is_empty() { store.set_response_text(&streaming_acc.response_text).await; @@ -763,6 +763,9 @@ async fn handle_http_over_tls( if !streaming_acc.thinking_text.is_empty() { store.set_thinking_text(&streaming_acc.thinking_text).await; } + if let Some(ref gm) = streaming_acc.grounding_metadata { + store.set_grounding(gm.clone()).await; + } if streaming_acc.is_complete { store.mark_response_complete(); } @@ -827,7 +830,7 @@ async fn handle_http_over_tls( info!("MITM: stored {} function call(s) from body chunk", streaming_acc.function_calls.len()); } - // Capture response + thinking text directly into MitmStore + // Capture response + thinking text + grounding directly into MitmStore if bypass_ls { if !streaming_acc.response_text.is_empty() { store.set_response_text(&streaming_acc.response_text).await; @@ -835,6 +838,9 @@ async fn handle_http_over_tls( if !streaming_acc.thinking_text.is_empty() { store.set_thinking_text(&streaming_acc.thinking_text).await; } + if let Some(ref gm) = streaming_acc.grounding_metadata { + store.set_grounding(gm.clone()).await; + } if streaming_acc.is_complete { store.mark_response_complete(); } @@ -883,6 +889,10 @@ async fn handle_http_over_tls( // Capture usage data if is_streaming_response { + // Store grounding metadata before consuming the accumulator + if let Some(ref gm) = streaming_acc.grounding_metadata { + store.set_grounding(gm.clone()).await; + } if streaming_acc.is_complete || streaming_acc.output_tokens > 0 { // Function calls are stored immediately when detected (above), // so no need to store them again here.