fix: block ALL LS follow-up requests across connections

Move the in-flight blocking check to the top of the LLM request flow,
BEFORE request modification. This catches follow-ups on ALL connections
(the LS opens multiple parallel TLS connections). Only the very first
modified request reaches Google — all others get fake STOP responses.

Previously, each new connection independently allowed one request
through before blocking, letting 4-5 requests leak per turn.
This commit is contained in:
Nikketryhard
2026-02-16 00:57:33 -06:00
parent a8f3c8915f
commit 3fdd0368a0
23 changed files with 992 additions and 568 deletions

View File

@@ -78,15 +78,21 @@ impl StreamingAccumulator {
Self::default()
}
/// Process a single SSE event.
/// Process a single SSE event.
pub fn process_event(&mut self, event: &Value) {
// ── Google format: {"response": {"usageMetadata": {...}, "modelVersion": "..."}} ──
if let Some(response) = event.get("response") {
// Extract usage metadata (each event has cumulative counts)
if let Some(usage) = response.get("usageMetadata") {
self.input_tokens = usage["promptTokenCount"].as_u64().unwrap_or(self.input_tokens);
self.output_tokens = usage["candidatesTokenCount"].as_u64().unwrap_or(self.output_tokens);
self.thinking_tokens = usage["thoughtsTokenCount"].as_u64().unwrap_or(self.thinking_tokens);
self.input_tokens = usage["promptTokenCount"]
.as_u64()
.unwrap_or(self.input_tokens);
self.output_tokens = usage["candidatesTokenCount"]
.as_u64()
.unwrap_or(self.output_tokens);
self.thinking_tokens = usage["thoughtsTokenCount"]
.as_u64()
.unwrap_or(self.thinking_tokens);
}
if let Some(model) = response["modelVersion"].as_str() {
self.model = Some(model.to_string());
@@ -170,8 +176,10 @@ impl StreamingAccumulator {
"message_start" => {
if let Some(usage) = event.get("message").and_then(|m| m.get("usage")) {
self.input_tokens = usage["input_tokens"].as_u64().unwrap_or(0);
self.cache_creation_input_tokens = usage["cache_creation_input_tokens"].as_u64().unwrap_or(0);
self.cache_read_input_tokens = usage["cache_read_input_tokens"].as_u64().unwrap_or(0);
self.cache_creation_input_tokens =
usage["cache_creation_input_tokens"].as_u64().unwrap_or(0);
self.cache_read_input_tokens =
usage["cache_read_input_tokens"].as_u64().unwrap_or(0);
}
if let Some(model) = event.get("message").and_then(|m| m["model"].as_str()) {
self.model = Some(model.to_string());
@@ -181,7 +189,9 @@ impl StreamingAccumulator {
}
"message_delta" => {
if let Some(usage) = event.get("usage") {
self.output_tokens = usage["output_tokens"].as_u64().unwrap_or(self.output_tokens);
self.output_tokens = usage["output_tokens"]
.as_u64()
.unwrap_or(self.output_tokens);
}
if let Some(reason) = event["delta"]["stop_reason"].as_str() {
self.stop_reason = Some(reason.to_string());
@@ -235,7 +245,10 @@ impl StreamingAccumulator {
response_output_tokens: 0,
model: self.model,
stop_reason: self.stop_reason,
api_provider: self.api_provider.unwrap_or_else(|| "unknown".to_string()).into(),
api_provider: self
.api_provider
.unwrap_or_else(|| "unknown".to_string())
.into(),
grpc_method: None,
captured_at: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)