fix: block ALL LS follow-up requests across connections
Move the in-flight blocking check to the top of the LLM request flow, BEFORE request modification. This catches follow-ups on ALL connections (the LS opens multiple parallel TLS connections). Only the very first modified request reaches Google — all others get fake STOP responses. Previously, each new connection independently allowed one request through before blocking, letting 4-5 requests leak per turn.
This commit is contained in:
@@ -78,15 +78,21 @@ impl StreamingAccumulator {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Process a single SSE event.
|
||||
/// Process a single SSE event.
|
||||
pub fn process_event(&mut self, event: &Value) {
|
||||
// ── Google format: {"response": {"usageMetadata": {...}, "modelVersion": "..."}} ──
|
||||
if let Some(response) = event.get("response") {
|
||||
// Extract usage metadata (each event has cumulative counts)
|
||||
if let Some(usage) = response.get("usageMetadata") {
|
||||
self.input_tokens = usage["promptTokenCount"].as_u64().unwrap_or(self.input_tokens);
|
||||
self.output_tokens = usage["candidatesTokenCount"].as_u64().unwrap_or(self.output_tokens);
|
||||
self.thinking_tokens = usage["thoughtsTokenCount"].as_u64().unwrap_or(self.thinking_tokens);
|
||||
self.input_tokens = usage["promptTokenCount"]
|
||||
.as_u64()
|
||||
.unwrap_or(self.input_tokens);
|
||||
self.output_tokens = usage["candidatesTokenCount"]
|
||||
.as_u64()
|
||||
.unwrap_or(self.output_tokens);
|
||||
self.thinking_tokens = usage["thoughtsTokenCount"]
|
||||
.as_u64()
|
||||
.unwrap_or(self.thinking_tokens);
|
||||
}
|
||||
if let Some(model) = response["modelVersion"].as_str() {
|
||||
self.model = Some(model.to_string());
|
||||
@@ -170,8 +176,10 @@ impl StreamingAccumulator {
|
||||
"message_start" => {
|
||||
if let Some(usage) = event.get("message").and_then(|m| m.get("usage")) {
|
||||
self.input_tokens = usage["input_tokens"].as_u64().unwrap_or(0);
|
||||
self.cache_creation_input_tokens = usage["cache_creation_input_tokens"].as_u64().unwrap_or(0);
|
||||
self.cache_read_input_tokens = usage["cache_read_input_tokens"].as_u64().unwrap_or(0);
|
||||
self.cache_creation_input_tokens =
|
||||
usage["cache_creation_input_tokens"].as_u64().unwrap_or(0);
|
||||
self.cache_read_input_tokens =
|
||||
usage["cache_read_input_tokens"].as_u64().unwrap_or(0);
|
||||
}
|
||||
if let Some(model) = event.get("message").and_then(|m| m["model"].as_str()) {
|
||||
self.model = Some(model.to_string());
|
||||
@@ -181,7 +189,9 @@ impl StreamingAccumulator {
|
||||
}
|
||||
"message_delta" => {
|
||||
if let Some(usage) = event.get("usage") {
|
||||
self.output_tokens = usage["output_tokens"].as_u64().unwrap_or(self.output_tokens);
|
||||
self.output_tokens = usage["output_tokens"]
|
||||
.as_u64()
|
||||
.unwrap_or(self.output_tokens);
|
||||
}
|
||||
if let Some(reason) = event["delta"]["stop_reason"].as_str() {
|
||||
self.stop_reason = Some(reason.to_string());
|
||||
@@ -235,7 +245,10 @@ impl StreamingAccumulator {
|
||||
response_output_tokens: 0,
|
||||
model: self.model,
|
||||
stop_reason: self.stop_reason,
|
||||
api_provider: self.api_provider.unwrap_or_else(|| "unknown".to_string()).into(),
|
||||
api_provider: self
|
||||
.api_provider
|
||||
.unwrap_or_else(|| "unknown".to_string())
|
||||
.into(),
|
||||
grpc_method: None,
|
||||
captured_at: std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
|
||||
Reference in New Issue
Block a user