fix: block ALL LS follow-up requests across connections

Move the in-flight blocking check to the top of the LLM request flow,
BEFORE request modification. This catches follow-ups on ALL connections
(the LS opens multiple parallel TLS connections). Only the very first
modified request reaches Google — all others get fake STOP responses.

Previously, each new connection independently allowed one request
through before blocking, letting 4-5 requests leak per turn.
This commit is contained in:
Nikketryhard
2026-02-16 00:57:33 -06:00
parent a8f3c8915f
commit 3fdd0368a0
23 changed files with 992 additions and 568 deletions

View File

@@ -251,7 +251,10 @@ fn looks_like_valid_message(fields: &[ProtoField], original_len: usize) -> bool
// (e.g., a long string that happened to have a valid first-field prefix)
if fields.len() == 1 && original_len > 100 {
// Single-field messages of >100 bytes are suspicious unless the field is bytes/message
matches!(&fields[0].value, ProtoValue::Bytes(_) | ProtoValue::Message(_))
matches!(
&fields[0].value,
ProtoValue::Bytes(_) | ProtoValue::Message(_)
)
} else {
true
}
@@ -328,7 +331,9 @@ fn try_extract_usage(fields: &[ProtoField]) -> Option<GrpcUsage> {
.iter()
.filter_map(|f| {
if let ProtoValue::Bytes(ref b) = f.value {
std::str::from_utf8(b).ok().map(|s| (f.number, s.to_string()))
std::str::from_utf8(b)
.ok()
.map(|s| (f.number, s.to_string()))
} else {
None
}
@@ -361,14 +366,23 @@ fn try_extract_usage(fields: &[ProtoField]) -> Option<GrpcUsage> {
// Check if there's a model-like string (field 7 = message_id or field 11 = response_id
// can contain model names, or model enum values map to known names)
let has_model_string = string_fields.iter().any(|(_, s)| {
s.contains("claude") || s.contains("gemini") || s.contains("gpt")
|| s.starts_with("models/") || s.contains("sonnet") || s.contains("opus")
|| s.contains("flash") || s.contains("pro")
s.contains("claude")
|| s.contains("gemini")
|| s.contains("gpt")
|| s.starts_with("models/")
|| s.contains("sonnet")
|| s.contains("opus")
|| s.contains("flash")
|| s.contains("pro")
});
// Check for fields at the known ModelUsageStats field numbers
let has_field_2 = fields.iter().any(|f| f.number == 2 && matches!(f.value, ProtoValue::Varint(_)));
let has_field_3 = fields.iter().any(|f| f.number == 3 && matches!(f.value, ProtoValue::Varint(_)));
let has_field_2 = fields
.iter()
.any(|f| f.number == 2 && matches!(f.value, ProtoValue::Varint(_)));
let has_field_3 = fields
.iter()
.any(|f| f.number == 3 && matches!(f.value, ProtoValue::Varint(_)));
// Strong signal: has both input and output token fields
let is_likely_usage = (has_field_2 && has_field_3) || has_model_string;
@@ -392,8 +406,8 @@ fn try_extract_usage(fields: &[ProtoField]) -> Option<GrpcUsage> {
// field 1 = model enum (varint, not string!)
2 => usage.input_tokens = v,
3 => usage.output_tokens = v,
4 => usage.cache_write_tokens = v, // VERIFIED: field 4
5 => usage.cache_read_tokens = v, // VERIFIED: field 5
4 => usage.cache_write_tokens = v, // VERIFIED: field 4
5 => usage.cache_read_tokens = v, // VERIFIED: field 5
// field 6 = api_provider enum (varint)
9 => usage.thinking_output_tokens = v, // VERIFIED: field 9
10 => usage.response_output_tokens = v, // VERIFIED: field 10
@@ -486,11 +500,11 @@ pub fn parse_grpc_response_for_usage(body: &[u8]) -> Option<GrpcUsage> {
fn model_enum_name(enum_val: u64) -> &'static str {
match enum_val {
// Placeholder models (1000 + N)
1007 => "gemini-3-pro", // MODEL_PLACEHOLDER_M7
1008 => "gemini-3-pro-high", // MODEL_PLACEHOLDER_M8
1012 => "claude-opus-4.5", // MODEL_PLACEHOLDER_M12
1018 => "gemini-3-flash", // MODEL_PLACEHOLDER_M18
1026 => "claude-opus-4.6", // MODEL_PLACEHOLDER_M26
1007 => "gemini-3-pro", // MODEL_PLACEHOLDER_M7
1008 => "gemini-3-pro-high", // MODEL_PLACEHOLDER_M8
1012 => "claude-opus-4.5", // MODEL_PLACEHOLDER_M12
1018 => "gemini-3-flash", // MODEL_PLACEHOLDER_M18
1026 => "claude-opus-4.6", // MODEL_PLACEHOLDER_M26
// Claude models (named)
281 => "claude-4-sonnet",
@@ -629,13 +643,13 @@ mod tests {
data.push(v as u8);
}
encode_varint_field(&mut data, 1, 5); // model enum
encode_varint_field(&mut data, 2, 1000); // input_tokens
encode_varint_field(&mut data, 3, 500); // output_tokens
encode_varint_field(&mut data, 4, 100); // cache_write_tokens
encode_varint_field(&mut data, 5, 200); // cache_read_tokens
encode_varint_field(&mut data, 9, 300); // thinking_output_tokens
encode_varint_field(&mut data, 10, 200); // response_output_tokens
encode_varint_field(&mut data, 1, 5); // model enum
encode_varint_field(&mut data, 2, 1000); // input_tokens
encode_varint_field(&mut data, 3, 500); // output_tokens
encode_varint_field(&mut data, 4, 100); // cache_write_tokens
encode_varint_field(&mut data, 5, 200); // cache_read_tokens
encode_varint_field(&mut data, 9, 300); // thinking_output_tokens
encode_varint_field(&mut data, 10, 200); // response_output_tokens
let fields = decode_proto(&data);
let usage = try_extract_usage(&fields).expect("should extract usage");