Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
4f2c206
feat(config): make conversation history limit configurable
Apr 27, 2026
46578d1
feat(memory): thread STM context through gRPC stack and schedule STMO…
Apr 27, 2026
3143e6c
test(memory): add unit tests for STMO boundary and count logic
Apr 27, 2026
5da7776
refactor(grpc): group storage handles into PersistenceHandles struct
Apr 27, 2026
d6d6960
refactor(memory): inline STMO enqueue — remove tokio::spawn
Apr 27, 2026
0ed7b64
fix(memory): count STM turns from assembled conversation history
Apr 27, 2026
f499c27
refactor(grpc): add with_request_context() to ResponsesContext
Apr 27, 2026
4f7f5e4
fix(clippy): suppress too_many_arguments, remove unused with_request_…
Apr 27, 2026
79aca7f
refactor(memory): align stm_enabled with MemoryExecutionState pattern
Apr 27, 2026
455c6c7
fix(memory): use raw DB item count for STMO total_items in conversati…
Apr 27, 2026
09c5213
fix(memory): reject oversized conversations when STMO is active
Apr 28, 2026
f1dec4d
fix(memory): skip STMO when conversation history load fails
Apr 28, 2026
ea0e87e
fix(memory): skip STMO in Harmony path when conversation history not …
Apr 28, 2026
7925e8f
fix(memory): skip raw-count correction when response chain also loaded
Apr 28, 2026
9e7d765
fix(memory): gate conversation_too_large rejection on store=true
Apr 28, 2026
3598d93
fix(memory): skip STMO cap check for MCP streaming path
Apr 28, 2026
5410bb1
refactor(memory): simplify fetch_limit to always use cap+1
Apr 28, 2026
acf8389
refactor(memory): remove unused turn_info param from execute_tool_loo…
Apr 28, 2026
ff8c4c2
fix(memory): use cap fetch with >= check instead of cap+1 probe
Apr 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions model_gateway/src/config/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ impl RouterConfigBuilder {
self
}

pub fn max_conversation_history_items(mut self, max_items: usize) -> Self {
self.config.max_conversation_history_items = max_items;
self
}

pub fn worker_startup_timeout_secs(mut self, timeout: u64) -> Self {
self.config.worker_startup_timeout_secs = timeout;
self
Expand Down
14 changes: 12 additions & 2 deletions model_gateway/src/config/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ pub struct RouterConfig {
pub storage_context_headers: HashMap<String, String>,
#[serde(default)]
pub memory_runtime: MemoryRuntimeConfig,
/// Maximum conversation items to load into request context.
#[serde(default = "default_max_conversation_history_items")]
pub max_conversation_history_items: usize,
#[serde(default)]
pub background: BackgroundConfig,
#[serde(default)]
Expand Down Expand Up @@ -219,6 +222,10 @@ fn default_load_monitor_interval_secs() -> u64 {
10
}

fn default_max_conversation_history_items() -> usize {
100
}

fn default_enable_l0() -> bool {
false
}
Expand Down Expand Up @@ -628,8 +635,9 @@ impl Default for RouterConfig {
policy: PolicyConfig::Random,
host: "0.0.0.0".to_string(),
port: 3001,
max_payload_size: 536_870_912, // 512MB
request_timeout_secs: 1800, // 30 minutes
max_payload_size: 536_870_912, // 512MB
request_timeout_secs: 1800, // 30 minutes
max_conversation_history_items: default_max_conversation_history_items(),
worker_startup_timeout_secs: 1800, // 30 minutes for large model loading
worker_startup_check_interval_secs: 30,
load_monitor_interval_secs: 10,
Expand Down Expand Up @@ -766,6 +774,7 @@ mod tests {
assert_eq!(config.port, 3001);
assert_eq!(config.max_payload_size, 536_870_912);
assert_eq!(config.request_timeout_secs, 1800);
assert_eq!(config.max_conversation_history_items, 100);
assert_eq!(config.worker_startup_timeout_secs, 1800);
assert_eq!(config.worker_startup_check_interval_secs, 30);
assert_eq!(config.load_monitor_interval_secs, 10);
Expand Down Expand Up @@ -955,6 +964,7 @@ stream_retention_secs: 3600

assert!(deserialized.skills_enabled);
assert!(deserialized.skills.is_none());
assert_eq!(deserialized.max_conversation_history_items, 100);
assert!(!deserialized.tenant_resolution.trust_tenant_header);
assert_eq!(
deserialized.tenant_resolution.tenant_header_name,
Expand Down
8 changes: 8 additions & 0 deletions model_gateway/src/config/validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,14 @@ impl ConfigValidator {
});
}

if config.max_conversation_history_items == 0 {
return Err(ConfigError::InvalidValue {
field: "max_conversation_history_items".to_string(),
value: config.max_conversation_history_items.to_string(),
reason: "Must be > 0".to_string(),
});
}

if config.queue_size > 0 && config.queue_timeout_secs == 0 {
return Err(ConfigError::InvalidValue {
field: "queue_timeout_secs".to_string(),
Expand Down
17 changes: 16 additions & 1 deletion model_gateway/src/memory/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ pub struct MemoryExecutionContext {
pub subject_id: Option<String>,
pub embedding_model: Option<String>,
pub extraction_model: Option<String>,
pub stm_enabled: MemoryExecutionState,
pub stm_condenser_model_id: Option<String>,
}

impl MemoryExecutionContext {
Expand All @@ -68,6 +70,8 @@ impl MemoryExecutionContext {
}
let store_ltm_requested = policy.allows_ltm_store();
let recall_requested = policy.allows_recall();
let stm_enabled =
MemoryExecutionState::from_requested_and_runtime(headers.stm_enabled, runtime.enabled);

Self {
store_ltm: MemoryExecutionState::from_requested_and_runtime(
Expand All @@ -82,6 +86,11 @@ impl MemoryExecutionContext {
subject_id: headers.subject_id.clone(),
embedding_model: headers.embedding_model.clone(),
extraction_model: headers.extraction_model.clone(),
stm_enabled,
stm_condenser_model_id: stm_enabled
.active()
.then_some(headers.stm_condenser_model_id.clone())
.flatten(),
}
}
}
Expand Down Expand Up @@ -151,6 +160,8 @@ mod tests {
fn store_and_recall_requested_but_not_active_when_runtime_disabled() {
let headers = MemoryHeaderView {
policy: Some("store_and_recall".to_string()),
stm_enabled: true,
stm_condenser_model_id: Some("condense-1".to_string()),
..MemoryHeaderView::default()
};

Expand All @@ -159,6 +170,8 @@ mod tests {
assert_eq!(ctx.store_ltm, MemoryExecutionState::GatedOff);
assert_eq!(ctx.recall, MemoryExecutionState::GatedOff);
assert_eq!(ctx.policy_mode, MemoryPolicyMode::StoreAndRecall);
assert_eq!(ctx.stm_enabled, MemoryExecutionState::GatedOff);
assert_eq!(ctx.stm_condenser_model_id, None);
}

#[test]
Expand All @@ -167,7 +180,7 @@ mod tests {
headers.insert(
"x-conversation-memory-config",
HeaderValue::from_static(
r#"{"long_term_memory":{"enabled":true,"policy":"store_and_recall","subject_id":" subject_abc ","embedding_model_id":" text-embedding-3-small ","extraction_model_id":" gpt-4.1-mini "}}"#,
r#"{"long_term_memory":{"enabled":true,"policy":"store_and_recall","subject_id":" subject_abc ","embedding_model_id":" text-embedding-3-small ","extraction_model_id":" gpt-4.1-mini "},"short_term_memory":{"enabled":true,"condenser_model_id":" condense-1 "}}"#,
),
);

Expand All @@ -182,5 +195,7 @@ mod tests {
Some("text-embedding-3-small")
);
assert_eq!(ctx.extraction_model.as_deref(), Some("gpt-4.1-mini"));
assert_eq!(ctx.stm_enabled, MemoryExecutionState::Active);
assert_eq!(ctx.stm_condenser_model_id.as_deref(), Some("condense-1"));
}
}
13 changes: 13 additions & 0 deletions model_gateway/src/routers/common/header_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ pub struct MemoryHeaderView {
pub subject_id: Option<String>,
pub embedding_model: Option<String>,
pub extraction_model: Option<String>,
pub stm_enabled: bool,
pub stm_condenser_model_id: Option<String>,
}

impl MemoryHeaderView {
Expand All @@ -35,6 +37,7 @@ impl MemoryHeaderView {
return Self::default();
};
let ltm_enabled = config.long_term_memory.enabled;
let stm_enabled = config.short_term_memory.enabled;
let policy = if ltm_enabled {
config
.long_term_memory
Expand All @@ -54,6 +57,10 @@ impl MemoryHeaderView {
extraction_model: ltm_enabled
.then_some(config.long_term_memory.extraction_model_id)
.flatten(),
stm_enabled,
stm_condenser_model_id: stm_enabled
.then_some(config.short_term_memory.condenser_model_id)
.flatten(),
}
}
}
Expand Down Expand Up @@ -543,6 +550,8 @@ mod tests {
assert_eq!(view.subject_id, None);
assert_eq!(view.embedding_model, None);
assert_eq!(view.extraction_model, None);
assert!(!view.stm_enabled);
assert_eq!(view.stm_condenser_model_id, None);
}

#[test]
Expand All @@ -561,6 +570,8 @@ mod tests {
assert_eq!(view.subject_id, None);
assert_eq!(view.embedding_model, None);
assert_eq!(view.extraction_model, None);
assert!(view.stm_enabled);
assert_eq!(view.stm_condenser_model_id.as_deref(), Some("cond-1"));
}

#[test]
Expand All @@ -582,6 +593,8 @@ mod tests {
Some("text-embedding-3-small")
);
assert_eq!(view.extraction_model.as_deref(), Some("gpt-4.1-mini"));
assert!(!view.stm_enabled);
assert_eq!(view.stm_condenser_model_id, None);
}

#[test]
Expand Down
Loading
Loading