Skip to content

Commit 011e491

Browse files
test(stack): add failure cascade tests for executor, health poller, orchestrator
14 new unit tests covering production failure scenarios: Executor (6 tests): - stop_failure_still_attempts_remove: stop fails → remove still called - remove_failure_still_updates_state_to_stopped: state transitions despite errors - stop_and_remove_both_fail_still_updates_state: double failure → clean state - ports_released_on_remove_even_when_stop_fails: no port leaks on failure - ports_released_when_create_fails_on_retry: port reallocation after create failure - replica_scale_down_removes_excess_replicas: reconciler + executor scale-down Health poller (4 tests): - multiple_services_timeout_simultaneously: parallel timeouts don't block - timeouts_count_toward_retry_exhaustion: timeouts exhaust retries like failures - exec_error_counted_as_failure_toward_exhaustion: exec errors count as failures - service_recovers_after_retry_exhaustion: service can become healthy again Orchestrator (4 tests): - max_rounds_exhaustion_reports_pending_services: no transient state on exit - max_rounds_exhaustion_with_create_failure_leaves_failed_state: terminal state - orchestrator_resumes_cleanly_after_max_rounds: second run converges - concurrent_health_check_failures_dont_block_convergence: multi-service recovery Also adds fail_remove flag to MockContainerRuntime and runtime_mut/ports_mut test accessors to StackExecutor. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 440f5c0 commit 011e491

3 files changed

Lines changed: 608 additions & 0 deletions

File tree

crates/vz-stack/src/executor.rs

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,11 +445,23 @@ impl<R: ContainerRuntime> StackExecutor<R> {
445445
&self.ports
446446
}
447447

448+
/// Mutable access to the port tracker (for test reallocation checks).
449+
#[cfg(test)]
450+
pub fn ports_mut(&mut self) -> &mut PortTracker {
451+
&mut self.ports
452+
}
453+
448454
/// Access the underlying container runtime.
449455
pub fn runtime(&self) -> &R {
450456
&self.runtime
451457
}
452458

459+
/// Mutable access to the underlying container runtime (for test failure injection).
460+
#[cfg(test)]
461+
pub fn runtime_mut(&mut self) -> &mut R {
462+
&mut self.runtime
463+
}
464+
453465
/// Execute a batch of reconciler actions for the given stack spec.
454466
///
455467
/// Services at the same topological level (no dependency edges
@@ -1307,6 +1319,8 @@ pub(crate) mod tests_support {
13071319
pub fail_create: bool,
13081320
/// Whether stop should fail.
13091321
pub fail_stop: bool,
1322+
/// Whether remove should fail.
1323+
pub fail_remove: bool,
13101324
/// Exit code to return from exec calls.
13111325
pub exec_exit_code: i32,
13121326
/// Whether exec should fail with an error (not just non-zero exit).
@@ -1337,6 +1351,7 @@ pub(crate) mod tests_support {
13371351
fail_pull: false,
13381352
fail_create: false,
13391353
fail_stop: false,
1354+
fail_remove: false,
13401355
exec_exit_code: 0,
13411356
fail_exec: false,
13421357
exec_delay: None,
@@ -1431,6 +1446,9 @@ pub(crate) mod tests_support {
14311446
.lock()
14321447
.unwrap()
14331448
.push(("remove".to_string(), container_id.to_string()));
1449+
if self.fail_remove {
1450+
return Err(StackError::InvalidSpec("mock remove failure".to_string()));
1451+
}
14341452
Ok(())
14351453
}
14361454

@@ -1579,6 +1597,7 @@ mod tests {
15791597

15801598
use super::tests_support::MockContainerRuntime;
15811599
use super::*;
1600+
use crate::reconcile::apply;
15821601
use crate::spec::MountSpec as StackMountSpec;
15831602
use crate::spec::{PortSpec, ResourcesSpec, ServiceKind, StackSpec, VolumeSpec};
15841603
use std::collections::HashMap;
@@ -3234,4 +3253,248 @@ mod tests {
32343253
// Ensure Debug is derived.
32353254
let _debug = format!("{:?}", cloned);
32363255
}
3256+
3257+
// ── Stop/remove failure cascade tests ──
3258+
3259+
#[test]
3260+
fn stop_failure_still_attempts_remove() {
3261+
let mut runtime = MockContainerRuntime::new();
3262+
runtime.fail_stop = true;
3263+
let mut executor = make_executor(runtime);
3264+
let spec = stack("myapp", vec![]);
3265+
3266+
// Simulate existing running container.
3267+
executor
3268+
.store()
3269+
.save_observed_state(
3270+
"myapp",
3271+
&ServiceObservedState {
3272+
service_name: "web".to_string(),
3273+
phase: ServicePhase::Running,
3274+
container_id: Some("ctr-web".to_string()),
3275+
last_error: None,
3276+
ready: false,
3277+
},
3278+
)
3279+
.unwrap();
3280+
3281+
let actions = vec![Action::ServiceRemove {
3282+
service_name: "web".to_string(),
3283+
}];
3284+
3285+
let result = executor.execute(&spec, &actions).unwrap();
3286+
assert!(result.all_succeeded(), "remove should succeed despite stop failure");
3287+
3288+
// Verify both stop AND remove were attempted.
3289+
let calls = executor.runtime().call_log();
3290+
assert!(calls.iter().any(|(op, _)| op == "stop"), "stop should be attempted");
3291+
assert!(
3292+
calls.iter().any(|(op, _)| op == "remove"),
3293+
"remove should still be called after stop failure"
3294+
);
3295+
3296+
// State should be Stopped (not stuck in Running).
3297+
let observed = executor.store().load_observed_state("myapp").unwrap();
3298+
let web = observed.iter().find(|o| o.service_name == "web").unwrap();
3299+
assert_eq!(web.phase, ServicePhase::Stopped);
3300+
assert!(web.container_id.is_none());
3301+
}
3302+
3303+
#[test]
3304+
fn remove_failure_still_updates_state_to_stopped() {
3305+
let mut runtime = MockContainerRuntime::new();
3306+
runtime.fail_remove = true;
3307+
let mut executor = make_executor(runtime);
3308+
let spec = stack("myapp", vec![]);
3309+
3310+
// Simulate existing running container.
3311+
executor
3312+
.store()
3313+
.save_observed_state(
3314+
"myapp",
3315+
&ServiceObservedState {
3316+
service_name: "web".to_string(),
3317+
phase: ServicePhase::Running,
3318+
container_id: Some("ctr-web".to_string()),
3319+
last_error: None,
3320+
ready: false,
3321+
},
3322+
)
3323+
.unwrap();
3324+
3325+
let actions = vec![Action::ServiceRemove {
3326+
service_name: "web".to_string(),
3327+
}];
3328+
3329+
let result = executor.execute(&spec, &actions).unwrap();
3330+
assert!(result.all_succeeded(), "remove should succeed even when runtime remove fails");
3331+
3332+
// State should be Stopped (not stuck in Running).
3333+
let observed = executor.store().load_observed_state("myapp").unwrap();
3334+
let web = observed.iter().find(|o| o.service_name == "web").unwrap();
3335+
assert_eq!(web.phase, ServicePhase::Stopped);
3336+
assert!(web.container_id.is_none());
3337+
}
3338+
3339+
#[test]
3340+
fn stop_and_remove_both_fail_still_updates_state() {
3341+
let mut runtime = MockContainerRuntime::new();
3342+
runtime.fail_stop = true;
3343+
runtime.fail_remove = true;
3344+
let mut executor = make_executor(runtime);
3345+
let spec = stack("myapp", vec![]);
3346+
3347+
executor
3348+
.store()
3349+
.save_observed_state(
3350+
"myapp",
3351+
&ServiceObservedState {
3352+
service_name: "web".to_string(),
3353+
phase: ServicePhase::Running,
3354+
container_id: Some("ctr-web".to_string()),
3355+
last_error: None,
3356+
ready: false,
3357+
},
3358+
)
3359+
.unwrap();
3360+
3361+
let actions = vec![Action::ServiceRemove {
3362+
service_name: "web".to_string(),
3363+
}];
3364+
3365+
let result = executor.execute(&spec, &actions).unwrap();
3366+
// Executor marks result as succeeded because state is updated
3367+
// regardless of stop/remove runtime errors (best-effort cleanup).
3368+
assert!(result.all_succeeded());
3369+
3370+
let observed = executor.store().load_observed_state("myapp").unwrap();
3371+
let web = observed.iter().find(|o| o.service_name == "web").unwrap();
3372+
assert_eq!(web.phase, ServicePhase::Stopped);
3373+
}
3374+
3375+
#[test]
3376+
fn ports_released_on_remove_even_when_stop_fails() {
3377+
let mut runtime = MockContainerRuntime::new();
3378+
runtime.fail_stop = true;
3379+
let mut executor = make_executor(runtime);
3380+
3381+
let mut web = svc("web", "nginx:latest");
3382+
web.ports = vec![PortSpec {
3383+
protocol: "tcp".to_string(),
3384+
container_port: 80,
3385+
host_port: Some(8080),
3386+
}];
3387+
let spec = stack("myapp", vec![web.clone()]);
3388+
3389+
// Create the service first.
3390+
let actions = vec![Action::ServiceCreate {
3391+
service_name: "web".to_string(),
3392+
}];
3393+
let result = executor.execute(&spec, &actions).unwrap();
3394+
assert!(result.all_succeeded());
3395+
assert!(executor.ports().in_use().contains(&8080));
3396+
3397+
// Now remove — stop will fail but ports should still be released.
3398+
let remove_spec = stack("myapp", vec![]);
3399+
let remove_actions = vec![Action::ServiceRemove {
3400+
service_name: "web".to_string(),
3401+
}];
3402+
let result = executor.execute(&remove_spec, &remove_actions).unwrap();
3403+
assert!(result.all_succeeded());
3404+
assert!(
3405+
!executor.ports().in_use().contains(&8080),
3406+
"port 8080 should be released even when stop fails"
3407+
);
3408+
}
3409+
3410+
#[test]
3411+
fn ports_released_when_create_fails_on_retry() {
3412+
let mut runtime = MockContainerRuntime::new();
3413+
runtime.fail_create = true;
3414+
let mut executor = make_executor(runtime);
3415+
3416+
let mut web = svc("web", "nginx:latest");
3417+
web.ports = vec![PortSpec {
3418+
protocol: "tcp".to_string(),
3419+
container_port: 80,
3420+
host_port: Some(8080),
3421+
}];
3422+
let spec = stack("myapp", vec![web.clone()]);
3423+
3424+
// Create fails — ports were allocated during prepare_create but
3425+
// service is marked Failed. Verify port state is usable for retry.
3426+
let actions = vec![Action::ServiceCreate {
3427+
service_name: "web".to_string(),
3428+
}];
3429+
let result = executor.execute(&spec, &actions).unwrap();
3430+
assert_eq!(result.failed, 1);
3431+
3432+
// Port should still be allocated (not released) because the service
3433+
// will be retried — release only happens on ServiceRemove.
3434+
// But crucially, a second create attempt should not conflict.
3435+
let mut retry_runtime = MockContainerRuntime::new();
3436+
retry_runtime.fail_create = false;
3437+
// We can't swap the runtime, but we can verify port tracker state
3438+
// allows reallocation for the same service.
3439+
let reallocated = executor.ports_mut().allocate("web", &web.ports);
3440+
assert!(
3441+
reallocated.is_ok(),
3442+
"same service should be able to reallocate its ports on retry: {:?}",
3443+
reallocated.err()
3444+
);
3445+
}
3446+
3447+
// ── Partial replica scale-down failure tests ──
3448+
3449+
#[test]
3450+
fn replica_scale_down_removes_excess_replicas() {
3451+
let runtime = MockContainerRuntime::new();
3452+
let mut executor = make_executor(runtime);
3453+
let spec_name = "replica-sd";
3454+
3455+
// Simulate 3 running replicas.
3456+
for (name, cid) in [("web", "ctr-web"), ("web-2", "ctr-web-2"), ("web-3", "ctr-web-3")] {
3457+
executor
3458+
.store()
3459+
.save_observed_state(
3460+
spec_name,
3461+
&ServiceObservedState {
3462+
service_name: name.to_string(),
3463+
phase: ServicePhase::Running,
3464+
container_id: Some(cid.to_string()),
3465+
last_error: None,
3466+
ready: false,
3467+
},
3468+
)
3469+
.unwrap();
3470+
}
3471+
3472+
// Scale down to 1 replica.
3473+
let mut web = svc("web", "nginx:latest");
3474+
web.resources.replicas = 1;
3475+
let spec = stack(spec_name, vec![web]);
3476+
3477+
let health = HashMap::new();
3478+
let reconcile = apply(&spec, executor.store(), &health).unwrap();
3479+
3480+
// Should generate 2 remove actions (for web-2 and web-3).
3481+
let remove_count = reconcile
3482+
.actions
3483+
.iter()
3484+
.filter(|a| matches!(a, Action::ServiceRemove { .. }))
3485+
.count();
3486+
assert_eq!(remove_count, 2, "should remove 2 excess replicas");
3487+
3488+
let result = executor.execute(&spec, &reconcile.actions).unwrap();
3489+
assert_eq!(result.failed, 0, "all removals should succeed");
3490+
3491+
// Only web (base replica) should remain running.
3492+
let observed = executor.store().load_observed_state(spec_name).unwrap();
3493+
let running: Vec<&str> = observed
3494+
.iter()
3495+
.filter(|o| matches!(o.phase, ServicePhase::Running))
3496+
.map(|o| o.service_name.as_str())
3497+
.collect();
3498+
assert_eq!(running, vec!["web"]);
3499+
}
32373500
}

0 commit comments

Comments
 (0)