@@ -445,11 +445,23 @@ impl<R: ContainerRuntime> StackExecutor<R> {
445445 & self . ports
446446 }
447447
448+ /// Mutable access to the port tracker (for test reallocation checks).
449+ #[ cfg( test) ]
450+ pub fn ports_mut ( & mut self ) -> & mut PortTracker {
451+ & mut self . ports
452+ }
453+
448454 /// Access the underlying container runtime.
449455 pub fn runtime ( & self ) -> & R {
450456 & self . runtime
451457 }
452458
459+ /// Mutable access to the underlying container runtime (for test failure injection).
460+ #[ cfg( test) ]
461+ pub fn runtime_mut ( & mut self ) -> & mut R {
462+ & mut self . runtime
463+ }
464+
453465 /// Execute a batch of reconciler actions for the given stack spec.
454466 ///
455467 /// Services at the same topological level (no dependency edges
@@ -1307,6 +1319,8 @@ pub(crate) mod tests_support {
13071319 pub fail_create : bool ,
13081320 /// Whether stop should fail.
13091321 pub fail_stop : bool ,
1322+ /// Whether remove should fail.
1323+ pub fail_remove : bool ,
13101324 /// Exit code to return from exec calls.
13111325 pub exec_exit_code : i32 ,
13121326 /// Whether exec should fail with an error (not just non-zero exit).
@@ -1337,6 +1351,7 @@ pub(crate) mod tests_support {
13371351 fail_pull : false ,
13381352 fail_create : false ,
13391353 fail_stop : false ,
1354+ fail_remove : false ,
13401355 exec_exit_code : 0 ,
13411356 fail_exec : false ,
13421357 exec_delay : None ,
@@ -1431,6 +1446,9 @@ pub(crate) mod tests_support {
14311446 . lock ( )
14321447 . unwrap ( )
14331448 . push ( ( "remove" . to_string ( ) , container_id. to_string ( ) ) ) ;
1449+ if self . fail_remove {
1450+ return Err ( StackError :: InvalidSpec ( "mock remove failure" . to_string ( ) ) ) ;
1451+ }
14341452 Ok ( ( ) )
14351453 }
14361454
@@ -1579,6 +1597,7 @@ mod tests {
15791597
15801598 use super :: tests_support:: MockContainerRuntime ;
15811599 use super :: * ;
1600+ use crate :: reconcile:: apply;
15821601 use crate :: spec:: MountSpec as StackMountSpec ;
15831602 use crate :: spec:: { PortSpec , ResourcesSpec , ServiceKind , StackSpec , VolumeSpec } ;
15841603 use std:: collections:: HashMap ;
@@ -3234,4 +3253,248 @@ mod tests {
32343253 // Ensure Debug is derived.
32353254 let _debug = format ! ( "{:?}" , cloned) ;
32363255 }
3256+
3257+ // ── Stop/remove failure cascade tests ──
3258+
3259+ #[ test]
3260+ fn stop_failure_still_attempts_remove ( ) {
3261+ let mut runtime = MockContainerRuntime :: new ( ) ;
3262+ runtime. fail_stop = true ;
3263+ let mut executor = make_executor ( runtime) ;
3264+ let spec = stack ( "myapp" , vec ! [ ] ) ;
3265+
3266+ // Simulate existing running container.
3267+ executor
3268+ . store ( )
3269+ . save_observed_state (
3270+ "myapp" ,
3271+ & ServiceObservedState {
3272+ service_name : "web" . to_string ( ) ,
3273+ phase : ServicePhase :: Running ,
3274+ container_id : Some ( "ctr-web" . to_string ( ) ) ,
3275+ last_error : None ,
3276+ ready : false ,
3277+ } ,
3278+ )
3279+ . unwrap ( ) ;
3280+
3281+ let actions = vec ! [ Action :: ServiceRemove {
3282+ service_name: "web" . to_string( ) ,
3283+ } ] ;
3284+
3285+ let result = executor. execute ( & spec, & actions) . unwrap ( ) ;
3286+ assert ! ( result. all_succeeded( ) , "remove should succeed despite stop failure" ) ;
3287+
3288+ // Verify both stop AND remove were attempted.
3289+ let calls = executor. runtime ( ) . call_log ( ) ;
3290+ assert ! ( calls. iter( ) . any( |( op, _) | op == "stop" ) , "stop should be attempted" ) ;
3291+ assert ! (
3292+ calls. iter( ) . any( |( op, _) | op == "remove" ) ,
3293+ "remove should still be called after stop failure"
3294+ ) ;
3295+
3296+ // State should be Stopped (not stuck in Running).
3297+ let observed = executor. store ( ) . load_observed_state ( "myapp" ) . unwrap ( ) ;
3298+ let web = observed. iter ( ) . find ( |o| o. service_name == "web" ) . unwrap ( ) ;
3299+ assert_eq ! ( web. phase, ServicePhase :: Stopped ) ;
3300+ assert ! ( web. container_id. is_none( ) ) ;
3301+ }
3302+
3303+ #[ test]
3304+ fn remove_failure_still_updates_state_to_stopped ( ) {
3305+ let mut runtime = MockContainerRuntime :: new ( ) ;
3306+ runtime. fail_remove = true ;
3307+ let mut executor = make_executor ( runtime) ;
3308+ let spec = stack ( "myapp" , vec ! [ ] ) ;
3309+
3310+ // Simulate existing running container.
3311+ executor
3312+ . store ( )
3313+ . save_observed_state (
3314+ "myapp" ,
3315+ & ServiceObservedState {
3316+ service_name : "web" . to_string ( ) ,
3317+ phase : ServicePhase :: Running ,
3318+ container_id : Some ( "ctr-web" . to_string ( ) ) ,
3319+ last_error : None ,
3320+ ready : false ,
3321+ } ,
3322+ )
3323+ . unwrap ( ) ;
3324+
3325+ let actions = vec ! [ Action :: ServiceRemove {
3326+ service_name: "web" . to_string( ) ,
3327+ } ] ;
3328+
3329+ let result = executor. execute ( & spec, & actions) . unwrap ( ) ;
3330+ assert ! ( result. all_succeeded( ) , "remove should succeed even when runtime remove fails" ) ;
3331+
3332+ // State should be Stopped (not stuck in Running).
3333+ let observed = executor. store ( ) . load_observed_state ( "myapp" ) . unwrap ( ) ;
3334+ let web = observed. iter ( ) . find ( |o| o. service_name == "web" ) . unwrap ( ) ;
3335+ assert_eq ! ( web. phase, ServicePhase :: Stopped ) ;
3336+ assert ! ( web. container_id. is_none( ) ) ;
3337+ }
3338+
3339+ #[ test]
3340+ fn stop_and_remove_both_fail_still_updates_state ( ) {
3341+ let mut runtime = MockContainerRuntime :: new ( ) ;
3342+ runtime. fail_stop = true ;
3343+ runtime. fail_remove = true ;
3344+ let mut executor = make_executor ( runtime) ;
3345+ let spec = stack ( "myapp" , vec ! [ ] ) ;
3346+
3347+ executor
3348+ . store ( )
3349+ . save_observed_state (
3350+ "myapp" ,
3351+ & ServiceObservedState {
3352+ service_name : "web" . to_string ( ) ,
3353+ phase : ServicePhase :: Running ,
3354+ container_id : Some ( "ctr-web" . to_string ( ) ) ,
3355+ last_error : None ,
3356+ ready : false ,
3357+ } ,
3358+ )
3359+ . unwrap ( ) ;
3360+
3361+ let actions = vec ! [ Action :: ServiceRemove {
3362+ service_name: "web" . to_string( ) ,
3363+ } ] ;
3364+
3365+ let result = executor. execute ( & spec, & actions) . unwrap ( ) ;
3366+ // Executor marks result as succeeded because state is updated
3367+ // regardless of stop/remove runtime errors (best-effort cleanup).
3368+ assert ! ( result. all_succeeded( ) ) ;
3369+
3370+ let observed = executor. store ( ) . load_observed_state ( "myapp" ) . unwrap ( ) ;
3371+ let web = observed. iter ( ) . find ( |o| o. service_name == "web" ) . unwrap ( ) ;
3372+ assert_eq ! ( web. phase, ServicePhase :: Stopped ) ;
3373+ }
3374+
3375+ #[ test]
3376+ fn ports_released_on_remove_even_when_stop_fails ( ) {
3377+ let mut runtime = MockContainerRuntime :: new ( ) ;
3378+ runtime. fail_stop = true ;
3379+ let mut executor = make_executor ( runtime) ;
3380+
3381+ let mut web = svc ( "web" , "nginx:latest" ) ;
3382+ web. ports = vec ! [ PortSpec {
3383+ protocol: "tcp" . to_string( ) ,
3384+ container_port: 80 ,
3385+ host_port: Some ( 8080 ) ,
3386+ } ] ;
3387+ let spec = stack ( "myapp" , vec ! [ web. clone( ) ] ) ;
3388+
3389+ // Create the service first.
3390+ let actions = vec ! [ Action :: ServiceCreate {
3391+ service_name: "web" . to_string( ) ,
3392+ } ] ;
3393+ let result = executor. execute ( & spec, & actions) . unwrap ( ) ;
3394+ assert ! ( result. all_succeeded( ) ) ;
3395+ assert ! ( executor. ports( ) . in_use( ) . contains( & 8080 ) ) ;
3396+
3397+ // Now remove — stop will fail but ports should still be released.
3398+ let remove_spec = stack ( "myapp" , vec ! [ ] ) ;
3399+ let remove_actions = vec ! [ Action :: ServiceRemove {
3400+ service_name: "web" . to_string( ) ,
3401+ } ] ;
3402+ let result = executor. execute ( & remove_spec, & remove_actions) . unwrap ( ) ;
3403+ assert ! ( result. all_succeeded( ) ) ;
3404+ assert ! (
3405+ !executor. ports( ) . in_use( ) . contains( & 8080 ) ,
3406+ "port 8080 should be released even when stop fails"
3407+ ) ;
3408+ }
3409+
3410+ #[ test]
3411+ fn ports_released_when_create_fails_on_retry ( ) {
3412+ let mut runtime = MockContainerRuntime :: new ( ) ;
3413+ runtime. fail_create = true ;
3414+ let mut executor = make_executor ( runtime) ;
3415+
3416+ let mut web = svc ( "web" , "nginx:latest" ) ;
3417+ web. ports = vec ! [ PortSpec {
3418+ protocol: "tcp" . to_string( ) ,
3419+ container_port: 80 ,
3420+ host_port: Some ( 8080 ) ,
3421+ } ] ;
3422+ let spec = stack ( "myapp" , vec ! [ web. clone( ) ] ) ;
3423+
3424+ // Create fails — ports were allocated during prepare_create but
3425+ // service is marked Failed. Verify port state is usable for retry.
3426+ let actions = vec ! [ Action :: ServiceCreate {
3427+ service_name: "web" . to_string( ) ,
3428+ } ] ;
3429+ let result = executor. execute ( & spec, & actions) . unwrap ( ) ;
3430+ assert_eq ! ( result. failed, 1 ) ;
3431+
3432+ // Port should still be allocated (not released) because the service
3433+ // will be retried — release only happens on ServiceRemove.
3434+ // But crucially, a second create attempt should not conflict.
3435+ let mut retry_runtime = MockContainerRuntime :: new ( ) ;
3436+ retry_runtime. fail_create = false ;
3437+ // We can't swap the runtime, but we can verify port tracker state
3438+ // allows reallocation for the same service.
3439+ let reallocated = executor. ports_mut ( ) . allocate ( "web" , & web. ports ) ;
3440+ assert ! (
3441+ reallocated. is_ok( ) ,
3442+ "same service should be able to reallocate its ports on retry: {:?}" ,
3443+ reallocated. err( )
3444+ ) ;
3445+ }
3446+
3447+ // ── Partial replica scale-down failure tests ──
3448+
3449+ #[ test]
3450+ fn replica_scale_down_removes_excess_replicas ( ) {
3451+ let runtime = MockContainerRuntime :: new ( ) ;
3452+ let mut executor = make_executor ( runtime) ;
3453+ let spec_name = "replica-sd" ;
3454+
3455+ // Simulate 3 running replicas.
3456+ for ( name, cid) in [ ( "web" , "ctr-web" ) , ( "web-2" , "ctr-web-2" ) , ( "web-3" , "ctr-web-3" ) ] {
3457+ executor
3458+ . store ( )
3459+ . save_observed_state (
3460+ spec_name,
3461+ & ServiceObservedState {
3462+ service_name : name. to_string ( ) ,
3463+ phase : ServicePhase :: Running ,
3464+ container_id : Some ( cid. to_string ( ) ) ,
3465+ last_error : None ,
3466+ ready : false ,
3467+ } ,
3468+ )
3469+ . unwrap ( ) ;
3470+ }
3471+
3472+ // Scale down to 1 replica.
3473+ let mut web = svc ( "web" , "nginx:latest" ) ;
3474+ web. resources . replicas = 1 ;
3475+ let spec = stack ( spec_name, vec ! [ web] ) ;
3476+
3477+ let health = HashMap :: new ( ) ;
3478+ let reconcile = apply ( & spec, executor. store ( ) , & health) . unwrap ( ) ;
3479+
3480+ // Should generate 2 remove actions (for web-2 and web-3).
3481+ let remove_count = reconcile
3482+ . actions
3483+ . iter ( )
3484+ . filter ( |a| matches ! ( a, Action :: ServiceRemove { .. } ) )
3485+ . count ( ) ;
3486+ assert_eq ! ( remove_count, 2 , "should remove 2 excess replicas" ) ;
3487+
3488+ let result = executor. execute ( & spec, & reconcile. actions ) . unwrap ( ) ;
3489+ assert_eq ! ( result. failed, 0 , "all removals should succeed" ) ;
3490+
3491+ // Only web (base replica) should remain running.
3492+ let observed = executor. store ( ) . load_observed_state ( spec_name) . unwrap ( ) ;
3493+ let running: Vec < & str > = observed
3494+ . iter ( )
3495+ . filter ( |o| matches ! ( o. phase, ServicePhase :: Running ) )
3496+ . map ( |o| o. service_name . as_str ( ) )
3497+ . collect ( ) ;
3498+ assert_eq ! ( running, vec![ "web" ] ) ;
3499+ }
32373500}
0 commit comments