@@ -17,7 +17,7 @@ use serde::Deserialize;
1717use sha2:: { Digest , Sha256 } ;
1818use tracing:: debug;
1919use vz_runtime_proto:: runtime_v2;
20- use vz_runtimed_client:: DaemonClient ;
20+ use vz_runtimed_client:: { DaemonClient , DaemonClientError } ;
2121
2222use super :: runtime_daemon:: { connect_control_plane_for_state_db, default_state_db_path} ;
2323
@@ -61,6 +61,10 @@ pub struct DevStopArgs {
6161 /// Path to vz.json (default: search cwd and parents).
6262 #[ arg( long) ]
6363 pub config : Option < PathBuf > ,
64+
65+ /// Stop all running `vz run` sandboxes (not just current project).
66+ #[ arg( long) ]
67+ pub all : bool ,
6468}
6569
6670// ── vz.json schema ─────────────────────────────────────────────────
@@ -93,6 +97,11 @@ struct VzConfig {
9397 /// Resource limits.
9498 #[ serde( default ) ]
9599 resources : ResourceConfig ,
100+
101+ /// Device nodes to create inside the container (e.g., "/dev/kvm", "/dev/net/tun").
102+ /// These are created via mknod at container start using the host VM's device metadata.
103+ #[ serde( default ) ]
104+ devices : Vec < String > ,
96105}
97106
98107fn default_image ( ) -> String {
@@ -135,6 +144,16 @@ pub async fn cmd_run(args: DevRunArgs) -> anyhow::Result<()> {
135144 ) ?;
136145
137146 let volume_mounts = build_volume_mounts ( & config, & project_dir) ?;
147+
148+ // --fresh: delete persistent disk so the container starts with a clean filesystem.
149+ if args. fresh {
150+ let run_dir = home_dir ( ) ?. join ( ".vz" ) . join ( "run" ) . join ( & sandbox_id) ;
151+ let disk_path = run_dir. join ( "disk.img" ) ;
152+ if disk_path. exists ( ) {
153+ let _ = std:: fs:: remove_file ( & disk_path) ;
154+ }
155+ }
156+
138157 let disk_image_path = ensure_project_disk ( & sandbox_id) ?;
139158
140159 let state_db = default_state_db_path ( ) ;
@@ -307,6 +326,10 @@ pub async fn cmd_run(args: DevRunArgs) -> anyhow::Result<()> {
307326 runtime_v2:: create_execution_request:: PtyMode :: Disabled as i32
308327 } ;
309328
329+ // Keep copies for potential retry on terminal state conflict.
330+ let retry_container_id = container_id. clone ( ) ;
331+ let retry_full_command = full_command. clone ( ) ;
332+
310333 let execution = client
311334 . create_execution ( runtime_v2:: CreateExecutionRequest {
312335 metadata : None ,
@@ -387,13 +410,49 @@ pub async fn cmd_run(args: DevRunArgs) -> anyhow::Result<()> {
387410 None
388411 } ;
389412
390- let mut stream = client
413+ let stream_result = client
391414 . stream_exec_output ( runtime_v2:: StreamExecOutputRequest {
392- execution_id,
415+ execution_id : execution_id . clone ( ) ,
393416 metadata : None ,
394417 } )
395- . await
396- . context ( "failed to stream execution output" ) ?;
418+ . await ;
419+
420+ // If the execution is already in a terminal state (e.g., from a previous failed run),
421+ // create a fresh execution and retry. This recovers from stale state_conflict errors
422+ // without requiring a manual daemon kill.
423+ let is_terminal_err = stream_result. as_ref ( ) . err ( ) . is_some_and ( is_terminal_state_error) ;
424+ let mut stream = match stream_result {
425+ Ok ( s) => s,
426+ Err ( _) if is_terminal_err => {
427+ debug ! ( execution_id = %execution_id, "execution in terminal state, creating fresh execution" ) ;
428+ let retry = client
429+ . create_execution ( runtime_v2:: CreateExecutionRequest {
430+ metadata : None ,
431+ container_id : retry_container_id,
432+ cmd : vec ! [ "/bin/sh" . to_string( ) ] ,
433+ args : vec ! [ "-c" . to_string( ) , retry_full_command] ,
434+ env_override : HashMap :: new ( ) ,
435+ timeout_secs : 3600 ,
436+ pty_mode,
437+ } )
438+ . await
439+ . context ( "failed to create retry execution" ) ?;
440+
441+ let retry_id = retry
442+ . execution
443+ . ok_or_else ( || anyhow ! ( "daemon missing execution payload on retry" ) ) ?
444+ . execution_id ;
445+
446+ client
447+ . stream_exec_output ( runtime_v2:: StreamExecOutputRequest {
448+ execution_id : retry_id,
449+ metadata : None ,
450+ } )
451+ . await
452+ . context ( "failed to stream retry execution output" ) ?
453+ }
454+ Err ( e) => return Err ( e) . context ( "failed to stream execution output" ) ,
455+ } ;
397456
398457 let mut exit_code = 0i32 ;
399458 while let Some ( event) = stream
@@ -440,14 +499,40 @@ pub async fn cmd_run(args: DevRunArgs) -> anyhow::Result<()> {
440499}
441500
442501pub async fn cmd_stop ( args : DevStopArgs ) -> anyhow:: Result < ( ) > {
443- let ( _config, project_dir) = load_config ( args. config . as_deref ( ) ) ?;
444- let sandbox_id = sandbox_id_for_project ( & project_dir) ;
445-
446502 let state_db = default_state_db_path ( ) ;
447503 let mut client = connect_control_plane_for_state_db ( & state_db) . await ?;
448504
449- terminate_sandbox ( & mut client, & sandbox_id) . await ?;
450- eprintln ! ( "Stopped VM for {}" , project_dir. display( ) ) ;
505+ if args. all {
506+ // Stop all vz-run sandboxes.
507+ let response = client
508+ . list_sandboxes ( runtime_v2:: ListSandboxesRequest { metadata : None } )
509+ . await
510+ . context ( "failed to list sandboxes" ) ?;
511+
512+ let run_sandboxes: Vec < _ > = response
513+ . sandboxes
514+ . iter ( )
515+ . filter ( |s| s. sandbox_id . starts_with ( "vz-run-" ) )
516+ . filter ( |s| s. state == "ready" || s. state == "active" )
517+ . collect ( ) ;
518+
519+ if run_sandboxes. is_empty ( ) {
520+ eprintln ! ( "No running `vz run` VMs found." ) ;
521+ return Ok ( ( ) ) ;
522+ }
523+
524+ for sandbox in & run_sandboxes {
525+ let _ = terminate_sandbox ( & mut client, & sandbox. sandbox_id ) . await ;
526+ eprintln ! ( "Stopped {}" , sandbox. sandbox_id) ;
527+ }
528+ eprintln ! ( "Stopped {} VM(s)." , run_sandboxes. len( ) ) ;
529+ } else {
530+ let ( _config, project_dir) = load_config ( args. config . as_deref ( ) ) ?;
531+ let sandbox_id = sandbox_id_for_project ( & project_dir) ;
532+ terminate_sandbox ( & mut client, & sandbox_id) . await ?;
533+ eprintln ! ( "Stopped VM for {}" , project_dir. display( ) ) ;
534+ }
535+
451536 Ok ( ( ) )
452537}
453538
@@ -577,13 +662,29 @@ fn ensure_project_disk(sandbox_id: &str) -> anyhow::Result<PathBuf> {
577662
578663// ── Setup caching ──────────────────────────────────────────────────
579664
580- /// Compute the setup hash for a set of commands.
581- fn compute_setup_hash ( commands : & [ String ] ) -> String {
665+ /// Compute a setup hash over the full vz.json config.
666+ ///
667+ /// Includes image, setup commands, devices, and resources so that
668+ /// changes to any of these trigger re-execution of setup.
669+ fn compute_setup_hash ( config : & VzConfig ) -> String {
582670 let mut hasher = Sha256 :: new ( ) ;
583- for cmd in commands {
671+ hasher. update ( config. image . as_bytes ( ) ) ;
672+ hasher. update ( b"\n " ) ;
673+ for cmd in & config. setup {
584674 hasher. update ( cmd. as_bytes ( ) ) ;
585675 hasher. update ( b"\n " ) ;
586676 }
677+ for dev in & config. devices {
678+ hasher. update ( b"dev:" ) ;
679+ hasher. update ( dev. as_bytes ( ) ) ;
680+ hasher. update ( b"\n " ) ;
681+ }
682+ if let Some ( cpus) = config. resources . cpus {
683+ hasher. update ( format ! ( "cpus:{cpus}\n " ) . as_bytes ( ) ) ;
684+ }
685+ if let Some ( ref mem) = config. resources . memory {
686+ hasher. update ( format ! ( "mem:{mem}\n " ) . as_bytes ( ) ) ;
687+ }
587688 hasher. finalize ( ) [ ..8 ]
588689 . iter ( )
589690 . map ( |b| format ! ( "{b:02x}" ) )
@@ -626,7 +727,7 @@ async fn run_setup_if_needed(
626727 return Ok ( ( ) ) ;
627728 }
628729
629- let setup_hash = compute_setup_hash ( & config. setup ) ;
730+ let setup_hash = compute_setup_hash ( config) ;
630731
631732 // Check guest-side hash first (persistent disk), then host-side fallback.
632733 let container_id = resolve_container ( client, sandbox_id) . await ?;
@@ -657,6 +758,27 @@ async fn run_setup_if_needed(
657758 return Ok ( ( ) ) ;
658759 }
659760
761+ // Create requested device nodes before running user setup commands.
762+ if !config. devices . is_empty ( ) {
763+ let device_cmds: Vec < String > = config
764+ . devices
765+ . iter ( )
766+ . map ( |dev| {
767+ // Read major:minor from /proc/misc or /sys for well-known devices.
768+ // For now, handle the common cases directly.
769+ match dev. as_str ( ) {
770+ "/dev/kvm" => "mknod /dev/kvm c 10 232 2>/dev/null || true" . to_string ( ) ,
771+ "/dev/net/tun" => "mkdir -p /dev/net && mknod /dev/net/tun c 10 200 2>/dev/null || true" . to_string ( ) ,
772+ _ => format ! ( "echo 'unsupported device: {dev}'" ) ,
773+ }
774+ } )
775+ . collect ( ) ;
776+
777+ for cmd in & device_cmds {
778+ let _ = exec_quiet ( client, & container_id, cmd) . await ;
779+ }
780+ }
781+
660782 eprintln ! ( "Running setup commands..." ) ;
661783 for ( i, cmd) in config. setup . iter ( ) . enumerate ( ) {
662784 eprintln ! ( " [{}/{}] {}" , i + 1 , config. setup. len( ) , cmd) ;
@@ -854,6 +976,17 @@ fn home_dir() -> anyhow::Result<PathBuf> {
854976 . context ( "HOME environment variable not set" )
855977}
856978
979+ /// Check if a daemon client error is a "terminal state" conflict
980+ /// that can be recovered by creating a new execution.
981+ fn is_terminal_state_error ( error : & DaemonClientError ) -> bool {
982+ matches ! (
983+ error,
984+ DaemonClientError :: Grpc ( status)
985+ if status. code( ) == tonic:: Code :: FailedPrecondition
986+ && status. message( ) . contains( "terminal state" )
987+ )
988+ }
989+
857990fn parse_memory ( raw : Option < & str > ) -> anyhow:: Result < u64 > {
858991 match raw {
859992 None => Ok ( 8192 ) ,
0 commit comments