Skip to content

Commit 17e3d46

Browse files
fix: 5 devx improvements for vz run
1. Auto-recover from stale execution state (vz-pfex) When stream_exec_output hits a terminal-state execution from a previous failed run, automatically create a fresh execution instead of failing with state_conflict error. 2. --fresh deletes persistent disk (vz-9ai3) vz run --fresh now removes the disk.img before recreating so the container starts with a truly clean filesystem. 3. Device passthrough via vz.json (vz-zuoq) New "devices" field in vz.json (e.g. ["/dev/kvm", "/dev/net/tun"]) creates device nodes inside the container at setup time via mknod. 4. vz stop --all (vz-cl6z) New --all flag stops all running vz-run sandboxes, not just the current project's VM. 5. Setup hash includes full config (vz-uiwm) compute_setup_hash now hashes image, devices, and resources in addition to setup commands, so changing any of these triggers re-execution of setup. Bump vz-cli to v0.3.4. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 08b9cff commit 17e3d46

File tree

3 files changed

+149
-16
lines changed

3 files changed

+149
-16
lines changed

crates/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/vz-cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "vz-cli"
3-
version = "0.3.3"
3+
version = "0.3.4"
44
description = "CLI for managing containers and macOS VM sandboxes"
55
edition.workspace = true
66
rust-version.workspace = true

crates/vz-cli/src/commands/dev.rs

Lines changed: 147 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use serde::Deserialize;
1717
use sha2::{Digest, Sha256};
1818
use tracing::debug;
1919
use vz_runtime_proto::runtime_v2;
20-
use vz_runtimed_client::DaemonClient;
20+
use vz_runtimed_client::{DaemonClient, DaemonClientError};
2121

2222
use super::runtime_daemon::{connect_control_plane_for_state_db, default_state_db_path};
2323

@@ -61,6 +61,10 @@ pub struct DevStopArgs {
6161
/// Path to vz.json (default: search cwd and parents).
6262
#[arg(long)]
6363
pub config: Option<PathBuf>,
64+
65+
/// Stop all running `vz run` sandboxes (not just current project).
66+
#[arg(long)]
67+
pub all: bool,
6468
}
6569

6670
// ── vz.json schema ─────────────────────────────────────────────────
@@ -93,6 +97,11 @@ struct VzConfig {
9397
/// Resource limits.
9498
#[serde(default)]
9599
resources: ResourceConfig,
100+
101+
/// Device nodes to create inside the container (e.g., "/dev/kvm", "/dev/net/tun").
102+
/// These are created via mknod at container start using the host VM's device metadata.
103+
#[serde(default)]
104+
devices: Vec<String>,
96105
}
97106

98107
fn default_image() -> String {
@@ -135,6 +144,16 @@ pub async fn cmd_run(args: DevRunArgs) -> anyhow::Result<()> {
135144
)?;
136145

137146
let volume_mounts = build_volume_mounts(&config, &project_dir)?;
147+
148+
// --fresh: delete persistent disk so the container starts with a clean filesystem.
149+
if args.fresh {
150+
let run_dir = home_dir()?.join(".vz").join("run").join(&sandbox_id);
151+
let disk_path = run_dir.join("disk.img");
152+
if disk_path.exists() {
153+
let _ = std::fs::remove_file(&disk_path);
154+
}
155+
}
156+
138157
let disk_image_path = ensure_project_disk(&sandbox_id)?;
139158

140159
let state_db = default_state_db_path();
@@ -307,6 +326,10 @@ pub async fn cmd_run(args: DevRunArgs) -> anyhow::Result<()> {
307326
runtime_v2::create_execution_request::PtyMode::Disabled as i32
308327
};
309328

329+
// Keep copies for potential retry on terminal state conflict.
330+
let retry_container_id = container_id.clone();
331+
let retry_full_command = full_command.clone();
332+
310333
let execution = client
311334
.create_execution(runtime_v2::CreateExecutionRequest {
312335
metadata: None,
@@ -387,13 +410,49 @@ pub async fn cmd_run(args: DevRunArgs) -> anyhow::Result<()> {
387410
None
388411
};
389412

390-
let mut stream = client
413+
let stream_result = client
391414
.stream_exec_output(runtime_v2::StreamExecOutputRequest {
392-
execution_id,
415+
execution_id: execution_id.clone(),
393416
metadata: None,
394417
})
395-
.await
396-
.context("failed to stream execution output")?;
418+
.await;
419+
420+
// If the execution is already in a terminal state (e.g., from a previous failed run),
421+
// create a fresh execution and retry. This recovers from stale state_conflict errors
422+
// without requiring a manual daemon kill.
423+
let is_terminal_err = stream_result.as_ref().err().is_some_and(is_terminal_state_error);
424+
let mut stream = match stream_result {
425+
Ok(s) => s,
426+
Err(_) if is_terminal_err => {
427+
debug!(execution_id = %execution_id, "execution in terminal state, creating fresh execution");
428+
let retry = client
429+
.create_execution(runtime_v2::CreateExecutionRequest {
430+
metadata: None,
431+
container_id: retry_container_id,
432+
cmd: vec!["/bin/sh".to_string()],
433+
args: vec!["-c".to_string(), retry_full_command],
434+
env_override: HashMap::new(),
435+
timeout_secs: 3600,
436+
pty_mode,
437+
})
438+
.await
439+
.context("failed to create retry execution")?;
440+
441+
let retry_id = retry
442+
.execution
443+
.ok_or_else(|| anyhow!("daemon missing execution payload on retry"))?
444+
.execution_id;
445+
446+
client
447+
.stream_exec_output(runtime_v2::StreamExecOutputRequest {
448+
execution_id: retry_id,
449+
metadata: None,
450+
})
451+
.await
452+
.context("failed to stream retry execution output")?
453+
}
454+
Err(e) => return Err(e).context("failed to stream execution output"),
455+
};
397456

398457
let mut exit_code = 0i32;
399458
while let Some(event) = stream
@@ -440,14 +499,40 @@ pub async fn cmd_run(args: DevRunArgs) -> anyhow::Result<()> {
440499
}
441500

442501
pub async fn cmd_stop(args: DevStopArgs) -> anyhow::Result<()> {
443-
let (_config, project_dir) = load_config(args.config.as_deref())?;
444-
let sandbox_id = sandbox_id_for_project(&project_dir);
445-
446502
let state_db = default_state_db_path();
447503
let mut client = connect_control_plane_for_state_db(&state_db).await?;
448504

449-
terminate_sandbox(&mut client, &sandbox_id).await?;
450-
eprintln!("Stopped VM for {}", project_dir.display());
505+
if args.all {
506+
// Stop all vz-run sandboxes.
507+
let response = client
508+
.list_sandboxes(runtime_v2::ListSandboxesRequest { metadata: None })
509+
.await
510+
.context("failed to list sandboxes")?;
511+
512+
let run_sandboxes: Vec<_> = response
513+
.sandboxes
514+
.iter()
515+
.filter(|s| s.sandbox_id.starts_with("vz-run-"))
516+
.filter(|s| s.state == "ready" || s.state == "active")
517+
.collect();
518+
519+
if run_sandboxes.is_empty() {
520+
eprintln!("No running `vz run` VMs found.");
521+
return Ok(());
522+
}
523+
524+
for sandbox in &run_sandboxes {
525+
let _ = terminate_sandbox(&mut client, &sandbox.sandbox_id).await;
526+
eprintln!("Stopped {}", sandbox.sandbox_id);
527+
}
528+
eprintln!("Stopped {} VM(s).", run_sandboxes.len());
529+
} else {
530+
let (_config, project_dir) = load_config(args.config.as_deref())?;
531+
let sandbox_id = sandbox_id_for_project(&project_dir);
532+
terminate_sandbox(&mut client, &sandbox_id).await?;
533+
eprintln!("Stopped VM for {}", project_dir.display());
534+
}
535+
451536
Ok(())
452537
}
453538

@@ -577,13 +662,29 @@ fn ensure_project_disk(sandbox_id: &str) -> anyhow::Result<PathBuf> {
577662

578663
// ── Setup caching ──────────────────────────────────────────────────
579664

580-
/// Compute the setup hash for a set of commands.
581-
fn compute_setup_hash(commands: &[String]) -> String {
665+
/// Compute a setup hash over the full vz.json config.
666+
///
667+
/// Includes image, setup commands, devices, and resources so that
668+
/// changes to any of these trigger re-execution of setup.
669+
fn compute_setup_hash(config: &VzConfig) -> String {
582670
let mut hasher = Sha256::new();
583-
for cmd in commands {
671+
hasher.update(config.image.as_bytes());
672+
hasher.update(b"\n");
673+
for cmd in &config.setup {
584674
hasher.update(cmd.as_bytes());
585675
hasher.update(b"\n");
586676
}
677+
for dev in &config.devices {
678+
hasher.update(b"dev:");
679+
hasher.update(dev.as_bytes());
680+
hasher.update(b"\n");
681+
}
682+
if let Some(cpus) = config.resources.cpus {
683+
hasher.update(format!("cpus:{cpus}\n").as_bytes());
684+
}
685+
if let Some(ref mem) = config.resources.memory {
686+
hasher.update(format!("mem:{mem}\n").as_bytes());
687+
}
587688
hasher.finalize()[..8]
588689
.iter()
589690
.map(|b| format!("{b:02x}"))
@@ -626,7 +727,7 @@ async fn run_setup_if_needed(
626727
return Ok(());
627728
}
628729

629-
let setup_hash = compute_setup_hash(&config.setup);
730+
let setup_hash = compute_setup_hash(config);
630731

631732
// Check guest-side hash first (persistent disk), then host-side fallback.
632733
let container_id = resolve_container(client, sandbox_id).await?;
@@ -657,6 +758,27 @@ async fn run_setup_if_needed(
657758
return Ok(());
658759
}
659760

761+
// Create requested device nodes before running user setup commands.
762+
if !config.devices.is_empty() {
763+
let device_cmds: Vec<String> = config
764+
.devices
765+
.iter()
766+
.map(|dev| {
767+
// Read major:minor from /proc/misc or /sys for well-known devices.
768+
// For now, handle the common cases directly.
769+
match dev.as_str() {
770+
"/dev/kvm" => "mknod /dev/kvm c 10 232 2>/dev/null || true".to_string(),
771+
"/dev/net/tun" => "mkdir -p /dev/net && mknod /dev/net/tun c 10 200 2>/dev/null || true".to_string(),
772+
_ => format!("echo 'unsupported device: {dev}'"),
773+
}
774+
})
775+
.collect();
776+
777+
for cmd in &device_cmds {
778+
let _ = exec_quiet(client, &container_id, cmd).await;
779+
}
780+
}
781+
660782
eprintln!("Running setup commands...");
661783
for (i, cmd) in config.setup.iter().enumerate() {
662784
eprintln!(" [{}/{}] {}", i + 1, config.setup.len(), cmd);
@@ -854,6 +976,17 @@ fn home_dir() -> anyhow::Result<PathBuf> {
854976
.context("HOME environment variable not set")
855977
}
856978

979+
/// Check if a daemon client error is a "terminal state" conflict
980+
/// that can be recovered by creating a new execution.
981+
fn is_terminal_state_error(error: &DaemonClientError) -> bool {
982+
matches!(
983+
error,
984+
DaemonClientError::Grpc(status)
985+
if status.code() == tonic::Code::FailedPrecondition
986+
&& status.message().contains("terminal state")
987+
)
988+
}
989+
857990
fn parse_memory(raw: Option<&str>) -> anyhow::Result<u64> {
858991
match raw {
859992
None => Ok(8192),

0 commit comments

Comments
 (0)