diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index 8725984f9..c824a4ba8 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -570,6 +570,37 @@ configure_hostname() { ts "hostname=${sandbox_hostname}" } +run_openshell_init_dropins() { + # Run executable drop-ins from /opt/openshell/init.d in deterministic + # ASCII-sorted order. Drop-ins are *executed* in a child shell rather + # than sourced, so they cannot mutate parent shell state or exit the + # caller. They inherit `OPENSHELL_VM_INIT_PHASE`, `ROOT_PREFIX`, and + # any `OPENSHELL_VM_DATA_*` env vars set by lifecycle extensions. + local init_dir + init_dir="$(root_path /opt/openshell/init.d)" + [ -d "$init_dir" ] || return 0 + + export OPENSHELL_VM_INIT_PHASE="before-supervisor" + export ROOT_PREFIX="${ROOT_PREFIX:-}" + + local dropin rc + while IFS= read -r dropin; do + [ -n "$dropin" ] || continue + [ -f "$dropin" ] || continue + [ -x "$dropin" ] || continue + + ts "running OpenShell VM init drop-in ${dropin##*/}" + rc=0 + set +e + "$dropin" + rc=$? + set -e + if [ "$rc" -ne 0 ]; then + ts "WARNING: OpenShell VM init drop-in ${dropin##*/} failed with exit code ${rc}" + fi + done < <(LC_ALL=C find "$init_dir" -mindepth 1 -maxdepth 1 -type f -print | LC_ALL=C sort) +} + run_post_overlay_setup() { # Source QEMU-injected environment variables if present. The file lives in # the overlay upperdir so the cached bootstrap rootfs remains immutable. @@ -729,6 +760,8 @@ if [ -d /sandbox ]; then fi fi +run_openshell_init_dropins + rewrite_openshell_endpoint_if_needed # Log supervisor connectivity state for debugging stuck-in-Provisioning issues diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index f09f1ebc3..084349382 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -1,6 +1,10 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use crate::lifecycle::{ + BackendFeature, GuestInitDropin, LaunchAbortReason, LaunchPlan, LifecycleExtensionRegistry, + RestoreContext, extension_state_dir, +}; use crate::gpu::{ GpuInventory, SubnetAllocator, allocate_vsock_cid, mac_from_sandbox_id, tap_device_name, }; @@ -9,6 +13,7 @@ use crate::rootfs::{ extract_rootfs_archive_to, prepare_sandbox_rootfs_from_image_root, sandbox_guest_init_path, set_rootfs_image_file_mode, write_rootfs_image_file, }; +use crate::runtime::VmBackend; use bollard::Docker; use bollard::errors::Error as BollardError; use bollard::models::ContainerCreateBody; @@ -101,6 +106,7 @@ const GUEST_TLS_CA_PATH: &str = "/opt/openshell/tls/ca.crt"; const GUEST_TLS_CERT_PATH: &str = "/opt/openshell/tls/tls.crt"; const GUEST_TLS_KEY_PATH: &str = "/opt/openshell/tls/tls.key"; const GUEST_SANDBOX_TOKEN_PATH: &str = "/opt/openshell/auth/sandbox.jwt"; +const GUEST_INIT_DROPIN_DIR: &str = "/opt/openshell/init.d"; const IMAGE_CACHE_ROOT_DIR: &str = "images"; const IMAGE_CACHE_ROOTFS_IMAGE: &str = "rootfs.ext4"; const OVERLAY_TEMPLATE_CACHE_DIR: &str = "overlay-templates"; @@ -280,6 +286,7 @@ struct SandboxRecord { process: Option>>, provisioning_task: Option>, gpu_bdf: Option, + qemu_network_allocated: bool, deleting: bool, } @@ -298,10 +305,21 @@ pub struct VmDriver { events: broadcast::Sender, gpu_inventory: Option>>, subnet_allocator: Arc>, + lifecycle_extensions: Arc, } impl VmDriver { pub async fn new(config: VmDriverConfig) -> Result { + Self::new_with_extensions(config, LifecycleExtensionRegistry::new()).await + } + + pub async fn new_with_extensions( + config: VmDriverConfig, + lifecycle_extensions: LifecycleExtensionRegistry, + ) -> Result { + lifecycle_extensions + .validate() + .map_err(|err| err.message().to_string())?; if config.openshell_endpoint.trim().is_empty() { return Err("openshell endpoint is required".to_string()); } @@ -366,6 +384,7 @@ impl VmDriver { events, gpu_inventory, subnet_allocator, + lifecycle_extensions: Arc::new(lifecycle_extensions), }; driver.restore_persisted_sandboxes().await; Ok(driver) @@ -438,6 +457,7 @@ impl VmDriver { process: None, provisioning_task: None, gpu_bdf: None, + qemu_network_allocated: false, deleting: false, }, ); @@ -458,6 +478,13 @@ impl VmDriver { return Err(Status::internal(format!("create state dir failed: {err}"))); } + if let Err(err) = self.ensure_extension_state_dirs(&state_dir).await { + let mut registry = self.registry.lock().await; + registry.remove(&sandbox.id); + let _ = tokio::fs::remove_dir_all(&state_dir).await; + return Err(err); + } + if let Err(err) = write_sandbox_request(&state_dir, sandbox).await { let mut registry = self.registry.lock().await; registry.remove(&sandbox.id); @@ -561,6 +588,7 @@ impl VmDriver { overlay_preparation: OverlayPreparation, ) -> Result<(), Status> { self.ensure_provisioning_active(&sandbox.id).await?; + let is_gpu = sandbox.spec.as_ref().is_some_and(|spec| spec.gpu); self.publish_platform_event( sandbox.id.clone(), platform_event( @@ -622,15 +650,129 @@ impl VmDriver { ))); } - let spec = sandbox.spec.as_ref(); - let is_gpu = spec.is_some_and(|s| s.gpu); - let gpu_device = spec.map_or("", |s| s.gpu_device.as_str()); + let gpu_device = sandbox.spec.as_ref().map_or("", |s| s.gpu_device.as_str()); let gpu_bdf = if is_gpu { Some(self.assign_gpu_to_record(&sandbox.id, gpu_device).await?) } else { None }; + let needs_qemu = is_gpu; + + let mut plan = + match self.build_vm_launch_plan(&sandbox.id, needs_qemu, is_gpu, gpu_bdf.clone()) { + Ok(plan) => plan, + Err(err) => { + self.release_gpu_and_subnet(&sandbox.id); + return Err(err); + } + }; + + if let Err(err) = self + .lifecycle_extensions + .configure_launch(&sandbox, &state_dir, &mut plan) + .await + { + self.lifecycle_extensions + .after_launch_failed( + &sandbox, + &state_dir, + LaunchAbortReason::BeforeLaunchHookFailed, + ) + .await; + self.release_gpu_and_subnet(&sandbox.id); + let message = format!( + "vm lifecycle extension rejected sandbox launch plan: {}", + err.message() + ); + return Err(if err.is_resource_exhausted() { + Status::resource_exhausted(message) + } else { + Status::failed_precondition(message) + }); + } + + // Resolve and validate the backend from the requirements that + // `configure_launch` extensions contributed. After this point the + // plan's backend, sizing, and host allocations (subnet, tap, vsock) + // are final; the `before_launch` hook below may still mutate + // `plan.env` and `plan.guest_init_dropins` and may abort the launch, + // but it MUST NOT change `plan.backend`, `plan.required_backends`, + // or `plan.required_backend_features` -- those are enforced as a + // documented trait contract, not a runtime check. + if let Err(err) = + self.resolve_launch_plan_backend(&sandbox.id, is_gpu, gpu_bdf.clone(), &mut plan) + { + self.lifecycle_extensions + .after_launch_failed( + &sandbox, + &state_dir, + LaunchAbortReason::BeforeLaunchHookFailed, + ) + .await; + self.release_gpu_and_subnet(&sandbox.id); + return Err(err); + } + + if let Err(err) = Self::validate_launch_plan_backend(is_gpu, &plan) { + self.lifecycle_extensions + .after_launch_failed( + &sandbox, + &state_dir, + LaunchAbortReason::BeforeLaunchHookFailed, + ) + .await; + self.release_gpu_and_subnet(&sandbox.id); + return Err(err); + } + + if plan.backend == VmBackend::Qemu + && let Err(err) = self.mark_qemu_network_allocated(&sandbox.id).await + { + self.release_gpu_and_subnet(&sandbox.id); + return Err(err); + } + + if let Err(err) = self + .lifecycle_extensions + .before_launch(&sandbox, &state_dir, &mut plan) + .await + { + self.lifecycle_extensions + .after_launch_failed( + &sandbox, + &state_dir, + LaunchAbortReason::BeforeLaunchHookFailed, + ) + .await; + self.release_gpu_and_subnet(&sandbox.id); + let message = format!( + "vm lifecycle extension rejected sandbox launch: {}", + err.message() + ); + return Err(if err.is_resource_exhausted() { + Status::resource_exhausted(message) + } else { + Status::failed_precondition(message) + }); + } + + if let Err(err) = inject_guest_init_dropins(&overlay_disk, &plan.guest_init_dropins) { + self.lifecycle_extensions + .after_launch_failed(&sandbox, &state_dir, LaunchAbortReason::GuestPrepareFailed) + .await; + self.release_gpu_and_subnet(&sandbox.id); + return Err(err); + } + + let endpoint_override = if plan.backend == VmBackend::Qemu { + plan.host_ip.as_deref().map(|host_ip| { + guest_visible_openshell_endpoint_for_tap(&self.config.openshell_endpoint, host_ip) + }) + } else { + None + }; + let console_output = state_dir.join("rootfs-console.log"); let mut command = Command::new(&self.launcher_bin); command.kill_on_drop(true); @@ -646,66 +788,37 @@ impl VmDriver { command.arg("--vm-exec").arg(sandbox_guest_init_path()); command.arg("--vm-workdir").arg("/"); command.arg("--vm-console-output").arg(&console_output); + command.arg("--vm-vcpus").arg(plan.vcpus.to_string()); + command.arg("--vm-mem-mib").arg(plan.mem_mib.to_string()); + if let Some(kernel_image) = &plan.kernel_image { + command.arg("--vm-kernel-image").arg(kernel_image); + } - // Compute the endpoint override before building the env so - // there is a single OPENSHELL_ENDPOINT value in the env list. - let endpoint_override = if let Some(bdf) = gpu_bdf.as_ref() { - let subnet = match self - .subnet_allocator - .lock() - .map_err(|e| Status::internal(format!("subnet allocator lock poisoned: {e}"))) - .and_then(|mut alloc| { - alloc - .allocate(&sandbox.id) - .map_err(Status::failed_precondition) - }) { - Ok(s) => s, - Err(err) => { - self.release_gpu_and_subnet(&sandbox.id); - return Err(err); - } - }; - let vsock_cid = allocate_vsock_cid(); - let mac = mac_from_sandbox_id(&sandbox.id); - let mac_str = format!( - "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", - mac[0], mac[1], mac[2], mac[3], mac[4], mac[5] - ); - let tap = tap_device_name(&sandbox.id); - - let tap_endpoint = guest_visible_openshell_endpoint_for_tap( - &self.config.openshell_endpoint, - &subnet.host_ip.to_string(), - ); - + if plan.backend == VmBackend::Qemu { command.arg("--vm-backend").arg("qemu"); - command - .arg("--vm-vcpus") - .arg(self.config.gpu_vcpus.to_string()); - command - .arg("--vm-mem-mib") - .arg(self.config.gpu_mem_mib.to_string()); - command.arg("--vm-gpu-bdf").arg(bdf); - command.arg("--vm-tap-device").arg(&tap); - command - .arg("--vm-guest-ip") - .arg(subnet.guest_ip.to_string()); - command.arg("--vm-host-ip").arg(subnet.host_ip.to_string()); - command.arg("--vm-vsock-cid").arg(vsock_cid.to_string()); - command.arg("--vm-guest-mac").arg(&mac_str); - - if let Some(port) = gateway_port_from_endpoint(&self.config.openshell_endpoint) { + if let Some(bdf) = plan.gpu_bdf.as_deref() { + command.arg("--vm-gpu-bdf").arg(bdf); + } + if let Some(tap) = plan.tap_device.as_deref() { + command.arg("--vm-tap-device").arg(tap); + } + if let Some(guest_ip) = plan.guest_ip.as_deref() { + command.arg("--vm-guest-ip").arg(guest_ip); + } + if let Some(host_ip) = plan.host_ip.as_deref() { + command.arg("--vm-host-ip").arg(host_ip); + } + if let Some(vsock_cid) = plan.vsock_cid { + command.arg("--vm-vsock-cid").arg(vsock_cid.to_string()); + } + if let Some(guest_mac) = plan.guest_mac.as_deref() { + command.arg("--vm-guest-mac").arg(guest_mac); + } + if let Some(port) = plan.gateway_port { command.arg("--vm-gateway-port").arg(port.to_string()); } + } - Some(tap_endpoint) - } else { - command.arg("--vm-vcpus").arg(self.config.vcpus.to_string()); - command - .arg("--vm-mem-mib") - .arg(self.config.mem_mib.to_string()); - None - }; self.ensure_provisioning_active(&sandbox.id).await?; command @@ -715,6 +828,9 @@ impl VmDriver { for env in build_guest_environment(&sandbox, &self.config, endpoint_override.as_deref()) { command.arg("--vm-env").arg(env); } + for env in &plan.env { + command.arg("--vm-env").arg(env); + } info!( sandbox_id = %sandbox.id, @@ -730,9 +846,14 @@ impl VmDriver { error = %err, "vm driver: launcher spawn failed" ); - if gpu_bdf.is_some() { - self.release_gpu_and_subnet(&sandbox.id); - } + self.lifecycle_extensions + .after_launch_failed( + &sandbox, + &state_dir, + LaunchAbortReason::LauncherSpawnFailed, + ) + .await; + self.release_gpu_and_subnet(&sandbox.id); return Err(Status::internal(format!( "failed to launch vm helper '{}': {err}", self.launcher_bin.display() @@ -757,6 +878,7 @@ impl VmDriver { Some(record) if !record.deleting => { record.process = Some(process.clone()); record.gpu_bdf.clone_from(&gpu_bdf); + record.qemu_network_allocated = plan.backend == VmBackend::Qemu; record.provisioning_task = None; snapshot_to_publish = Some(record.snapshot.clone()); } @@ -785,6 +907,15 @@ impl VmDriver { if let Some(snapshot) = snapshot_to_publish { self.publish_snapshot(snapshot); } + if overlay_preparation == OverlayPreparation::PreserveExisting { + let persisted = RestoreContext { + sandbox: sandbox.clone(), + state_dir: state_dir.clone(), + }; + self.lifecycle_extensions + .after_restore(&persisted) + .await; + } tokio::spawn({ let driver = self.clone(); let sandbox_id = sandbox.id.clone(); @@ -821,7 +952,14 @@ impl VmDriver { return Ok(DeleteSandboxResponse { deleted: false }); }; - let (state_dir, process, gpu_bdf, provisioning_task) = { + let ( + state_dir, + process, + gpu_bdf, + qemu_network_allocated, + provisioning_task, + sandbox_snapshot, + ) = { let mut registry = self.registry.lock().await; let Some(record) = registry.get_mut(&record_id) else { return Ok(DeleteSandboxResponse { deleted: false }); @@ -831,7 +969,9 @@ impl VmDriver { record.state_dir.clone(), record.process.clone(), record.gpu_bdf.clone(), + record.qemu_network_allocated, record.provisioning_task.take(), + record.snapshot.clone(), ) }; @@ -854,9 +994,11 @@ impl VmDriver { .map_err(|err| Status::internal(format!("failed to stop vm: {err}")))?; } - if gpu_bdf.is_some() { - self.release_gpu_and_subnet(&record_id); - } + self.lifecycle_extensions + .after_delete(&sandbox_snapshot, &state_dir) + .await; + + self.release_allocations(&record_id, gpu_bdf.is_some(), qemu_network_allocated); remove_sandbox_state_dir(&self.config.state_dir, &state_dir).await?; @@ -998,6 +1140,36 @@ impl VmDriver { } }; + if let Err(err) = self.ensure_extension_state_dirs(&state_dir).await { + warn!( + sandbox_id = %sandbox.id, + sandbox_name = %sandbox.name, + state_dir = %state_dir.display(), + error = %err.message(), + "vm driver: cannot restore persisted sandbox extension state" + ); + return; + } + + let persisted = RestoreContext { + sandbox: sandbox.clone(), + state_dir: state_dir.clone(), + }; + if let Err(err) = self + .lifecycle_extensions + .before_restore(&persisted) + .await + { + warn!( + sandbox_id = %sandbox.id, + sandbox_name = %sandbox.name, + state_dir = %state_dir.display(), + error = %err, + "vm driver: lifecycle extension rejected persisted sandbox restore" + ); + return; + } + let snapshot = sandbox_snapshot(&sandbox, provisioning_condition(), false); { let mut registry = self.registry.lock().await; @@ -1012,6 +1184,7 @@ impl VmDriver { process: None, provisioning_task: None, gpu_bdf: None, + qemu_network_allocated: false, deleting: false, }, ); @@ -1054,17 +1227,278 @@ impl VmDriver { } } - fn release_gpu_and_subnet(&self, sandbox_id: &str) { + fn release_gpu(&self, sandbox_id: &str) { if let Some(inventory) = self.gpu_inventory.as_ref() && let Ok(mut inv) = inventory.lock() { inv.release(sandbox_id); } + } + + fn release_subnet(&self, sandbox_id: &str) { if let Ok(mut alloc) = self.subnet_allocator.lock() { alloc.release(sandbox_id); } } + fn release_allocations(&self, sandbox_id: &str, has_gpu: bool, has_qemu_network: bool) { + if has_gpu { + self.release_gpu(sandbox_id); + } + if has_qemu_network { + self.release_subnet(sandbox_id); + } + } + + fn release_gpu_and_subnet(&self, sandbox_id: &str) { + self.release_gpu(sandbox_id); + self.release_subnet(sandbox_id); + } + + async fn ensure_extension_state_dirs(&self, state_dir: &Path) -> Result<(), Status> { + for extension_name in self.lifecycle_extensions.names() { + let extension_dir = extension_state_dir(state_dir, &extension_name).map_err(|err| { + Status::failed_precondition(format!( + "invalid VM lifecycle extension '{}': {}", + extension_name, + err.message() + )) + })?; + create_private_dir_all(&extension_dir) + .await + .map_err(|err| { + Status::internal(format!( + "create VM lifecycle extension state dir '{}' failed: {err}", + extension_dir.display() + )) + })?; + } + Ok(()) + } + + #[allow(clippy::result_large_err)] + fn resolve_launch_plan_backend( + &self, + sandbox_id: &str, + is_gpu: bool, + gpu_bdf: Option, + plan: &mut LaunchPlan, + ) -> Result<(), Status> { + if plan.kernel_image.is_some() { + plan.require_backend_feature(BackendFeature::ExternalKernelImage); + } + if !plan.guest_init_dropins.is_empty() { + plan.require_backend_feature(BackendFeature::GuestInitDropins); + } + + if plan.required_backends.contains(&VmBackend::Qemu) + || plan.backend == VmBackend::Qemu + || plan + .required_backend_features + .iter() + .any(|feature| feature.requires_qemu()) + { + self.configure_qemu_launch_plan(sandbox_id, is_gpu, gpu_bdf, plan)?; + } + + Ok(()) + } + + #[allow(clippy::result_large_err)] + fn configure_qemu_launch_plan( + &self, + sandbox_id: &str, + is_gpu: bool, + gpu_bdf: Option, + plan: &mut LaunchPlan, + ) -> Result<(), Status> { + plan.backend = VmBackend::Qemu; + if is_gpu { + plan.vcpus = self.config.gpu_vcpus; + plan.mem_mib = self.config.gpu_mem_mib; + } + if plan.gpu_bdf.is_none() { + plan.gpu_bdf = gpu_bdf; + } + if has_complete_qemu_network(plan) { + return Ok(()); + } + + let subnet = self + .subnet_allocator + .lock() + .map_err(|e| Status::internal(format!("subnet allocator lock poisoned: {e}")))? + .allocate(sandbox_id) + .map_err(Status::failed_precondition)?; + let mac = mac_from_sandbox_id(sandbox_id); + plan.tap_device = Some(tap_device_name(sandbox_id)); + plan.guest_ip = Some(subnet.guest_ip.to_string()); + plan.host_ip = Some(subnet.host_ip.to_string()); + plan.vsock_cid = Some(allocate_vsock_cid()); + plan.guest_mac = Some(format!( + "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5] + )); + plan.gateway_port = gateway_port_from_endpoint(&self.config.openshell_endpoint); + Ok(()) + } + + #[allow(clippy::result_large_err)] + fn validate_launch_plan_backend(is_gpu: bool, plan: &LaunchPlan) -> Result<(), Status> { + // NOTE: this guard exists because the non-GPU QEMU launch path + // (PCI device transport, VFIO root port wiring) has not landed + // yet. Until then, even though the resolver will happily promote + // a plan to QEMU when an extension requires `PciPassthrough` or + // `ExternalKernelImage`, the launch itself is blocked here so we + // don't spawn a QEMU instance with no concrete device backing. + // Remove this guard once the non-GPU QEMU launch path supports + // emitting `pcie-root-port` + `vfio-pci` for arbitrary device + // descriptors. + if plan.backend == VmBackend::Qemu && !is_gpu { + let offending_feature = plan + .required_backend_features + .iter() + .find(|feature| feature.requires_qemu()) + .map_or("(explicit QEMU backend requirement)", |feature| { + feature.as_str() + }); + return Err(Status::failed_precondition(format!( + "vm lifecycle extension required '{offending_feature}', which resolves to the QEMU backend, \ + but non-GPU QEMU launch is not yet supported (pending PCI device transport)" + ))); + } + if plan.backend != VmBackend::Qemu && is_gpu { + return Err(Status::failed_precondition( + "GPU sandbox launch requires the QEMU backend", + )); + } + if plan.required_backends.contains(&VmBackend::Libkrun) + && plan.required_backends.contains(&VmBackend::Qemu) + { + return Err(Status::failed_precondition( + "VM lifecycle extensions requested conflicting VM backends", + )); + } + if plan.required_backends.contains(&VmBackend::Libkrun) + && plan.backend != VmBackend::Libkrun + { + return Err(Status::failed_precondition( + "VM lifecycle extension requires the libkrun backend", + )); + } + if plan.required_backends.contains(&VmBackend::Qemu) && plan.backend != VmBackend::Qemu { + return Err(Status::failed_precondition( + "VM lifecycle extension requires the QEMU backend", + )); + } + if plan.backend != VmBackend::Qemu + && let Some(feature) = plan + .required_backend_features + .iter() + .find(|feature| feature.requires_qemu()) + { + return Err(Status::failed_precondition(format!( + "VM backend feature '{}' requires a VM backend with PCI-style launch support", + feature.as_str() + ))); + } + if plan.kernel_image.is_some() && plan.backend != VmBackend::Qemu { + return Err(Status::failed_precondition( + "selected kernel image requires a VM backend that supports external kernel images", + )); + } + if let Some(kernel_image) = &plan.kernel_image + && !kernel_image.is_file() + { + return Err(Status::failed_precondition(format!( + "selected kernel image does not exist: {}", + kernel_image.display() + ))); + } + Ok(()) + } + + async fn mark_qemu_network_allocated(&self, sandbox_id: &str) -> Result<(), Status> { + let mut registry = self.registry.lock().await; + match registry.get_mut(sandbox_id) { + Some(record) if !record.deleting => { + record.qemu_network_allocated = true; + Ok(()) + } + _ => Err(Status::cancelled("sandbox provisioning cancelled")), + } + } + + #[allow(clippy::result_large_err)] + fn build_vm_launch_plan( + &self, + sandbox_id: &str, + needs_qemu: bool, + is_gpu: bool, + gpu_bdf: Option, + ) -> Result { + if !needs_qemu { + return Ok(LaunchPlan { + backend: VmBackend::Libkrun, + vcpus: self.config.vcpus, + mem_mib: self.config.mem_mib, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: None, + gpu_bdf: None, + tap_device: None, + guest_ip: None, + host_ip: None, + vsock_cid: None, + guest_mac: None, + gateway_port: None, + guest_init_dropins: Vec::new(), + env: Vec::new(), + }); + } + + let subnet = self + .subnet_allocator + .lock() + .map_err(|e| Status::internal(format!("subnet allocator lock poisoned: {e}")))? + .allocate(sandbox_id) + .map_err(Status::failed_precondition)?; + let vsock_cid = allocate_vsock_cid(); + let mac = mac_from_sandbox_id(sandbox_id); + let mac_str = format!( + "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5] + ); + let tap = tap_device_name(sandbox_id); + let gateway_port = gateway_port_from_endpoint(&self.config.openshell_endpoint); + + let (vcpus, mem_mib) = if is_gpu { + (self.config.gpu_vcpus, self.config.gpu_mem_mib) + } else { + (self.config.vcpus, self.config.mem_mib) + }; + + Ok(LaunchPlan { + backend: VmBackend::Qemu, + vcpus, + mem_mib, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: None, + gpu_bdf, + tap_device: Some(tap), + guest_ip: Some(subnet.guest_ip.to_string()), + host_ip: Some(subnet.host_ip.to_string()), + vsock_cid: Some(vsock_cid), + guest_mac: Some(mac_str), + gateway_port, + guest_init_dropins: Vec::new(), + env: Vec::new(), + }) + } + async fn ensure_provisioning_active(&self, sandbox_id: &str) -> Result<(), Status> { let registry = self.registry.lock().await; match registry.get(sandbox_id) { @@ -1128,6 +1562,7 @@ impl VmDriver { record.process = None; record.provisioning_task = None; record.gpu_bdf = None; + record.qemu_network_allocated = false; record.snapshot.status = Some(status_with_condition( &record.snapshot, error_condition(reason, message), @@ -2347,16 +2782,13 @@ impl VmDriver { sandbox_id.clone(), platform_event("vm", "Warning", "ProcessExited", message), ); - let has_gpu = { + let (has_gpu, has_qemu_network) = { let registry = self.registry.lock().await; - registry - .get(&sandbox_id) - .and_then(|r| r.gpu_bdf.as_ref()) - .is_some() + registry.get(&sandbox_id).map_or((false, false), |record| { + (record.gpu_bdf.is_some(), record.qemu_network_allocated) + }) }; - if has_gpu { - self.release_gpu_and_subnet(&sandbox_id); - } + self.release_allocations(&sandbox_id, has_gpu, has_qemu_network); return; } @@ -3437,6 +3869,14 @@ fn gateway_port_from_endpoint(endpoint: &str) -> Option { Url::parse(endpoint).ok().and_then(|url| url.port()) } +fn has_complete_qemu_network(plan: &LaunchPlan) -> bool { + plan.tap_device.is_some() + && plan.guest_ip.is_some() + && plan.host_ip.is_some() + && plan.vsock_cid.is_some() + && plan.guest_mac.is_some() +} + fn guest_visible_openshell_endpoint_for_tap(endpoint: &str, host_ip: &str) -> String { let Ok(mut url) = Url::parse(endpoint) else { return endpoint.to_string(); @@ -4033,6 +4473,61 @@ fn inject_guest_sandbox_token(overlay_disk: &Path, token: &str) -> Result<(), St set_rootfs_image_file_mode(overlay_disk, &token_path, 0o600) } +#[allow(clippy::result_large_err)] +fn inject_guest_init_dropins( + overlay_disk: &Path, + dropins: &[GuestInitDropin], +) -> Result<(), Status> { + validate_guest_init_dropins(dropins).map_err(Status::failed_precondition)?; + + // Drop-ins are *executed* in a child shell by run_openshell_init_dropins + // in the guest init script, not sourced into the parent. Mode 0o755 is + // required (the runner skips anything that is not `-x`) and is the + // contract drop-in authors should rely on. + for dropin in dropins { + let guest_path = overlay_upper_path(&format!("{GUEST_INIT_DROPIN_DIR}/{}", dropin.name)); + write_rootfs_image_file(overlay_disk, &guest_path, &dropin.contents).map_err(|err| { + Status::internal(format!( + "write VM guest init drop-in '{}' failed: {err}", + dropin.name + )) + })?; + set_rootfs_image_file_mode(overlay_disk, &guest_path, 0o755).map_err(|err| { + Status::internal(format!( + "set VM guest init drop-in '{}' executable failed: {err}", + dropin.name + )) + })?; + } + Ok(()) +} + +fn validate_guest_init_dropins(dropins: &[GuestInitDropin]) -> Result<(), String> { + let mut names = HashSet::new(); + for dropin in dropins { + validate_guest_init_dropin_name(&dropin.name)?; + if !names.insert(dropin.name.clone()) { + return Err(format!("duplicate VM guest init drop-in '{}'", dropin.name)); + } + } + Ok(()) +} + +fn validate_guest_init_dropin_name(name: &str) -> Result<(), String> { + if name.is_empty() || name == "." || name == ".." { + return Err("VM guest init drop-in name is empty or reserved".to_string()); + } + if !name + .chars() + .all(|ch| ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' || ch == '.') + { + return Err(format!( + "VM guest init drop-in name '{name}' must contain only ASCII letters, numbers, '.', '-', or '_'" + )); + } + Ok(()) +} + fn overlay_upper_path(guest_path: &str) -> String { format!("/upper/{}", guest_path.trim_start_matches('/')) } @@ -4797,6 +5292,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; assert_eq!(driver.capabilities().default_image, "openshell/sandbox:dev"); @@ -4818,6 +5314,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; let sandbox = Sandbox { spec: Some(SandboxSpec { @@ -4852,6 +5349,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; let sandbox = Sandbox { spec: Some(SandboxSpec { @@ -4880,6 +5378,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; let sandbox = Sandbox { spec: Some(SandboxSpec { @@ -4909,6 +5408,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; assert_eq!( @@ -4933,6 +5433,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; assert_eq!( @@ -4954,6 +5455,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; assert_eq!( @@ -5295,6 +5797,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; let state_file = sandbox_state_dir(&driver_state, "sandbox-123").unwrap(); @@ -5358,6 +5861,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; let state_dir = sandbox_state_dir(&driver_state, "sandbox-123").unwrap(); @@ -5376,6 +5880,7 @@ mod tests { process: None, provisioning_task: None, gpu_bdf: None, + qemu_network_allocated: false, deleting: false, }, ); @@ -5412,6 +5917,7 @@ mod tests { Ipv4Addr::new(10, 0, 128, 0), 17, ))), + lifecycle_extensions: Arc::new(LifecycleExtensionRegistry::new()), }; let state_dir = sandbox_state_dir(&driver_state, "sandbox-123").unwrap(); @@ -5431,6 +5937,7 @@ mod tests { process: None, provisioning_task: None, gpu_bdf: None, + qemu_network_allocated: false, deleting: false, }, ); @@ -5752,8 +6259,350 @@ mod tests { process: Some(process), provisioning_task: None, gpu_bdf: None, + qemu_network_allocated: false, deleting: false, }, ); } + + use crate::lifecycle::{ + BackendFeature, LaunchPlan, LifecycleError, LifecycleExtension, + LifecycleExtensionRegistry, LifecycleResult, + }; + use crate::runtime::VmBackend; + + fn test_driver_with_extensions(extensions: LifecycleExtensionRegistry) -> VmDriver { + let (events, _) = broadcast::channel(WATCH_BUFFER); + VmDriver { + config: VmDriverConfig { + openshell_endpoint: "http://127.0.0.1:8080".to_string(), + vcpus: 2, + mem_mib: 2048, + gpu_vcpus: 8, + gpu_mem_mib: 16384, + ..Default::default() + }, + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + image_cache_lock: Arc::new(Mutex::new(())), + events, + gpu_inventory: None, + subnet_allocator: Arc::new(std::sync::Mutex::new(SubnetAllocator::new( + Ipv4Addr::new(10, 0, 128, 0), + 17, + ))), + lifecycle_extensions: Arc::new(extensions), + } + } + + #[derive(Debug)] + struct QemuRequiringExtension { + name: String, + } + + #[tonic::async_trait] + impl LifecycleExtension for QemuRequiringExtension { + fn name(&self) -> &str { + &self.name + } + + async fn configure_launch( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + plan.require_backend(VmBackend::Qemu); + plan.require_backend_feature(BackendFeature::PciPassthrough); + Ok(()) + } + + async fn before_launch( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + if plan.backend != VmBackend::Qemu { + return Err(LifecycleError::new( + "qemu-requiring extension demands QEMU backend", + )); + } + plan.env.push("EXT_DECLARED_QEMU=1".to_string()); + Ok(()) + } + } + + #[derive(Debug)] + struct AlwaysFailsExtension; + + #[tonic::async_trait] + impl LifecycleExtension for AlwaysFailsExtension { + fn name(&self) -> &'static str { + "always-fails" + } + + async fn before_launch( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + _plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + Err(LifecycleError::resource_exhausted("pool empty")) + } + } + + #[test] + fn empty_registry_keeps_non_gpu_sandbox_on_libkrun() { + let driver = test_driver_with_extensions(LifecycleExtensionRegistry::new()); + + let plan = driver + .build_vm_launch_plan("sandbox-x", false, false, None) + .expect("plan should build"); + + assert_eq!(plan.backend, VmBackend::Libkrun); + assert_eq!(plan.vcpus, 2); + assert_eq!(plan.mem_mib, 2048); + assert!(plan.tap_device.is_none()); + assert!(plan.guest_ip.is_none()); + assert!(plan.host_ip.is_none()); + assert!(plan.vsock_cid.is_none()); + assert!(plan.guest_mac.is_none()); + assert!(plan.gpu_bdf.is_none()); + assert!(plan.env.is_empty()); + } + + #[test] + fn empty_registry_has_no_extension_descriptors() { + let driver = test_driver_with_extensions(LifecycleExtensionRegistry::new()); + assert!(driver.lifecycle_extensions.descriptors().is_empty()); + } + + #[test] + fn gpu_sandbox_uses_qemu_backend_and_gpu_sizing() { + let driver = test_driver_with_extensions(LifecycleExtensionRegistry::new()); + + let plan = driver + .build_vm_launch_plan("sandbox-gpu", true, true, Some("0000:01:00.0".to_string())) + .expect("gpu plan should build"); + + assert_eq!(plan.backend, VmBackend::Qemu); + assert_eq!(plan.vcpus, 8); + assert_eq!(plan.mem_mib, 16384); + assert_eq!(plan.gpu_bdf.as_deref(), Some("0000:01:00.0")); + assert!(plan.tap_device.is_some()); + assert!(plan.guest_ip.is_some()); + assert!(plan.host_ip.is_some()); + assert!(plan.vsock_cid.is_some()); + assert!(plan.guest_mac.is_some()); + } + + #[test] + fn launch_plan_rejects_external_kernel_on_unsupported_backend() { + let mut plan = LaunchPlan { + backend: VmBackend::Libkrun, + vcpus: 2, + mem_mib: 2048, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: Some(PathBuf::from("/tmp/openshell-test-kernel")), + gpu_bdf: None, + tap_device: None, + guest_ip: None, + host_ip: None, + vsock_cid: None, + guest_mac: None, + gateway_port: None, + guest_init_dropins: Vec::new(), + env: Vec::new(), + }; + + let err = VmDriver::validate_launch_plan_backend(false, &plan) + .expect_err("external kernels require a compatible backend"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("external kernel images")); + + let base = unique_temp_dir(); + std::fs::create_dir_all(&base).unwrap(); + let kernel = base.join("vmlinux"); + std::fs::write(&kernel, b"kernel").unwrap(); + plan.backend = VmBackend::Qemu; + plan.kernel_image = Some(kernel); + VmDriver::validate_launch_plan_backend(true, &plan).expect("existing kernel is accepted"); + let _ = std::fs::remove_dir_all(base); + } + + #[test] + fn backend_feature_requirements_select_qemu_launch_plan() { + let driver = test_driver_with_extensions(LifecycleExtensionRegistry::new()); + let mut plan = driver + .build_vm_launch_plan("sandbox-vfio", false, false, None) + .expect("base plan should build"); + plan.require_backend_feature(BackendFeature::PciPassthrough); + + driver + .resolve_launch_plan_backend("sandbox-vfio", false, None, &mut plan) + .expect("backend feature should resolve"); + + assert_eq!(plan.backend, VmBackend::Qemu); + assert!(plan.tap_device.is_some()); + assert!(plan.guest_ip.is_some()); + assert!(plan.host_ip.is_some()); + assert!(plan.vsock_cid.is_some()); + assert!(plan.guest_mac.is_some()); + + driver.release_subnet("sandbox-vfio"); + } + + #[test] + fn explicit_backend_requirement_selects_qemu_launch_plan() { + let driver = test_driver_with_extensions(LifecycleExtensionRegistry::new()); + let mut plan = driver + .build_vm_launch_plan("sandbox-qemu", false, false, None) + .expect("base plan should build"); + plan.require_backend(VmBackend::Qemu); + + driver + .resolve_launch_plan_backend("sandbox-qemu", false, None, &mut plan) + .expect("backend requirement should resolve"); + + assert_eq!(plan.backend, VmBackend::Qemu); + assert!(plan.tap_device.is_some()); + assert!(plan.guest_ip.is_some()); + assert!(plan.host_ip.is_some()); + + driver.release_subnet("sandbox-qemu"); + } + + #[test] + fn guest_init_dropin_feature_does_not_force_qemu() { + let driver = test_driver_with_extensions(LifecycleExtensionRegistry::new()); + let mut plan = driver + .build_vm_launch_plan("sandbox-init", false, false, None) + .expect("base plan should build"); + plan.require_backend_feature(BackendFeature::GuestInitDropins); + + driver + .resolve_launch_plan_backend("sandbox-init", false, None, &mut plan) + .expect("guest init feature should resolve"); + + assert_eq!(plan.backend, VmBackend::Libkrun); + assert!(plan.tap_device.is_none()); + } + + #[test] + fn guest_init_dropin_validation_rejects_unsafe_or_duplicate_names() { + validate_guest_init_dropins(&[GuestInitDropin::new("50-vfio.sh", b"true\n".to_vec())]) + .expect("safe drop-in name is accepted"); + + let err = validate_guest_init_dropins(&[GuestInitDropin::new( + "../50-vfio.sh", + b"true\n".to_vec(), + )]) + .expect_err("path traversal is rejected"); + assert!(err.contains("must contain only ASCII")); + + let err = validate_guest_init_dropins(&[ + GuestInitDropin::new("50-vfio.sh", b"true\n".to_vec()), + GuestInitDropin::new("50-vfio.sh", b"true\n".to_vec()), + ]) + .expect_err("duplicate drop-ins are rejected"); + assert!(err.contains("duplicate")); + } + + #[tokio::test] + async fn extension_can_validate_backend_in_before_launch() { + let extension = Arc::new(QemuRequiringExtension { + name: "validate".to_string(), + }); + let extensions = LifecycleExtensionRegistry::with(vec![extension.clone()]); + let sandbox = Sandbox { + id: "sandbox-validate".to_string(), + name: "sandbox-validate".to_string(), + ..Default::default() + }; + + let mut libkrun_plan = LaunchPlan { + backend: VmBackend::Libkrun, + vcpus: 2, + mem_mib: 2048, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: None, + gpu_bdf: None, + tap_device: None, + guest_ip: None, + host_ip: None, + vsock_cid: None, + guest_mac: None, + gateway_port: None, + guest_init_dropins: Vec::new(), + env: Vec::new(), + }; + let err = extensions + .before_launch(&sandbox, Path::new("/tmp/state"), &mut libkrun_plan) + .await + .expect_err("backend mismatch should fail validation"); + assert!(err.message().contains("demands QEMU")); + + let mut qemu_plan = LaunchPlan { + backend: VmBackend::Qemu, + vcpus: 2, + mem_mib: 2048, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: None, + gpu_bdf: None, + tap_device: Some("vmtap-x".to_string()), + guest_ip: Some("10.0.0.2".to_string()), + host_ip: Some("10.0.0.1".to_string()), + vsock_cid: Some(7), + guest_mac: Some("02:00:00:00:00:01".to_string()), + gateway_port: Some(8080), + guest_init_dropins: Vec::new(), + env: Vec::new(), + }; + extensions + .before_launch(&sandbox, Path::new("/tmp/state"), &mut qemu_plan) + .await + .expect("QEMU backend should satisfy the extension"); + assert!(qemu_plan.env.contains(&"EXT_DECLARED_QEMU=1".to_string())); + } + + #[tokio::test] + async fn lifecycle_error_resource_exhausted_propagates() { + let extensions = LifecycleExtensionRegistry::with(vec![Arc::new(AlwaysFailsExtension)]); + let sandbox = Sandbox { + id: "sandbox-resource".to_string(), + name: "sandbox-resource".to_string(), + ..Default::default() + }; + let mut plan = LaunchPlan { + backend: VmBackend::Qemu, + vcpus: 2, + mem_mib: 2048, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: None, + gpu_bdf: None, + tap_device: Some("vmtap-x".to_string()), + guest_ip: Some("10.0.0.2".to_string()), + host_ip: Some("10.0.0.1".to_string()), + vsock_cid: Some(7), + guest_mac: Some("02:00:00:00:00:01".to_string()), + gateway_port: Some(8080), + guest_init_dropins: Vec::new(), + env: Vec::new(), + }; + let err = extensions + .before_launch(&sandbox, Path::new("/tmp/state"), &mut plan) + .await + .expect_err("scripted pool exhaustion should surface"); + assert!(err.is_resource_exhausted()); + assert_eq!(err.message(), "pool empty"); + } } diff --git a/crates/openshell-driver-vm/src/extensions/mod.rs b/crates/openshell-driver-vm/src/extensions/mod.rs new file mode 100644 index 000000000..86d346469 --- /dev/null +++ b/crates/openshell-driver-vm/src/extensions/mod.rs @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Concrete implementations of [`crate::lifecycle::LifecycleExtension`]. +//! +//! The framework itself — the trait, registry, launch plan, descriptor, +//! and backend-feature resolver — lives in [`crate::lifecycle`]. This +//! module is the conventional home for concrete extensions that +//! implement the trait. +//! +//! # Conventions for a new extension +//! +//! - Each extension lives in its own submodule: +//! `crates/openshell-driver-vm/src/extensions//mod.rs`. +//! - The submodule exposes a single public type +//! `Extension` (e.g. `VfioPassthroughExtension`) that +//! implements [`crate::lifecycle::LifecycleExtension`]. +//! - Helpers (pool allocators, guest env builders, on-disk state +//! layouts) live alongside the extension in private modules within +//! the same directory. +//! - The extension's [`crate::lifecycle::LifecycleExtension::name`] +//! must match `` (kebab-case ASCII). The driver creates +//! per-extension state at +//! `/extensions//` for hooks to use. +//! - Re-export the extension type from this `mod.rs` so callers can +//! write `use openshell_driver_vm::extensions::VfioPassthroughExtension;`. +//! +//! New extension types are wired into a running driver by registering +//! them with [`crate::lifecycle::LifecycleExtensionRegistry`] and passing +//! the registry to [`crate::VmDriver::new_with_extensions`]. diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs index 5b2ddc2bc..88e2c3b20 100644 --- a/crates/openshell-driver-vm/src/lib.rs +++ b/crates/openshell-driver-vm/src/lib.rs @@ -5,12 +5,18 @@ pub mod driver; mod embedded_runtime; mod ffi; pub mod gpu; +pub mod lifecycle; mod nft_ruleset; pub mod procguard; mod rootfs; mod runtime; pub use driver::{VmDriver, VmDriverConfig}; +pub use lifecycle::{ + BackendFeature, ExtensionCapabilities, ExtensionDescriptor, GuestInitDropin, LaunchAbortReason, + LaunchPlan, LifecycleError, LifecycleExtension, LifecycleExtensionRegistry, LifecycleResult, + RestoreContext, +}; pub use runtime::{ VM_RUNTIME_DIR_ENV, VmBackend, VmLaunchConfig, cleanup_stale_tap_interfaces, configured_runtime_dir, run_vm, diff --git a/crates/openshell-driver-vm/src/lifecycle.rs b/crates/openshell-driver-vm/src/lifecycle.rs new file mode 100644 index 000000000..346897979 --- /dev/null +++ b/crates/openshell-driver-vm/src/lifecycle.rs @@ -0,0 +1,1029 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::collections::HashSet; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use openshell_core::proto::compute::v1::DriverSandbox as Sandbox; + +use crate::runtime::VmBackend; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LaunchAbortReason { + LauncherSpawnFailed, + BeforeLaunchHookFailed, + GuestPrepareFailed, +} + +#[derive(Debug, Clone)] +pub struct LifecycleError { + message: String, + resource_exhausted: bool, +} + +impl LifecycleError { + pub fn new(message: impl Into) -> Self { + Self { + message: message.into(), + resource_exhausted: false, + } + } + + pub fn resource_exhausted(message: impl Into) -> Self { + Self { + message: message.into(), + resource_exhausted: true, + } + } + + #[must_use] + pub fn message(&self) -> &str { + &self.message + } + + #[must_use] + pub fn is_resource_exhausted(&self) -> bool { + self.resource_exhausted + } +} + +impl std::fmt::Display for LifecycleError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.message) + } +} + +impl std::error::Error for LifecycleError {} + +pub type LifecycleResult = Result; + +/// A capability an extension can require from the VM backend. +/// +/// Extensions declare features they need (e.g. PCI passthrough or an +/// external kernel image) and the VM driver resolves a concrete +/// [`VmBackend`] that can satisfy them. The mapping from feature to +/// backend lives in [`BackendFeature::requires_qemu`] for now; once a +/// third backend exists this should evolve into a per-backend capability +/// table that the resolver intersects against feature requirements. +/// +/// # Current limitations +/// +/// Until the non-GPU QEMU launch path (PCI device transport / VFIO root +/// port wiring) lands, the driver still rejects launches where the +/// resolved backend is QEMU but the sandbox has no GPU. As a result, +/// declaring [`Self::PciPassthrough`] or [`Self::ExternalKernelImage`] on +/// a non-GPU sandbox is accepted by [`LifecycleExtensionRegistry::validate`] +/// at registration time but will fail provisioning with a +/// `FailedPrecondition` status. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum BackendFeature { + /// Extension supplies its own kernel image via + /// [`LaunchPlan::kernel_image`]. Currently QEMU-only. + ExternalKernelImage, + /// Extension contributes guest init drop-ins via + /// [`LaunchPlan::guest_init_dropins`]. Supported by all backends. + GuestInitDropins, + /// Extension needs PCI device passthrough on the guest. Currently + /// QEMU-only and currently rejected for non-GPU sandboxes pending the + /// non-GPU QEMU launch path landing. + PciPassthrough, + /// Extension needs a host TAP device wired into the guest. Currently + /// QEMU-only (libkrun does not expose a TAP transport). + TapNetworking, +} + +impl BackendFeature { + #[must_use] + pub fn as_str(self) -> &'static str { + match self { + Self::ExternalKernelImage => "external-kernel-image", + Self::GuestInitDropins => "guest-init-dropins", + Self::PciPassthrough => "pci-passthrough", + Self::TapNetworking => "tap-networking", + } + } + + /// Returns true when satisfying this feature requires the QEMU backend + /// today. This is the simplest possible resolver and is expected to be + /// replaced with a per-backend capability table once a third backend + /// exists. + #[must_use] + pub fn requires_qemu(self) -> bool { + matches!( + self, + Self::ExternalKernelImage | Self::PciPassthrough | Self::TapNetworking + ) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct ExtensionCapabilities { + pub kernel_profiles: Vec, + pub guest_init_dropins: Vec, + pub launch_features: Vec, + pub host_resources: Vec, +} + +/// A registration-time description of what a lifecycle extension provides +/// and requires. +/// +/// `required_backends` and `required_backend_features` are merged into the +/// launch plan unconditionally for every sandbox. An extension that wants +/// conditional behavior (e.g. only contribute requirements when the +/// sandbox spec asks for it) should leave the descriptor fields empty and +/// call [`LaunchPlan::require_backend`] / +/// [`LaunchPlan::require_backend_feature`] inside +/// [`LifecycleExtension::configure_launch`] instead. +/// +/// A future PR will add a per-sandbox activation protocol so the driver +/// can gate this merge on a sandbox spec field. Until that lands, the +/// only knob is "declare in the descriptor (always merged) vs decide in +/// the hook (per-sandbox)". +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ExtensionDescriptor { + pub name: String, + pub provides: ExtensionCapabilities, + pub required_backends: Vec, + pub required_backend_features: Vec, +} + +impl ExtensionDescriptor { + #[must_use] + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + provides: ExtensionCapabilities::default(), + required_backends: Vec::new(), + required_backend_features: Vec::new(), + } + } +} + +/// A guest-side init drop-in injected into the sandbox's overlay disk. +/// +/// Drop-ins land at `/opt/openshell/init.d/{name}` inside the guest with +/// mode `0o755`. The guest's init script *executes* drop-ins in a child +/// shell in deterministic ASCII-sorted order; it does not source them. +/// Authors should: +/// +/// - Begin the file with a `#!/bin/bash` (or equivalent) shebang. +/// - Use the `00-`, `50-`, `99-` prefix convention to control ordering. +/// - Treat the parent shell as immutable: env vars set in a drop-in do not +/// propagate to the rest of init. +/// +/// `name` must consist of ASCII letters, digits, `.`, `-`, or `_` (no +/// path separators, no `.`/`..`); duplicates across a single launch plan +/// are rejected by the driver. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GuestInitDropin { + pub name: String, + pub contents: Vec, +} + +impl GuestInitDropin { + #[must_use] + pub fn new(name: impl Into, contents: impl Into>) -> Self { + Self { + name: name.into(), + contents: contents.into(), + } + } +} + +#[derive(Debug, Clone)] +pub struct LaunchPlan { + pub backend: VmBackend, + pub vcpus: u8, + pub mem_mib: u32, + pub required_backends: Vec, + pub required_backend_features: Vec, + pub kernel_profile: Option, + pub kernel_image: Option, + pub gpu_bdf: Option, + pub tap_device: Option, + pub guest_ip: Option, + pub host_ip: Option, + pub vsock_cid: Option, + pub guest_mac: Option, + pub gateway_port: Option, + pub guest_init_dropins: Vec, + pub env: Vec, +} + +impl LaunchPlan { + pub fn require_backend(&mut self, backend: VmBackend) { + if !self.required_backends.contains(&backend) { + self.required_backends.push(backend); + } + } + + pub fn require_backend_feature(&mut self, feature: BackendFeature) { + if !self.required_backend_features.contains(&feature) { + self.required_backend_features.push(feature); + } + } + + pub fn require_backend_features( + &mut self, + features: impl IntoIterator, + ) { + for feature in features { + self.require_backend_feature(feature); + } + } +} + +#[derive(Debug, Clone)] +pub struct RestoreContext { + pub sandbox: Sandbox, + pub state_dir: PathBuf, +} + +/// Lifecycle hooks an extension can implement to participate in VM sandbox +/// provisioning, launch failure, deletion, and post-restart reconciliation. +/// +/// # Hook ordering during a successful launch +/// +/// 1. [`configure_launch`](Self::configure_launch) — contribute backend +/// requirements (via [`LaunchPlan::require_backend`] / +/// [`LaunchPlan::require_backend_feature`]) and provisioning inputs +/// (kernel profile, guest init drop-ins, etc.). Called before the driver +/// has resolved the final backend. +/// 2. Driver resolves [`LaunchPlan::backend`] from declared requirements +/// and allocates backend-specific host resources (subnet, tap, vsock). +/// 3. [`before_launch`](Self::before_launch) — perform host-side +/// side effects with the resolved plan in hand, optionally append +/// additional guest env via [`LaunchPlan::env`]. +/// 4. The driver spawns the VM launcher process. +/// +/// On launch failure or sandbox deletion, the driver invokes +/// [`after_launch_failed`](Self::after_launch_failed) or +/// [`after_delete`](Self::after_delete) in **reverse +/// registration order**, so cleanup mirrors setup. +#[tonic::async_trait] +pub trait LifecycleExtension: std::fmt::Debug + Send + Sync { + fn name(&self) -> &str; + + fn descriptor(&self) -> ExtensionDescriptor { + ExtensionDescriptor::new(self.name()) + } + + /// Contribute backend requirements and provisioning inputs to the plan + /// before the driver picks a backend. + /// + /// Use this hook to: + /// - Declare backend requirements with + /// [`LaunchPlan::require_backend`] or + /// [`LaunchPlan::require_backend_feature`]. + /// - Set [`LaunchPlan::kernel_profile`] or + /// [`LaunchPlan::kernel_image`]. + /// - Append [`LaunchPlan::guest_init_dropins`] entries. + /// + /// At this point [`LaunchPlan::backend`] is the driver's tentative + /// choice and may still change during backend resolution. Do not perform + /// host-side side effects here — defer them to + /// [`before_launch`](Self::before_launch). + async fn configure_launch( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + _plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + Ok(()) + } + + /// Perform host-side preparation with the resolved launch plan. + /// + /// At this point [`LaunchPlan::backend`], + /// [`LaunchPlan::required_backends`], and + /// [`LaunchPlan::required_backend_features`] are finalized and any + /// backend-specific host resources (subnet, tap, vsock) have been + /// allocated. This hook is the right place to bind PCI devices, set + /// up filesystem state, or otherwise prepare the host. + /// + /// Implementations MAY append entries to [`LaunchPlan::env`] to + /// inject additional guest environment variables, and MAY return an + /// error to abort the launch. Implementations MUST NOT change + /// [`LaunchPlan::backend`], [`LaunchPlan::required_backends`], or + /// [`LaunchPlan::required_backend_features`]; those changes are + /// ignored by the driver once `before_launch` is reached. + /// + /// If this hook performs allocations that must be released on failure + /// or delete, implement + /// [`after_launch_failed`](Self::after_launch_failed) and + /// [`after_delete`](Self::after_delete) accordingly. + async fn before_launch( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + _plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + Ok(()) + } + + /// Release anything this extension allocated during + /// [`configure_launch`](Self::configure_launch) or + /// [`before_launch`](Self::before_launch) when the launcher + /// could not be started or aborted before it became healthy. + /// + /// Invoked in reverse registration order. Errors are logged but do not + /// propagate; do best-effort cleanup and return [`Ok`] when possible. + /// This hook is invoked on every launcher failure, including failures + /// that happen during a persisted-sandbox restore (in that case + /// [`after_restore`](Self::after_restore) is *not* + /// invoked). + async fn after_launch_failed( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + _reason: LaunchAbortReason, + ) -> LifecycleResult<()> { + Ok(()) + } + + /// Release per-sandbox resources after a sandbox has been deleted. + /// + /// Invoked in reverse registration order. Errors are logged but do not + /// propagate. + async fn after_delete( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + ) -> LifecycleResult<()> { + Ok(()) + } + + /// Inspect or reconcile persisted extension state before the driver + /// attempts to restore a sandbox after a process restart. + /// + /// Returning an error causes the driver to skip restoring this + /// sandbox; the persisted state is left on disk for operator + /// inspection. + async fn before_restore( + &self, + _sandbox: &RestoreContext, + ) -> LifecycleResult<()> { + Ok(()) + } + + /// Notify the extension that a persisted sandbox has been + /// successfully restored and its launcher is running again. + /// + /// Only invoked when restore succeeds. If the restore fails partway + /// through, [`after_launch_failed`](Self::after_launch_failed) + /// runs instead. + async fn after_restore( + &self, + _sandbox: &RestoreContext, + ) -> LifecycleResult<()> { + Ok(()) + } +} + +#[derive(Clone, Default)] +pub struct LifecycleExtensionRegistry { + extensions: Vec>, +} + +impl std::fmt::Debug for LifecycleExtensionRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LifecycleExtensionRegistry") + .field( + "names", + &self + .extensions + .iter() + .map(|ext| ext.name()) + .collect::>(), + ) + .finish() + } +} + +impl LifecycleExtensionRegistry { + #[must_use] + pub fn new() -> Self { + Self::default() + } + + #[must_use] + pub fn with(extensions: Vec>) -> Self { + Self { extensions } + } + + pub fn push(&mut self, extension: Arc) { + self.extensions.push(extension); + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.extensions.is_empty() + } + + #[must_use] + pub fn len(&self) -> usize { + self.extensions.len() + } + + #[must_use] + pub fn names(&self) -> Vec { + self.extensions + .iter() + .map(|ext| ext.name().to_string()) + .collect() + } + + #[must_use] + pub fn descriptors(&self) -> Vec { + self.extensions.iter().map(|ext| ext.descriptor()).collect() + } + + pub fn validate(&self) -> LifecycleResult<()> { + let mut names = HashSet::new(); + for ext in &self.extensions { + let descriptor = ext.descriptor(); + validate_extension_name(ext.name())?; + validate_extension_name(&descriptor.name)?; + if descriptor.name != ext.name() { + return Err(LifecycleError::new(format!( + "lifecycle extension '{}' descriptor name does not match '{}'", + ext.name(), + descriptor.name + ))); + } + validate_descriptor_strings(&descriptor)?; + if !names.insert(descriptor.name.clone()) { + return Err(LifecycleError::new(format!( + "duplicate lifecycle extension name: {}", + descriptor.name + ))); + } + } + Ok(()) + } + + pub async fn configure_launch( + &self, + sandbox: &Sandbox, + state_dir: &Path, + plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + for ext in &self.extensions { + let descriptor = ext.descriptor(); + for backend in descriptor.required_backends { + plan.require_backend(backend); + } + plan.require_backend_features(descriptor.required_backend_features); + // Snapshot fields where "last writer wins" could mask an + // extension conflict, so we can flag the conflict instead of + // silently dropping the earlier value. + let prev_kernel_profile = plan.kernel_profile.clone(); + let prev_kernel_image = plan.kernel_image.clone(); + ext.configure_launch(sandbox, state_dir, plan).await?; + warn_on_singleton_overwrite( + ext.name(), + "kernel_profile", + prev_kernel_profile.as_deref(), + plan.kernel_profile.as_deref(), + ); + warn_on_singleton_overwrite( + ext.name(), + "kernel_image", + prev_kernel_image + .as_deref() + .map(|p| p.display().to_string()), + plan.kernel_image + .as_deref() + .map(|p| p.display().to_string()), + ); + } + Ok(()) + } + + pub async fn before_launch( + &self, + sandbox: &Sandbox, + state_dir: &Path, + plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + for ext in &self.extensions { + ext.before_launch(sandbox, state_dir, plan).await?; + } + Ok(()) + } + + pub async fn after_launch_failed( + &self, + sandbox: &Sandbox, + state_dir: &Path, + reason: LaunchAbortReason, + ) { + for ext in self.extensions.iter().rev() { + if let Err(err) = ext + .after_launch_failed(sandbox, state_dir, reason.clone()) + .await + { + tracing::warn!( + extension = ext.name(), + sandbox_id = %sandbox.id, + error = %err, + "vm driver: lifecycle extension after_launch_failed hook failed" + ); + } + } + } + + pub async fn after_delete(&self, sandbox: &Sandbox, state_dir: &Path) { + for ext in self.extensions.iter().rev() { + if let Err(err) = ext.after_delete(sandbox, state_dir).await { + tracing::warn!( + extension = ext.name(), + sandbox_id = %sandbox.id, + error = %err, + "vm driver: lifecycle extension after_delete hook failed" + ); + } + } + } + + pub async fn before_restore( + &self, + sandbox: &RestoreContext, + ) -> LifecycleResult<()> { + for ext in &self.extensions { + ext.before_restore(sandbox).await?; + } + Ok(()) + } + + pub async fn after_restore(&self, sandbox: &RestoreContext) { + for ext in &self.extensions { + if let Err(err) = ext.after_restore(sandbox).await { + tracing::warn!( + extension = ext.name(), + sandbox_id = %sandbox.sandbox.id, + error = %err, + "vm driver: lifecycle extension after_restore hook failed" + ); + } + } + } +} + +fn warn_on_singleton_overwrite( + extension_name: &str, + field: &str, + prev: Option, + next: Option, +) where + T: AsRef + std::fmt::Display + PartialEq, +{ + let (Some(prev), Some(next)) = (prev, next) else { + return; + }; + if prev == next { + return; + } + tracing::warn!( + extension = extension_name, + field, + previous = %prev, + next = %next, + "vm driver: lifecycle extension overwrote a singleton launch-plan field set by an earlier extension" + ); +} + +pub fn extension_state_dir( + sandbox_state_dir: &Path, + extension_name: &str, +) -> LifecycleResult { + validate_extension_name(extension_name)?; + Ok(sandbox_state_dir.join("extensions").join(extension_name)) +} + +fn validate_extension_name(name: &str) -> LifecycleResult<()> { + if name.is_empty() || name == "." || name == ".." { + return Err(LifecycleError::new( + "lifecycle extension name is empty or reserved", + )); + } + if !name + .chars() + .all(|ch| ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' || ch == '.') + { + return Err(LifecycleError::new(format!( + "lifecycle extension name '{name}' must contain only ASCII letters, numbers, '.', '-', or '_'" + ))); + } + Ok(()) +} + +fn validate_descriptor_strings( + descriptor: &ExtensionDescriptor, +) -> LifecycleResult<()> { + for value in descriptor + .provides + .kernel_profiles + .iter() + .chain(descriptor.provides.guest_init_dropins.iter()) + .chain(descriptor.provides.launch_features.iter()) + .chain(descriptor.provides.host_resources.iter()) + { + validate_extension_identifier(value).map_err(|err| { + LifecycleError::new(format!( + "lifecycle extension '{}' has invalid provided capability '{}': {err}", + descriptor.name, value + )) + })?; + } + Ok(()) +} + +fn validate_extension_identifier(value: &str) -> Result<(), &'static str> { + if value.is_empty() || value == "." || value == ".." { + return Err("identifier is empty or reserved"); + } + if !value + .chars() + .all(|ch| ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' || ch == '.') + { + return Err("identifier must contain only ASCII letters, numbers, '.', '-', or '_'"); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + use std::sync::Mutex; + + use super::*; + + #[derive(Debug)] + struct RecordingExtension { + name: String, + configure_should_fail: bool, + before_should_fail: bool, + calls: Mutex>, + } + + impl RecordingExtension { + fn new(name: &str) -> Arc { + Arc::new(Self { + name: name.to_string(), + configure_should_fail: false, + before_should_fail: false, + calls: Mutex::new(Vec::new()), + }) + } + + fn failing(name: &str) -> Arc { + Arc::new(Self { + name: name.to_string(), + configure_should_fail: false, + before_should_fail: true, + calls: Mutex::new(Vec::new()), + }) + } + + fn configure_failing(name: &str) -> Arc { + Arc::new(Self { + name: name.to_string(), + configure_should_fail: true, + before_should_fail: false, + calls: Mutex::new(Vec::new()), + }) + } + + fn calls(&self) -> Vec { + self.calls.lock().unwrap().clone() + } + } + + #[tonic::async_trait] + impl LifecycleExtension for RecordingExtension { + fn name(&self) -> &str { + &self.name + } + + fn descriptor(&self) -> ExtensionDescriptor { + ExtensionDescriptor { + name: self.name.clone(), + provides: ExtensionCapabilities { + kernel_profiles: vec![format!("profile-{}", self.name)], + guest_init_dropins: vec![format!("50-{}.sh", self.name)], + launch_features: vec!["guest-init-dropins".to_string()], + host_resources: Vec::new(), + }, + required_backends: Vec::new(), + required_backend_features: vec![BackendFeature::GuestInitDropins], + } + } + + async fn configure_launch( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + self.calls + .lock() + .unwrap() + .push(format!("{}:configure_launch", self.name)); + if self.configure_should_fail { + return Err(LifecycleError::new(format!( + "{}: scripted configure_launch failure", + self.name + ))); + } + plan.kernel_profile = Some(format!("profile-{}", self.name)); + plan.guest_init_dropins.push(GuestInitDropin::new( + format!("50-{}.sh", self.name), + b"#!/bin/sh\n".to_vec(), + )); + Ok(()) + } + + async fn before_launch( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + self.calls + .lock() + .unwrap() + .push(format!("{}:before_launch", self.name)); + if self.before_should_fail { + return Err(LifecycleError::new(format!( + "{}: scripted before_launch failure", + self.name + ))); + } + plan.env.push(format!("RECORDING_{}=1", self.name)); + Ok(()) + } + + async fn after_launch_failed( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + reason: LaunchAbortReason, + ) -> LifecycleResult<()> { + self.calls + .lock() + .unwrap() + .push(format!("{}:after_launch_failed:{:?}", self.name, reason)); + Ok(()) + } + + async fn after_delete( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + ) -> LifecycleResult<()> { + self.calls + .lock() + .unwrap() + .push(format!("{}:after_delete", self.name)); + Ok(()) + } + } + + fn sample_plan(backend: VmBackend) -> LaunchPlan { + LaunchPlan { + backend, + vcpus: 2, + mem_mib: 2048, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: None, + gpu_bdf: None, + tap_device: None, + guest_ip: None, + host_ip: None, + vsock_cid: None, + guest_mac: None, + gateway_port: None, + guest_init_dropins: Vec::new(), + env: Vec::new(), + } + } + + fn sample_sandbox() -> Sandbox { + Sandbox { + id: "sandbox-123".to_string(), + name: "sandbox-123".to_string(), + ..Default::default() + } + } + + fn as_extension(extension: &Arc) -> Arc + where + T: LifecycleExtension + 'static, + { + extension.clone() + } + + #[tokio::test] + async fn configure_launch_runs_each_extension_in_order() { + let ext_a = RecordingExtension::new("a"); + let ext_b = RecordingExtension::new("b"); + let registry = + LifecycleExtensionRegistry::with(vec![as_extension(&ext_a), as_extension(&ext_b)]); + let mut plan = sample_plan(VmBackend::Qemu); + let sandbox = sample_sandbox(); + + registry + .configure_launch(&sandbox, &PathBuf::from("/tmp/state"), &mut plan) + .await + .expect("configure_launch succeeds"); + + assert_eq!(plan.kernel_profile.as_deref(), Some("profile-b")); + assert_eq!( + plan.guest_init_dropins + .iter() + .map(|dropin| dropin.name.as_str()) + .collect::>(), + vec!["50-a.sh", "50-b.sh"] + ); + assert_eq!(ext_a.calls(), vec!["a:configure_launch"]); + assert_eq!(ext_b.calls(), vec!["b:configure_launch"]); + } + + #[tokio::test] + async fn configure_launch_short_circuits_on_first_failure() { + let ext_a = RecordingExtension::new("a"); + let ext_fail = RecordingExtension::configure_failing("boom"); + let ext_c = RecordingExtension::new("c"); + let registry = LifecycleExtensionRegistry::with(vec![ + as_extension(&ext_a), + as_extension(&ext_fail), + as_extension(&ext_c), + ]); + let mut plan = sample_plan(VmBackend::Libkrun); + let sandbox = sample_sandbox(); + + let err = registry + .configure_launch(&sandbox, &PathBuf::from("/tmp/state"), &mut plan) + .await + .expect_err("scripted failure should propagate"); + assert!( + err.message() + .contains("scripted configure_launch failure") + ); + + assert_eq!(ext_a.calls(), vec!["a:configure_launch"]); + assert_eq!(ext_fail.calls(), vec!["boom:configure_launch"]); + assert!( + ext_c.calls().is_empty(), + "extensions after the failure must not be invoked" + ); + } + + #[tokio::test] + async fn before_launch_runs_each_extension_in_order_and_collects_env() { + let ext_a = RecordingExtension::new("a"); + let ext_b = RecordingExtension::new("b"); + let registry = + LifecycleExtensionRegistry::with(vec![as_extension(&ext_a), as_extension(&ext_b)]); + let mut plan = sample_plan(VmBackend::Qemu); + let sandbox = sample_sandbox(); + + registry + .before_launch(&sandbox, &PathBuf::from("/tmp/state"), &mut plan) + .await + .expect("before_launch succeeds"); + + assert_eq!(plan.env, vec!["RECORDING_a=1", "RECORDING_b=1"]); + assert_eq!(ext_a.calls(), vec!["a:before_launch"]); + assert_eq!(ext_b.calls(), vec!["b:before_launch"]); + } + + #[tokio::test] + async fn before_launch_short_circuits_on_first_failure() { + let ext_a = RecordingExtension::new("a"); + let ext_fail = RecordingExtension::failing("boom"); + let ext_c = RecordingExtension::new("c"); + let registry = LifecycleExtensionRegistry::with(vec![ + as_extension(&ext_a), + as_extension(&ext_fail), + as_extension(&ext_c), + ]); + let mut plan = sample_plan(VmBackend::Libkrun); + let sandbox = sample_sandbox(); + + let err = registry + .before_launch(&sandbox, &PathBuf::from("/tmp/state"), &mut plan) + .await + .expect_err("scripted failure should propagate"); + assert!(err.message().contains("scripted before_launch failure")); + + assert_eq!(ext_a.calls(), vec!["a:before_launch"]); + assert_eq!(ext_fail.calls(), vec!["boom:before_launch"]); + assert!( + ext_c.calls().is_empty(), + "extensions after the failure must not be invoked" + ); + } + + #[tokio::test] + async fn after_launch_failed_runs_every_extension_in_reverse_order() { + let ext_a = RecordingExtension::new("a"); + let ext_b = RecordingExtension::new("b"); + let registry = + LifecycleExtensionRegistry::with(vec![as_extension(&ext_a), as_extension(&ext_b)]); + let sandbox = sample_sandbox(); + + registry + .after_launch_failed( + &sandbox, + &PathBuf::from("/tmp/state"), + LaunchAbortReason::LauncherSpawnFailed, + ) + .await; + + assert_eq!( + ext_a.calls(), + vec!["a:after_launch_failed:LauncherSpawnFailed"] + ); + assert_eq!( + ext_b.calls(), + vec!["b:after_launch_failed:LauncherSpawnFailed"] + ); + } + + #[tokio::test] + async fn after_delete_runs_every_extension() { + let ext_a = RecordingExtension::new("a"); + let ext_b = RecordingExtension::new("b"); + let registry = + LifecycleExtensionRegistry::with(vec![as_extension(&ext_a), as_extension(&ext_b)]); + let sandbox = sample_sandbox(); + + registry + .after_delete(&sandbox, &PathBuf::from("/tmp/state")) + .await; + + assert_eq!(ext_a.calls(), vec!["a:after_delete"]); + assert_eq!(ext_b.calls(), vec!["b:after_delete"]); + } + + #[test] + fn resource_exhausted_flag_round_trips() { + let err = LifecycleError::resource_exhausted("pool empty"); + assert!(err.is_resource_exhausted()); + assert_eq!(err.message(), "pool empty"); + + let plain = LifecycleError::new("internal"); + assert!(!plain.is_resource_exhausted()); + } + + #[test] + fn extension_state_dir_rejects_path_unsafe_names() { + let base = PathBuf::from("/tmp/sandbox"); + assert_eq!( + extension_state_dir(&base, "vfio").unwrap(), + base.join("extensions").join("vfio") + ); + assert!(extension_state_dir(&base, "../vfio").is_err()); + assert!(extension_state_dir(&base, "").is_err()); + } + + #[test] + fn validate_rejects_duplicate_extension_names() { + let registry = LifecycleExtensionRegistry::with(vec![ + RecordingExtension::new("dup"), + RecordingExtension::new("dup"), + ]); + let err = registry + .validate() + .expect_err("duplicate names should fail"); + assert!(err.message().contains("duplicate")); + } + + #[test] + fn descriptor_tracks_provided_capabilities_and_requirements() { + let ext = RecordingExtension::new("vfio"); + let registry = LifecycleExtensionRegistry::with(vec![ext]); + + let descriptors = registry.descriptors(); + assert_eq!(descriptors.len(), 1); + assert_eq!(descriptors[0].name, "vfio"); + assert!(descriptors[0].required_backends.is_empty()); + assert_eq!( + descriptors[0].required_backend_features, + vec![BackendFeature::GuestInitDropins] + ); + assert_eq!( + descriptors[0].provides.kernel_profiles, + vec!["profile-vfio".to_string()] + ); + assert_eq!( + descriptors[0].provides.guest_init_dropins, + vec!["50-vfio.sh".to_string()] + ); + } +} diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs index 8f662dc76..57db7b64b 100644 --- a/crates/openshell-driver-vm/src/main.rs +++ b/crates/openshell-driver-vm/src/main.rs @@ -36,6 +36,9 @@ struct Args { #[arg(long = "vm-image-disk", hide = true)] vm_image_disk: Option, + #[arg(long = "vm-kernel-image", hide = true)] + vm_kernel_image: Option, + #[arg(long, hide = true)] vm_exec: Option, @@ -482,6 +485,7 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result, + pub kernel_image: Option, pub vcpus: u8, pub mem_mib: u32, pub exec_path: String, @@ -126,12 +127,15 @@ fn run_qemu_vm(config: &VmLaunchConfig) -> Result<(), String> { let guest_env = qemu_guest_env_vars(config, host_dns_server()); write_guest_env_file(&config.overlay_disk, &guest_env)?; - let runtime_dir = qemu_runtime_dir()?; let gw_port = config.gateway_port.unwrap_or(0); setup_tap_networking(tap_device, host_ip, gw_port)?; let mut tap_guard = TapGuard::new(tap_device.to_string(), host_ip.to_string(), gw_port); - let vmlinux = runtime_dir.join("vmlinux"); + let vmlinux = if let Some(kernel_image) = &config.kernel_image { + kernel_image.clone() + } else { + qemu_runtime_dir()?.join("vmlinux") + }; if !vmlinux.is_file() { return Err(format!("VM kernel not found: {}", vmlinux.display())); } @@ -648,6 +652,12 @@ fn procguard_kill_children() { } fn run_libkrun_vm(config: &VmLaunchConfig) -> Result<(), String> { + if let Some(kernel_image) = &config.kernel_image { + return Err(format!( + "selected kernel image is not supported by this VM backend: {}", + kernel_image.display() + )); + } if !config.root_disk.is_file() { return Err(format!( "root disk image not found: {}", @@ -1397,6 +1407,7 @@ mod tests { root_disk: PathBuf::from("/rootfs.ext4"), overlay_disk: PathBuf::from("/overlay.ext4"), image_disk: None, + kernel_image: None, vcpus: 2, mem_mib: 2048, exec_path: "/srv/openshell-vm-sandbox-init.sh".to_string(),