From da57ba3aa4cf3cf547dd9a4d680286e29736399d Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 24 Mar 2026 23:11:44 -0700 Subject: [PATCH 01/10] feat(vm): add openshell-vm crate with libkrun microVM gateway --- .gitignore | 3 + Cargo.lock | 116 +- architecture/custom-vm-runtime.md | 140 ++ architecture/gateway-single-node.md | 1 + crates/openshell-bootstrap/src/lib.rs | 8 +- crates/openshell-bootstrap/src/paths.rs | 12 +- crates/openshell-core/src/paths.rs | 13 + crates/openshell-vm/Cargo.toml | 36 + crates/openshell-vm/entitlements.plist | 8 + crates/openshell-vm/runtime/README.md | 169 ++ .../runtime/build-custom-libkrunfw.sh | 385 +++++ .../runtime/kernel/bridge-cni.config | 119 ++ crates/openshell-vm/scripts/api-proxy.py | 132 ++ crates/openshell-vm/scripts/build-rootfs.sh | 626 +++++++ .../scripts/check-vm-capabilities.sh | 234 +++ crates/openshell-vm/scripts/hello-server.py | 49 + .../scripts/openshell-vm-exec-agent.py | 173 ++ .../openshell-vm/scripts/openshell-vm-init.sh | 692 ++++++++ crates/openshell-vm/src/exec.rs | 534 ++++++ crates/openshell-vm/src/ffi.rs | 312 ++++ crates/openshell-vm/src/lib.rs | 1454 +++++++++++++++++ crates/openshell-vm/src/main.rs | 197 +++ .../openshell-vm/tests/gateway_integration.rs | 154 ++ .../helm/openshell/templates/statefulset.yaml | 13 + deploy/helm/openshell/values.yaml | 19 + .../kube/manifests/openshell-helmchart.yaml | 8 +- scripts/bin/openshell-vm | 25 + tasks/rust.toml | 4 +- tasks/scripts/bundle-vm-runtime.sh | 122 ++ tasks/scripts/codesign-openshell-vm.sh | 12 + tasks/scripts/ensure-vm-rootfs.sh | 17 + tasks/scripts/package-openshell-vm-runtime.sh | 27 + tasks/scripts/run-vm.sh | 15 + tasks/scripts/sync-vm-rootfs.sh | 91 ++ tasks/test.toml | 8 +- tasks/vm.toml | 69 + 36 files changed, 5944 insertions(+), 53 deletions(-) create mode 100644 architecture/custom-vm-runtime.md create mode 100644 crates/openshell-vm/Cargo.toml create mode 100644 crates/openshell-vm/entitlements.plist create mode 100644 crates/openshell-vm/runtime/README.md create mode 100755 crates/openshell-vm/runtime/build-custom-libkrunfw.sh create mode 100644 crates/openshell-vm/runtime/kernel/bridge-cni.config create mode 100644 crates/openshell-vm/scripts/api-proxy.py create mode 100755 crates/openshell-vm/scripts/build-rootfs.sh create mode 100755 crates/openshell-vm/scripts/check-vm-capabilities.sh create mode 100644 crates/openshell-vm/scripts/hello-server.py create mode 100644 crates/openshell-vm/scripts/openshell-vm-exec-agent.py create mode 100755 crates/openshell-vm/scripts/openshell-vm-init.sh create mode 100644 crates/openshell-vm/src/exec.rs create mode 100644 crates/openshell-vm/src/ffi.rs create mode 100644 crates/openshell-vm/src/lib.rs create mode 100644 crates/openshell-vm/src/main.rs create mode 100644 crates/openshell-vm/tests/gateway_integration.rs create mode 100755 scripts/bin/openshell-vm create mode 100755 tasks/scripts/bundle-vm-runtime.sh create mode 100755 tasks/scripts/codesign-openshell-vm.sh create mode 100755 tasks/scripts/ensure-vm-rootfs.sh create mode 100755 tasks/scripts/package-openshell-vm-runtime.sh create mode 100755 tasks/scripts/run-vm.sh create mode 100755 tasks/scripts/sync-vm-rootfs.sh create mode 100644 tasks/vm.toml diff --git a/.gitignore b/.gitignore index 32610f714..145c30695 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,9 @@ kubeconfig # Documentation build output _build/ +# Gateway microVM rootfs build artifacts +rootfs/ + # Docker build artifacts (image tarballs, packaged helm charts) deploy/docker/.build/ diff --git a/Cargo.lock b/Cargo.lock index 9d8247e5d..ec73e4a76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -106,9 +106,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.16.1" +version = "1.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" dependencies = [ "aws-lc-sys", "untrusted 0.7.1", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.38.0" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e" +checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" dependencies = [ "cc", "cmake", @@ -488,9 +488,9 @@ dependencies = [ [[package]] name = "bollard" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "227aa051deec8d16bd9c34605e7aaf153f240e35483dd42f6f78903847934738" +checksum = "ee04c4c84f1f811b017f2fbb7dd8815c976e7ca98593de9c1e2afad0f636bff4" dependencies = [ "base64 0.22.1", "bollard-stubs", @@ -584,9 +584,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.56" +version = "1.2.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" dependencies = [ "find-msvc-tools", "jobserver", @@ -710,9 +710,9 @@ checksum = "5417da527aa9bf6a1e10a781231effd1edd3ee82f27d5f8529ac9b279babce96" [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "compact_str" @@ -1125,9 +1125,9 @@ dependencies = [ [[package]] name = "digest" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "285743a676ccb6b3e116bc14cc69319b957867930ae9c4822f8e0f54509d7243" +checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c" dependencies = [ "block-buffer 0.12.0", "const-oid 0.10.2", @@ -2112,7 +2112,7 @@ checksum = "fe44f2bbd99fcb302e246e2d6bcf51aeda346d02a365f80296a07a8c711b6da6" dependencies = [ "argon2", "bcrypt-pbkdf", - "digest 0.11.1", + "digest 0.11.2", "ecdsa", "ed25519-dalek", "hex", @@ -2143,9 +2143,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "iri-string" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb" dependencies = [ "memchr", "serde", @@ -2201,9 +2201,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jobserver" @@ -2468,6 +2468,16 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libm" version = "0.2.16" @@ -2476,9 +2486,9 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libredox" -version = "0.1.14" +version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" +checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" dependencies = [ "bitflags", "libc", @@ -2724,9 +2734,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" [[package]] name = "num-integer" @@ -3046,6 +3056,24 @@ dependencies = [ "url", ] +[[package]] +name = "openshell-vm" +version = "0.0.0" +dependencies = [ + "base64 0.22.1", + "clap", + "libc", + "libloading", + "miette", + "nix", + "openshell-bootstrap", + "serde", + "serde_json", + "thiserror 2.0.18", + "tracing", + "tracing-subscriber", +] + [[package]] name = "openssh" version = "0.11.6" @@ -3899,7 +3927,7 @@ dependencies = [ "const-oid 0.10.2", "crypto-bigint 0.7.0-rc.18", "crypto-primes", - "digest 0.11.1", + "digest 0.11.2", "pkcs1 0.8.0-rc.4", "pkcs8 0.11.0-rc.11", "rand_core 0.10.0-rc-3", @@ -4091,9 +4119,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.9" +version = "0.103.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" dependencies = [ "ring", "rustls-pki-types", @@ -4373,7 +4401,7 @@ checksum = "3b167252f3c126be0d8926639c4c4706950f01445900c4b3db0fd7e89fcb750a" dependencies = [ "cfg-if", "cpufeatures", - "digest 0.11.1", + "digest 0.11.2", ] [[package]] @@ -4395,7 +4423,7 @@ checksum = "7c5f3b1e2dc8aad28310d8410bd4d7e180eca65fca176c52ab00d364475d0024" dependencies = [ "cfg-if", "cpufeatures", - "digest 0.11.1", + "digest 0.11.2", ] [[package]] @@ -4473,7 +4501,7 @@ version = "3.0.0-rc.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a96996ccff7dfa16f052bd995b4cecc72af22c35138738dc029f0ead6608d" dependencies = [ - "digest 0.11.1", + "digest 0.11.2", "rand_core 0.10.0-rc-3", ] @@ -4898,9 +4926,9 @@ dependencies = [ [[package]] name = "tar" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" dependencies = [ "filetime", "libc", @@ -4957,12 +4985,12 @@ dependencies = [ [[package]] name = "terminal_size" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0" +checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ "rustix 1.1.4", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5067,9 +5095,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" dependencies = [ "tinyvec_macros", ] @@ -5393,9 +5421,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.22" +version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" dependencies = [ "matchers", "nu-ansi-term", @@ -5501,9 +5529,9 @@ checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" [[package]] name = "unicode-segmentation" -version = "1.12.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +checksum = "da36089a805484bcccfffe0739803392c8298778a2d2f09febf76fac5ad9025b" [[package]] name = "unicode-truncate" @@ -6331,18 +6359,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.42" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" +checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.42" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" +checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" dependencies = [ "proc-macro2", "quote", diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md new file mode 100644 index 000000000..e7a7d73ae --- /dev/null +++ b/architecture/custom-vm-runtime.md @@ -0,0 +1,140 @@ +# Custom libkrunfw VM Runtime + +## Overview + +The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a +lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel +is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel. + +The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or +conntrack support. This is insufficient for Kubernetes pod networking. + +The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to +the VM kernel, enabling standard Kubernetes networking. + +## Architecture + +``` +Host (macOS/Linux) +├── openshell-vm binary +│ ├── Loads libkrun.dylib (VMM) +│ ├── Preloads libkrunfw.dylib (kernel) +│ └── Logs runtime provenance +├── openshell-vm.runtime/ (sidecar bundle) +│ ├── libkrun.dylib +│ ├── libkrunfw.dylib (stock or custom) +│ ├── gvproxy +│ ├── manifest.json +│ └── provenance.json (custom only) +└── gvproxy (networking) + +Guest VM +├── openshell-vm-init.sh (PID 1) +│ ├── Validates kernel capabilities (fail-fast) +│ ├── Configures bridge CNI +│ ├── Starts openshell-vm-exec-agent.py on vsock port 10777 +│ └── Execs k3s server +├── openshell-vm-exec-agent.py (guest exec agent) +└── check-vm-capabilities.sh (diagnostics) +``` + +## Network Profile + +The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and +netfilter kernel support. The init script validates these capabilities at boot and fails +fast with an actionable error if they are missing. + +### Bridge Profile + +- CNI: bridge plugin with `cni0` interface +- IP masquerade: enabled (iptables-legacy via CNI bridge plugin) +- kube-proxy: enabled (nftables mode) +- Service VIPs: functional (ClusterIP, NodePort) +- hostNetwork workarounds: not required + +## Runtime Provenance + +At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle: + +- Library paths and SHA-256 hashes +- Whether the runtime is custom-built or stock +- For custom runtimes: libkrunfw commit, kernel version, build timestamp + +This information is sourced from `provenance.json` (generated by the build script) +and makes it straightforward to correlate VM behavior with a specific runtime artifact. + +## Build Pipeline + +``` +crates/openshell-vm/runtime/ +├── build-custom-libkrunfw.sh # Clones libkrunfw, applies config, builds +├── kernel/ +│ └── bridge-cni.config # Kernel config fragment +└── README.md # Operator documentation + +Output: target/custom-runtime/ +├── libkrunfw.dylib # Custom library +├── provenance.json # Build metadata +├── bridge-cni.config # Config fragment used +└── kernel.config # Full kernel .config +``` + +## Kernel Config Fragment + +The `bridge-cni.config` fragment enables these kernel features on top of the stock +libkrunfw kernel: + +| Feature | Config | Purpose | +|---------|--------|---------| +| Bridge device | `CONFIG_BRIDGE` | cni0 bridge for pod networking | +| Bridge netfilter | `CONFIG_BRIDGE_NETFILTER` | kube-proxy visibility into bridge traffic | +| Netfilter | `CONFIG_NETFILTER` | iptables/nftables framework | +| Connection tracking | `CONFIG_NF_CONNTRACK` | NAT state tracking | +| NAT | `CONFIG_NF_NAT` | Service VIP DNAT/SNAT | +| iptables | `CONFIG_IP_NF_IPTABLES` | CNI bridge masquerade | +| nftables | `CONFIG_NF_TABLES` | kube-proxy nftables mode (primary) | +| veth | `CONFIG_VETH` | Pod network namespace pairs | +| IPVS | `CONFIG_IP_VS` | kube-proxy IPVS mode (optional) | +| Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support | +| Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support | + +## Verification + +One verification tool is provided: + +1. **Capability checker** (`check-vm-capabilities.sh`): Runs inside the VM to verify + kernel capabilities. Produces pass/fail results for each required feature. + +## Running Commands In A Live VM + +The standalone `openshell-vm` binary supports `openshell-vm exec -- ` for a running VM. + +- The host stores local VM runtime state next to the rootfs artifacts +- libkrun maps a per-rootfs host unix socket into the guest on vsock port `10777` +- `openshell-vm-init.sh` starts `openshell-vm-exec-agent.py` during boot +- `openshell-vm exec` connects to the host socket, which libkrun forwards into the guest agent +- The guest agent spawns the command, then streams stdout, stderr, and exit status back + +`openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style +commands work the same way they would inside the VM shell. + +## Rollout Strategy + +1. Custom runtime support is opt-in via `OPENSHELL_VM_RUNTIME_SOURCE_DIR`. +2. The init script validates kernel capabilities at boot and fails fast if missing. +3. Rollback: unset the env var and re-bundle with stock libraries (note: stock + libraries lack bridge/netfilter and pod networking will not work). + +## Related Files + +| File | Purpose | +|------|---------| +| `crates/openshell-vm/src/ffi.rs` | Runtime loading, provenance capture | +| `crates/openshell-vm/src/lib.rs` | VM launch, provenance logging | +| `crates/openshell-vm/src/exec.rs` | Runtime state tracking and host-side exec transport | +| `crates/openshell-vm/scripts/openshell-vm-init.sh` | Guest init, network profile selection | +| `crates/openshell-vm/scripts/openshell-vm-exec-agent.py` | Guest-side exec agent | +| `crates/openshell-vm/scripts/check-vm-capabilities.sh` | Kernel capability checker | +| `crates/openshell-vm/runtime/` | Build pipeline and kernel config | +| `tasks/scripts/bundle-vm-runtime.sh` | Runtime bundling (stock + custom) | +| `tasks/vm.toml` | Mise task definitions | diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 57aebd3a5..d5ed49943 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -49,6 +49,7 @@ All gateway lifecycle commands live under `openshell gateway`: | `openshell status` | Show gateway health via gRPC/HTTP | | `openshell doctor logs [--name NAME] [--remote user@host] [--tail N]` | Fetch gateway container logs | | `openshell doctor exec [--name NAME] [--remote user@host] -- ` | Run a command inside the gateway container | +| `gateway exec [--workdir DIR] [--env KEY=VALUE] -- ` | Run a command inside the standalone gateway microVM | | `openshell gateway select ` | Set the active gateway | | `openshell gateway select` | Open an interactive chooser on a TTY, or list all gateways in non-interactive mode | diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 938986757..805c101bf 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -6,12 +6,12 @@ pub mod edge_token; pub mod errors; pub mod image; -mod constants; +pub mod constants; mod docker; mod metadata; -mod mtls; -mod paths; -mod pki; +pub mod mtls; +pub mod paths; +pub mod pki; pub(crate) mod push; mod runtime; diff --git a/crates/openshell-bootstrap/src/paths.rs b/crates/openshell-bootstrap/src/paths.rs index cd3cb7693..529eab87a 100644 --- a/crates/openshell-bootstrap/src/paths.rs +++ b/crates/openshell-bootstrap/src/paths.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use miette::Result; -use openshell_core::paths::xdg_config_dir; +use openshell_core::paths::{xdg_config_dir, xdg_data_dir}; use std::path::PathBuf; /// Path to the file that stores the active gateway name. @@ -26,6 +26,16 @@ pub fn last_sandbox_path(gateway: &str) -> Result { Ok(gateways_dir()?.join(gateway).join("last_sandbox")) } +/// Default rootfs directory for gateway microVMs. +/// +/// Location: `$XDG_DATA_HOME/openshell/openshell-vm/rootfs` +pub fn default_rootfs_dir() -> Result { + Ok(xdg_data_dir()? + .join("openshell") + .join("openshell-vm") + .join("rootfs")) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/openshell-core/src/paths.rs b/crates/openshell-core/src/paths.rs index bd9ce23d4..fd0a141b3 100644 --- a/crates/openshell-core/src/paths.rs +++ b/crates/openshell-core/src/paths.rs @@ -29,6 +29,19 @@ pub fn openshell_config_dir() -> Result { Ok(xdg_config_dir()?.join("openshell")) } +/// Resolve the XDG data base directory. +/// +/// Returns `$XDG_DATA_HOME` if set, otherwise `$HOME/.local/share`. +pub fn xdg_data_dir() -> Result { + if let Ok(path) = std::env::var("XDG_DATA_HOME") { + return Ok(PathBuf::from(path)); + } + let home = std::env::var("HOME") + .into_diagnostic() + .wrap_err("HOME is not set")?; + Ok(PathBuf::from(home).join(".local").join("share")) +} + /// Create a directory (and parents) with owner-only permissions (`0o700`) on /// Unix. On non-Unix platforms, falls back to default permissions. /// diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml new file mode 100644 index 000000000..71800c684 --- /dev/null +++ b/crates/openshell-vm/Cargo.toml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-vm" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +description = "MicroVM runtime using libkrun for hardware-isolated execution" + +[lib] +name = "openshell_vm" +path = "src/lib.rs" + +[[bin]] +name = "openshell-vm" +path = "src/main.rs" + +[dependencies] +base64 = "0.22" +clap = { workspace = true } +libc = "0.2" +libloading = "0.8" +miette = { workspace = true } +nix = { workspace = true } +openshell-bootstrap = { path = "../openshell-bootstrap" } +serde = { workspace = true } +serde_json = "1" +thiserror = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } + +[lints] +workspace = true diff --git a/crates/openshell-vm/entitlements.plist b/crates/openshell-vm/entitlements.plist new file mode 100644 index 000000000..154f3308e --- /dev/null +++ b/crates/openshell-vm/entitlements.plist @@ -0,0 +1,8 @@ + + + + + com.apple.security.hypervisor + + + diff --git a/crates/openshell-vm/runtime/README.md b/crates/openshell-vm/runtime/README.md new file mode 100644 index 000000000..52bb1382d --- /dev/null +++ b/crates/openshell-vm/runtime/README.md @@ -0,0 +1,169 @@ +# Custom libkrunfw Runtime + +This directory contains the build infrastructure for a custom `libkrunfw` runtime +that enables bridge CNI and netfilter support in the OpenShell gateway VM. + +## Why + +The stock `libkrunfw` (from Homebrew) ships a kernel without bridge, netfilter, +or conntrack support. This means the VM cannot: + +- Create `cni0` bridge interfaces (required by the bridge CNI plugin) +- Run kube-proxy (requires nftables) +- Route service VIP traffic (requires NAT/conntrack) + +The custom runtime builds libkrunfw with an additional kernel config fragment +that enables these networking and sandboxing features. + +## Directory Structure + +``` +runtime/ + build-custom-libkrunfw.sh # Build script for custom libkrunfw + kernel/ + bridge-cni.config # Kernel config fragment (networking + sandboxing) +``` + +## Building + +### Prerequisites + +- Rust toolchain +- make, git, curl +- On macOS: Xcode command line tools and cross-compilation tools for aarch64 + +### Quick Build + +```bash +# Build custom libkrunfw (clones libkrunfw repo, applies config, builds) +./crates/openshell-vm/runtime/build-custom-libkrunfw.sh + +# Or via mise task: +mise run vm:build-custom-runtime +``` + +### Output + +Build artifacts are placed in `target/custom-runtime/`: + +``` +target/custom-runtime/ + libkrunfw.dylib # The custom library + libkrunfw..dylib # Version-suffixed copy + provenance.json # Build metadata (commit, hash, timestamp) + bridge-cni.config # The config fragment used + kernel.config # Full kernel .config (for debugging) +``` + +### Using the Custom Runtime + +```bash +# Point the bundle script at the custom build: +export OPENSHELL_VM_RUNTIME_SOURCE_DIR=target/custom-runtime +mise run vm:bundle-runtime + +# Then boot the VM as usual: +mise run vm +``` + +## Networking + +The VM uses bridge CNI for pod networking with nftables-mode kube-proxy for +service VIP / ClusterIP support. The kernel config fragment enables both +iptables (for CNI bridge masquerade) and nftables (for kube-proxy). + +k3s is started with `--kube-proxy-arg=proxy-mode=nftables` because the +bundled iptables binaries in k3s have revision-negotiation issues with the +libkrun kernel's xt_MARK module. nftables mode uses the kernel's nf_tables +subsystem directly and avoids this entirely. + +## Runtime Provenance + +At VM boot, the openshell-vm binary logs provenance information about the loaded +runtime: + +``` +runtime: /path/to/openshell-vm.runtime + libkrunfw: libkrunfw.dylib + sha256: a1b2c3d4e5f6... + type: custom (OpenShell-built) + libkrunfw-commit: abc1234 + kernel-version: 6.6.30 + build-timestamp: 2026-03-23T10:00:00Z +``` + +For stock runtimes: +``` +runtime: /path/to/openshell-vm.runtime + libkrunfw: libkrunfw.dylib + sha256: f6e5d4c3b2a1... + type: stock (system/homebrew) +``` + +## Verification + +### Capability Check (inside VM) + +```bash +# Run inside the VM to verify kernel capabilities: +/srv/check-vm-capabilities.sh + +# JSON output for CI: +/srv/check-vm-capabilities.sh --json +``` + +### Rollback + +To revert to the stock runtime: + +```bash +# Unset the custom runtime source: +unset OPENSHELL_VM_RUNTIME_SOURCE_DIR + +# Re-bundle with stock libraries: +mise run vm:bundle-runtime + +# Boot — will auto-detect legacy-vm-net profile: +mise run vm +``` + +## Troubleshooting + +### "FailedCreatePodSandBox" bridge errors + +The kernel does not have bridge support. Verify: +```bash +# Inside VM: +ip link add test0 type bridge && echo "bridge OK" && ip link del test0 +``` + +If this fails, you are running the stock runtime. Build and use the custom one. + +### kube-proxy CrashLoopBackOff + +kube-proxy runs in nftables mode. If it crashes, verify nftables support: +```bash +# Inside VM: +nft list ruleset +``` + +If this fails, the kernel may lack `CONFIG_NF_TABLES`. Use the custom runtime. + +Common errors: +- `unknown option "--xor-mark"`: kube-proxy is running in iptables mode instead + of nftables. Verify `--kube-proxy-arg=proxy-mode=nftables` is in the k3s args. + +### Runtime mismatch after upgrade + +If libkrunfw is updated (e.g., via `brew upgrade`), the stock runtime may +change. Check provenance: +```bash +# Look for provenance info in VM boot output +grep "runtime:" ~/.local/share/openshell/openshell-vm/console.log +``` + +Re-build the custom runtime if needed: +```bash +mise run vm:build-custom-runtime +mise run vm:bundle-runtime +``` diff --git a/crates/openshell-vm/runtime/build-custom-libkrunfw.sh b/crates/openshell-vm/runtime/build-custom-libkrunfw.sh new file mode 100755 index 000000000..a69fc0c13 --- /dev/null +++ b/crates/openshell-vm/runtime/build-custom-libkrunfw.sh @@ -0,0 +1,385 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build a custom libkrunfw with bridge/netfilter kernel support. +# +# This script clones libkrunfw, applies the OpenShell kernel config +# fragment (bridge CNI, iptables, conntrack), builds the library, and +# stages the artifact with provenance metadata. +# +# Prerequisites: +# - Rust toolchain (cargo) +# - make, git, curl +# - Cross-compilation toolchain for aarch64 (if building on x86_64) +# - On macOS: Xcode command line tools +# +# Usage: +# ./build-custom-libkrunfw.sh [--output-dir DIR] [--libkrunfw-ref REF] +# +# Environment: +# LIBKRUNFW_REF - git ref to check out (default: main) +# LIBKRUNFW_REPO - git repo URL (default: github.com/containers/libkrunfw) +# OPENSHELL_RUNTIME_OUTPUT_DIR - output directory for built artifacts + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" +KERNEL_CONFIG_FRAGMENT="${SCRIPT_DIR}/kernel/bridge-cni.config" + +# Defaults +LIBKRUNFW_REPO="${LIBKRUNFW_REPO:-https://github.com/containers/libkrunfw.git}" +LIBKRUNFW_REF="${LIBKRUNFW_REF:-main}" +OUTPUT_DIR="${OPENSHELL_RUNTIME_OUTPUT_DIR:-${PROJECT_ROOT}/target/custom-runtime}" +BUILD_DIR="${PROJECT_ROOT}/target/libkrunfw-build" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) + OUTPUT_DIR="$2"; shift 2 ;; + --libkrunfw-ref) + LIBKRUNFW_REF="$2"; shift 2 ;; + --help|-h) + echo "Usage: $0 [--output-dir DIR] [--libkrunfw-ref REF]" + echo "" + echo "Build a custom libkrunfw with bridge/netfilter kernel support." + echo "" + echo "Options:" + echo " --output-dir DIR Output directory for built artifacts" + echo " --libkrunfw-ref REF Git ref to check out (default: main)" + echo "" + echo "Environment:" + echo " LIBKRUNFW_REPO Git repo URL" + echo " LIBKRUNFW_REF Git ref (branch/tag/commit)" + echo " OPENSHELL_RUNTIME_OUTPUT_DIR Output directory" + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2; exit 1 ;; + esac +done + +echo "==> Building custom libkrunfw" +echo " Repo: ${LIBKRUNFW_REPO}" +echo " Ref: ${LIBKRUNFW_REF}" +echo " Config fragment: ${KERNEL_CONFIG_FRAGMENT}" +echo " Output: ${OUTPUT_DIR}" +echo "" + +# ── Clone / update libkrunfw ──────────────────────────────────────────── + +if [ -d "${BUILD_DIR}/libkrunfw/.git" ]; then + echo "==> Updating existing libkrunfw checkout..." + git -C "${BUILD_DIR}/libkrunfw" fetch origin + git -C "${BUILD_DIR}/libkrunfw" checkout "${LIBKRUNFW_REF}" + git -C "${BUILD_DIR}/libkrunfw" pull --ff-only 2>/dev/null || true +else + echo "==> Cloning libkrunfw..." + mkdir -p "${BUILD_DIR}" + git clone "${LIBKRUNFW_REPO}" "${BUILD_DIR}/libkrunfw" + git -C "${BUILD_DIR}/libkrunfw" checkout "${LIBKRUNFW_REF}" +fi + +LIBKRUNFW_DIR="${BUILD_DIR}/libkrunfw" +LIBKRUNFW_COMMIT=$(git -C "${LIBKRUNFW_DIR}" rev-parse HEAD) +LIBKRUNFW_SHORT=$(git -C "${LIBKRUNFW_DIR}" rev-parse --short HEAD) + +echo " Commit: ${LIBKRUNFW_COMMIT}" + +# ── Detect the kernel version libkrunfw targets ──────────────────────── + +# libkrunfw's Makefile typically sets KERNEL_VERSION or has it in a +# config file. Try to detect it. +KERNEL_VERSION="" +if [ -f "${LIBKRUNFW_DIR}/Makefile" ]; then + KERNEL_VERSION=$(grep -oE 'KERNEL_VERSION\s*=\s*linux-[^\s]+' "${LIBKRUNFW_DIR}/Makefile" 2>/dev/null | head -1 | sed 's/.*= *//' || true) +fi +if [ -z "$KERNEL_VERSION" ] && [ -f "${LIBKRUNFW_DIR}/kernel_version" ]; then + KERNEL_VERSION=$(cat "${LIBKRUNFW_DIR}/kernel_version") +fi +echo " Kernel version: ${KERNEL_VERSION:-unknown}" + +# ── Apply kernel config fragment ──────────────────────────────────────── + +echo "==> Applying OpenShell kernel config fragment..." + +# libkrunfw builds the kernel with a config generated from its own +# sources. The config merge happens after `make olddefconfig` runs +# on the base config. We use the kernel's scripts/kconfig/merge_config.sh +# when available, otherwise do a simple append+olddefconfig. + +MERGE_HOOK="${LIBKRUNFW_DIR}/openshell-kconfig-hook.sh" +cat > "${MERGE_HOOK}" << 'HOOKEOF' +#!/usr/bin/env bash +# Hook called by the libkrunfw build after extracting the kernel source. +# Merges the OpenShell kernel config fragment into .config. +set -euo pipefail + +KERNEL_DIR="$1" +FRAGMENT="$2" + +if [ ! -d "$KERNEL_DIR" ]; then + echo "ERROR: kernel source dir not found: $KERNEL_DIR" >&2 + exit 1 +fi + +if [ ! -f "$FRAGMENT" ]; then + echo "ERROR: config fragment not found: $FRAGMENT" >&2 + exit 1 +fi + +cd "$KERNEL_DIR" + +if [ -f scripts/kconfig/merge_config.sh ]; then + echo " Using kernel merge_config.sh" + KCONFIG_CONFIG=.config ./scripts/kconfig/merge_config.sh -m .config "$FRAGMENT" +else + echo " Appending fragment and running olddefconfig" + cat "$FRAGMENT" >> .config +fi + +make ARCH=arm64 olddefconfig + +# Verify critical configs are set +REQUIRED=( + CONFIG_BRIDGE + CONFIG_BRIDGE_NETFILTER + CONFIG_NETFILTER + CONFIG_NF_CONNTRACK + CONFIG_NF_NAT + CONFIG_IP_NF_IPTABLES + CONFIG_IP_NF_FILTER + CONFIG_IP_NF_NAT + CONFIG_NF_TABLES + CONFIG_NFT_NUMGEN + CONFIG_NFT_FIB_IPV4 + CONFIG_NFT_FIB_IPV6 + CONFIG_NFT_CT + CONFIG_NFT_NAT + CONFIG_NFT_MASQ + CONFIG_VETH + CONFIG_NET_NS +) + +MISSING=() +for cfg in "${REQUIRED[@]}"; do + if ! grep -q "^${cfg}=[ym]" .config; then + MISSING+=("$cfg") + fi +done + +if [ ${#MISSING[@]} -gt 0 ]; then + echo "ERROR: Required kernel configs not set after merge:" >&2 + printf " %s\n" "${MISSING[@]}" >&2 + exit 1 +fi + +echo " All required kernel configs verified." +HOOKEOF +chmod +x "${MERGE_HOOK}" + +# ── Build libkrunfw ──────────────────────────────────────────────────── + +echo "==> Building libkrunfw (this may take 10-30 minutes)..." + +cd "${LIBKRUNFW_DIR}" + +# Detect macOS vs Linux and pick the right library extension / target +if [ "$(uname -s)" = "Darwin" ]; then + LIB_EXT="dylib" +else + LIB_EXT="so" +fi + +# Detect the kernel source directory name from the Makefile +KERNEL_DIR_NAME=$(grep -oE 'KERNEL_VERSION\s*=\s*linux-[^\s]+' Makefile | head -1 | sed 's/KERNEL_VERSION *= *//') +if [ -z "$KERNEL_DIR_NAME" ]; then + echo "ERROR: Could not detect KERNEL_VERSION from Makefile" >&2 + exit 1 +fi +echo " Kernel source dir: ${KERNEL_DIR_NAME}" + +if [ "$(uname -s)" = "Darwin" ]; then + # On macOS, use krunvm to build the kernel inside a lightweight Linux VM. + # This matches the upstream libkrunfw build approach and avoids all the + # issues with Docker emulation and APFS filesystem limitations. + # + # Prerequisites: brew tap slp/krun && brew install krunvm + + if ! command -v krunvm &>/dev/null; then + echo "ERROR: krunvm is required to build the kernel on macOS" >&2 + echo " Install with: brew tap slp/krun && brew install krunvm" >&2 + exit 1 + fi + + echo "==> Building kernel inside krunvm (macOS detected)..." + + VM_NAME="libkrunfw-openshell" + + # Clean up any leftover VM from a previous failed run + krunvm delete "${VM_NAME}" 2>/dev/null || true + + # Copy the config fragment into the libkrunfw tree so the VM can see it. + # The merge hook (MERGE_HOOK) is already written there by the cat above. + cp -f "${KERNEL_CONFIG_FRAGMENT}" "${LIBKRUNFW_DIR}/openshell-bridge-cni.config" + + echo " Creating VM..." + # krunvm may print "The volume has been configured" on first use of a + # volume path and exit non-zero. Retry once if that happens. + if ! krunvm create fedora \ + --name "${VM_NAME}" \ + --cpus 4 \ + --mem 4096 \ + -v "${LIBKRUNFW_DIR}:/work" \ + -w /work; then + echo " Retrying VM creation..." + krunvm create fedora \ + --name "${VM_NAME}" \ + --cpus 4 \ + --mem 4096 \ + -v "${LIBKRUNFW_DIR}:/work" \ + -w /work + fi + + echo " Installing build dependencies..." + krunvm start "${VM_NAME}" /usr/bin/dnf -- install -y \ + 'dnf-command(builddep)' python3-pyelftools + + krunvm start "${VM_NAME}" /usr/bin/dnf -- builddep -y kernel + + # Step 1: prepare kernel sources (download, extract, patch, base config) + echo " Preparing kernel sources..." + krunvm start "${VM_NAME}" /usr/bin/make -- "${KERNEL_DIR_NAME}" + + # Step 2: merge the OpenShell config fragment + echo " Merging OpenShell kernel config fragment..." + krunvm start "${VM_NAME}" /usr/bin/bash -- \ + /work/openshell-kconfig-hook.sh "/work/${KERNEL_DIR_NAME}" /work/openshell-bridge-cni.config + + # Step 3: build the kernel and generate the C bundle + echo " Building kernel (this is the slow part)..." + krunvm start "${VM_NAME}" /usr/bin/make -- -j4 + + echo " Cleaning up VM..." + krunvm delete "${VM_NAME}" + + # Clean up temp files from the libkrunfw tree + rm -f "${LIBKRUNFW_DIR}/openshell-bridge-cni.config" + + if [ ! -f "${LIBKRUNFW_DIR}/kernel.c" ]; then + echo "ERROR: kernel.c was not produced — build failed" >&2 + exit 1 + fi + + # Compile the shared library on the host (uses host cc for a .dylib) + echo "==> Compiling libkrunfw.dylib on host..." + ABI_VERSION=$(grep -oE 'ABI_VERSION\s*=\s*[0-9]+' Makefile | head -1 | sed 's/[^0-9]//g') + cc -fPIC -DABI_VERSION="${ABI_VERSION}" -shared -o "libkrunfw.${ABI_VERSION}.dylib" kernel.c +else + # On Linux, we can do everything natively in three steps: + + # Step 1: prepare kernel sources + echo " Preparing kernel sources..." + make "${KERNEL_DIR_NAME}" + + # Step 2: merge config fragment + echo "==> Merging OpenShell kernel config fragment..." + bash "${MERGE_HOOK}" "${LIBKRUNFW_DIR}/${KERNEL_DIR_NAME}" "${KERNEL_CONFIG_FRAGMENT}" + + # Step 3: build the kernel and shared library + make -j"$(nproc)" "$(grep -oE 'KRUNFW_BINARY_Linux\s*=\s*\S+' Makefile | head -1 | sed 's/[^=]*= *//')" || \ + make -j"$(nproc)" libkrunfw.so +fi + +# ── Stage output artifacts ────────────────────────────────────────────── + +echo "==> Staging artifacts..." +mkdir -p "${OUTPUT_DIR}" + +# Find the built library — check versioned names (e.g. libkrunfw.5.dylib) first +BUILT_LIB="" +for candidate in \ + "${LIBKRUNFW_DIR}"/libkrunfw*.${LIB_EXT} \ + "${LIBKRUNFW_DIR}/libkrunfw.${LIB_EXT}" \ + "${LIBKRUNFW_DIR}/target/release/libkrunfw.${LIB_EXT}" \ + "${LIBKRUNFW_DIR}/build/libkrunfw.${LIB_EXT}"; do + if [ -f "$candidate" ]; then + BUILT_LIB="$candidate" + break + fi +done + +if [ -z "$BUILT_LIB" ]; then + echo "ERROR: Could not find built libkrunfw.${LIB_EXT}" >&2 + echo " Searched in ${LIBKRUNFW_DIR}/ for libkrunfw*.${LIB_EXT}" + exit 1 +fi + +echo " Found library: ${BUILT_LIB}" + +# Compute SHA-256 (shasum on macOS, sha256sum on Linux) +if command -v sha256sum &>/dev/null; then + ARTIFACT_HASH=$(sha256sum "${BUILT_LIB}" | cut -d' ' -f1) +else + ARTIFACT_HASH=$(shasum -a 256 "${BUILT_LIB}" | cut -d' ' -f1) +fi +ARTIFACT_HASH_SHORT="${ARTIFACT_HASH:0:12}" + +# Copy the library — always stage as libkrunfw.dylib / libkrunfw.so +# (the base name the runtime loader expects) plus the original name +cp "${BUILT_LIB}" "${OUTPUT_DIR}/libkrunfw.${LIB_EXT}" +BUILT_BASENAME="$(basename "${BUILT_LIB}")" +if [ "${BUILT_BASENAME}" != "libkrunfw.${LIB_EXT}" ]; then + cp "${BUILT_LIB}" "${OUTPUT_DIR}/${BUILT_BASENAME}" +fi + +# Copy the kernel config that was actually used (for reproducibility) +KERNEL_SRC_DIR="" +for candidate in \ + "${LIBKRUNFW_DIR}/linux-"* \ + "${LIBKRUNFW_DIR}/build/linux-"* \ + "${LIBKRUNFW_DIR}/kernel/linux-"*; do + if [ -d "$candidate" ] && [ -f "${candidate}/.config" ]; then + KERNEL_SRC_DIR="$candidate" + break + fi +done + +if [ -n "$KERNEL_SRC_DIR" ] && [ -f "${KERNEL_SRC_DIR}/.config" ]; then + cp "${KERNEL_SRC_DIR}/.config" "${OUTPUT_DIR}/kernel.config" +fi + +# Copy our fragment for reference +cp "${KERNEL_CONFIG_FRAGMENT}" "${OUTPUT_DIR}/bridge-cni.config" + +# ── Write provenance metadata ────────────────────────────────────────── + +cat > "${OUTPUT_DIR}/provenance.json" << EOF +{ + "artifact": "libkrunfw-custom", + "version": "0.1.0-openshell", + "build_timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "libkrunfw_repo": "${LIBKRUNFW_REPO}", + "libkrunfw_ref": "${LIBKRUNFW_REF}", + "libkrunfw_commit": "${LIBKRUNFW_COMMIT}", + "kernel_version": "${KERNEL_VERSION:-unknown}", + "kernel_config_fragment": "bridge-cni.config", + "artifact_sha256": "${ARTIFACT_HASH}", + "host_os": "$(uname -s)", + "host_arch": "$(uname -m)", + "builder": "build-custom-libkrunfw.sh" +} +EOF + +echo "" +echo "==> Build complete" +echo " Library: ${OUTPUT_DIR}/libkrunfw.${LIB_EXT}" +echo " SHA256: ${ARTIFACT_HASH_SHORT}..." +echo " Provenance: ${OUTPUT_DIR}/provenance.json" +echo " Commit: ${LIBKRUNFW_SHORT}" +echo "" +echo "To use this runtime:" +echo " export OPENSHELL_VM_RUNTIME_SOURCE_DIR=${OUTPUT_DIR}" +echo " mise run vm:bundle-runtime" diff --git a/crates/openshell-vm/runtime/kernel/bridge-cni.config b/crates/openshell-vm/runtime/kernel/bridge-cni.config new file mode 100644 index 000000000..7b9610e30 --- /dev/null +++ b/crates/openshell-vm/runtime/kernel/bridge-cni.config @@ -0,0 +1,119 @@ +# Custom kernel config fragment for libkrunfw (OpenShell VM) +# +# This fragment is applied on top of libkrunfw's base kernel config +# to enable bridge CNI, netfilter/iptables, and conntrack support +# required for Kubernetes pod networking in the VM. +# +# Apply with: scripts/merge-kconfig.sh +# +# See also: check-vm-capabilities.sh for runtime verification. + +# ── Network Namespaces (required for pod isolation) ───────────────────── +CONFIG_NET_NS=y +CONFIG_NAMESPACES=y + +# ── Virtual Ethernet (veth pairs for pod networking) ──────────────────── +CONFIG_VETH=y + +# ── Linux Bridge (required for bridge CNI plugin) ────────────────────── +CONFIG_BRIDGE=y +CONFIG_BRIDGE_NETFILTER=y +CONFIG_BRIDGE_IGMP_SNOOPING=y + +# ── Netfilter framework ──────────────────────────────────────────────── +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=y +CONFIG_NETFILTER_NETLINK_QUEUE=y +CONFIG_NETFILTER_NETLINK_LOG=y + +# ── Connection tracking (required for NAT and kube-proxy) ────────────── +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y + +# ── Netfilter xtables match modules (required by kube-proxy & kubelet) ─ +# kube-proxy uses xt_conntrack for stateful rules and xt_comment for +# labeling chains. Without these, iptables fails with: +# "Couldn't load match 'conntrack': No such file or directory" +CONFIG_NETFILTER_XTABLES=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y +CONFIG_NETFILTER_XT_MATCH_RECENT=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y + +# ── NAT (required for service VIP / DNAT / SNAT) ────────────────────── +CONFIG_NF_NAT=y +CONFIG_NF_NAT_MASQUERADE_IPV4=y + +# ── iptables (CNI bridge masquerade + compat) ────────────────────────── +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_REJECT=y + +# ── nftables (kube-proxy nftables mode — primary proxy backend) ───────── +# kube-proxy nftables proxier requires: numgen (random LB), fib (local +# address detection), counter, ct, nat, masq, reject, limit, redir. +CONFIG_NF_TABLES=y +CONFIG_NF_TABLES_INET=y +CONFIG_NFT_CT=y +CONFIG_NFT_NAT=y +CONFIG_NFT_MASQ=y +CONFIG_NFT_REJECT=y +CONFIG_NFT_COMPAT=y +CONFIG_NFT_NUMGEN=y +CONFIG_NFT_FIB_IPV4=y +CONFIG_NFT_FIB_IPV6=y +CONFIG_NFT_LIMIT=y +CONFIG_NFT_REDIR=y +CONFIG_NFT_TPROXY=y + +# ── IP forwarding and routing (required for pod-to-pod) ──────────────── +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_NET_IP_TUNNEL=y + +# ── IPVS (optional: kube-proxy IPVS mode) ───────────────────────────── +CONFIG_IP_VS=y +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_RR=y +CONFIG_IP_VS_WRR=y +CONFIG_IP_VS_SH=y +CONFIG_IP_VS_NFCT=y + +# ── Misc networking required by Kubernetes ───────────────────────────── +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_CGROUP=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y + +# ── Dummy interface (fallback networking) ────────────────────────────── +CONFIG_DUMMY=y + +# ── TUN/TAP (used by some CNI plugins) ──────────────────────────────── +CONFIG_TUN=y + +# ── Cgroups (already in base, ensure v2 is available) ────────────────── +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PIDS=y +CONFIG_MEMCG=y + +# ── Security features required by the sandbox runtime ─────────────────── +CONFIG_SECURITY_LANDLOCK=y +CONFIG_SECCOMP_FILTER=y diff --git a/crates/openshell-vm/scripts/api-proxy.py b/crates/openshell-vm/scripts/api-proxy.py new file mode 100644 index 000000000..6da224f13 --- /dev/null +++ b/crates/openshell-vm/scripts/api-proxy.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +TCP proxy that waits for the k3s apiserver to be ready on 127.0.0.1:6444, +then accepts connections on 0.0.0.0:6443 and forwards them to the apiserver. + +This decouples the TSI-exposed port from k3s's internal dynamiclistener, +which has TLS handshake issues when accessed through TSI. +""" + +import os +import socket +import sys +import threading +import time + +LISTEN_HOST = "0.0.0.0" +LISTEN_PORT = int(os.environ.get("PROXY_LISTEN_PORT", "6443")) +UPSTREAM_HOST = "127.0.0.1" +UPSTREAM_PORT = int(os.environ.get("PROXY_UPSTREAM_PORT", "6444")) +BUFFER_SIZE = 65536 + + +def wait_for_upstream(): + """Block until the upstream apiserver completes a TLS handshake. + + A raw TCP connect succeeds as soon as the port is bound, but the TLS + server may not be ready yet. We do a full TLS handshake to confirm. + """ + import ssl + + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + attempt = 0 + while True: + attempt += 1 + try: + sock = socket.create_connection((UPSTREAM_HOST, UPSTREAM_PORT), timeout=5) + ssock = ctx.wrap_socket(sock, server_hostname="localhost") + ssock.close() + print(f"[proxy] upstream TLS ready after {attempt} attempts", flush=True) + return + except ( + ConnectionRefusedError, + ConnectionResetError, + OSError, + ssl.SSLError, + ) as e: + if attempt % 5 == 0: + print( + f"[proxy] waiting for upstream (attempt {attempt}): {e}", flush=True + ) + time.sleep(1) + + +def forward(src, dst, label): + """Forward data between two sockets until one closes.""" + try: + while True: + data = src.recv(BUFFER_SIZE) + if not data: + break + dst.sendall(data) + except (BrokenPipeError, ConnectionResetError, OSError): + pass + finally: + try: + dst.shutdown(socket.SHUT_WR) + except OSError: + pass + + +def handle_client(client_sock, client_addr): + """Connect to upstream and forward bidirectionally.""" + print(f"[proxy] accepted connection from {client_addr}", flush=True) + try: + upstream = socket.create_connection((UPSTREAM_HOST, UPSTREAM_PORT), timeout=5) + print(f"[proxy] connected to upstream for {client_addr}", flush=True) + except OSError as e: + print( + f"[proxy] failed to connect to upstream for {client_addr}: {e}", flush=True + ) + client_sock.close() + return + + # Forward in both directions + t1 = threading.Thread( + target=forward, args=(client_sock, upstream, "client->upstream"), daemon=True + ) + t2 = threading.Thread( + target=forward, args=(upstream, client_sock, "upstream->client"), daemon=True + ) + t1.start() + t2.start() + t1.join() + t2.join() + print(f"[proxy] connection closed for {client_addr}", flush=True) + client_sock.close() + upstream.close() + + +def main(): + # Wait for the real apiserver to be ready before accepting connections + print( + f"[proxy] waiting for upstream at {UPSTREAM_HOST}:{UPSTREAM_PORT}...", + flush=True, + ) + wait_for_upstream() + + # Start listening + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind((LISTEN_HOST, LISTEN_PORT)) + server.listen(64) + print( + f"[proxy] listening on {LISTEN_HOST}:{LISTEN_PORT} -> {UPSTREAM_HOST}:{UPSTREAM_PORT}", + flush=True, + ) + + while True: + client_sock, client_addr = server.accept() + threading.Thread( + target=handle_client, args=(client_sock, client_addr), daemon=True + ).start() + + +if __name__ == "__main__": + main() diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh new file mode 100755 index 000000000..50cc13ca4 --- /dev/null +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -0,0 +1,626 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build an aarch64 Ubuntu rootfs for the openshell-vm microVM. +# +# Produces a rootfs with k3s pre-installed, the OpenShell helm chart and +# manifests baked in, container images pre-loaded, AND a fully initialized +# k3s cluster state (database, TLS, images imported, all services deployed). +# +# On first VM boot, k3s resumes from this pre-baked state instead of +# cold-starting, achieving ~3-5s startup times. +# +# Usage: +# ./crates/openshell-vm/scripts/build-rootfs.sh [output_dir] +# +# Requires: Docker (or compatible container runtime), curl, helm, zstd + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEFAULT_ROOTFS="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" +ROOTFS_DIR="${1:-${DEFAULT_ROOTFS}}" +CONTAINER_NAME="krun-rootfs-builder" +INIT_CONTAINER_NAME="krun-k3s-init" +BASE_IMAGE_TAG="krun-rootfs:openshell-vm" +# K3S_VERSION uses the semver "+" form for GitHub releases. +# The mise env may provide the Docker-tag form with "-" instead of "+"; +# normalise to "+" so the GitHub download URL works. +K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" +K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" + +# Project root (two levels up from crates/openshell-vm/scripts/) +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +# Container images to pre-load into k3s (arm64). +IMAGE_REPO_BASE="${IMAGE_REPO_BASE:-openshell}" +IMAGE_TAG="${IMAGE_TAG:-dev}" +SERVER_IMAGE="${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" +AGENT_SANDBOX_IMAGE="registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0" +COMMUNITY_SANDBOX_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base:latest" + +echo "==> Building openshell-vm rootfs" +echo " k3s version: ${K3S_VERSION}" +echo " Images: ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}" +echo " Output: ${ROOTFS_DIR}" + +# ── Check for running VM ──────────────────────────────────────────────── +# If an openshell-vm is using this rootfs via virtio-fs, wiping the rootfs +# corrupts the VM's filesystem (e.g. /var disappears) causing cascading +# k3s failures. We use two checks: +# +# 1. flock: The Rust openshell-vm process holds an exclusive flock on the lock +# file for its entire lifetime. This is the primary guard — it works +# even if the state file was deleted, and the OS releases the lock +# automatically when the process dies (including SIGKILL). +# +# 2. State file: Fallback check for the PID in the state file. This +# catches VMs launched before the flock guard was added. + +VM_LOCK_FILE="$(dirname "${ROOTFS_DIR}")/$(basename "${ROOTFS_DIR}")-vm.lock" +if [ -f "${VM_LOCK_FILE}" ]; then + # Try to acquire the lock non-blocking. Use Python's fcntl.flock() + # because the `flock` CLI tool is not available on macOS. + if ! python3 -c " +import fcntl, os, sys +fd = os.open(sys.argv[1], os.O_RDONLY) +try: + fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + fcntl.flock(fd, fcntl.LOCK_UN) +except BlockingIOError: + sys.exit(1) +finally: + os.close(fd) +" "${VM_LOCK_FILE}" 2>/dev/null; then + HOLDER_PID=$(cat "${VM_LOCK_FILE}" 2>/dev/null | tr -d '[:space:]') + echo "" + echo "ERROR: An openshell-vm (pid ${HOLDER_PID:-unknown}) holds a lock on this rootfs." + echo " Wiping the rootfs while the VM is running will corrupt its" + echo " filesystem and cause k3s failures." + echo "" + echo " Stop the VM first: kill ${HOLDER_PID:-}" + echo " Then re-run this script." + echo "" + exit 1 + fi +fi + +VM_STATE_FILE="$(dirname "${ROOTFS_DIR}")/$(basename "${ROOTFS_DIR}")-vm-state.json" +if [ -f "${VM_STATE_FILE}" ]; then + VM_PID=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1]))['pid'])" "${VM_STATE_FILE}" 2>/dev/null || echo "") + if [ -n "${VM_PID}" ] && kill -0 "${VM_PID}" 2>/dev/null; then + echo "" + echo "ERROR: An openshell-vm is running (pid ${VM_PID}) using this rootfs." + echo " Wiping the rootfs while the VM is running will corrupt its" + echo " filesystem and cause k3s failures." + echo "" + echo " Stop the VM first: kill ${VM_PID}" + echo " Then re-run this script." + echo "" + exit 1 + else + # Stale state file — VM is no longer running. Clean it up. + rm -f "${VM_STATE_FILE}" + fi +fi + +# ── Download k3s binary (outside Docker — much faster) ───────────────── + +K3S_BIN="/tmp/k3s-arm64-${K3S_VERSION}" +if [ -f "${K3S_BIN}" ]; then + echo "==> Using cached k3s binary: ${K3S_BIN}" +else + echo "==> Downloading k3s ${K3S_VERSION} for arm64..." + curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s-arm64" \ + -o "${K3S_BIN}" + chmod +x "${K3S_BIN}" +fi + +# ── Build base image with dependencies ───────────────────────────────── + +# Clean up any previous run +docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true +docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true + +echo "==> Building base image..." +docker build --platform linux/arm64 -t "${BASE_IMAGE_TAG}" -f - . <<'DOCKERFILE' +FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + iptables \ + iproute2 \ + python3 \ + busybox-static \ + zstd \ + && rm -rf /var/lib/apt/lists/* +# busybox-static provides udhcpc for DHCP inside the VM. +RUN mkdir -p /usr/share/udhcpc && \ + ln -sf /bin/busybox /sbin/udhcpc +RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s +DOCKERFILE + +# Create a container and export the filesystem +echo "==> Creating container..." +docker create --platform linux/arm64 --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true + +echo "==> Exporting filesystem..." +# Previous builds may leave overlayfs work/ dirs with permissions that +# prevent rm on macOS. Force-fix permissions before removing. +if [ -d "${ROOTFS_DIR}" ]; then + chmod -R u+rwx "${ROOTFS_DIR}" 2>/dev/null || true + rm -rf "${ROOTFS_DIR}" +fi +mkdir -p "${ROOTFS_DIR}" +docker export "${CONTAINER_NAME}" | tar -C "${ROOTFS_DIR}" -xf - + +docker rm "${CONTAINER_NAME}" + +# ── Inject k3s binary ──────────────────────────────────────────────── + +echo "==> Injecting k3s binary..." +cp "${K3S_BIN}" "${ROOTFS_DIR}/usr/local/bin/k3s" +chmod +x "${ROOTFS_DIR}/usr/local/bin/k3s" +ln -sf /usr/local/bin/k3s "${ROOTFS_DIR}/usr/local/bin/kubectl" + +# k3s self-extracts runtime binaries (containerd, runc, CNI plugins, +# coreutils, etc.) into a versioned data directory the first time it +# runs. On the pre-initialized rootfs these were extracted during the +# Docker build or VM pre-init phase. docker export and macOS virtio-fs +# can strip execute bits from Linux ELF binaries, so fix them here. +echo " Fixing execute permissions on k3s data binaries..." +chmod +x "${ROOTFS_DIR}"/var/lib/rancher/k3s/data/*/bin/* 2>/dev/null || true +chmod +x "${ROOTFS_DIR}"/var/lib/rancher/k3s/data/*/bin/aux/* 2>/dev/null || true + +# ── Inject scripts ──────────────────────────────────────────────────── + +echo "==> Injecting openshell-vm-init.sh..." +mkdir -p "${ROOTFS_DIR}/srv" +cp "${SCRIPT_DIR}/openshell-vm-init.sh" "${ROOTFS_DIR}/srv/openshell-vm-init.sh" +chmod +x "${ROOTFS_DIR}/srv/openshell-vm-init.sh" + +# Keep the hello server around for debugging +cp "${SCRIPT_DIR}/hello-server.py" "${ROOTFS_DIR}/srv/hello-server.py" +chmod +x "${ROOTFS_DIR}/srv/hello-server.py" + +# Inject VM capability checker for runtime diagnostics. +cp "${SCRIPT_DIR}/check-vm-capabilities.sh" "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" +chmod +x "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" + +# Inject the openshell-vm exec agent used by `openshell-vm exec`. +cp "${SCRIPT_DIR}/openshell-vm-exec-agent.py" "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" +chmod +x "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" + +# ── Build and inject openshell-sandbox supervisor binary ───────────── +# The supervisor binary runs inside every sandbox pod. It is side-loaded +# from the node filesystem via a read-only hostPath volume mount at +# /opt/openshell/bin. In the Docker-based gateway this is built in the +# Dockerfile.cluster supervisor-builder stage; here we cross-compile +# from the host using cargo-zigbuild. + +SUPERVISOR_TARGET="aarch64-unknown-linux-gnu" +SUPERVISOR_BIN="${PROJECT_ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" + +echo "==> Building openshell-sandbox supervisor binary (${SUPERVISOR_TARGET})..." +if ! command -v cargo-zigbuild >/dev/null 2>&1; then + echo "ERROR: cargo-zigbuild is not installed." + echo " Install it with: cargo install cargo-zigbuild" + echo " Also requires: zig (brew install zig)" + exit 1 +fi + +cargo zigbuild --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" \ + --manifest-path "${PROJECT_ROOT}/Cargo.toml" 2>&1 | tail -5 + +if [ ! -f "${SUPERVISOR_BIN}" ]; then + echo "ERROR: supervisor binary not found at ${SUPERVISOR_BIN}" + exit 1 +fi + +echo " Injecting supervisor binary into rootfs..." +mkdir -p "${ROOTFS_DIR}/opt/openshell/bin" +cp "${SUPERVISOR_BIN}" "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" +chmod +x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" +echo " Size: $(du -h "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" | cut -f1)" + +# ── Package and inject helm chart ──────────────────────────────────── + +HELM_CHART_DIR="${PROJECT_ROOT}/deploy/helm/openshell" +CHART_DEST="${ROOTFS_DIR}/var/lib/rancher/k3s/server/static/charts" + +if [ -d "${HELM_CHART_DIR}" ]; then + echo "==> Packaging helm chart..." + mkdir -p "${CHART_DEST}" + helm package "${HELM_CHART_DIR}" -d "${CHART_DEST}" + echo " $(ls "${CHART_DEST}"/*.tgz 2>/dev/null | xargs -I{} basename {})" + # Also stage to /opt/openshell/charts/ so the init script can + # restore them after a --reset wipes server/static/charts/. + mkdir -p "${ROOTFS_DIR}/opt/openshell/charts" + cp "${CHART_DEST}"/*.tgz "${ROOTFS_DIR}/opt/openshell/charts/" +else + echo "WARNING: Helm chart not found at ${HELM_CHART_DIR}, skipping" +fi + +# ── Inject Kubernetes manifests ────────────────────────────────────── +# These are copied to /opt/openshell/manifests/ (staging). openshell-vm-init.sh +# moves them to /var/lib/rancher/k3s/server/manifests/ at boot so the +# k3s Helm Controller auto-deploys them. + +MANIFEST_SRC="${PROJECT_ROOT}/deploy/kube/manifests" +MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/manifests" + +echo "==> Injecting Kubernetes manifests..." +mkdir -p "${MANIFEST_DEST}" + +for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do + if [ -f "${MANIFEST_SRC}/${manifest}" ]; then + cp "${MANIFEST_SRC}/${manifest}" "${MANIFEST_DEST}/" + echo " ${manifest}" + else + echo "WARNING: ${manifest} not found in ${MANIFEST_SRC}" + fi +done + +# ── Pre-load container images ──────────────────────────────────────── +# Pull arm64 images and save as tarballs in the k3s airgap images +# directory. k3s auto-imports from /var/lib/rancher/k3s/agent/images/ +# on startup, so no internet access is needed at boot time. +# +# Tarballs are cached in a persistent directory outside the rootfs so +# they survive rebuilds. This avoids re-pulling and re-saving ~1 GiB +# of images each time. + +IMAGES_DIR="${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images" +IMAGE_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/openshell/openshell-vm/images" +mkdir -p "${IMAGES_DIR}" "${IMAGE_CACHE_DIR}" + +echo "==> Pre-loading container images (arm64)..." + +pull_and_save() { + local image="$1" + local output="$2" + local cache="${IMAGE_CACHE_DIR}/$(basename "${output}")" + + # Use cached tarball if available. + if [ -f "${cache}" ]; then + echo " cached: $(basename "${output}")" + cp "${cache}" "${output}" + return 0 + fi + + # Try to pull; if the registry is unavailable, fall back to the + # local Docker image cache (image may exist from a previous pull). + echo " pulling: ${image}..." + if ! docker pull --platform linux/arm64 "${image}" --quiet 2>/dev/null; then + echo " pull failed, checking local Docker cache..." + if ! docker image inspect "${image}" >/dev/null 2>&1; then + echo "ERROR: image ${image} not available locally or from registry" + exit 1 + fi + echo " using locally cached image" + fi + + echo " saving: $(basename "${output}")..." + # Pipe through zstd for faster decompression and smaller tarballs. + # k3s auto-imports .tar.zst files from the airgap images directory. + # -T0 uses all CPU cores; -3 is a good speed/ratio tradeoff. + docker save "${image}" | zstd -T0 -3 -o "${output}" + # Cache for next rebuild. + cp "${output}" "${cache}" +} + +pull_and_save "${SERVER_IMAGE}" "${IMAGES_DIR}/openshell-server.tar.zst" +pull_and_save "${AGENT_SANDBOX_IMAGE}" "${IMAGES_DIR}/agent-sandbox-controller.tar.zst" +pull_and_save "${COMMUNITY_SANDBOX_IMAGE}" "${IMAGES_DIR}/community-sandbox-base.tar.zst" + +# ── Pre-initialize k3s cluster state ───────────────────────────────── +# Boot k3s inside a Docker container using the rootfs we just built. +# Wait for it to fully initialize (import images, deploy manifests, +# create database), then capture the state back into the rootfs. +# +# This eliminates cold-start latency: on VM boot, k3s finds existing +# state and resumes in ~3-5 seconds instead of 30-60s. + +echo "" +echo "==> Pre-initializing k3s cluster state..." +echo " This boots k3s in a container, waits for full readiness," +echo " then captures the initialized state into the rootfs." + +# Patch the HelmChart manifest for the init container (same patches +# openshell-vm-init.sh applies at runtime). +INIT_MANIFESTS="${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests" +mkdir -p "${INIT_MANIFESTS}" + +# Copy manifests from staging to the k3s manifest directory. +for manifest in "${MANIFEST_DEST}"/*.yaml; do + [ -f "$manifest" ] || continue + cp "$manifest" "${INIT_MANIFESTS}/" +done + +# Patch HelmChart for local images and VM settings. +HELMCHART="${INIT_MANIFESTS}/openshell-helmchart.yaml" +if [ -f "$HELMCHART" ]; then + # Use local images — explicitly imported into containerd. + sed -i '' 's|__IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" + sed -i '' 's|__SANDBOX_IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__SANDBOX_IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" + # Use the locally imported image references. + sed -i '' -E "s|repository:[[:space:]]*[^[:space:]]+|repository: ${SERVER_IMAGE%:*}|" "$HELMCHART" 2>/dev/null \ + || sed -i -E "s|repository:[[:space:]]*[^[:space:]]+|repository: ${SERVER_IMAGE%:*}|" "$HELMCHART" + sed -i '' -E "s|tag:[[:space:]]*\"?[^\"[:space:]]+\"?|tag: \"${IMAGE_TAG}\"|" "$HELMCHART" 2>/dev/null \ + || sed -i -E "s|tag:[[:space:]]*\"?[^\"[:space:]]+\"?|tag: \"${IMAGE_TAG}\"|" "$HELMCHART" + # Bridge CNI: pods use normal pod networking, not hostNetwork. + # This must match what openshell-vm-init.sh applies at runtime so the + # HelmChart manifest is unchanged at boot — preventing a helm + # upgrade job that would cycle the pre-baked pod. + sed -i '' 's|__HOST_NETWORK__|false|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__HOST_NETWORK__|false|g' "$HELMCHART" + # Enable SA token automount for bridge CNI mode. Must match + # openshell-vm-init.sh runtime value to avoid manifest delta. + sed -i '' 's|__AUTOMOUNT_SA_TOKEN__|true|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__AUTOMOUNT_SA_TOKEN__|true|g' "$HELMCHART" + # Disable persistence — use /tmp for the SQLite database. PVC mounts + # are unreliable on virtiofs. + sed -i '' 's|__PERSISTENCE_ENABLED__|false|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__PERSISTENCE_ENABLED__|false|g' "$HELMCHART" + sed -i '' 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" + # Clear SSH gateway placeholders. + sed -i '' 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" + sed -i '' 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" + SSH_HANDSHAKE_SECRET="$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n')" + sed -i '' "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART" 2>/dev/null \ + || sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART" + sed -i '' 's|__DISABLE_GATEWAY_AUTH__|false|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__DISABLE_GATEWAY_AUTH__|false|g' "$HELMCHART" + sed -i '' 's|__DISABLE_TLS__|false|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__DISABLE_TLS__|false|g' "$HELMCHART" + sed -i '' 's|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: ""|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: ""|g' "$HELMCHART" + sed -i '' '/__CHART_CHECKSUM__/d' "$HELMCHART" 2>/dev/null \ + || sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART" +fi + +# Patch agent-sandbox manifest for VM networking constraints. +AGENT_MANIFEST="${INIT_MANIFESTS}/agent-sandbox.yaml" +if [ -f "$AGENT_MANIFEST" ]; then + # Keep agent-sandbox on pod networking to avoid host port clashes. + # Point in-cluster client traffic at the API server node IP because + # kube-proxy is disabled in VM mode. + sed -i '' '/hostNetwork: true/d' "$AGENT_MANIFEST" 2>/dev/null \ + || sed -i '/hostNetwork: true/d' "$AGENT_MANIFEST" + sed -i '' '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" 2>/dev/null \ + || sed -i '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" + sed -i '' 's|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0\ + args:\ + - -metrics-bind-address=:8082\ + env:\ + - name: KUBERNETES_SERVICE_HOST\ + value: 192.168.127.2\ + - name: KUBERNETES_SERVICE_PORT\ + value: "6443"|g' "$AGENT_MANIFEST" 2>/dev/null \ + || sed -i 's|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0\ + args:\ + - -metrics-bind-address=:8082\ + env:\ + - name: KUBERNETES_SERVICE_HOST\ + value: 192.168.127.2\ + - name: KUBERNETES_SERVICE_PORT\ + value: "6443"|g' "$AGENT_MANIFEST" + if grep -q 'hostNetwork: true' "$AGENT_MANIFEST" \ + || grep -q 'ClusterFirstWithHostNet' "$AGENT_MANIFEST" \ + || ! grep -q 'KUBERNETES_SERVICE_HOST' "$AGENT_MANIFEST" \ + || ! grep -q 'metrics-bind-address=:8082' "$AGENT_MANIFEST"; then + echo "ERROR: failed to patch agent-sandbox manifest for VM networking constraints: $AGENT_MANIFEST" >&2 + exit 1 + fi +fi + +# local-storage implies local-path-provisioner, which requires CNI bridge +# networking that is unavailable in the VM kernel. +rm -f "${INIT_MANIFESTS}/local-storage.yaml" 2>/dev/null || true + +# ── Pre-initialize using the actual libkrun VM ────────────────────────── +# Boot the real VM with the rootfs we just built. This uses the same +# kernel, networking, and kube-proxy config as production — eliminating +# Docker IP mismatches, snapshotter mismatches, and the Docker volume +# copy-back dance. The VM writes state directly into the rootfs via +# virtio-fs. +# +# Requirements: the openshell-vm binary must be built and codesigned. +# mise run vm:build:binary handles this. + +GATEWAY_BIN="${PROJECT_ROOT}/target/debug/openshell-vm" +RUNTIME_DIR="${PROJECT_ROOT}/target/debug/openshell-vm.runtime" + +if [ ! -x "${GATEWAY_BIN}" ]; then + echo "ERROR: openshell-vm binary not found at ${GATEWAY_BIN}" + echo " Run: mise run vm:build:binary" + exit 1 +fi + +if [ ! -d "${RUNTIME_DIR}" ]; then + echo "ERROR: VM runtime bundle not found at ${RUNTIME_DIR}" + echo " Run: mise run vm:bundle-runtime" + exit 1 +fi + +# Helper: run a command inside the VM via the exec agent. +vm_exec() { + DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" \ + "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" exec -- "$@" 2>&1 +} + +# Ensure no stale VM is using this rootfs. +echo " Starting VM for pre-initialization..." +export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" +"${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" --reset & +VM_PID=$! + +# Ensure the VM is cleaned up on script exit. +cleanup_vm() { + if kill -0 "${VM_PID}" 2>/dev/null; then + echo " Stopping VM (pid ${VM_PID})..." + kill "${VM_PID}" 2>/dev/null || true + wait "${VM_PID}" 2>/dev/null || true + fi +} +trap cleanup_vm EXIT + +# Wait for the exec agent to become reachable. +echo " Waiting for VM exec agent..." +for i in $(seq 1 120); do + if vm_exec true >/dev/null 2>&1; then + echo " Exec agent ready (${i}s)" + break + fi + if [ "$i" -eq 120 ]; then + echo "ERROR: VM exec agent did not become reachable in 120s" + exit 1 + fi + sleep 1 +done + +# Wait for containerd to be ready. +echo " Waiting for containerd..." +for i in $(seq 1 60); do + if vm_exec k3s ctr version >/dev/null 2>&1; then + echo " Containerd ready (${i}s)" + break + fi + if [ "$i" -eq 60 ]; then + echo "ERROR: containerd did not become ready in 60s" + exit 1 + fi + sleep 1 +done + +# Wait for the openshell namespace (Helm controller creates it). +echo " Waiting for openshell namespace..." +for i in $(seq 1 180); do + if vm_exec kubectl get namespace openshell -o name 2>/dev/null | grep -q openshell; then + echo " Namespace ready (${i}s)" + break + fi + if [ "$i" -eq 180 ]; then + echo "ERROR: openshell namespace did not appear in 180s" + exit 1 + fi + sleep 1 +done + +# Wait for the openshell StatefulSet to have a ready replica. +# The VM init script generates PKI and writes TLS secrets manifests +# automatically — no host-side PKI generation needed. +echo " Waiting for openshell pod to be ready..." +for i in $(seq 1 180); do + ready=$(vm_exec kubectl -n openshell get statefulset openshell \ + -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0") + if [ "$ready" = "1" ]; then + echo " OpenShell pod ready (${i}s)" + break + fi + if [ "$i" -eq 180 ]; then + echo "WARNING: openshell pod not ready after 180s, continuing anyway" + vm_exec kubectl -n openshell get pods 2>/dev/null | sed 's/^/ /' || true + break + fi + sleep 1 +done + +# Pre-unpack container images so the overlayfs snapshotter has ready-to-use +# snapshots on first boot. Without this, the first container create for each +# image triggers a full layer extraction which can take minutes. +echo " Pre-unpacking container images..." +for img in \ + "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" \ + "ghcr.io/nvidia/openshell/gateway:latest"; do + if vm_exec k3s ctr -n k8s.io images ls -q 2>/dev/null | grep -qF "$img"; then + echo " unpacking: $img" + vm_exec k3s ctr -n k8s.io run --rm "$img" "pre-unpack-$(date +%s)" true 2>/dev/null || true + fi +done +echo " Image pre-unpack complete." + +# Stop the VM so the kine SQLite DB is flushed. +echo " Stopping VM..." +kill "${VM_PID}" 2>/dev/null || true +wait "${VM_PID}" 2>/dev/null || true + +# Surgically clean the kine SQLite DB. Runtime objects (pods, events, +# leases) would cause the VM's kubelet to reconcile against an empty +# containerd on next boot. +echo " Cleaning runtime objects from kine DB..." +DB="${ROOTFS_DIR}/var/lib/rancher/k3s/server/db/state.db" +if [ -f "$DB" ]; then + echo " Before: $(sqlite3 "$DB" "SELECT COUNT(*) FROM kine;") kine records" + sqlite3 "$DB" <<'EOSQL' +DELETE FROM kine WHERE name LIKE '/registry/pods/%'; +DELETE FROM kine WHERE name LIKE '/registry/events/%'; +DELETE FROM kine WHERE name LIKE '/registry/leases/%'; +DELETE FROM kine WHERE name LIKE '/registry/endpointslices/%'; +DELETE FROM kine WHERE name LIKE '/registry/masterleases/%'; +PRAGMA wal_checkpoint(TRUNCATE); +VACUUM; +EOSQL + echo " After: $(sqlite3 "$DB" "SELECT COUNT(*) FROM kine;") kine records" +else + echo "WARNING: state.db not found at ${DB}" +fi + +# Clean up runtime artifacts that shouldn't persist. +echo " Cleaning runtime artifacts..." +rm -rf "${ROOTFS_DIR}/var/lib/rancher/k3s/server/tls/temporary-certs" 2>/dev/null || true +rm -f "${ROOTFS_DIR}/var/lib/rancher/k3s/server/kine.sock" 2>/dev/null || true +find "${ROOTFS_DIR}/var/lib/rancher/k3s" -name '*.sock' -delete 2>/dev/null || true +find "${ROOTFS_DIR}/run" -name '*.sock' -delete 2>/dev/null || true + +# Write sentinel file so openshell-vm-init.sh and the host-side bootstrap +# know this rootfs has pre-initialized state. +echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "${ROOTFS_DIR}/opt/openshell/.initialized" + +echo " Pre-initialization complete." + +# ── Verify ──────────────────────────────────────────────────────────── + +if [ ! -f "${ROOTFS_DIR}/usr/local/bin/k3s" ]; then + echo "ERROR: k3s binary not found in rootfs. Something went wrong." + exit 1 +fi + +if [ ! -f "${ROOTFS_DIR}/opt/openshell/.initialized" ]; then + echo "WARNING: Pre-initialization sentinel not found. Cold starts will be slow." +fi + +if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then + echo "ERROR: openshell-sandbox supervisor binary not found in rootfs." + echo " Sandbox pods will fail with CreateContainerError." + exit 1 +fi + +echo "" +echo "==> Rootfs ready at: ${ROOTFS_DIR}" +echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" +echo " Pre-initialized: $(cat "${ROOTFS_DIR}/opt/openshell/.initialized" 2>/dev/null || echo 'no')" + +# Show k3s data size +K3S_DATA="${ROOTFS_DIR}/var/lib/rancher/k3s" +if [ -d "${K3S_DATA}" ]; then + echo " k3s state: $(du -sh "${K3S_DATA}" | cut -f1)" +fi + +# PKI is generated at first VM boot by the init script — not baked. + +# Show supervisor binary +if [ -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then + echo " Supervisor: $(du -h "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" | cut -f1)" +fi + +echo "" +echo "Next steps:" +echo " 1. Run: openshell-vm" +echo " Expected startup time: ~3-5 seconds (pre-initialized)" diff --git a/crates/openshell-vm/scripts/check-vm-capabilities.sh b/crates/openshell-vm/scripts/check-vm-capabilities.sh new file mode 100755 index 000000000..2e758f5e0 --- /dev/null +++ b/crates/openshell-vm/scripts/check-vm-capabilities.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# VM Kernel Capability Checker +# +# Runs inside the guest VM (or a container with the same rootfs) to +# verify that the kernel has the capabilities required for bridge CNI +# networking, kube-proxy, and Kubernetes pod networking. +# +# Usage: +# ./check-vm-capabilities.sh [--json] +# +# Exit codes: +# 0 = all required capabilities present +# 1 = one or more required capabilities missing +# 2 = script error + +set -euo pipefail + +JSON_OUTPUT=false +if [ "${1:-}" = "--json" ]; then + JSON_OUTPUT=true +fi + +PASS=0 +FAIL=0 +WARN=0 +RESULTS=() + +# ── Helpers ───────────────────────────────────────────────────────────── + +check() { + local name="$1" + local category="$2" + local required="$3" # "required" or "optional" + local description="$4" + shift 4 + local cmd=("$@") + + if eval "${cmd[@]}" >/dev/null 2>&1; then + RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"pass\",\"required\":\"$required\",\"description\":\"$description\"}") + PASS=$((PASS + 1)) + if [ "$JSON_OUTPUT" = false ]; then + printf " ✓ %-40s %s\n" "$name" "$description" + fi + else + if [ "$required" = "required" ]; then + RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"fail\",\"required\":\"$required\",\"description\":\"$description\"}") + FAIL=$((FAIL + 1)) + if [ "$JSON_OUTPUT" = false ]; then + printf " ✗ %-40s %s (REQUIRED)\n" "$name" "$description" + fi + else + RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"warn\",\"required\":\"$required\",\"description\":\"$description\"}") + WARN=$((WARN + 1)) + if [ "$JSON_OUTPUT" = false ]; then + printf " ~ %-40s %s (optional)\n" "$name" "$description" + fi + fi + fi +} + +check_module() { + local module="$1" + # Check /proc/modules (loaded), /proc/config.gz (builtin), or /sys/module + if [ -d "/sys/module/$module" ]; then + return 0 + fi + if grep -q "^${module} " /proc/modules 2>/dev/null; then + return 0 + fi + # Check if compiled in via /proc/config.gz or /boot/config + local config_key + config_key="CONFIG_$(echo "$module" | tr '[:lower:]-' '[:upper:]_')" + if [ -f /proc/config.gz ]; then + if zcat /proc/config.gz 2>/dev/null | grep -q "^${config_key}=[ym]"; then + return 0 + fi + fi + return 1 +} + +# ── Capability Checks ────────────────────────────────────────────────── + +if [ "$JSON_OUTPUT" = false ]; then + echo "VM Kernel Capability Check" + echo "==========================" + echo "" + echo "Kernel: $(uname -r)" + echo "" +fi + +# --- Network Namespaces --- +if [ "$JSON_OUTPUT" = false ]; then echo "[Network Namespaces]"; fi + +check "net_namespace" "netns" "required" \ + "network namespace support (CONFIG_NET_NS)" \ + "test -d /proc/self/ns && ls /proc/self/ns/net" + +check "veth_pair" "netns" "required" \ + "veth pair creation (CONFIG_VETH)" \ + "ip link add _chk0 type veth peer name _chk1 && ip link del _chk0" + +# --- Linux Bridge --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Linux Bridge]"; fi + +check "bridge_module" "bridge" "required" \ + "bridge device support (CONFIG_BRIDGE)" \ + "ip link add _chkbr0 type bridge && ip link del _chkbr0" + +check "bridge_nf_call" "bridge" "required" \ + "bridge netfilter (CONFIG_BRIDGE_NETFILTER)" \ + "check_module bridge && test -f /proc/sys/net/bridge/bridge-nf-call-iptables 2>/dev/null || check_module br_netfilter" + +# --- Netfilter / iptables --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Netfilter / iptables]"; fi + +check "netfilter" "netfilter" "required" \ + "netfilter framework (CONFIG_NETFILTER)" \ + "check_module nf_conntrack || check_module ip_tables || test -d /proc/sys/net/netfilter" + +check "nf_conntrack" "netfilter" "required" \ + "connection tracking (CONFIG_NF_CONNTRACK)" \ + "check_module nf_conntrack" + +check "nf_nat" "netfilter" "required" \ + "NAT support (CONFIG_NF_NAT)" \ + "check_module nf_nat" + +check "iptables_filter" "netfilter" "required" \ + "iptables filter (CONFIG_IP_NF_FILTER)" \ + "check_module ip_tables || iptables -L -n >/dev/null 2>&1" + +check "iptables_nat" "netfilter" "required" \ + "iptables NAT (CONFIG_IP_NF_NAT)" \ + "check_module iptable_nat || iptables -t nat -L -n >/dev/null 2>&1" + +check "iptables_mangle" "netfilter" "optional" \ + "iptables mangle (CONFIG_IP_NF_MANGLE)" \ + "check_module iptable_mangle || iptables -t mangle -L -n >/dev/null 2>&1" + +check "nf_conntrack_netlink" "netfilter" "optional" \ + "conntrack netlink (CONFIG_NF_CT_NETLINK)" \ + "check_module nf_conntrack_netlink" + +check "nftables" "netfilter" "optional" \ + "nftables (CONFIG_NF_TABLES)" \ + "check_module nf_tables || nft list ruleset >/dev/null 2>&1" + +# --- IP Forwarding / Routing --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[IP Forwarding]"; fi + +check "ip_forward" "routing" "required" \ + "IP forwarding (sysctl)" \ + "test -f /proc/sys/net/ipv4/ip_forward" + +check "ip_route" "routing" "required" \ + "IP routing" \ + "ip route show >/dev/null 2>&1" + +# --- CNI Plugin Dependencies --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[CNI Plugins]"; fi + +check "cni_bridge_bin" "cni" "required" \ + "bridge CNI plugin binary" \ + "test -x /opt/cni/bin/bridge || find /var/lib/rancher/k3s/data -name bridge -type f 2>/dev/null | head -1 | grep -q ." + +check "cni_host_local_bin" "cni" "required" \ + "host-local IPAM plugin binary" \ + "test -x /opt/cni/bin/host-local || find /var/lib/rancher/k3s/data -name host-local -type f 2>/dev/null | head -1 | grep -q ." + +check "cni_loopback_bin" "cni" "required" \ + "loopback CNI plugin binary" \ + "test -x /opt/cni/bin/loopback || find /var/lib/rancher/k3s/data -name loopback -type f 2>/dev/null | head -1 | grep -q ." + +check "cni_portmap_bin" "cni" "optional" \ + "portmap CNI plugin binary (needs iptables)" \ + "test -x /opt/cni/bin/portmap || find /var/lib/rancher/k3s/data -name portmap -type f 2>/dev/null | head -1 | grep -q ." + +# --- Userspace Tools --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Userspace Tools]"; fi + +check "iptables_bin" "userspace" "required" \ + "iptables binary" \ + "command -v iptables" + +check "conntrack_bin" "userspace" "optional" \ + "conntrack binary" \ + "command -v conntrack" + +check "ip_bin" "userspace" "required" \ + "iproute2 (ip command)" \ + "command -v ip" + +# ── Summary ──────────────────────────────────────────────────────────── + +if [ "$JSON_OUTPUT" = true ]; then + echo "{" + echo " \"kernel\": \"$(uname -r)\"," + echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," + echo " \"pass\": $PASS," + echo " \"fail\": $FAIL," + echo " \"warn\": $WARN," + echo " \"results\": [" + local_first=true + for r in "${RESULTS[@]}"; do + if [ "$local_first" = true ]; then + local_first=false + else + echo "," + fi + printf " %s" "$r" + done + echo "" + echo " ]" + echo "}" +else + echo "" + echo "─────────────────────────────────────────" + printf "Results: %d passed, %d failed, %d warnings\n" "$PASS" "$FAIL" "$WARN" + + if [ "$FAIL" -gt 0 ]; then + echo "" + echo "FAIL: $FAIL required capabilities missing." + echo "The VM kernel needs to be rebuilt with the missing features." + echo "See: crates/openshell-vm/runtime/kernel/README.md" + exit 1 + else + echo "" + echo "PASS: All required capabilities present." + exit 0 + fi +fi diff --git a/crates/openshell-vm/scripts/hello-server.py b/crates/openshell-vm/scripts/hello-server.py new file mode 100644 index 000000000..f02d7d72e --- /dev/null +++ b/crates/openshell-vm/scripts/hello-server.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Minimal HTTP server that responds with 'Hello from libkrun VM!' on port 8080.""" + +import json +import os +import platform +from http.server import HTTPServer, BaseHTTPRequestHandler + + +class HelloHandler(BaseHTTPRequestHandler): + def do_GET(self): + body = json.dumps( + { + "message": "Hello from libkrun VM!", + "hostname": platform.node(), + "platform": platform.platform(), + "arch": platform.machine(), + "pid": os.getpid(), + "path": self.path, + }, + indent=2, + ) + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body.encode()) + + def log_message(self, format, *args): + print(f"[hello-server] {args[0]}") + + +def main(): + host = "0.0.0.0" + port = 8080 + server = HTTPServer((host, port), HelloHandler) + print(f"Hello server listening on {host}:{port}") + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nShutting down.") + server.server_close() + + +if __name__ == "__main__": + main() diff --git a/crates/openshell-vm/scripts/openshell-vm-exec-agent.py b/crates/openshell-vm/scripts/openshell-vm-exec-agent.py new file mode 100644 index 000000000..d7ffd81df --- /dev/null +++ b/crates/openshell-vm/scripts/openshell-vm-exec-agent.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import json +import os +import socket +import subprocess +import sys +import threading + + +PORT = 10777 + + +def recv_line(sock_file): + line = sock_file.readline() + if not line: + return None + return json.loads(line.decode("utf-8")) + + +def send_frame(sock_file, lock, frame): + data = (json.dumps(frame, separators=(",", ":")) + "\n").encode("utf-8") + with lock: + sock_file.write(data) + sock_file.flush() + + +def validate_env(env_items): + env = {} + for item in env_items: + if "=" not in item: + raise ValueError(f"invalid env item: {item}") + key, value = item.split("=", 1) + if not key or not (key[0] == "_" or key[0].isalpha()): + raise ValueError(f"invalid env key: {key}") + if not all(ch == "_" or ch.isalnum() for ch in key): + raise ValueError(f"invalid env key: {key}") + env[key] = value + return env + + +def stream_reader(pipe, frame_type, sock_file, lock): + try: + while True: + chunk = pipe.read(8192) + if not chunk: + break + send_frame( + sock_file, + lock, + {"type": frame_type, "data": base64.b64encode(chunk).decode("ascii")}, + ) + finally: + pipe.close() + + +def stdin_writer(proc, sock_file, sock, lock): + """Forward stdin frames from the client to the subprocess. + + When the client sends ``stdin_close`` (or the connection drops), we + close the subprocess's stdin pipe so it sees EOF. We must NOT + terminate the subprocess or shut down the socket here — the main + thread needs the process to finish naturally and the stdout/stderr + reader threads still need to flush their data back to the client. + """ + try: + while True: + frame = recv_line(sock_file) + if frame is None: + break + kind = frame.get("type") + if kind == "stdin": + payload = base64.b64decode(frame.get("data", "")) + if proc.stdin is not None: + proc.stdin.write(payload) + proc.stdin.flush() + elif kind == "stdin_close": + break + else: + send_frame( + sock_file, + lock, + {"type": "error", "message": f"unknown frame type: {kind}"}, + ) + break + except BrokenPipeError: + pass + finally: + try: + if proc.stdin is not None: + proc.stdin.close() + except OSError: + pass + + +def handle_client(conn): + sock_file = conn.makefile("rwb", buffering=0) + lock = threading.Lock() + try: + request = recv_line(sock_file) + if request is None: + return + + argv = request.get("argv") or ["sh"] + cwd = request.get("cwd") + env = os.environ.copy() + env.update(validate_env(request.get("env") or [])) + + proc = subprocess.Popen( + argv, + cwd=cwd or "/", + env=env, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + stdout_thread = threading.Thread( + target=stream_reader, + args=(proc.stdout, "stdout", sock_file, lock), + daemon=True, + ) + stderr_thread = threading.Thread( + target=stream_reader, + args=(proc.stderr, "stderr", sock_file, lock), + daemon=True, + ) + stdin_thread = threading.Thread( + target=stdin_writer, args=(proc, sock_file, conn, lock), daemon=True + ) + + stdout_thread.start() + stderr_thread.start() + stdin_thread.start() + + code = proc.wait() + stdout_thread.join() + stderr_thread.join() + send_frame(sock_file, lock, {"type": "exit", "code": code}) + except Exception as exc: + try: + send_frame(sock_file, lock, {"type": "error", "message": str(exc)}) + except Exception: + pass + finally: + try: + sock_file.close() + except Exception: + pass + conn.close() + + +def main(): + if not hasattr(socket, "AF_VSOCK"): + print("AF_VSOCK is not available", file=sys.stderr) + return 1 + + server = socket.socket(socket.AF_VSOCK, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind((socket.VMADDR_CID_ANY, PORT)) + server.listen(16) + + while True: + conn, _addr = server.accept() + thread = threading.Thread(target=handle_client, args=(conn,), daemon=True) + thread.start() + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh new file mode 100755 index 000000000..c4e89cab0 --- /dev/null +++ b/crates/openshell-vm/scripts/openshell-vm-init.sh @@ -0,0 +1,692 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Init script for the openshell-vm microVM. Runs as PID 1 inside the libkrun VM. +# +# Mounts essential virtual filesystems, configures networking, then execs +# k3s server. If the rootfs was pre-initialized by build-rootfs.sh (sentinel +# at /opt/openshell/.initialized), the full manifest setup is skipped and +# k3s resumes from its persisted state (~3-5s startup). + +set -e + +BOOT_START=$(date +%s%3N 2>/dev/null || date +%s) + +ts() { + local now + now=$(date +%s%3N 2>/dev/null || date +%s) + local elapsed=$(( (now - BOOT_START) )) + printf "[%d.%03ds] %s\n" $((elapsed / 1000)) $((elapsed % 1000)) "$*" +} + +PRE_INITIALIZED=false +if [ -f /opt/openshell/.initialized ]; then + PRE_INITIALIZED=true + ts "pre-initialized rootfs detected (fast path)" +fi + +# ── Mount essential filesystems (parallel) ────────────────────────────── +# These are independent; mount them concurrently. + +mount -t proc proc /proc 2>/dev/null & +mount -t sysfs sysfs /sys 2>/dev/null & +mount -t tmpfs tmpfs /tmp 2>/dev/null & +mount -t tmpfs tmpfs /run 2>/dev/null & +mount -t devtmpfs devtmpfs /dev 2>/dev/null & +wait + +# These depend on /dev being mounted. +mkdir -p /dev/pts /dev/shm +mount -t devpts devpts /dev/pts 2>/dev/null & +mount -t tmpfs tmpfs /dev/shm 2>/dev/null & + +# cgroup2 (unified hierarchy) — required by k3s/containerd. +mkdir -p /sys/fs/cgroup +mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null & +wait + +ts "filesystems mounted" + +# ── Networking ────────────────────────────────────────────────────────── + +hostname openshell-vm 2>/dev/null || true + +# Ensure loopback is up (k3s binds to 127.0.0.1). +ip link set lo up 2>/dev/null || true + +# Detect whether we have a real network interface (gvproxy) or need a +# dummy interface (TSI / no networking). +if ip link show eth0 >/dev/null 2>&1; then + # gvproxy networking — bring up eth0 and get an IP via DHCP. + # gvproxy has a built-in DHCP server that assigns 192.168.127.2/24 + # with gateway 192.168.127.1 and configures ARP properly. + ts "detected eth0 (gvproxy networking)" + ip link set eth0 up 2>/dev/null || true + + # Use DHCP to get IP and configure routes. gvproxy's DHCP server + # handles ARP resolution which static config does not. + if command -v udhcpc >/dev/null 2>&1; then + # udhcpc needs a script to apply the lease. Use the busybox + # default script if available, otherwise write a minimal one. + UDHCPC_SCRIPT="/usr/share/udhcpc/default.script" + if [ ! -f "$UDHCPC_SCRIPT" ]; then + mkdir -p /usr/share/udhcpc + cat > "$UDHCPC_SCRIPT" << 'DHCP_SCRIPT' +#!/bin/sh +case "$1" in + bound|renew) + ip addr flush dev "$interface" + ip addr add "$ip/$mask" dev "$interface" + if [ -n "$router" ]; then + ip route add default via $router dev "$interface" + fi + if [ -n "$dns" ]; then + echo -n > /etc/resolv.conf + for d in $dns; do + echo "nameserver $d" >> /etc/resolv.conf + done + fi + ;; +esac +DHCP_SCRIPT + chmod +x "$UDHCPC_SCRIPT" + fi + # -f: stay in foreground, -q: quit after obtaining lease, + # -n: exit if no lease, -T 1: 1s between retries, -t 3: 3 retries + # -A 1: wait 1s before first retry (aggressive for local gvproxy) + udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1 || true + else + # Fallback to static config if no DHCP client available. + ts "no DHCP client, using static config" + ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true + ip route add default via 192.168.127.1 2>/dev/null || true + fi + + # Ensure DNS is configured. DHCP should have set /etc/resolv.conf, + # but if it didn't (or static fallback was used), provide a default. + if [ ! -s /etc/resolv.conf ]; then + echo "nameserver 8.8.8.8" > /etc/resolv.conf + echo "nameserver 8.8.4.4" >> /etc/resolv.conf + fi + + # Read back the IP we got (from DHCP or static). + NODE_IP=$(ip -4 addr show eth0 | grep -oP 'inet \K[^/]+' || echo "192.168.127.2") + ts "eth0 IP: $NODE_IP" +else + # TSI or no networking — create a dummy interface for k3s. + ts "no eth0 found, using dummy interface (TSI mode)" + ip link add dummy0 type dummy 2>/dev/null || true + ip addr add 10.0.2.15/24 dev dummy0 2>/dev/null || true + ip link set dummy0 up 2>/dev/null || true + ip route add default dev dummy0 2>/dev/null || true + + NODE_IP="10.0.2.15" +fi + +# ── k3s data directories ─────────────────────────────────────────────── + +mkdir -p /var/lib/rancher/k3s +mkdir -p /etc/rancher/k3s + +# Clean stale runtime artifacts from previous boots (virtio-fs persists +# the rootfs between VM restarts). +rm -rf /var/lib/rancher/k3s/server/tls/temporary-certs 2>/dev/null || true +rm -f /var/lib/rancher/k3s/server/kine.sock 2>/dev/null || true +# Clean stale node password so k3s doesn't fail validation on reboot. +# Each k3s start generates a new random node password; the old hash in +# the database will not match. Removing the local password file forces +# k3s to re-register with a fresh one. +rm -f /var/lib/rancher/k3s/server/cred/node-passwd 2>/dev/null || true +# Also clean any stale pid files and unix sockets +find /var/lib/rancher/k3s -name '*.sock' -delete 2>/dev/null || true +find /run -name '*.sock' -delete 2>/dev/null || true + +# Clean stale containerd runtime state from previous boots. +# +# The rootfs persists across VM restarts via virtio-fs. The overlayfs +# snapshotter is backed by tmpfs (see below), so snapshot layer data is +# wiped on every boot. We must also delete meta.db because it contains +# snapshot metadata (parent chain references) that become invalid once +# the tmpfs is remounted. If meta.db survives but the snapshot dirs +# don't, containerd fails every pod with: +# "missing parent bucket: not found" +# because it tries to look up snapshot parents that no longer exist. +# +# Deleting meta.db is safe: containerd rebuilds it on startup by +# re-importing the pre-baked image tarballs from +# /var/lib/rancher/k3s/agent/images/ (adds ~3s to boot). The content +# store blobs on virtio-fs are preserved so no network pulls are needed. +# +# The kine (SQLite) DB cleanup in build-rootfs.sh already removes stale +# pod/sandbox records from k3s etcd, preventing kubelet from reconciling +# against stale sandboxes. +CONTAINERD_DIR="/var/lib/rancher/k3s/agent/containerd" +if [ -d "$CONTAINERD_DIR" ]; then + # Remove runtime task state (stale shim PIDs, sockets from dead processes). + rm -rf "${CONTAINERD_DIR}/io.containerd.runtime.v2.task" 2>/dev/null || true + # Remove sandbox controller shim state. Stale sandbox records cause + # containerd to reuse network namespaces from previous boots, which + # already have routes configured. The CNI bridge plugin then fails + # with "file exists" when adding the default route on retry. + rm -rf "${CONTAINERD_DIR}/io.containerd.sandbox.controller.v1.shim" 2>/dev/null || true + # Clean stale ingest temp files from the content store. + rm -rf "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" 2>/dev/null || true + mkdir -p "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" + # Delete meta.db — snapshot metadata references are invalidated by + # the tmpfs remount below. containerd will rebuild it from the + # pre-baked image tarballs on startup. + rm -f "${CONTAINERD_DIR}/io.containerd.metadata.v1.bolt/meta.db" 2>/dev/null || true + ts "cleaned containerd runtime state (reset meta.db + content store preserved)" +fi +rm -rf /run/k3s 2>/dev/null || true + +# Mount tmpfs for the overlayfs snapshotter upper/work directories. +# The overlayfs snapshotter on virtio-fs fails with "network dropped +# connection on reset" when runc tries to create bind mount targets +# inside the overlay. This is because virtio-fs (FUSE) doesn't fully +# support the file operations overlayfs needs in the upper layer. +# Using tmpfs (backed by RAM) for the snapshotter directory avoids +# this issue entirely. With 8GB VM RAM, this leaves ~6GB for image +# layers which is sufficient for typical sandbox workloads. +OVERLAYFS_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.overlayfs" +mkdir -p "$OVERLAYFS_DIR" +mount -t tmpfs -o size=4g tmpfs "$OVERLAYFS_DIR" +ts "mounted tmpfs for overlayfs snapshotter (4GB)" + +ts "stale artifacts cleaned" + +# ── Clean stale CNI / pod networking state ────────────────────────────── +# The rootfs persists across VM restarts via virtio-fs. Previous pod +# sandboxes leave behind veth pairs, bridge routes, host-local IPAM +# allocations, and network namespaces. If not cleaned, the bridge CNI +# plugin fails with: +# "failed to add route ... file exists" +# because the default route via cni0 already exists from the prior boot, +# or a stale network namespace already has the route configured. + +# Tear down the CNI bridge and its associated routes. +if ip link show cni0 >/dev/null 2>&1; then + ip link set cni0 down 2>/dev/null || true + ip link delete cni0 2>/dev/null || true + ts "deleted stale cni0 bridge" +fi + +# Remove any leftover veth pairs (CNI bridge plugin creates vethXXXX). +for veth in $(ip -o link show type veth 2>/dev/null | awk -F': ' '{print $2}' | cut -d'@' -f1); do + ip link delete "$veth" 2>/dev/null || true +done + +# Flush host-local IPAM allocations so IPs can be reassigned cleanly. +rm -rf /var/lib/cni/networks 2>/dev/null || true +rm -rf /var/lib/cni/results 2>/dev/null || true + +# Flush any stale CNI-added routes for the pod CIDR. These can conflict +# with routes the bridge plugin tries to add on the next boot. +ip route flush 10.42.0.0/24 2>/dev/null || true + +# Clean up stale pod network namespaces from previous boots. Containerd +# creates named netns under /var/run/netns/ for each pod sandbox. If +# these persist across VM restarts, the CNI bridge plugin fails when +# adding routes because the stale netns already has the default route +# configured from the prior boot. Removing all named network namespaces +# forces containerd to create fresh ones. +if [ -d /var/run/netns ]; then + for ns in $(ip netns list 2>/dev/null | awk '{print $1}'); do + ip netns delete "$ns" 2>/dev/null || true + done +fi +# Also clean the netns bind-mount directory used by containerd/CRI. +# Containerd may use /run/netns/ or /var/run/netns/ (same via tmpfs). +rm -rf /run/netns/* 2>/dev/null || true +rm -rf /var/run/netns/* 2>/dev/null || true + +ts "stale CNI networking state cleaned" + +# ── Network profile detection ─────────────────────────────────────────── +# Detect early so manifest patching and k3s flags both use the same value. +# +# "bridge" is the only supported profile. It requires a custom libkrunfw +# with CONFIG_BRIDGE, CONFIG_NETFILTER, CONFIG_NF_NAT built in. If the +# kernel lacks these capabilities the VM cannot run pod networking and we +# fail fast with an actionable error. + +NET_PROFILE="bridge" + +ts "network profile: ${NET_PROFILE}" + +# Validate that the kernel actually has the required capabilities. +_caps_ok=true +if ! ip link add _cap_br0 type bridge 2>/dev/null; then + echo "ERROR: kernel lacks bridge support (CONFIG_BRIDGE). Use a custom libkrunfw." >&2 + _caps_ok=false +else + ip link del _cap_br0 2>/dev/null || true +fi +if [ ! -d /proc/sys/net/netfilter ] && [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + echo "ERROR: kernel lacks netfilter support (CONFIG_NETFILTER). Use a custom libkrunfw." >&2 + _caps_ok=false +fi +if [ "$_caps_ok" = false ]; then + echo "FATAL: required kernel capabilities missing — cannot configure pod networking." >&2 + echo "See: architecture/custom-vm-runtime.md for build instructions." >&2 + exit 1 +fi + +# ── Deploy bundled manifests (cold boot only) ─────────────────────────── +# On pre-initialized rootfs, manifests are already in place from the +# build-time k3s boot. Skip this entirely for fast startup. + +K3S_MANIFESTS="/var/lib/rancher/k3s/server/manifests" +BUNDLED_MANIFESTS="/opt/openshell/manifests" + +if [ "$PRE_INITIALIZED" = false ]; then + + mkdir -p "$K3S_MANIFESTS" + + if [ -d "$BUNDLED_MANIFESTS" ]; then + ts "deploying bundled manifests (cold boot)..." + for manifest in "$BUNDLED_MANIFESTS"/*.yaml; do + [ ! -f "$manifest" ] && continue + cp "$manifest" "$K3S_MANIFESTS/" + done + + # Remove stale OpenShell-managed manifests from previous boots. + for existing in "$K3S_MANIFESTS"/openshell-*.yaml \ + "$K3S_MANIFESTS"/agent-*.yaml; do + [ ! -f "$existing" ] && continue + basename=$(basename "$existing") + if [ ! -f "$BUNDLED_MANIFESTS/$basename" ]; then + rm -f "$existing" + fi + done + fi + + # Restore helm chart tarballs from staging. A --reset wipes + # server/static/charts/ but the bundled charts survive in + # /opt/openshell/charts/. + BUNDLED_CHARTS="/opt/openshell/charts" + K3S_CHARTS="/var/lib/rancher/k3s/server/static/charts" + if [ -d "$BUNDLED_CHARTS" ]; then + mkdir -p "$K3S_CHARTS" + cp "$BUNDLED_CHARTS"/*.tgz "$K3S_CHARTS/" 2>/dev/null || true + ts "helm charts restored from staging" + fi + + ts "manifests deployed" +else + ts "skipping manifest deploy (pre-initialized)" +fi + +# Patch manifests for VM deployment constraints. +HELMCHART="$K3S_MANIFESTS/openshell-helmchart.yaml" +if [ -f "$HELMCHART" ]; then + # Use pre-loaded images — don't pull from registry. + sed -i 's|__IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" + sed -i 's|__SANDBOX_IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" + + # Bridge CNI: pods use normal pod networking, not hostNetwork. + # The pre-init in build-rootfs.sh replaces __HOST_NETWORK__ with "true" + # for Docker container networking. At VM boot with bridge CNI we need + # to override it back to "false" so pods use the CNI bridge network. + sed -i 's|hostNetwork: true|hostNetwork: false|g' "$HELMCHART" + sed -i 's|__HOST_NETWORK__|false|g' "$HELMCHART" + sed -i 's|__AUTOMOUNT_SA_TOKEN__|true|g' "$HELMCHART" + + sed -i 's|__PERSISTENCE_ENABLED__|false|g' "$HELMCHART" + sed -i 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" + # Clear SSH gateway placeholders (default 127.0.0.1 is correct for local VM). + sed -i 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" + sed -i 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" + # Generate a random SSH handshake secret for this boot. + SSH_SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n') + sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_SECRET}|g" "$HELMCHART" + sed -i 's|__DISABLE_GATEWAY_AUTH__|false|g' "$HELMCHART" + sed -i 's|__DISABLE_TLS__|false|g' "$HELMCHART" + sed -i 's|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: ""|g' "$HELMCHART" + sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART" +fi + +AGENT_MANIFEST="$K3S_MANIFESTS/agent-sandbox.yaml" +if [ -f "$AGENT_MANIFEST" ]; then + # Bridge CNI: agent-sandbox uses normal pod networking. + # kube-proxy is enabled so kubernetes.default.svc is reachable + # via ClusterIP — no need for KUBERNETES_SERVICE_HOST override. + sed -i '/hostNetwork: true/d' "$AGENT_MANIFEST" + sed -i '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" + ts "agent-sandbox: using pod networking (bridge profile)" +fi + +# ── CNI configuration (bridge) ────────────────────────────────────────── +# Uses the bridge CNI plugin with iptables masquerade. Requires +# CONFIG_BRIDGE, CONFIG_NETFILTER, CONFIG_NF_NAT in the VM kernel +# (validated above at boot). kube-proxy uses nftables mode for service +# VIP routing. + +CNI_CONF_DIR="/etc/cni/net.d" +CNI_BIN_DIR="/opt/cni/bin" +mkdir -p "$CNI_CONF_DIR" "$CNI_BIN_DIR" + +# Enable IP forwarding (required for masquerade). +echo 1 > /proc/sys/net/ipv4/ip_forward 2>/dev/null || true + +# Enable bridge netfilter call (required for CNI bridge masquerade to +# see bridged traffic). +if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + echo 1 > /proc/sys/net/bridge/bridge-nf-call-iptables 2>/dev/null || true +fi + +cat > "$CNI_CONF_DIR/10-bridge.conflist" << 'CNICFG' +{ + "cniVersion": "1.0.0", + "name": "bridge", + "plugins": [ + { + "type": "bridge", + "bridge": "cni0", + "isGateway": true, + "isDefaultGateway": true, + "ipMasq": true, + "hairpinMode": true, + "ipam": { + "type": "host-local", + "ranges": [[{ "subnet": "10.42.0.0/24" }]] + } + }, + { + "type": "portmap", + "capabilities": { "portMappings": true }, + "snat": true + }, + { + "type": "loopback" + } + ] +} +CNICFG + +# Remove any stale legacy ptp config. +rm -f "$CNI_CONF_DIR/10-ptp.conflist" 2>/dev/null || true + +ts "bridge CNI configured (cni0 + iptables masquerade)" + +# Start the local exec agent before k3s so `openshell-vm exec` works as soon as +# the VM has booted. It only listens on vsock, not on the guest network. +if command -v python3 >/dev/null 2>&1; then + ts "starting openshell-vm exec agent" + mkdir -p /run/openshell + setsid python3 /srv/openshell-vm-exec-agent.py >/run/openshell/openshell-vm-exec-agent.log 2>&1 & +else + ts "WARNING: python3 missing, openshell-vm exec agent disabled" +fi + +# Symlink k3s-bundled CNI binaries to the default containerd bin path. +# k3s extracts its tools to /var/lib/rancher/k3s/data//bin/ at startup. +# On cold boot this directory doesn't exist yet (k3s hasn't run), so we +# first try synchronously, then fall back to a background watcher that +# polls until k3s extracts the binaries and creates the symlinks before +# any pods can schedule. +link_cni_binaries() { + local data_bin="$1" + # Ensure execute permissions on all binaries. The rootfs may have + # been built on macOS where virtio-fs or docker export can strip + # execute bits from Linux ELF binaries. + chmod +x "$data_bin"/* 2>/dev/null || true + if [ -d "$data_bin/aux" ]; then + chmod +x "$data_bin/aux"/* 2>/dev/null || true + fi + for plugin in bridge host-local loopback bandwidth portmap; do + [ -e "$data_bin/$plugin" ] && ln -sf "$data_bin/$plugin" "$CNI_BIN_DIR/$plugin" + done +} + +# Find the k3s data bin dir, excluding temporary extraction directories +# (k3s extracts to -tmp/ then renames to /). +find_k3s_data_bin() { + find /var/lib/rancher/k3s/data -maxdepth 2 -name bin -type d 2>/dev/null \ + | grep -v '\-tmp/' | head -1 +} + +K3S_DATA_BIN=$(find_k3s_data_bin) +if [ -n "$K3S_DATA_BIN" ]; then + link_cni_binaries "$K3S_DATA_BIN" + ts "CNI binaries linked from $K3S_DATA_BIN" +else + # Cold boot: k3s hasn't extracted binaries yet. Launch a background + # watcher that polls until the data dir appears (k3s creates it in + # the first ~2s of startup) and then symlinks the CNI plugins. + # We exclude -tmp directories to avoid symlinking to the transient + # extraction path that k3s renames once extraction completes. + ts "CNI binaries not yet available, starting background watcher" + setsid sh -c ' + CNI_BIN_DIR="/opt/cni/bin" + for i in $(seq 1 60); do + K3S_DATA_BIN=$(find /var/lib/rancher/k3s/data -maxdepth 2 -name bin -type d 2>/dev/null \ + | grep -v "\-tmp/" | head -1) + if [ -n "$K3S_DATA_BIN" ]; then + chmod +x "$K3S_DATA_BIN"/* 2>/dev/null || true + if [ -d "$K3S_DATA_BIN/aux" ]; then + chmod +x "$K3S_DATA_BIN/aux"/* 2>/dev/null || true + fi + for plugin in bridge host-local loopback bandwidth portmap; do + [ -e "$K3S_DATA_BIN/$plugin" ] && ln -sf "$K3S_DATA_BIN/$plugin" "$CNI_BIN_DIR/$plugin" + done + echo "[cni-watcher] CNI binaries linked from $K3S_DATA_BIN after ${i}s" + exit 0 + fi + sleep 1 + done + echo "[cni-watcher] ERROR: k3s data bin dir not found after 60s" + ' & +fi + +# Also clean up any flannel config from the k3s-specific CNI directory +# (pre-baked state from the Docker build used host-gw flannel). +rm -f "/var/lib/rancher/k3s/agent/etc/cni/net.d/10-flannel.conflist" 2>/dev/null || true + +# ── PKI: generate once, write TLS secrets manifest every boot ────────── +# Certs are generated on first boot and stored at /opt/openshell/pki/. +# They survive --reset (which only wipes k3s server/agent state). +# The host-side bootstrap reads them from the rootfs via virtio-fs and +# copies the client certs to ~/.config/openshell/gateways//mtls/. + +PKI_DIR="/opt/openshell/pki" +if [ ! -f "$PKI_DIR/ca.crt" ]; then + ts "generating PKI (first boot)..." + mkdir -p "$PKI_DIR" + + # CA + openssl req -x509 -newkey ec -pkeyopt ec_paramgen_curve:prime256v1 \ + -keyout "$PKI_DIR/ca.key" -out "$PKI_DIR/ca.crt" \ + -days 3650 -nodes -subj "/O=openshell/CN=openshell-ca" 2>/dev/null + + # Server cert with SANs + cat > "$PKI_DIR/server.cnf" </dev/null + openssl x509 -req -in "$PKI_DIR/server.csr" \ + -CA "$PKI_DIR/ca.crt" -CAkey "$PKI_DIR/ca.key" -CAcreateserial \ + -out "$PKI_DIR/server.crt" -days 3650 \ + -extensions v3_req -extfile "$PKI_DIR/server.cnf" 2>/dev/null + + # Client cert (must be v3 — rustls rejects v1) + cat > "$PKI_DIR/client.cnf" </dev/null + openssl x509 -req -in "$PKI_DIR/client.csr" \ + -CA "$PKI_DIR/ca.crt" -CAkey "$PKI_DIR/ca.key" -CAcreateserial \ + -out "$PKI_DIR/client.crt" -days 3650 \ + -extensions v3_client -extfile "$PKI_DIR/client.cnf" 2>/dev/null + + # Clean up CSRs + rm -f "$PKI_DIR"/*.csr "$PKI_DIR"/*.cnf "$PKI_DIR"/*.srl + + ts "PKI generated" +else + ts "existing PKI found, skipping generation" +fi + +# Write TLS secrets as a k3s auto-deploy manifest. k3s applies any YAML +# in server/manifests/ on startup. We write this on every boot so that +# a --reset (which wipes the kine DB) gets the secrets re-applied. +ts "writing TLS secrets manifest..." +mkdir -p "$K3S_MANIFESTS" +CA_CRT_B64=$(base64 -w0 < "$PKI_DIR/ca.crt") +SERVER_CRT_B64=$(base64 -w0 < "$PKI_DIR/server.crt") +SERVER_KEY_B64=$(base64 -w0 < "$PKI_DIR/server.key") +CLIENT_CRT_B64=$(base64 -w0 < "$PKI_DIR/client.crt") +CLIENT_KEY_B64=$(base64 -w0 < "$PKI_DIR/client.key") + +cat > "$K3S_MANIFESTS/openshell-tls-secrets.yaml" < "$DIAG" + exit 1 + fi + { + echo "=== [DIAG $(date +%s)] nft binary: $NFT ===" + echo "=== [DIAG] nft list tables ===" + "$NFT" list tables 2>&1 + echo "=== [DIAG] nft list ruleset (kube-proxy) ===" + "$NFT" list ruleset 2>&1 + echo "=== [DIAG] ss -tlnp ===" + ss -tlnp 2>&1 || busybox netstat -tlnp 2>&1 || echo "ss/netstat not available" + echo "=== [DIAG] ip addr ===" + ip addr 2>&1 + echo "=== [DIAG] ip route ===" + ip route 2>&1 + echo "=== [DIAG] iptables -t nat -L -n -v ===" + iptables -t nat -L -n -v 2>&1 + echo "=== [DIAG] kube-proxy healthz ===" + wget -q -O - http://127.0.0.1:10256/healthz 2>&1 || echo "healthz failed" + echo "=== [DIAG] conntrack -L ===" + conntrack -L 2>&1 || echo "conntrack not available" + echo "=== [DIAG] done ===" + } > "$DIAG" 2>&1 +' & + +exec /usr/local/bin/k3s server "${K3S_ARGS[@]}" diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs new file mode 100644 index 000000000..34a9d5043 --- /dev/null +++ b/crates/openshell-vm/src/exec.rs @@ -0,0 +1,534 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::fs::{self, File}; +use std::io::{BufRead, BufReader, Read, Write}; +use std::os::unix::net::UnixStream; +use std::path::{Path, PathBuf}; +use std::thread; +use std::time::{SystemTime, UNIX_EPOCH}; + +use base64::Engine as _; +use serde::{Deserialize, Serialize}; + +use crate::VmError; + +pub const VM_EXEC_VSOCK_PORT: u32 = 10_777; + +const VM_EXEC_SOCKET_NAME: &str = "openshell-vm-exec.sock"; +const VM_STATE_NAME: &str = "vm-state.json"; +const VM_LOCK_NAME: &str = "vm.lock"; +const KUBECONFIG_ENV: &str = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"; + +#[derive(Debug, Clone)] +pub struct VmExecOptions { + pub rootfs: Option, + pub command: Vec, + pub workdir: Option, + pub env: Vec, + pub tty: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmRuntimeState { + pub pid: i32, + pub exec_vsock_port: u32, + pub socket_path: PathBuf, + pub rootfs: PathBuf, + pub console_log: PathBuf, + pub started_at_ms: u128, +} + +#[derive(Debug, Serialize)] +struct ExecRequest { + argv: Vec, + env: Vec, + cwd: Option, + tty: bool, +} + +#[derive(Debug, Serialize)] +#[serde(tag = "type", rename_all = "snake_case")] +enum ClientFrame { + Stdin { data: String }, + StdinClose, +} + +#[derive(Debug, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +enum ServerFrame { + Stdout { data: String }, + Stderr { data: String }, + Exit { code: i32 }, + Error { message: String }, +} + +pub fn vm_exec_socket_path(rootfs: &Path) -> PathBuf { + vm_run_dir(rootfs).join(format!("{}-{}", rootfs_key(rootfs), VM_EXEC_SOCKET_NAME)) +} + +pub fn write_vm_runtime_state(rootfs: &Path, pid: i32, console_log: &Path) -> Result<(), VmError> { + let state = VmRuntimeState { + pid, + exec_vsock_port: VM_EXEC_VSOCK_PORT, + socket_path: vm_exec_socket_path(rootfs), + rootfs: rootfs.to_path_buf(), + console_log: console_log.to_path_buf(), + started_at_ms: now_ms()?, + }; + let path = vm_state_path(rootfs); + let bytes = serde_json::to_vec_pretty(&state) + .map_err(|e| VmError::RuntimeState(format!("serialize VM runtime state: {e}")))?; + fs::create_dir_all(vm_run_dir(rootfs)) + .map_err(|e| VmError::RuntimeState(format!("create VM runtime dir: {e}")))?; + fs::write(&path, bytes) + .map_err(|e| VmError::RuntimeState(format!("write {}: {e}", path.display())))?; + Ok(()) +} + +pub fn clear_vm_runtime_state(rootfs: &Path) { + let state_path = vm_state_path(rootfs); + let socket_path = vm_exec_socket_path(rootfs); + let _ = fs::remove_file(state_path); + let _ = fs::remove_file(socket_path); +} + +/// Wipe stale container runtime state from the rootfs. +/// +/// After a crash or unclean shutdown, containerd and kubelet can retain +/// references to pod sandboxes and containers that no longer exist. This +/// causes `ContainerCreating` → `context deadline exceeded` loops because +/// containerd blocks trying to clean up orphaned resources. +/// +/// This function removes: +/// - containerd runtime task state (running container metadata) +/// - containerd sandbox controller shim state +/// - containerd CRI plugin state (pod/container tracking) +/// - containerd tmp mounts +/// - kubelet pod state (volume mounts, pod status) +/// +/// It preserves: +/// - containerd images and content (no re-pull needed) +/// - containerd snapshots (no re-extract needed) +/// - containerd metadata database (meta.db — image/snapshot tracking) +/// - k3s server state (kine/sqlite, TLS certs, manifests) +pub fn reset_runtime_state(rootfs: &Path) -> Result<(), VmError> { + // Full reset: wipe all k3s state so the VM cold-starts from scratch. + // The init script will re-import airgap images, deploy manifests, + // and generate fresh cluster state. This is slower (~30-60s) but + // guarantees no stale state from previous runs. + let dirs_to_remove = [ + // All k3s server state: kine DB, TLS certs, manifests, tokens + rootfs.join("var/lib/rancher/k3s/server"), + // All k3s agent state: containerd images, snapshots, metadata + rootfs.join("var/lib/rancher/k3s/agent/containerd"), + // Stale pod volume mounts and projected secrets + rootfs.join("var/lib/kubelet/pods"), + // CNI state: stale network namespace references from dead pods + rootfs.join("var/lib/cni"), + // Runtime state (PIDs, sockets, containerd socket) + rootfs.join("var/run"), + ]; + + let mut cleaned = 0usize; + for dir in &dirs_to_remove { + if dir.is_dir() { + fs::remove_dir_all(dir).map_err(|e| { + VmError::RuntimeState(format!("reset: remove {}: {e}", dir.display())) + })?; + cleaned += 1; + } + } + + // Remove the pre-initialized sentinel so the init script knows + // this is a cold start and deploys manifests from staging. + // We write a marker file so ensure-vm-rootfs.sh still sees the + // rootfs as built (avoiding a full rebuild) while the init script + // detects the cold start via the missing .initialized sentinel. + let sentinel = rootfs.join("opt/openshell/.initialized"); + let reset_marker = rootfs.join("opt/openshell/.reset"); + if sentinel.exists() { + fs::remove_file(&sentinel).ok(); + fs::write(&reset_marker, "").ok(); + cleaned += 1; + } + + // Rotate PKI: wipe VM-side certs so the init script regenerates + // them on next boot, and wipe host-side mTLS creds so + // bootstrap_gateway() takes the first-boot path and copies the + // new certs down. + let pki_dir = rootfs.join("opt/openshell/pki"); + if pki_dir.is_dir() { + fs::remove_dir_all(&pki_dir).ok(); + cleaned += 1; + eprintln!("Reset: rotated PKI (will regenerate on next boot)"); + } + + // Wipe host-side mTLS credentials so bootstrap picks up the new certs. + if let Ok(home) = std::env::var("HOME") { + let config_base = + std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); + let mtls_dir = PathBuf::from(&config_base) + .join("openshell/gateways") + .join(super::GATEWAY_CLUSTER_NAME) + .join("mtls"); + if mtls_dir.is_dir() { + fs::remove_dir_all(&mtls_dir).ok(); + } + // Also remove metadata so is_warm_boot() returns false. + let metadata = PathBuf::from(&config_base) + .join("openshell/gateways") + .join(super::GATEWAY_CLUSTER_NAME) + .join("metadata.json"); + if metadata.is_file() { + fs::remove_file(&metadata).ok(); + } + } + + eprintln!("Reset: cleaned {cleaned} state directories (full reset)"); + Ok(()) +} + +/// Acquire an exclusive lock on the rootfs lock file. +/// +/// The lock is held for the lifetime of the returned `File` handle. When +/// the process exits (even via SIGKILL), the OS releases the lock +/// automatically. This provides a reliable guard against two VM processes +/// sharing the same rootfs — even if the state file is deleted. +/// +/// Returns `Ok(File)` on success. The caller must keep the `File` alive +/// for as long as the VM is running. +pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { + let lock_path = vm_lock_path(rootfs); + fs::create_dir_all(vm_run_dir(rootfs)) + .map_err(|e| VmError::RuntimeState(format!("create VM runtime dir: {e}")))?; + + // Open (or create) the lock file without truncating so we can read + // the holder's PID for the error message if the lock is held. + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(false) + .open(&lock_path) + .map_err(|e| { + VmError::RuntimeState(format!("open lock file {}: {e}", lock_path.display())) + })?; + + // Try non-blocking exclusive lock. + let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); + let rc = unsafe { libc::flock(fd, libc::LOCK_EX | libc::LOCK_NB) }; + if rc != 0 { + let err = std::io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EWOULDBLOCK) { + // Another process holds the lock — read its PID for diagnostics. + let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default(); + let holder_pid = holder_pid.trim(); + return Err(VmError::RuntimeState(format!( + "another process (pid {holder_pid}) is using rootfs {}. \ + Stop the running VM first", + rootfs.display() + ))); + } + return Err(VmError::RuntimeState(format!( + "lock rootfs {}: {err}", + lock_path.display() + ))); + } + + // Lock acquired — write our PID (truncate first, then write). + // This is informational only; the flock is the real guard. + let _ = file.set_len(0); + { + let mut f = &file; + let _ = write!(f, "{}", std::process::id()); + } + + Ok(file) +} + +/// Check whether the rootfs lock file is currently held by another process. +/// +/// Returns `Ok(())` if the lock is free (or can be acquired), and an +/// `Err` if another process holds it. Does NOT acquire the lock — use +/// [`acquire_rootfs_lock`] for that. +fn check_rootfs_lock_free(rootfs: &Path) -> Result<(), VmError> { + let lock_path = vm_lock_path(rootfs); + if !lock_path.exists() { + return Ok(()); + } + + let Ok(file) = File::open(&lock_path) else { + return Ok(()); // Can't open → treat as free + }; + + let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); + let rc = unsafe { libc::flock(fd, libc::LOCK_EX | libc::LOCK_NB) }; + if rc != 0 { + let err = std::io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EWOULDBLOCK) { + let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default(); + let holder_pid = holder_pid.trim(); + return Err(VmError::RuntimeState(format!( + "another process (pid {holder_pid}) is using rootfs {}. \ + Stop the running VM first", + rootfs.display() + ))); + } + } else { + // We acquired the lock — release it immediately since we're only probing. + unsafe { libc::flock(fd, libc::LOCK_UN) }; + } + + Ok(()) +} + +pub fn ensure_vm_not_running(rootfs: &Path) -> Result<(), VmError> { + // Primary guard: check the flock. This works even if the state file + // has been deleted, because the kernel holds the lock until the + // owning process exits. + check_rootfs_lock_free(rootfs)?; + + // Secondary guard: check the state file for any stale state. + match load_vm_runtime_state(Some(rootfs)) { + Ok(state) => Err(VmError::RuntimeState(format!( + "VM is already running (pid {}) with exec socket {}", + state.pid, + state.socket_path.display() + ))), + Err(VmError::RuntimeState(message)) + if message.starts_with("read VM runtime state") + || message.starts_with("VM is not running") => + { + clear_vm_runtime_state(rootfs); + Ok(()) + } + Err(err) => Err(err), + } +} + +pub fn exec_running_vm(options: VmExecOptions) -> Result { + let state = load_vm_runtime_state(options.rootfs.as_deref())?; + let mut stream = UnixStream::connect(&state.socket_path).map_err(|e| { + VmError::Exec(format!( + "connect to VM exec socket {}: {e}", + state.socket_path.display() + )) + })?; + let mut writer = stream + .try_clone() + .map_err(|e| VmError::Exec(format!("clone VM exec socket: {e}")))?; + + let mut env = options.env; + validate_env_vars(&env)?; + if !env.iter().any(|item| item.starts_with("KUBECONFIG=")) { + env.push(KUBECONFIG_ENV.to_string()); + } + + let request = ExecRequest { + argv: options.command, + env, + cwd: options.workdir, + tty: options.tty, + }; + send_json_line(&mut writer, &request)?; + + let stdin_writer = writer; + thread::spawn(move || { + let _ = pump_stdin(stdin_writer); + }); + + let mut reader = BufReader::new(&mut stream); + let mut line = String::new(); + let stdout = std::io::stdout(); + let stderr = std::io::stderr(); + let mut stdout = stdout.lock(); + let mut stderr = stderr.lock(); + let mut exit_code = None; + + loop { + line.clear(); + let bytes = reader + .read_line(&mut line) + .map_err(|e| VmError::Exec(format!("read VM exec response from guest agent: {e}")))?; + if bytes == 0 { + break; + } + + let frame: ServerFrame = serde_json::from_str(line.trim_end()) + .map_err(|e| VmError::Exec(format!("decode VM exec response frame: {e}")))?; + + match frame { + ServerFrame::Stdout { data } => { + let bytes = decode_payload(&data)?; + stdout + .write_all(&bytes) + .map_err(|e| VmError::Exec(format!("write guest stdout: {e}")))?; + stdout + .flush() + .map_err(|e| VmError::Exec(format!("flush guest stdout: {e}")))?; + } + ServerFrame::Stderr { data } => { + let bytes = decode_payload(&data)?; + stderr + .write_all(&bytes) + .map_err(|e| VmError::Exec(format!("write guest stderr: {e}")))?; + stderr + .flush() + .map_err(|e| VmError::Exec(format!("flush guest stderr: {e}")))?; + } + ServerFrame::Exit { code } => { + exit_code = Some(code); + break; + } + ServerFrame::Error { message } => { + return Err(VmError::Exec(message)); + } + } + } + + exit_code.ok_or_else(|| { + VmError::Exec("VM exec agent disconnected before returning an exit code".to_string()) + }) +} + +fn vm_run_dir(rootfs: &Path) -> PathBuf { + rootfs.parent().unwrap_or(rootfs).to_path_buf() +} + +fn vm_state_path(rootfs: &Path) -> PathBuf { + vm_run_dir(rootfs).join(format!("{}-{}", rootfs_key(rootfs), VM_STATE_NAME)) +} + +fn vm_lock_path(rootfs: &Path) -> PathBuf { + vm_run_dir(rootfs).join(format!("{}-{}", rootfs_key(rootfs), VM_LOCK_NAME)) +} + +fn rootfs_key(rootfs: &Path) -> String { + let name = rootfs + .file_name() + .and_then(|part| part.to_str()) + .unwrap_or("openshell-vm"); + let mut out = String::with_capacity(name.len()); + for ch in name.chars() { + if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { + out.push(ch); + } else { + out.push('_'); + } + } + if out.is_empty() { + "openshell-vm".to_string() + } else { + out + } +} + +fn default_rootfs() -> Result { + openshell_bootstrap::paths::default_rootfs_dir() + .map_err(|e| VmError::RuntimeState(format!("resolve default VM rootfs: {e}"))) +} + +fn load_vm_runtime_state(rootfs: Option<&Path>) -> Result { + let rootfs = match rootfs { + Some(rootfs) => rootfs.to_path_buf(), + None => default_rootfs()?, + }; + let path = vm_state_path(&rootfs); + let bytes = fs::read(&path).map_err(|e| { + VmError::RuntimeState(format!( + "read VM runtime state {}: {e}. Start the VM with `openshell-vm` first", + path.display() + )) + })?; + let state: VmRuntimeState = serde_json::from_slice(&bytes) + .map_err(|e| VmError::RuntimeState(format!("decode VM runtime state: {e}")))?; + + if !process_alive(state.pid) { + clear_vm_runtime_state(&state.rootfs); + return Err(VmError::RuntimeState(format!( + "VM is not running (stale pid {})", + state.pid + ))); + } + + if !state.socket_path.exists() { + return Err(VmError::RuntimeState(format!( + "VM exec socket is not ready: {}", + state.socket_path.display() + ))); + } + + Ok(state) +} + +fn validate_env_vars(items: &[String]) -> Result<(), VmError> { + for item in items { + let (key, _value) = item.split_once('=').ok_or_else(|| { + VmError::Exec(format!( + "invalid environment variable `{item}`; expected KEY=VALUE" + )) + })?; + if key.is_empty() + || !key.chars().enumerate().all(|(idx, ch)| { + ch == '_' || ch.is_ascii_alphanumeric() && (idx > 0 || !ch.is_ascii_digit()) + }) + { + return Err(VmError::Exec(format!( + "invalid environment variable name `{key}`" + ))); + } + } + Ok(()) +} + +fn send_json_line(writer: &mut UnixStream, value: &T) -> Result<(), VmError> { + let mut bytes = serde_json::to_vec(value) + .map_err(|e| VmError::Exec(format!("encode VM exec request: {e}")))?; + bytes.push(b'\n'); + writer + .write_all(&bytes) + .map_err(|e| VmError::Exec(format!("write VM exec request: {e}"))) +} + +fn pump_stdin(mut writer: UnixStream) -> Result<(), VmError> { + let stdin = std::io::stdin(); + let mut stdin = stdin.lock(); + let mut buf = [0u8; 8192]; + + loop { + let read = stdin + .read(&mut buf) + .map_err(|e| VmError::Exec(format!("read local stdin: {e}")))?; + if read == 0 { + break; + } + let frame = ClientFrame::Stdin { + data: base64::engine::general_purpose::STANDARD.encode(&buf[..read]), + }; + send_json_line(&mut writer, &frame)?; + } + + send_json_line(&mut writer, &ClientFrame::StdinClose) +} + +fn decode_payload(data: &str) -> Result, VmError> { + base64::engine::general_purpose::STANDARD + .decode(data) + .map_err(|e| VmError::Exec(format!("decode VM exec payload: {e}"))) +} + +fn process_alive(pid: i32) -> bool { + let rc = unsafe { libc::kill(pid, 0) }; + if rc == 0 { + return true; + } + std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM) +} + +fn now_ms() -> Result { + let duration = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|e| VmError::RuntimeState(format!("read system clock: {e}")))?; + Ok(duration.as_millis()) +} diff --git a/crates/openshell-vm/src/ffi.rs b/crates/openshell-vm/src/ffi.rs new file mode 100644 index 000000000..1ae7651cd --- /dev/null +++ b/crates/openshell-vm/src/ffi.rs @@ -0,0 +1,312 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal runtime-loaded bindings for the libkrun C API. +//! +//! We intentionally do not link libkrun at build time. Instead, the +//! `openshell-vm` binary loads `libkrun` from the staged `openshell-vm.runtime/` +//! sidecar bundle on first use. + +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; + +use libc::c_char; +use libloading::Library; + +use crate::VmError; + +/// Runtime provenance information extracted from the bundle. +#[derive(Debug, Clone)] +pub struct RuntimeProvenance { + /// Path to the libkrun library that was loaded. + pub libkrun_path: PathBuf, + /// Paths to all libkrunfw libraries that were preloaded. + pub libkrunfw_paths: Vec, + /// SHA-256 hash of the primary libkrunfw artifact (if computable). + pub libkrunfw_sha256: Option, + /// Contents of provenance.json if present in the runtime bundle. + pub provenance_json: Option, + /// Whether this is a custom (OpenShell-built) runtime. + pub is_custom: bool, +} + +pub const KRUN_LOG_TARGET_DEFAULT: i32 = -1; +pub const KRUN_LOG_LEVEL_OFF: u32 = 0; +pub const KRUN_LOG_LEVEL_ERROR: u32 = 1; +pub const KRUN_LOG_LEVEL_WARN: u32 = 2; +pub const KRUN_LOG_LEVEL_INFO: u32 = 3; +pub const KRUN_LOG_LEVEL_DEBUG: u32 = 4; +pub const KRUN_LOG_LEVEL_TRACE: u32 = 5; +pub const KRUN_LOG_STYLE_AUTO: u32 = 0; +pub const KRUN_LOG_OPTION_NO_ENV: u32 = 1; + +type KrunInitLog = + unsafe extern "C" fn(target_fd: i32, level: u32, style: u32, options: u32) -> i32; +type KrunCreateCtx = unsafe extern "C" fn() -> i32; +type KrunFreeCtx = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunSetVmConfig = unsafe extern "C" fn(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32; +type KrunSetRoot = unsafe extern "C" fn(ctx_id: u32, root_path: *const c_char) -> i32; +type KrunSetWorkdir = unsafe extern "C" fn(ctx_id: u32, workdir_path: *const c_char) -> i32; +type KrunSetExec = unsafe extern "C" fn( + ctx_id: u32, + exec_path: *const c_char, + argv: *const *const c_char, + envp: *const *const c_char, +) -> i32; +type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32; +type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32; +type KrunAddVsockPort2 = + unsafe extern "C" fn(ctx_id: u32, port: u32, c_filepath: *const c_char, listen: bool) -> i32; +type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunAddVsock = unsafe extern "C" fn(ctx_id: u32, tsi_features: u32) -> i32; +type KrunAddNetUnixgram = unsafe extern "C" fn( + ctx_id: u32, + c_path: *const c_char, + fd: i32, + c_mac: *const u8, + features: u32, + flags: u32, +) -> i32; + +pub struct LibKrun { + pub krun_init_log: KrunInitLog, + pub krun_create_ctx: KrunCreateCtx, + pub krun_free_ctx: KrunFreeCtx, + pub krun_set_vm_config: KrunSetVmConfig, + pub krun_set_root: KrunSetRoot, + pub krun_set_workdir: KrunSetWorkdir, + pub krun_set_exec: KrunSetExec, + pub krun_set_port_map: KrunSetPortMap, + pub krun_set_console_output: KrunSetConsoleOutput, + pub krun_add_vsock_port2: KrunAddVsockPort2, + pub krun_start_enter: KrunStartEnter, + pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, + pub krun_add_vsock: KrunAddVsock, + pub krun_add_net_unixgram: KrunAddNetUnixgram, +} + +static LIBKRUN: OnceLock = OnceLock::new(); +static RUNTIME_PROVENANCE: OnceLock = OnceLock::new(); + +pub fn libkrun() -> Result<&'static LibKrun, VmError> { + if let Some(lib) = LIBKRUN.get() { + return Ok(lib); + } + + let loaded = LibKrun::load()?; + let _ = LIBKRUN.set(loaded); + Ok(LIBKRUN.get().expect("libkrun should be initialized")) +} + +/// Return the provenance information for the loaded runtime. +/// +/// Only available after [`libkrun()`] has been called successfully. +pub fn runtime_provenance() -> Option<&'static RuntimeProvenance> { + RUNTIME_PROVENANCE.get() +} + +impl LibKrun { + fn load() -> Result { + let path = runtime_libkrun_path()?; + let runtime_dir = path.parent().ok_or_else(|| { + VmError::HostSetup(format!("libkrun has no parent dir: {}", path.display())) + })?; + let krunfw_paths = preload_runtime_support_libraries(runtime_dir)?; + + // Build and store provenance information. + let provenance_json_path = runtime_dir.join("provenance.json"); + let provenance_json = fs::read_to_string(&provenance_json_path).ok(); + let is_custom = provenance_json.is_some(); + + let libkrunfw_sha256 = krunfw_paths.first().and_then(|p| compute_sha256(p).ok()); + + let provenance = RuntimeProvenance { + libkrun_path: path.clone(), + libkrunfw_paths: krunfw_paths, + libkrunfw_sha256, + provenance_json, + is_custom, + }; + let _ = RUNTIME_PROVENANCE.set(provenance); + + let library = Box::leak(Box::new(unsafe { + Library::new(&path).map_err(|e| { + VmError::HostSetup(format!("load libkrun from {}: {e}", path.display())) + })? + })); + + Ok(Self { + krun_init_log: load_symbol(library, b"krun_init_log\0", &path)?, + krun_create_ctx: load_symbol(library, b"krun_create_ctx\0", &path)?, + krun_free_ctx: load_symbol(library, b"krun_free_ctx\0", &path)?, + krun_set_vm_config: load_symbol(library, b"krun_set_vm_config\0", &path)?, + krun_set_root: load_symbol(library, b"krun_set_root\0", &path)?, + krun_set_workdir: load_symbol(library, b"krun_set_workdir\0", &path)?, + krun_set_exec: load_symbol(library, b"krun_set_exec\0", &path)?, + krun_set_port_map: load_symbol(library, b"krun_set_port_map\0", &path)?, + krun_set_console_output: load_symbol(library, b"krun_set_console_output\0", &path)?, + krun_add_vsock_port2: load_symbol(library, b"krun_add_vsock_port2\0", &path)?, + krun_start_enter: load_symbol(library, b"krun_start_enter\0", &path)?, + krun_disable_implicit_vsock: load_symbol( + library, + b"krun_disable_implicit_vsock\0", + &path, + )?, + krun_add_vsock: load_symbol(library, b"krun_add_vsock\0", &path)?, + krun_add_net_unixgram: load_symbol(library, b"krun_add_net_unixgram\0", &path)?, + }) + } +} + +fn runtime_libkrun_path() -> Result { + Ok(crate::configured_runtime_dir()?.join(required_runtime_lib_name())) +} + +fn preload_runtime_support_libraries(runtime_dir: &Path) -> Result, VmError> { + let entries = fs::read_dir(runtime_dir) + .map_err(|e| VmError::HostSetup(format!("read {}: {e}", runtime_dir.display())))?; + + let mut support_libs: Vec = entries + .filter_map(Result::ok) + .map(|entry| entry.path()) + .filter(|path| { + path.file_name() + .and_then(|name| name.to_str()) + .map(|name| { + #[cfg(target_os = "macos")] + { + name.starts_with("libkrunfw") && name.ends_with(".dylib") + } + #[cfg(not(target_os = "macos"))] + { + name.starts_with("libkrunfw") && name.contains(".so") + } + }) + .unwrap_or(false) + }) + .collect(); + + support_libs.sort(); + + for path in &support_libs { + let path_cstr = std::ffi::CString::new(path.to_string_lossy().as_bytes()).map_err(|e| { + VmError::HostSetup(format!( + "invalid support library path {}: {e}", + path.display() + )) + })?; + let handle = + unsafe { libc::dlopen(path_cstr.as_ptr(), libc::RTLD_NOW | libc::RTLD_GLOBAL) }; + if handle.is_null() { + let error = unsafe { + let err = libc::dlerror(); + if err.is_null() { + "unknown dlopen error".to_string() + } else { + std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned() + } + }; + return Err(VmError::HostSetup(format!( + "preload runtime support library {}: {error}", + path.display() + ))); + } + } + + Ok(support_libs) +} + +fn required_runtime_lib_name() -> &'static str { + #[cfg(target_os = "macos")] + { + "libkrun.dylib" + } + #[cfg(not(target_os = "macos"))] + { + "libkrun.so" + } +} + +/// Compute SHA-256 hash of a file, returning hex string. +fn compute_sha256(path: &Path) -> Result { + use std::io::Read; + let mut file = fs::File::open(path)?; + let mut hasher = sha2_hasher(); + let mut buf = [0u8; 8192]; + loop { + let n = file.read(&mut buf)?; + if n == 0 { + break; + } + hasher_update(&mut hasher, &buf[..n]); + } + Ok(hasher_finalize(hasher)) +} + +// Minimal SHA-256 using the sha2 crate if available, otherwise shell out. +// We attempt a runtime `shasum` call to avoid adding a crate dependency. +fn sha2_hasher() -> Sha256State { + Sha256State { + data: Vec::with_capacity(1024 * 1024), + } +} + +struct Sha256State { + data: Vec, +} + +fn hasher_update(state: &mut Sha256State, bytes: &[u8]) { + state.data.extend_from_slice(bytes); +} + +fn hasher_finalize(state: Sha256State) -> String { + // Use shasum via process for simplicity — avoids adding a crypto dependency. + use std::io::Write; + use std::process::{Command, Stdio}; + + let mut child = match Command::new("shasum") + .args(["-a", "256"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .spawn() + { + Ok(c) => c, + Err(_) => return "unknown".to_string(), + }; + + if let Some(mut stdin) = child.stdin.take() { + let _ = stdin.write_all(&state.data); + } + + match child.wait_with_output() { + Ok(output) if output.status.success() => { + let stdout = String::from_utf8_lossy(&output.stdout); + stdout + .split_whitespace() + .next() + .unwrap_or("unknown") + .to_string() + } + _ => "unknown".to_string(), + } +} + +fn load_symbol( + library: &'static Library, + symbol: &[u8], + path: &Path, +) -> Result { + let loaded = unsafe { + library.get::(symbol).map_err(|e| { + VmError::HostSetup(format!( + "resolve {} from {}: {e}", + String::from_utf8_lossy(symbol).trim_end_matches('\0'), + path.display() + )) + })? + }; + Ok(*loaded) +} diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs new file mode 100644 index 000000000..a1dfedb8c --- /dev/null +++ b/crates/openshell-vm/src/lib.rs @@ -0,0 +1,1454 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! `MicroVM` runtime using libkrun for hardware-isolated execution. +//! +//! This crate provides a thin wrapper around the libkrun C API to boot +//! lightweight VMs backed by virtio-fs root filesystems. On macOS ARM64, +//! it uses Apple's Hypervisor.framework; on Linux it uses KVM. +//! +//! # Codesigning (macOS) +//! +//! The calling binary must be codesigned with the +//! `com.apple.security.hypervisor` entitlement. See `entitlements.plist`. + +#![allow(unsafe_code)] + +mod exec; +mod ffi; + +use std::ffi::CString; +use std::path::{Path, PathBuf}; +use std::ptr; +use std::time::Instant; + +pub use exec::{ + VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state, + ensure_vm_not_running, exec_running_vm, reset_runtime_state, vm_exec_socket_path, + write_vm_runtime_state, +}; + +// ── Error type ───────────────────────────────────────────────────────── + +/// Errors that can occur when configuring or launching a microVM. +#[derive(Debug, thiserror::Error, miette::Diagnostic)] +pub enum VmError { + /// A libkrun FFI call returned a negative error code. + #[error("{func} failed with error code {code}")] + Krun { func: &'static str, code: i32 }, + + /// The rootfs directory does not exist. + #[error( + "rootfs directory not found: {path}\nRun: ./crates/openshell-vm/scripts/build-rootfs.sh" + )] + RootfsNotFound { path: String }, + + /// A path contained invalid UTF-8. + #[error("path is not valid UTF-8: {0}")] + InvalidPath(String), + + /// `CString::new` failed (embedded NUL byte). + #[error("invalid C string: {0}")] + CString(#[from] std::ffi::NulError), + + /// A required host binary was not found. + #[error("required binary not found: {path}\n{hint}")] + BinaryNotFound { path: String, hint: String }, + + /// Host-side VM setup failed before boot. + #[error("host setup failed: {0}")] + HostSetup(String), + + /// `fork()` failed. + #[error("fork() failed: {0}")] + Fork(String), + + /// Post-boot bootstrap failed. + #[error("bootstrap failed: {0}")] + Bootstrap(String), + + /// Local VM runtime state could not be read or written. + #[error("VM runtime state error: {0}")] + RuntimeState(String), + + /// Exec operation against a running VM failed. + #[error("VM exec failed: {0}")] + Exec(String), +} + +/// Check a libkrun return code; negative values are errors. +fn check(ret: i32, func: &'static str) -> Result<(), VmError> { + if ret < 0 { + Err(VmError::Krun { func, code: ret }) + } else { + Ok(()) + } +} + +// ── Configuration ────────────────────────────────────────────────────── + +/// Networking backend for the microVM. +#[derive(Debug, Clone)] +pub enum NetBackend { + /// TSI (Transparent Socket Impersonation) — default libkrun networking. + /// Simple but intercepts guest loopback connections, breaking k3s. + Tsi, + + /// No networking — disable vsock/TSI entirely. For debugging only. + None, + + /// gvproxy (vfkit mode) — real `eth0` interface via virtio-net. + /// Requires gvproxy binary on the host. Port forwarding is done + /// through gvproxy's HTTP API. + Gvproxy { + /// Path to the gvproxy binary. + binary: PathBuf, + }, +} + +/// Host Unix socket bridged into the guest as a vsock port. +#[derive(Debug, Clone)] +pub struct VsockPort { + pub port: u32, + pub socket_path: PathBuf, + pub listen: bool, +} + +/// Configuration for a libkrun microVM. +pub struct VmConfig { + /// Path to the extracted rootfs directory (aarch64 Linux). + pub rootfs: PathBuf, + + /// Number of virtual CPUs. + pub vcpus: u8, + + /// RAM in MiB. + pub mem_mib: u32, + + /// Executable path inside the VM. + pub exec_path: String, + + /// Arguments to the executable (argv, excluding argv\[0\]). + pub args: Vec, + + /// Environment variables in `KEY=VALUE` form. + /// If empty, a minimal default set is used. + pub env: Vec, + + /// Working directory inside the VM. + pub workdir: String, + + /// TCP port mappings in `"host_port:guest_port"` form. + /// Only used with TSI networking. + pub port_map: Vec, + + /// Optional host Unix sockets exposed to the guest over vsock. + pub vsock_ports: Vec, + + /// libkrun log level (0=Off .. 5=Trace). + pub log_level: u32, + + /// Optional file path for VM console output. If `None`, console output + /// goes to the parent directory of the rootfs as `console.log`. + pub console_output: Option, + + /// Networking backend. + pub net: NetBackend, + + /// Wipe all runtime state (containerd tasks/sandboxes, kubelet pods) + /// before booting. Recovers from corrupted state after a crash. + pub reset: bool, +} + +impl VmConfig { + /// Default gateway configuration: boots k3s server inside the VM. + /// + /// Runs `/srv/openshell-vm-init.sh` which mounts essential filesystems, + /// deploys the OpenShell helm chart, and execs `k3s server`. + /// Exposes the OpenShell gateway on port 30051. + pub fn gateway(rootfs: PathBuf) -> Self { + Self { + vsock_ports: vec![VsockPort { + port: VM_EXEC_VSOCK_PORT, + socket_path: vm_exec_socket_path(&rootfs), + listen: true, + }], + rootfs, + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".to_string(), + args: vec![], + env: vec![ + "HOME=/root".to_string(), + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + "TERM=xterm".to_string(), + ], + workdir: "/".to_string(), + port_map: vec![ + // OpenShell server — with bridge CNI the pod listens on + // 8080 inside its own network namespace (10.42.0.x), not + // on the VM's root namespace. The NodePort service + // (kube-proxy nftables) forwards VM:30051 → pod:8080. + // gvproxy maps host:30051 → VM:30051 to complete the path. + "30051:30051".to_string(), + ], + log_level: 3, // Info — for debugging + console_output: None, + net: NetBackend::Gvproxy { + binary: default_runtime_gvproxy_path(), + }, + reset: false, + } + } +} + +// ── Helpers ───────────────────────────────────────────────────────────── + +/// Build a null-terminated C string array from a slice of strings. +/// +/// Returns both the `CString` owners (to keep them alive) and the pointer array. +fn c_string_array(strings: &[&str]) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { + let owned: Vec = strings + .iter() + .map(|s| CString::new(*s)) + .collect::, _>>()?; + let mut ptrs: Vec<*const libc::c_char> = owned.iter().map(|c| c.as_ptr()).collect(); + ptrs.push(ptr::null()); // null terminator + Ok((owned, ptrs)) +} + +const VM_RUNTIME_DIR_NAME: &str = "openshell-vm.runtime"; +const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR"; + +pub(crate) fn configured_runtime_dir() -> Result { + if let Some(path) = std::env::var_os(VM_RUNTIME_DIR_ENV) { + return Ok(PathBuf::from(path)); + } + + let exe = std::env::current_exe().map_err(|e| VmError::HostSetup(e.to_string()))?; + let exe_dir = exe.parent().ok_or_else(|| { + VmError::HostSetup(format!( + "executable has no parent directory: {}", + exe.display() + )) + })?; + Ok(exe_dir.join(VM_RUNTIME_DIR_NAME)) +} + +fn required_runtime_lib_name() -> &'static str { + #[cfg(target_os = "macos")] + { + "libkrun.dylib" + } + #[cfg(not(target_os = "macos"))] + { + "libkrun.so" + } +} + +fn validate_runtime_dir(dir: &Path) -> Result { + if !dir.is_dir() { + return Err(VmError::BinaryNotFound { + path: dir.display().to_string(), + hint: format!( + "stage the VM runtime bundle with `mise run vm:bundle-runtime` or set {VM_RUNTIME_DIR_ENV}" + ), + }); + } + + let libkrun = dir.join(required_runtime_lib_name()); + if !libkrun.is_file() { + return Err(VmError::BinaryNotFound { + path: libkrun.display().to_string(), + hint: "runtime bundle is incomplete: missing libkrun".to_string(), + }); + } + + let has_krunfw = std::fs::read_dir(dir) + .map_err(|e| VmError::HostSetup(format!("read {}: {e}", dir.display())))? + .filter_map(Result::ok) + .any(|entry| { + entry + .file_name() + .to_string_lossy() + .starts_with("libkrunfw.") + }); + if !has_krunfw { + return Err(VmError::BinaryNotFound { + path: dir.display().to_string(), + hint: "runtime bundle is incomplete: missing libkrunfw".to_string(), + }); + } + + let gvproxy = dir.join("gvproxy"); + if !gvproxy.is_file() { + return Err(VmError::BinaryNotFound { + path: gvproxy.display().to_string(), + hint: "runtime bundle is incomplete: missing gvproxy".to_string(), + }); + } + + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt as _; + + let mode = std::fs::metadata(&gvproxy) + .map_err(|e| VmError::HostSetup(format!("stat {}: {e}", gvproxy.display())))? + .permissions() + .mode(); + if mode & 0o111 == 0 { + return Err(VmError::HostSetup(format!( + "gvproxy is not executable: {}", + gvproxy.display() + ))); + } + } + + // Validate manifest.json if present — warn but don't fail if files + // listed in the manifest are missing (backwards compatibility). + let manifest_path = dir.join("manifest.json"); + if manifest_path.is_file() { + if let Ok(contents) = std::fs::read_to_string(&manifest_path) { + // Simple check: verify all listed files exist. + // The manifest lists files as JSON strings in a "files" array. + for line in contents.lines() { + let trimmed = line.trim().trim_matches(|c| c == '"' || c == ','); + if !trimmed.is_empty() + && !trimmed.starts_with('{') + && !trimmed.starts_with('}') + && !trimmed.starts_with('[') + && !trimmed.starts_with(']') + && !trimmed.contains(':') + { + let file_path = dir.join(trimmed); + if !file_path.exists() { + eprintln!( + "warning: manifest.json references missing file: {}", + trimmed + ); + } + } + } + } + } + + Ok(gvproxy) +} + +fn resolve_runtime_bundle() -> Result { + let runtime_dir = configured_runtime_dir()?; + validate_runtime_dir(&runtime_dir) +} + +pub fn default_runtime_gvproxy_path() -> PathBuf { + configured_runtime_dir() + .unwrap_or_else(|_| PathBuf::from(VM_RUNTIME_DIR_NAME)) + .join("gvproxy") +} + +#[cfg(target_os = "macos")] +fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), VmError> { + let existing = std::env::var_os("DYLD_FALLBACK_LIBRARY_PATH"); + let mut paths = vec![runtime_dir.to_path_buf()]; + if let Some(existing) = existing { + paths.extend(std::env::split_paths(&existing)); + } + let joined = std::env::join_paths(paths) + .map_err(|e| VmError::HostSetup(format!("join DYLD_FALLBACK_LIBRARY_PATH: {e}")))?; + unsafe { + std::env::set_var("DYLD_FALLBACK_LIBRARY_PATH", joined); + } + Ok(()) +} + +#[cfg(not(target_os = "macos"))] +fn configure_runtime_loader_env(_runtime_dir: &Path) -> Result<(), VmError> { + Ok(()) +} + +fn raise_nofile_limit() { + #[cfg(unix)] + unsafe { + let mut rlim = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + if libc::getrlimit(libc::RLIMIT_NOFILE, &raw mut rlim) == 0 { + rlim.rlim_cur = rlim.rlim_max; + let _ = libc::setrlimit(libc::RLIMIT_NOFILE, &rlim); + } + } +} + +/// Log runtime provenance information for diagnostics. +/// +/// Prints the libkrun/libkrunfw versions, artifact hashes, and whether +/// a custom runtime is in use. This makes it easy to correlate VM issues +/// with the specific runtime bundle. +fn log_runtime_provenance(runtime_dir: &Path) { + if let Some(prov) = ffi::runtime_provenance() { + eprintln!("runtime: {}", runtime_dir.display()); + eprintln!(" libkrun: {}", prov.libkrun_path.display()); + for krunfw in &prov.libkrunfw_paths { + let name = krunfw + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown".to_string()); + eprintln!(" libkrunfw: {name}"); + } + if let Some(ref sha) = prov.libkrunfw_sha256 { + let short = if sha.len() > 12 { &sha[..12] } else { sha }; + eprintln!(" sha256: {short}..."); + } + if prov.is_custom { + eprintln!(" type: custom (OpenShell-built)"); + // Parse provenance.json for additional details. + if let Some(ref json) = prov.provenance_json { + // Extract key fields without pulling in serde_json for this. + for key in &["libkrunfw_commit", "kernel_version", "build_timestamp"] { + if let Some(val) = extract_json_string(json, key) { + eprintln!(" {}: {}", key.replace('_', "-"), val); + } + } + } + } else { + eprintln!(" type: stock (system/homebrew)"); + } + } +} + +/// Simple JSON string value extractor (avoids serde_json dependency +/// for this single use case). +fn extract_json_string(json: &str, key: &str) -> Option { + let pattern = format!("\"{}\"", key); + let idx = json.find(&pattern)?; + let after_key = &json[idx + pattern.len()..]; + // Skip whitespace and colon + let after_colon = after_key.trim_start().strip_prefix(':')?; + let after_ws = after_colon.trim_start(); + if after_ws.starts_with('"') { + let value_start = &after_ws[1..]; + let end = value_start.find('"')?; + Some(value_start[..end].to_string()) + } else { + None + } +} + +fn clamp_log_level(level: u32) -> u32 { + match level { + 0 => ffi::KRUN_LOG_LEVEL_OFF, + 1 => ffi::KRUN_LOG_LEVEL_ERROR, + 2 => ffi::KRUN_LOG_LEVEL_WARN, + 3 => ffi::KRUN_LOG_LEVEL_INFO, + 4 => ffi::KRUN_LOG_LEVEL_DEBUG, + _ => ffi::KRUN_LOG_LEVEL_TRACE, + } +} + +struct VmContext { + krun: &'static ffi::LibKrun, + ctx_id: u32, +} + +impl VmContext { + fn create(log_level: u32) -> Result { + let krun = ffi::libkrun()?; + unsafe { + check( + (krun.krun_init_log)( + ffi::KRUN_LOG_TARGET_DEFAULT, + clamp_log_level(log_level), + ffi::KRUN_LOG_STYLE_AUTO, + ffi::KRUN_LOG_OPTION_NO_ENV, + ), + "krun_init_log", + )?; + } + + let ctx_id = unsafe { (krun.krun_create_ctx)() }; + if ctx_id < 0 { + return Err(VmError::Krun { + func: "krun_create_ctx", + code: ctx_id, + }); + } + + Ok(Self { + krun, + ctx_id: ctx_id as u32, + }) + } + + fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), + "krun_set_vm_config", + ) + } + } + + fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { + let rootfs_c = path_to_cstring(rootfs)?; + unsafe { + check( + (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), + "krun_set_root", + ) + } + } + + fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { + let workdir_c = CString::new(workdir)?; + unsafe { + check( + (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), + "krun_set_workdir", + ) + } + } + + fn disable_implicit_vsock(&self) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_disable_implicit_vsock)(self.ctx_id), + "krun_disable_implicit_vsock", + ) + } + } + + fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), + "krun_add_vsock", + ) + } + } + + fn add_net_unixgram( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + flags: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixgram)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + flags, + ), + "krun_add_net_unixgram", + ) + } + } + + fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { + let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); + let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; + unsafe { + check( + (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), + "krun_set_port_map", + ) + } + } + + fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { + let socket_c = path_to_cstring(&port.socket_path)?; + unsafe { + check( + (self.krun.krun_add_vsock_port2)( + self.ctx_id, + port.port, + socket_c.as_ptr(), + port.listen, + ), + "krun_add_vsock_port2", + ) + } + } + + fn set_console_output(&self, path: &Path) -> Result<(), VmError> { + let console_c = path_to_cstring(path)?; + unsafe { + check( + (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), + "krun_set_console_output", + ) + } + } + + fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { + let exec_c = CString::new(exec_path)?; + let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); + let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; + let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); + let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; + + unsafe { + check( + (self.krun.krun_set_exec)( + self.ctx_id, + exec_c.as_ptr(), + argv_ptrs.as_ptr(), + env_ptrs.as_ptr(), + ), + "krun_set_exec", + ) + } + } + + fn start_enter(&self) -> i32 { + unsafe { (self.krun.krun_start_enter)(self.ctx_id) } + } +} + +impl Drop for VmContext { + fn drop(&mut self) { + unsafe { + let _ = (self.krun.krun_free_ctx)(self.ctx_id); + } + } +} + +/// Issue a gvproxy expose call via its HTTP API (unix socket). +/// +/// Sends a raw HTTP/1.1 POST request over the unix socket to avoid +/// depending on `curl` being installed on the host. +fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { + use std::io::{Read, Write}; + use std::os::unix::net::UnixStream; + + let mut stream = + UnixStream::connect(api_sock).map_err(|e| format!("connect to gvproxy API socket: {e}"))?; + + let request = format!( + "POST /services/forwarder/expose HTTP/1.1\r\n\ + Host: localhost\r\n\ + Content-Type: application/json\r\n\ + Content-Length: {}\r\n\ + Connection: close\r\n\ + \r\n\ + {}", + body.len(), + body, + ); + + stream + .write_all(request.as_bytes()) + .map_err(|e| format!("write to gvproxy API: {e}"))?; + + // Read just enough of the response to get the status line. + let mut buf = [0u8; 1024]; + let n = stream + .read(&mut buf) + .map_err(|e| format!("read from gvproxy API: {e}"))?; + let response = String::from_utf8_lossy(&buf[..n]); + + // Parse the HTTP status code from the first line (e.g. "HTTP/1.1 200 OK"). + let status = response + .lines() + .next() + .and_then(|line| line.split_whitespace().nth(1)) + .unwrap_or("0"); + + match status { + "200" | "204" => Ok(()), + _ => { + let first_line = response.lines().next().unwrap_or(""); + Err(format!("gvproxy API: {first_line}")) + } + } +} + +/// Kill any stale gvproxy process from a previous openshell-vm run. +/// +/// If the CLI crashes or is killed before cleanup, gvproxy keeps running +/// and holds port 2222. A new gvproxy instance then fails with +/// "bind: address already in use". +fn kill_stale_gvproxy() { + let output = std::process::Command::new("pkill") + .args(["-x", "gvproxy"]) + .output(); + if let Ok(o) = output { + if o.status.success() { + eprintln!("Killed stale gvproxy process"); + // Brief pause for the port to be released. + std::thread::sleep(std::time::Duration::from_millis(200)); + } + } +} + +fn path_to_cstring(path: &Path) -> Result { + let s = path + .to_str() + .ok_or_else(|| VmError::InvalidPath(path.display().to_string()))?; + Ok(CString::new(s)?) +} + +// ── Launch ────────────────────────────────────────────────────────────── + +/// Configure and launch a libkrun microVM. +/// +/// This forks the process. The child enters the VM (never returns); the +/// parent blocks until the VM exits or a signal is received. +/// +/// Returns the VM exit code (from `waitpid`). +#[allow(clippy::similar_names)] +pub fn launch(config: &VmConfig) -> Result { + // Validate rootfs + if !config.rootfs.is_dir() { + return Err(VmError::RootfsNotFound { + path: config.rootfs.display().to_string(), + }); + } + if config.exec_path == "/srv/openshell-vm-init.sh" { + ensure_vm_not_running(&config.rootfs)?; + } + + // Acquire an exclusive flock on the rootfs lock file. This is held + // by the parent process for the VM's entire lifetime. If this process + // is killed (even SIGKILL), the OS releases the lock automatically. + // This prevents a second launch or rootfs rebuild from corrupting a + // running VM's filesystem via virtio-fs. + let _rootfs_lock = if config.exec_path == "/srv/openshell-vm-init.sh" { + Some(acquire_rootfs_lock(&config.rootfs)?) + } else { + None + }; + + // Wipe stale containerd/kubelet runtime state if requested. + // This must happen after the lock (to confirm no other VM is using + // the rootfs) but before booting (so the new VM starts clean). + if config.reset { + reset_runtime_state(&config.rootfs)?; + } + + let launch_start = Instant::now(); + eprintln!("rootfs: {}", config.rootfs.display()); + eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib); + + // The runtime must already be staged as a sidecar bundle next to the + // binary (or explicitly pointed to via OPENSHELL_VM_RUNTIME_DIR). + let runtime_gvproxy = resolve_runtime_bundle()?; + let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { + VmError::HostSetup(format!( + "runtime bundle file has no parent directory: {}", + runtime_gvproxy.display() + )) + })?; + configure_runtime_loader_env(runtime_dir)?; + raise_nofile_limit(); + + // ── Log runtime provenance ───────────────────────────────────── + // After configuring the loader, trigger library loading so that + // provenance is captured before we proceed with VM configuration. + let _ = ffi::libkrun()?; + log_runtime_provenance(runtime_dir); + + // ── Configure the microVM ────────────────────────────────────── + + let vm = VmContext::create(config.log_level)?; + vm.set_vm_config(config.vcpus, config.mem_mib)?; + vm.set_root(&config.rootfs)?; + vm.set_workdir(&config.workdir)?; + + // Networking setup + let mut gvproxy_child: Option = None; + let mut gvproxy_api_sock: Option = None; + + match &config.net { + NetBackend::Tsi => { + // Default TSI — no special setup needed. + } + NetBackend::None => { + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + eprintln!("Networking: disabled (no TSI, no virtio-net)"); + } + NetBackend::Gvproxy { binary } => { + if !binary.exists() { + return Err(VmError::BinaryNotFound { + path: binary.display().to_string(), + hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), + }); + } + + // Create temp socket paths + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let vfkit_sock = run_dir.join("gvproxy-vfkit.sock"); + let api_sock = run_dir.join("gvproxy-api.sock"); + + // Kill any stale gvproxy process from a previous run. + // If gvproxy is still holding port 2222, the new instance + // will fail with "bind: address already in use". + kill_stale_gvproxy(); + + // Clean stale sockets (including the -krun.sock file that + // libkrun creates as its datagram endpoint). + let _ = std::fs::remove_file(&vfkit_sock); + let _ = std::fs::remove_file(&api_sock); + let krun_sock = run_dir.join("gvproxy-vfkit.sock-krun.sock"); + let _ = std::fs::remove_file(&krun_sock); + + // Start gvproxy + eprintln!("Starting gvproxy: {}", binary.display()); + let gvproxy_log = run_dir.join("gvproxy.log"); + let gvproxy_log_file = std::fs::File::create(&gvproxy_log) + .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; + let child = std::process::Command::new(binary) + .arg("-listen-vfkit") + .arg(format!("unixgram://{}", vfkit_sock.display())) + .arg("-listen") + .arg(format!("unix://{}", api_sock.display())) + .stdout(std::process::Stdio::null()) + .stderr(gvproxy_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; + + eprintln!( + "gvproxy started (pid {}) [{:.1}s]", + child.id(), + launch_start.elapsed().as_secs_f64() + ); + + // Wait for the socket to appear (exponential backoff: 5ms → 100ms). + { + let deadline = Instant::now() + std::time::Duration::from_secs(5); + let mut interval = std::time::Duration::from_millis(5); + while !vfkit_sock.exists() { + if Instant::now() >= deadline { + return Err(VmError::Fork( + "gvproxy socket did not appear within 5s".to_string(), + )); + } + std::thread::sleep(interval); + interval = (interval * 2).min(std::time::Duration::from_millis(100)); + } + } + + // Disable implicit TSI and add virtio-net via gvproxy + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + // This MAC matches gvproxy's default static DHCP lease for + // 192.168.127.2. Using a different MAC can cause the gVisor + // network stack to misroute or drop packets. + let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; + + // COMPAT_NET_FEATURES from libkrun.h + const NET_FEATURE_CSUM: u32 = 1 << 0; + const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; + const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; + const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; + const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; + const NET_FEATURE_HOST_UFO: u32 = 1 << 14; + const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM + | NET_FEATURE_GUEST_CSUM + | NET_FEATURE_GUEST_TSO4 + | NET_FEATURE_GUEST_UFO + | NET_FEATURE_HOST_TSO4 + | NET_FEATURE_HOST_UFO; + const NET_FLAG_VFKIT: u32 = 1 << 0; + + vm.add_net_unixgram(&vfkit_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; + + eprintln!( + "Networking: gvproxy (virtio-net) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + gvproxy_child = Some(child); + gvproxy_api_sock = Some(api_sock); + } + } + + // Port mapping (TSI only) + if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { + vm.set_port_map(&config.port_map)?; + } + + for vsock_port in &config.vsock_ports { + vm.add_vsock_port(vsock_port)?; + } + + // Console output + let console_log = config.console_output.clone().unwrap_or_else(|| { + config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .join("console.log") + }); + vm.set_console_output(&console_log)?; + + // envp: use provided env or minimal defaults + let env: Vec = if config.env.is_empty() { + vec![ + "HOME=/root", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + ] + .into_iter() + .map(ToOwned::to_owned) + .collect() + } else { + config.env.clone() + }; + vm.set_exec(&config.exec_path, &config.args, &env)?; + + // ── Fork and enter the VM ────────────────────────────────────── + // + // krun_start_enter() never returns — it calls exit() when the guest + // process exits. We fork so the parent can monitor and report. + + let boot_start = Instant::now(); + eprintln!("Booting microVM..."); + + let pid = unsafe { libc::fork() }; + match pid { + -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), + 0 => { + // Child process: enter the VM (never returns on success) + let ret = vm.start_enter(); + eprintln!("krun_start_enter failed: {ret}"); + std::process::exit(1); + } + _ => { + // Parent: wait for child + if config.exec_path == "/srv/openshell-vm-init.sh" { + if let Err(err) = write_vm_runtime_state(&config.rootfs, pid, &console_log) { + unsafe { + libc::kill(pid, libc::SIGTERM); + } + if let Some(mut child) = gvproxy_child { + let _ = child.kill(); + let _ = child.wait(); + } + clear_vm_runtime_state(&config.rootfs); + return Err(err); + } + } + eprintln!( + "VM started (child pid {pid}) [{:.1}s]", + boot_start.elapsed().as_secs_f64() + ); + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + // Set up gvproxy port forwarding via its HTTP API. + // The port_map entries use the same "host:guest" format + // as TSI, but here we translate them into gvproxy expose + // calls targeting the guest IP (192.168.127.2). + // + // Instead of a fixed 500ms sleep, poll the API socket with + // exponential backoff (5ms → 200ms, ~1s total budget). + if let Some(ref api_sock) = gvproxy_api_sock { + let fwd_start = Instant::now(); + // Wait for the API socket to appear (it lags slightly + // behind the vfkit data socket). + { + let deadline = Instant::now() + std::time::Duration::from_secs(2); + let mut interval = std::time::Duration::from_millis(5); + while !api_sock.exists() { + if Instant::now() >= deadline { + eprintln!( + "warning: gvproxy API socket not ready after 2s, attempting anyway" + ); + break; + } + std::thread::sleep(interval); + interval = (interval * 2).min(std::time::Duration::from_millis(200)); + } + } + + let guest_ip = "192.168.127.2"; + + for pm in &config.port_map { + let parts: Vec<&str> = pm.split(':').collect(); + let (host_port, guest_port) = match parts.len() { + 2 => (parts[0], parts[1]), + 1 => (parts[0], parts[0]), + _ => { + eprintln!(" skipping invalid port mapping: {pm}"); + continue; + } + }; + + let expose_body = format!( + r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# + ); + + match gvproxy_expose(api_sock, &expose_body) { + Ok(()) => { + eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); + } + Err(e) => { + eprintln!(" port {host_port}: {e}"); + } + } + } + eprintln!( + "Port forwarding ready [{:.1}s]", + fwd_start.elapsed().as_secs_f64() + ); + } + + // Bootstrap the OpenShell control plane and wait for the + // service to be reachable. Only for the gateway preset. + if config.exec_path == "/srv/openshell-vm-init.sh" { + // Bootstrap stores host-side metadata and mTLS creds. + // With pre-baked rootfs (Path 1) this reads PKI directly + // from virtio-fs — no kubectl or port forwarding needed. + // Cold boot (Path 2) writes secret manifests into the + // k3s auto-deploy directory via virtio-fs. + if let Err(e) = bootstrap_gateway(&config.rootfs) { + eprintln!("Bootstrap failed: {e}"); + eprintln!(" The VM is running but OpenShell may not be fully operational."); + } + + // Wait for the gRPC service to be reachable via TCP + // probe on host:30051. This confirms the full path + // (gvproxy → kube-proxy nftables → pod:8080) is working. + wait_for_gateway_service(); + } + + eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); + eprintln!("Press Ctrl+C to stop."); + + // Forward signals to child + unsafe { + libc::signal( + libc::SIGINT, + forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + forward_signal as *const () as libc::sighandler_t, + ); + CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); + } + + let mut status: libc::c_int = 0; + unsafe { + libc::waitpid(pid, &raw mut status, 0); + } + + // Clean up gvproxy + if config.exec_path == "/srv/openshell-vm-init.sh" { + clear_vm_runtime_state(&config.rootfs); + } + if let Some(mut child) = gvproxy_child { + let _ = child.kill(); + let _ = child.wait(); + eprintln!("gvproxy stopped"); + } + + if libc::WIFEXITED(status) { + let code = libc::WEXITSTATUS(status); + eprintln!("VM exited with code {code}"); + return Ok(code); + } else if libc::WIFSIGNALED(status) { + let sig = libc::WTERMSIG(status); + eprintln!("VM killed by signal {sig}"); + return Ok(128 + sig); + } + + Ok(status) + } + } +} + +// ── Post-boot bootstrap ──────────────────────────────────────────────── + +/// Cluster name used for metadata and mTLS storage. +const GATEWAY_CLUSTER_NAME: &str = "openshell-vm"; + +/// Gateway port: the host port mapped to the OpenShell `NodePort` (30051). +const GATEWAY_PORT: u16 = 30051; + +/// Bootstrap the OpenShell control plane after k3s is ready. +/// +/// All operations use the virtio-fs rootfs — no kubectl or API server +/// port forwarding required. This avoids exposing port 6443 outside the +/// VM. +/// +/// Three paths, in priority order: +/// +/// 1. **Pre-baked rootfs** (from `build-rootfs.sh`): PKI files at +/// `rootfs/opt/openshell/pki/`. TLS secrets already exist in the k3s +/// database. Reads certs from the filesystem and stores metadata on the +/// host. +/// +/// 2. **Warm boot**: host-side metadata + mTLS certs survive across VM +/// restarts. Nothing to do — service readiness is confirmed by the TCP +/// probe in `wait_for_gateway_service()`. +/// +/// The VM generates PKI on first boot (via openshell-vm-init.sh) and +/// writes certs to `/opt/openshell/pki/` on the rootfs. This function: +/// +/// 1. **Warm boot**: host already has certs at `~/.config/.../mtls/` — skip. +/// 2. **First boot / post-reset**: polls the rootfs for `/opt/openshell/pki/ca.crt` +/// (written by the VM init script), then copies client certs to the host. +fn bootstrap_gateway(rootfs: &Path) -> Result<(), VmError> { + let bootstrap_start = Instant::now(); + + let metadata = openshell_bootstrap::GatewayMetadata { + name: GATEWAY_CLUSTER_NAME.to_string(), + gateway_endpoint: format!("https://127.0.0.1:{GATEWAY_PORT}"), + is_remote: false, + gateway_port: GATEWAY_PORT, + remote_host: None, + resolved_host: None, + auth_mode: None, + edge_team_domain: None, + edge_auth_url: None, + }; + + // ── Warm boot: host already has certs ────────────────────────── + if is_warm_boot() { + // Always (re-)store metadata so port/endpoint changes are picked up. + openshell_bootstrap::store_gateway_metadata(GATEWAY_CLUSTER_NAME, &metadata) + .map_err(|e| VmError::Bootstrap(format!("failed to store metadata: {e}")))?; + openshell_bootstrap::save_active_gateway(GATEWAY_CLUSTER_NAME) + .map_err(|e| VmError::Bootstrap(format!("failed to set active cluster: {e}")))?; + + // Verify host certs match the rootfs PKI. If they diverge (e.g. + // PKI was regenerated out-of-band, or the rootfs was replaced), + // re-sync the host certs from the authoritative rootfs copy. + let pki_dir = rootfs.join("opt/openshell/pki"); + if pki_dir.join("ca.crt").is_file() { + if let Err(e) = sync_host_certs_if_stale(&pki_dir) { + eprintln!("Warning: cert sync check failed: {e}"); + } + } + + eprintln!( + "Warm boot [{:.1}s]", + bootstrap_start.elapsed().as_secs_f64() + ); + eprintln!(" Cluster: {GATEWAY_CLUSTER_NAME}"); + eprintln!(" Gateway: https://127.0.0.1:{GATEWAY_PORT}"); + eprintln!(" mTLS: ~/.config/openshell/gateways/{GATEWAY_CLUSTER_NAME}/mtls/"); + return Ok(()); + } + + // ── First boot / post-reset: wait for VM to generate PKI ────── + // + // The VM init script generates certs at /opt/openshell/pki/ on + // first boot. We poll the rootfs (visible via virtio-fs) until + // the CA cert appears, then copy client certs to the host. + eprintln!("Waiting for VM to generate PKI..."); + let pki_dir = rootfs.join("opt/openshell/pki"); + let ca_cert_path = pki_dir.join("ca.crt"); + let poll_timeout = std::time::Duration::from_secs(120); + let poll_start = Instant::now(); + + loop { + if ca_cert_path.is_file() { + // Verify the file has content (not a partial write). + if let Ok(m) = std::fs::metadata(&ca_cert_path) { + if m.len() > 0 { + break; + } + } + } + if poll_start.elapsed() >= poll_timeout { + return Err(VmError::Bootstrap( + "VM did not generate PKI within 120s".to_string(), + )); + } + std::thread::sleep(std::time::Duration::from_secs(1)); + } + + eprintln!("PKI ready — copying client certs to host..."); + + let read = |name: &str| -> Result { + std::fs::read_to_string(pki_dir.join(name)) + .map_err(|e| VmError::Bootstrap(format!("failed to read {name}: {e}"))) + }; + + let pki_bundle = openshell_bootstrap::pki::PkiBundle { + ca_cert_pem: read("ca.crt")?, + ca_key_pem: read("ca.key")?, + server_cert_pem: read("server.crt")?, + server_key_pem: read("server.key")?, + client_cert_pem: read("client.crt")?, + client_key_pem: read("client.key")?, + }; + + openshell_bootstrap::store_gateway_metadata(GATEWAY_CLUSTER_NAME, &metadata) + .map_err(|e| VmError::Bootstrap(format!("failed to store metadata: {e}")))?; + + openshell_bootstrap::mtls::store_pki_bundle(GATEWAY_CLUSTER_NAME, &pki_bundle) + .map_err(|e| VmError::Bootstrap(format!("failed to store mTLS creds: {e}")))?; + + openshell_bootstrap::save_active_gateway(GATEWAY_CLUSTER_NAME) + .map_err(|e| VmError::Bootstrap(format!("failed to set active cluster: {e}")))?; + + eprintln!( + "Bootstrap complete [{:.1}s]", + bootstrap_start.elapsed().as_secs_f64() + ); + eprintln!(" Cluster: {GATEWAY_CLUSTER_NAME}"); + eprintln!(" Gateway: https://127.0.0.1:{GATEWAY_PORT}"); + eprintln!(" mTLS: ~/.config/openshell/gateways/{GATEWAY_CLUSTER_NAME}/mtls/"); + + Ok(()) +} + +/// Check whether a previous bootstrap left valid state on disk. +/// +/// A warm boot is detected when both: +/// - Cluster metadata exists: `$XDG_CONFIG_HOME/openshell/gateways/openshell-vm/metadata.json` +/// - mTLS certs exist: `$XDG_CONFIG_HOME/openshell/gateways/openshell-vm/mtls/{ca.crt,tls.crt,tls.key}` +/// +/// When true, the host-side bootstrap (PKI generation, secret manifest writing, +/// metadata storage) can be skipped because the virtio-fs rootfs persists k3s +/// state (TLS certs, kine/sqlite, containerd images, helm releases) across VM +/// restarts. +fn is_warm_boot() -> bool { + let Ok(home) = std::env::var("HOME") else { + return false; + }; + + let config_base = + std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); + + let config_dir = PathBuf::from(&config_base) + .join("openshell") + .join("gateways"); + + // Check metadata file. + let metadata_path = config_dir.join(GATEWAY_CLUSTER_NAME).join("metadata.json"); + if !metadata_path.is_file() { + return false; + } + + // Check mTLS cert files. + let mtls_dir = config_dir.join(GATEWAY_CLUSTER_NAME).join("mtls"); + for name in &["ca.crt", "tls.crt", "tls.key"] { + let path = mtls_dir.join(name); + match std::fs::metadata(&path) { + Ok(m) if m.is_file() && m.len() > 0 => {} + _ => return false, + } + } + + true +} + +/// Compare the CA cert on the rootfs (authoritative source) against the +/// host-side copy. If they differ, re-copy all client certs from the rootfs. +/// +/// This catches cases where PKI was regenerated (e.g. rootfs rebuilt, +/// manual reset) but host-side certs survived from a previous boot cycle. +fn sync_host_certs_if_stale(pki_dir: &Path) -> Result<(), VmError> { + let Ok(home) = std::env::var("HOME") else { + return Ok(()); + }; + let config_base = + std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); + let host_ca = PathBuf::from(&config_base) + .join("openshell/gateways") + .join(GATEWAY_CLUSTER_NAME) + .join("mtls/ca.crt"); + + let rootfs_ca = std::fs::read_to_string(pki_dir.join("ca.crt")) + .map_err(|e| VmError::Bootstrap(format!("failed to read rootfs ca.crt: {e}")))?; + + let host_ca_contents = std::fs::read_to_string(&host_ca) + .map_err(|e| VmError::Bootstrap(format!("failed to read host ca.crt: {e}")))?; + + if rootfs_ca.trim() == host_ca_contents.trim() { + return Ok(()); + } + + eprintln!("Cert drift detected — re-syncing mTLS certs from rootfs..."); + + let read = |name: &str| -> Result { + std::fs::read_to_string(pki_dir.join(name)) + .map_err(|e| VmError::Bootstrap(format!("failed to read {name}: {e}"))) + }; + + let pki_bundle = openshell_bootstrap::pki::PkiBundle { + ca_cert_pem: read("ca.crt")?, + ca_key_pem: read("ca.key")?, + server_cert_pem: read("server.crt")?, + server_key_pem: read("server.key")?, + client_cert_pem: read("client.crt")?, + client_key_pem: read("client.key")?, + }; + + openshell_bootstrap::mtls::store_pki_bundle(GATEWAY_CLUSTER_NAME, &pki_bundle) + .map_err(|e| VmError::Bootstrap(format!("failed to store mTLS creds: {e}")))?; + + eprintln!(" mTLS certs re-synced from rootfs"); + Ok(()) +} + +/// Wait for the openshell pod to become Ready inside the k3s cluster +/// and verify the gRPC service is reachable from the host. +/// +/// Stale pod/lease records are cleaned from the kine DB at build time +/// (see `build-rootfs.sh`). Containerd metadata (meta.db) is preserved +/// across boots so the native snapshotter doesn't re-extract image layers. +/// Runtime task state is cleaned by `openshell-vm-init.sh` on each boot. +/// +/// Wait for the OpenShell gRPC service to be reachable from the host. +/// +/// Polls `host_tcp_probe()` on `127.0.0.1:30051` with 1s intervals. +/// The probe confirms the full networking path: gvproxy → kube-proxy +/// nftables → pod:8080. A successful probe means the pod is running, +/// the NodePort service is routing, and the server is accepting +/// connections. No kubectl or API server access required. +fn wait_for_gateway_service() { + let start = Instant::now(); + let timeout = std::time::Duration::from_secs(90); + let poll_interval = std::time::Duration::from_secs(1); + + eprintln!("Waiting for gateway service..."); + + loop { + if host_tcp_probe() { + eprintln!("Service healthy [{:.1}s]", start.elapsed().as_secs_f64()); + return; + } + + if start.elapsed() >= timeout { + eprintln!( + " gateway service not ready after {:.0}s, continuing anyway", + timeout.as_secs_f64() + ); + return; + } + + std::thread::sleep(poll_interval); + } +} + +/// Probe `127.0.0.1:30051` from the host to verify the full +/// gvproxy → VM → pod path is working. +/// +/// gvproxy accepts TCP connections even when the guest port is closed, +/// but those connections are immediately reset. A server that is truly +/// listening will hold the connection open (waiting for a TLS +/// ClientHello). We exploit this: connect, then try a short read. If +/// the read **times out** the server is alive; if it returns an error +/// (reset/EOF) the server is down. +fn host_tcp_probe() -> bool { + use std::io::Read; + use std::net::{SocketAddr, TcpStream}; + use std::time::Duration; + + let addr: SocketAddr = ([127, 0, 0, 1], GATEWAY_PORT).into(); + let Ok(mut stream) = TcpStream::connect_timeout(&addr, Duration::from_secs(2)) else { + return false; + }; + + // A short read timeout: if the server is alive it will wait for us + // to send a TLS ClientHello, so the read will time out (= good). + // If the connection resets or closes, the server is dead. + stream + .set_read_timeout(Some(Duration::from_millis(200))) + .ok(); + let mut buf = [0u8; 1]; + match stream.read(&mut buf) { + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + true // Timeout = server alive, waiting for ClientHello. + } + _ => false, // Reset, EOF, or unexpected data = not healthy. + } +} + +static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); + +extern "C" fn forward_signal(_sig: libc::c_int) { + let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed); + if pid > 0 { + unsafe { + libc::kill(pid, libc::SIGTERM); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_runtime_dir() -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time went backwards") + .as_nanos(); + std::env::temp_dir().join(format!( + "openshell-vm-runtime-{}-{nanos}", + std::process::id() + )) + } + + fn write_runtime_file(path: &Path) { + fs::write(path, b"test").expect("failed to write runtime file"); + } + + #[test] + fn validate_runtime_dir_accepts_minimal_bundle() { + let dir = temp_runtime_dir(); + fs::create_dir_all(&dir).expect("failed to create runtime dir"); + + write_runtime_file(&dir.join(required_runtime_lib_name())); + write_runtime_file(&dir.join("libkrunfw.test")); + let gvproxy = dir.join("gvproxy"); + write_runtime_file(&gvproxy); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt as _; + + let mut perms = fs::metadata(&gvproxy).expect("stat gvproxy").permissions(); + perms.set_mode(0o755); + fs::set_permissions(&gvproxy, perms).expect("chmod gvproxy"); + } + + let resolved_gvproxy = validate_runtime_dir(&dir).expect("runtime bundle should validate"); + assert_eq!(resolved_gvproxy, gvproxy); + + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn validate_runtime_dir_requires_gvproxy() { + let dir = temp_runtime_dir(); + fs::create_dir_all(&dir).expect("failed to create runtime dir"); + + write_runtime_file(&dir.join(required_runtime_lib_name())); + write_runtime_file(&dir.join("libkrunfw.test")); + + let err = validate_runtime_dir(&dir).expect_err("missing gvproxy should fail"); + match err { + VmError::BinaryNotFound { hint, .. } => { + assert!(hint.contains("missing gvproxy")); + } + other => panic!("unexpected error: {other:?}"), + } + + let _ = fs::remove_dir_all(&dir); + } +} diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs new file mode 100644 index 000000000..ec35f53ea --- /dev/null +++ b/crates/openshell-vm/src/main.rs @@ -0,0 +1,197 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Standalone openshell-vm binary. +//! +//! Boots a libkrun microVM running the OpenShell control plane (k3s + +//! openshell-server). By default it uses the pre-built rootfs at +//! `~/.local/share/openshell/openshell-vm/rootfs`. +//! +//! # Codesigning (macOS) +//! +//! This binary must be codesigned with the `com.apple.security.hypervisor` +//! entitlement. See `entitlements.plist` in this crate. +//! +//! ```sh +//! codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - target/debug/openshell-vm +//! ``` + +use std::io::IsTerminal; +use std::path::PathBuf; + +use clap::{Parser, Subcommand, ValueHint}; + +/// Boot the OpenShell gateway microVM. +/// +/// Starts a libkrun microVM running a k3s Kubernetes cluster with the +/// OpenShell control plane. Use `--exec` to run a custom process instead. +#[derive(Parser)] +#[command(name = "openshell-vm", version)] +struct Cli { + #[command(subcommand)] + command: Option, + + /// Path to the rootfs directory (aarch64 Linux). + /// Defaults to `~/.local/share/openshell/openshell-vm/rootfs`. + #[arg(long, value_hint = ValueHint::DirPath)] + rootfs: Option, + + /// Executable path inside the VM. When set, runs this instead of + /// the default k3s server. + #[arg(long)] + exec: Option, + + /// Arguments to the executable (requires `--exec`). + #[arg(long, num_args = 1..)] + args: Vec, + + /// Environment variables in `KEY=VALUE` form (requires `--exec`). + #[arg(long, num_args = 1..)] + env: Vec, + + /// Working directory inside the VM. + #[arg(long, default_value = "/")] + workdir: String, + + /// Port mappings (`host_port:guest_port`). + #[arg(long, short, num_args = 1..)] + port: Vec, + + /// Number of virtual CPUs (default: 4 for openshell-vm, 2 for --exec). + #[arg(long)] + vcpus: Option, + + /// RAM in MiB (default: 8192 for openshell-vm, 2048 for --exec). + #[arg(long)] + mem: Option, + + /// libkrun log level (0=Off .. 5=Trace). + #[arg(long, default_value_t = 1)] + krun_log_level: u32, + + /// Networking backend: "gvproxy" (default), "tsi", or "none". + #[arg(long, default_value = "gvproxy")] + net: String, + + /// Wipe all runtime state (containerd, kubelet, k3s) before booting. + /// Use this to recover from a corrupted state after a crash or + /// unclean shutdown. + #[arg(long)] + reset: bool, +} + +#[derive(Subcommand)] +enum GatewayCommand { + /// Execute a command inside a running openshell-vm VM. + Exec { + /// Working directory inside the VM. + #[arg(long)] + workdir: Option, + + /// Environment variables in `KEY=VALUE` form. + #[arg(long, num_args = 1..)] + env: Vec, + + /// Command and arguments to run inside the VM. + #[arg(trailing_var_arg = true)] + command: Vec, + }, +} + +fn main() { + tracing_subscriber::fmt::init(); + + let cli = Cli::parse(); + + let code = match run(cli) { + Ok(code) => code, + Err(e) => { + eprintln!("Error: {e}"); + 1 + } + }; + + if code != 0 { + std::process::exit(code); + } +} + +fn run(cli: Cli) -> Result> { + if let Some(GatewayCommand::Exec { + workdir, + env, + mut command, + }) = cli.command + { + let effective_tty = std::io::stdin().is_terminal(); + if command.is_empty() { + if effective_tty { + command.push("sh".to_string()); + } else { + return Err("openshell-vm exec requires a command when stdin is not a TTY".into()); + } + } + return Ok(openshell_vm::exec_running_vm( + openshell_vm::VmExecOptions { + rootfs: cli.rootfs, + command, + workdir, + env, + tty: effective_tty, + }, + )?); + } + + let net_backend = match cli.net.as_str() { + "tsi" => openshell_vm::NetBackend::Tsi, + "none" => openshell_vm::NetBackend::None, + "gvproxy" => openshell_vm::NetBackend::Gvproxy { + binary: openshell_vm::default_runtime_gvproxy_path(), + }, + other => { + return Err( + format!("unknown --net backend: {other} (expected: gvproxy, tsi, none)").into(), + ); + } + }; + + let rootfs = match cli.rootfs { + Some(p) => p, + None => openshell_bootstrap::paths::default_rootfs_dir()?, + }; + + let mut config = if let Some(exec_path) = cli.exec { + openshell_vm::VmConfig { + rootfs, + vcpus: cli.vcpus.unwrap_or(2), + mem_mib: cli.mem.unwrap_or(2048), + exec_path, + args: cli.args, + env: cli.env, + workdir: cli.workdir, + port_map: cli.port, + vsock_ports: vec![], + log_level: cli.krun_log_level, + console_output: None, + net: net_backend.clone(), + reset: cli.reset, + } + } else { + let mut c = openshell_vm::VmConfig::gateway(rootfs); + if !cli.port.is_empty() { + c.port_map = cli.port; + } + if let Some(v) = cli.vcpus { + c.vcpus = v; + } + if let Some(m) = cli.mem { + c.mem_mib = m; + } + c.net = net_backend; + c.reset = cli.reset; + c + }; + config.log_level = cli.krun_log_level; + + Ok(openshell_vm::launch(&config)?) +} diff --git a/crates/openshell-vm/tests/gateway_integration.rs b/crates/openshell-vm/tests/gateway_integration.rs new file mode 100644 index 000000000..347d05532 --- /dev/null +++ b/crates/openshell-vm/tests/gateway_integration.rs @@ -0,0 +1,154 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration tests for the standalone `openshell-vm` binary. +//! +//! These tests require: +//! - libkrun installed (e.g. `brew tap slp/krun && brew install libkrun`) +//! - macOS ARM64 with Apple Hypervisor.framework +//! - A pre-built rootfs at `~/.local/share/openshell/openshell-vm/rootfs` +//! +//! All tests are `#[ignore]` — run them explicitly: +//! +//! ```sh +//! cargo test -p openshell-vm --test gateway_integration -- --ignored +//! ``` + +#![allow(unsafe_code)] + +use std::net::{SocketAddr, TcpStream}; +use std::process::{Command, Stdio}; +use std::time::{Duration, Instant}; + +/// Path to the built `openshell-vm` binary (resolved by Cargo at compile time). +const GATEWAY: &str = env!("CARGO_BIN_EXE_openshell-vm"); + +// ── Helpers ──────────────────────────────────────────────────────────── + +/// Codesign the binary on macOS so it can access Hypervisor.framework. +fn codesign_if_needed() { + if cfg!(target_os = "macos") { + let entitlements = format!("{}/entitlements.plist", env!("CARGO_MANIFEST_DIR")); + let status = Command::new("codesign") + .args([ + "--entitlements", + &entitlements, + "--force", + "-s", + "-", + GATEWAY, + ]) + .status() + .expect("codesign command failed to execute"); + assert!(status.success(), "failed to codesign openshell-vm binary"); + } +} + +fn assert_runtime_bundle_staged() { + let bundle_dir = std::path::Path::new(GATEWAY) + .parent() + .expect("openshell-vm binary has no parent") + .join("openshell-vm.runtime"); + assert!( + bundle_dir.is_dir(), + "openshell-vm.runtime is missing next to the test binary: {}. Run `mise run vm:bundle-runtime` first.", + bundle_dir.display() + ); +} + +// ── Tests ────────────────────────────────────────────────────────────── + +/// Boot the full OpenShell gateway and verify the gRPC service becomes +/// reachable on port 30051. +#[test] +#[ignore] // requires libkrun + rootfs +fn gateway_boots_and_service_becomes_reachable() { + codesign_if_needed(); + assert_runtime_bundle_staged(); + + let mut cmd = Command::new(GATEWAY); + cmd.stdout(Stdio::null()).stderr(Stdio::piped()); + + let mut child = cmd.spawn().expect("failed to start openshell-vm"); + + // Poll for the OpenShell gRPC service. + let addr: SocketAddr = ([127, 0, 0, 1], 30051).into(); + let timeout = Duration::from_secs(180); + let start = Instant::now(); + let mut reachable = false; + + while start.elapsed() < timeout { + if TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { + reachable = true; + break; + } + std::thread::sleep(Duration::from_secs(2)); + } + + // Tear down regardless of result. + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) }; + let _ = child.wait(); + + assert!( + reachable, + "openshell-vm service on port 30051 not reachable within {timeout:?}" + ); +} + +/// Run a trivial command inside the VM via `--exec` and verify it exits +/// successfully, proving the VM boots and can execute guest processes. +#[test] +#[ignore] // requires libkrun + rootfs +fn gateway_exec_runs_guest_command() { + codesign_if_needed(); + assert_runtime_bundle_staged(); + + let mut cmd = Command::new(GATEWAY); + cmd.args(["--exec", "/bin/true"]); + + let output = cmd.output().expect("failed to run openshell-vm --exec"); + + assert!( + output.status.success(), + "openshell-vm --exec /bin/true failed with status {:?}\nstderr: {}", + output.status, + String::from_utf8_lossy(&output.stderr), + ); +} + +/// Boot the VM, then use `openshell-vm exec` against the running instance. +#[test] +#[ignore] // requires libkrun + rootfs +fn gateway_exec_attaches_to_running_vm() { + codesign_if_needed(); + assert_runtime_bundle_staged(); + + let mut boot = Command::new(GATEWAY); + boot.stdout(Stdio::null()).stderr(Stdio::piped()); + let mut child = boot.spawn().expect("failed to start openshell-vm VM"); + + let addr: SocketAddr = ([127, 0, 0, 1], 30051).into(); + let timeout = Duration::from_secs(180); + let start = Instant::now(); + while start.elapsed() < timeout { + if TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { + break; + } + std::thread::sleep(Duration::from_secs(2)); + } + + let output = Command::new(GATEWAY) + .args(["exec", "--", "/bin/true"]) + .output() + .expect("failed to run openshell-vm exec"); + + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) }; + let _ = child.wait(); + + assert!( + output.status.success(), + "openshell-vm exec -- /bin/true failed with status {:?}\nstderr: {}", + output.status, + String::from_utf8_lossy(&output.stderr), + ); +} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 1be8f14ab..55d698a8c 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -26,11 +26,16 @@ spec: {{- end }} spec: terminationGracePeriodSeconds: {{ .Values.podLifecycle.terminationGracePeriodSeconds }} + {{- if .Values.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} serviceAccountName: {{ include "openshell.serviceAccountName" . }} + automountServiceAccountToken: {{ .Values.automountServiceAccountToken }} {{- if .Values.server.hostGatewayIP }} hostAliases: - ip: {{ .Values.server.hostGatewayIP | quote }} @@ -96,8 +101,10 @@ spec: {{- end }} {{- end }} volumeMounts: + {{- if .Values.persistence.enabled }} - name: openshell-data mountPath: /var/openshell + {{- end }} {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -140,6 +147,10 @@ spec: - name: tls-client-ca secret: secretName: {{ .Values.server.tls.clientCaSecretName }} + {{- if not .Values.persistence.enabled }} + - name: openshell-data + emptyDir: {} + {{- end }} {{- end }} {{- with .Values.nodeSelector }} nodeSelector: @@ -153,6 +164,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.persistence.enabled }} volumeClaimTemplates: - metadata: name: openshell-data @@ -161,3 +173,4 @@ spec: resources: requests: storage: 1Gi + {{- end }} diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index ccc8d1ffa..1b835202b 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -19,6 +19,12 @@ serviceAccount: annotations: {} name: "" +# Whether to auto-mount the ServiceAccount token into the pod. Disabled +# in microVM gateway mode because the projected volume mount at +# /var/run/secrets/kubernetes.io/serviceaccount hits a containerd +# native-snapshotter + virtiofs incompatibility on sandbox re-creation. +automountServiceAccountToken: true + podAnnotations: {} podLabels: {} @@ -60,6 +66,19 @@ probes: resources: {} +# Persistent storage for the OpenShell database. When disabled, an +# emptyDir volume is used instead of a PVC. This is useful in microVM +# environments where overlayfs-on-virtiofs doesn't support PVC mounts +# reliably. +persistence: + enabled: true + +# Run the pod directly on the host network. Useful in microVM +# environments where kube-proxy is unavailable (no iptables). +# When true, the pod binds to the VM's eth0 and NodePort is +# unnecessary — gvproxy forwards host ports to the pod directly. +hostNetwork: false + nodeSelector: {} tolerations: [] diff --git a/deploy/kube/manifests/openshell-helmchart.yaml b/deploy/kube/manifests/openshell-helmchart.yaml index 2245c72ed..b3b420171 100644 --- a/deploy/kube/manifests/openshell-helmchart.yaml +++ b/deploy/kube/manifests/openshell-helmchart.yaml @@ -27,12 +27,18 @@ spec: image: repository: ghcr.io/nvidia/openshell/gateway tag: latest - pullPolicy: Always + pullPolicy: __IMAGE_PULL_POLICY__ + hostNetwork: __HOST_NETWORK__ + automountServiceAccountToken: __AUTOMOUNT_SA_TOKEN__ + persistence: + enabled: __PERSISTENCE_ENABLED__ server: sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest + sandboxImagePullPolicy: __SANDBOX_IMAGE_PULL_POLICY__ sshGatewayHost: __SSH_GATEWAY_HOST__ sshGatewayPort: __SSH_GATEWAY_PORT__ sshHandshakeSecret: __SSH_HANDSHAKE_SECRET__ + dbUrl: __DB_URL__ grpcEndpoint: "https://openshell.openshell.svc.cluster.local:8080" hostGatewayIP: __HOST_GATEWAY_IP__ disableGatewayAuth: __DISABLE_GATEWAY_AUTH__ diff --git a/scripts/bin/openshell-vm b/scripts/bin/openshell-vm new file mode 100755 index 000000000..d9d667417 --- /dev/null +++ b/scripts/bin/openshell-vm @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +BINARY="$PROJECT_ROOT/target/debug/openshell-vm" + +cargo build --package openshell-vm --bin openshell-vm --quiet + +# On macOS, codesign with the hypervisor entitlement so libkrun can use +# Apple's Hypervisor.framework. Re-sign after every build. +ENTITLEMENTS="$PROJECT_ROOT/crates/openshell-vm/entitlements.plist" +if [[ "$(uname)" == "Darwin" ]] && [[ -f "$ENTITLEMENTS" ]]; then + codesign --entitlements "$ENTITLEMENTS" --force -s - "$BINARY" 2>/dev/null +fi + +# Ensure libkrunfw is discoverable by libkrun's dlopen on macOS. +# dyld only reads DYLD_FALLBACK_LIBRARY_PATH at process startup, so we +# set it here before exec. +if [[ "$(uname)" == "Darwin" ]]; then + HOMEBREW_LIB="$(brew --prefix 2>/dev/null || echo /opt/homebrew)/lib" + export DYLD_FALLBACK_LIBRARY_PATH="${HOMEBREW_LIB}${DYLD_FALLBACK_LIBRARY_PATH:+:$DYLD_FALLBACK_LIBRARY_PATH}" +fi + +exec "$BINARY" "$@" diff --git a/tasks/rust.toml b/tasks/rust.toml index 69214ce7f..dfa4068f4 100644 --- a/tasks/rust.toml +++ b/tasks/rust.toml @@ -5,12 +5,12 @@ ["rust:check"] description = "Check all Rust crates for errors" -run = "cargo check --workspace" +run = "cargo check --workspace --exclude openshell-vm" hide = true ["rust:lint"] description = "Lint Rust code with Clippy" -run = "cargo clippy --workspace --all-targets" +run = "cargo clippy --workspace --all-targets --exclude openshell-vm" hide = true ["rust:format"] diff --git a/tasks/scripts/bundle-vm-runtime.sh b/tasks/scripts/bundle-vm-runtime.sh new file mode 100755 index 000000000..ac2711c63 --- /dev/null +++ b/tasks/scripts/bundle-vm-runtime.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +if [ "$(uname -s)" != "Darwin" ]; then + echo "vm:bundle-runtime currently supports macOS only" >&2 + exit 1 +fi + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +LIB_DIR="${OPENSHELL_VM_RUNTIME_SOURCE_DIR:-}" +GVPROXY_BIN="${OPENSHELL_VM_GVPROXY:-}" + +if [ -z "$LIB_DIR" ]; then + # Prefer the custom runtime (has bridge/netfilter kernel support) over + # the stock Homebrew libkrunfw which lacks these capabilities. + CUSTOM_RUNTIME_DIR="${ROOT}/target/custom-runtime" + if [ -f "${CUSTOM_RUNTIME_DIR}/provenance.json" ] && [ -e "${CUSTOM_RUNTIME_DIR}/libkrunfw.dylib" ]; then + LIB_DIR="${CUSTOM_RUNTIME_DIR}" + echo "using custom runtime at ${LIB_DIR}" + else + BREW_PREFIX="$(brew --prefix 2>/dev/null || true)" + if [ -n "$BREW_PREFIX" ]; then + LIB_DIR="${BREW_PREFIX}/lib" + else + LIB_DIR="/opt/homebrew/lib" + fi + fi +fi + +if [ -z "$GVPROXY_BIN" ]; then + if command -v gvproxy >/dev/null 2>&1; then + GVPROXY_BIN="$(command -v gvproxy)" + elif [ -x /opt/homebrew/bin/gvproxy ]; then + GVPROXY_BIN="/opt/homebrew/bin/gvproxy" + elif [ -x /opt/podman/bin/gvproxy ]; then + GVPROXY_BIN="/opt/podman/bin/gvproxy" + else + echo "gvproxy not found; set OPENSHELL_VM_GVPROXY or install gvproxy" >&2 + exit 1 + fi +fi + +# libkrun.dylib: prefer the custom runtime dir, fall back to Homebrew. +# libkrun is the VMM and does not need a custom build; only libkrunfw +# carries the custom kernel. +LIBKRUN="${LIB_DIR}/libkrun.dylib" +if [ ! -e "$LIBKRUN" ]; then + BREW_PREFIX="${BREW_PREFIX:-$(brew --prefix 2>/dev/null || true)}" + if [ -n "$BREW_PREFIX" ] && [ -e "${BREW_PREFIX}/lib/libkrun.dylib" ]; then + LIBKRUN="${BREW_PREFIX}/lib/libkrun.dylib" + echo "using Homebrew libkrun at ${LIBKRUN}" + else + echo "libkrun not found at ${LIB_DIR}/libkrun.dylib or Homebrew; install libkrun or set OPENSHELL_VM_RUNTIME_SOURCE_DIR" >&2 + exit 1 + fi +fi + +KRUNFW_FILES=() +while IFS= read -r line; do + KRUNFW_FILES+=("$line") +done < <(find "$LIB_DIR" -maxdepth 1 \( -type f -o -type l \) \( -name 'libkrunfw.dylib' -o -name 'libkrunfw.*.dylib' \) | sort -u) + +if [ "${#KRUNFW_FILES[@]}" -eq 0 ]; then + echo "libkrunfw not found under ${LIB_DIR}; set OPENSHELL_VM_RUNTIME_SOURCE_DIR" >&2 + exit 1 +fi + +# Check for provenance.json (custom runtime indicator) +PROVENANCE_FILE="${LIB_DIR}/provenance.json" +IS_CUSTOM="false" +if [ -f "$PROVENANCE_FILE" ]; then + IS_CUSTOM="true" + echo "custom runtime detected (provenance.json present)" +fi + +TARGETS=( + "${ROOT}/target/debug" + "${ROOT}/target/release" + "${ROOT}/target/aarch64-apple-darwin/debug" + "${ROOT}/target/aarch64-apple-darwin/release" +) + +for target_dir in "${TARGETS[@]}"; do + runtime_dir="${target_dir}/openshell-vm.runtime" + mkdir -p "$runtime_dir" + + install -m 0644 "$LIBKRUN" "${runtime_dir}/libkrun.dylib" + install -m 0755 "$GVPROXY_BIN" "${runtime_dir}/gvproxy" + for krunfw in "${KRUNFW_FILES[@]}"; do + install -m 0644 "$krunfw" "${runtime_dir}/$(basename "$krunfw")" + done + + # Copy provenance.json if this is a custom runtime. + if [ "$IS_CUSTOM" = "true" ] && [ -f "$PROVENANCE_FILE" ]; then + install -m 0644 "$PROVENANCE_FILE" "${runtime_dir}/provenance.json" + fi + + manifest_entries=() + manifest_entries+=(' "libkrun.dylib"') + manifest_entries+=(' "gvproxy"') + for krunfw in "${KRUNFW_FILES[@]}"; do + manifest_entries+=(" \"$(basename "$krunfw")\"") + done + if [ "$IS_CUSTOM" = "true" ]; then + manifest_entries+=(' "provenance.json"') + fi + + cat > "${runtime_dir}/manifest.json" <&2 + exit 1 +fi + +if [ ! -d "${TARGET_DIR}/openshell-vm.runtime" ]; then + echo "target/release/openshell-vm.runtime not found; run mise run vm:bundle-runtime first" >&2 + exit 1 +fi + +mkdir -p "${ARTIFACT_DIR}" +tar -czf "${ARTIFACT_DIR}/openshell-vm-aarch64-apple-darwin.tar.gz" \ + -C "${TARGET_DIR}" \ + openshell-vm \ + openshell-vm.runtime + +ls -lh "${ARTIFACT_DIR}/openshell-vm-aarch64-apple-darwin.tar.gz" diff --git a/tasks/scripts/run-vm.sh b/tasks/scripts/run-vm.sh new file mode 100755 index 000000000..9b9506017 --- /dev/null +++ b/tasks/scripts/run-vm.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +RUNTIME_DIR="${ROOT}/target/debug/openshell-vm.runtime" +GATEWAY_BIN="${ROOT}/target/debug/openshell-vm" + +if [ "$(uname -s)" = "Darwin" ]; then + export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" +fi + +exec "${GATEWAY_BIN}" "$@" diff --git a/tasks/scripts/sync-vm-rootfs.sh b/tasks/scripts/sync-vm-rootfs.sh new file mode 100755 index 000000000..053b3d99f --- /dev/null +++ b/tasks/scripts/sync-vm-rootfs.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Sync mutable development artifacts into the existing VM rootfs. +# Runs on every `mise run vm` so that script changes, helm chart +# updates, manifest changes, and supervisor binary rebuilds are +# picked up without a full rootfs rebuild. +# +# This is fast (<1s) — it only copies files, no Docker or VM boot. + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOTFS_DIR="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" +SCRIPT_DIR="${ROOT}/crates/openshell-vm/scripts" + +if [ ! -d "${ROOTFS_DIR}/srv" ]; then + # Rootfs doesn't exist yet — nothing to sync. ensure-vm-rootfs.sh + # or build-rootfs.sh will create it. + exit 0 +fi + +echo "Syncing development artifacts into rootfs..." + +# ── Init scripts and utilities ───────────────────────────────────────── +for script in openshell-vm-init.sh openshell-vm-exec-agent.py check-vm-capabilities.sh hello-server.py; do + src="${SCRIPT_DIR}/${script}" + dst="${ROOTFS_DIR}/srv/${script}" + if [ -f "$src" ]; then + if ! cmp -s "$src" "$dst" 2>/dev/null; then + cp "$src" "$dst" + chmod +x "$dst" + echo " updated: /srv/${script}" + fi + fi +done + +# ── Helm chart ───────────────────────────────────────────────────────── +HELM_CHART_DIR="${ROOT}/deploy/helm/openshell" +CHART_STAGING="${ROOTFS_DIR}/opt/openshell/charts" +if [ -d "${HELM_CHART_DIR}" ]; then + mkdir -p "${CHART_STAGING}" + # Package into a temp dir and compare — only update if changed. + TMP_CHART=$(mktemp -d) + helm package "${HELM_CHART_DIR}" -d "${TMP_CHART}" >/dev/null 2>&1 + for tgz in "${TMP_CHART}"/*.tgz; do + [ -f "$tgz" ] || continue + base=$(basename "$tgz") + if ! cmp -s "$tgz" "${CHART_STAGING}/${base}" 2>/dev/null; then + cp "$tgz" "${CHART_STAGING}/${base}" + echo " updated: /opt/openshell/charts/${base}" + fi + done + rm -rf "${TMP_CHART}" +fi + +# ── Kubernetes manifests ─────────────────────────────────────────────── +MANIFEST_SRC="${ROOT}/deploy/k8s" +MANIFEST_DST="${ROOTFS_DIR}/opt/openshell/manifests" +if [ -d "${MANIFEST_SRC}" ]; then + mkdir -p "${MANIFEST_DST}" + for manifest in "${MANIFEST_SRC}"/*.yaml; do + [ -f "$manifest" ] || continue + base=$(basename "$manifest") + if ! cmp -s "$manifest" "${MANIFEST_DST}/${base}" 2>/dev/null; then + cp "$manifest" "${MANIFEST_DST}/${base}" + echo " updated: /opt/openshell/manifests/${base}" + fi + done +fi + +# ── Supervisor binary ───────────────────────────────────────────────── +SUPERVISOR_TARGET="aarch64-unknown-linux-gnu" +SUPERVISOR_BIN="${ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" +SUPERVISOR_DST="${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" +if [ -f "${SUPERVISOR_BIN}" ]; then + mkdir -p "$(dirname "${SUPERVISOR_DST}")" + if ! cmp -s "${SUPERVISOR_BIN}" "${SUPERVISOR_DST}" 2>/dev/null; then + cp "${SUPERVISOR_BIN}" "${SUPERVISOR_DST}" + chmod +x "${SUPERVISOR_DST}" + echo " updated: /opt/openshell/bin/openshell-sandbox" + fi +fi + +# ── Fix execute permissions on k3s data binaries ────────────────────── +# docker export and macOS virtio-fs can strip execute bits. +chmod +x "${ROOTFS_DIR}"/var/lib/rancher/k3s/data/*/bin/* 2>/dev/null || true +chmod +x "${ROOTFS_DIR}"/var/lib/rancher/k3s/data/*/bin/aux/* 2>/dev/null || true + +echo "Sync complete." diff --git a/tasks/test.toml b/tasks/test.toml index 6231c21e7..c514fe382 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -17,7 +17,7 @@ depends = ["e2e:python:gpu"] ["test:rust"] description = "Run Rust tests" -run = "cargo test --workspace" +run = "cargo test --workspace --exclude openshell-vm" hide = true ["test:python"] @@ -46,3 +46,9 @@ description = "Run Python GPU e2e tests" depends = ["python:proto", "CLUSTER_GPU=1 cluster"] env = { UV_NO_SYNC = "1", PYTHONPATH = "python" } run = "uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2e/python" + +["e2e:vm"] +description = "Run e2e tests against a gateway VM (macOS ARM64)" +depends = ["python:proto"] +env = { UV_NO_SYNC = "1", PYTHONPATH = "python", OPENSHELL_GATEWAY = "gateway" } +run = "uv run pytest -o python_files='test_*.py' e2e/python" diff --git a/tasks/vm.toml b/tasks/vm.toml new file mode 100644 index 000000000..812455df4 --- /dev/null +++ b/tasks/vm.toml @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# openshell-vm development helpers + +[vm] +description = "Build and run the standalone openshell-vm microVM" +run = [ + "mise run vm:build:binary", + "tasks/scripts/codesign-openshell-vm.sh", + "tasks/scripts/bundle-vm-runtime.sh", + "tasks/scripts/ensure-vm-rootfs.sh", + "tasks/scripts/sync-vm-rootfs.sh", + "tasks/scripts/run-vm.sh", +] +hide = false + +["vm:build"] +description = "Force a fresh openshell-vm rebuild, including the rootfs" +run = [ + "mise run vm:build:binary", + "tasks/scripts/codesign-openshell-vm.sh", + "tasks/scripts/bundle-vm-runtime.sh", + "OPENSHELL_VM_FORCE_ROOTFS_REBUILD=1 tasks/scripts/ensure-vm-rootfs.sh", + "tasks/scripts/sync-vm-rootfs.sh", +] +hide = false + +["vm:build:binary"] +description = "Build the standalone openshell-vm binary" +run = "cargo build -p openshell-vm" +hide = true + +["vm:build:release"] +description = "Build the standalone openshell-vm binary in release mode" +run = "cargo build -p openshell-vm --release" +hide = true + +["vm:rootfs"] +description = "Build the default openshell-vm rootfs if needed" +run = "tasks/scripts/ensure-vm-rootfs.sh" +hide = true + +["vm:codesign"] +description = "Codesign the openshell-vm binary for Hypervisor.framework access on macOS" +depends = ["vm:build:binary"] +run = "tasks/scripts/codesign-openshell-vm.sh" +hide = true + +["vm:bundle-runtime"] +description = "Stage the openshell-vm sidecar runtime bundle next to local build outputs" +run = "tasks/scripts/bundle-vm-runtime.sh" +hide = false + +["vm:build-custom-runtime"] +description = "Build a custom libkrunfw with bridge/netfilter kernel support" +run = "crates/openshell-vm/runtime/build-custom-libkrunfw.sh" +hide = false + +["vm:check-capabilities"] +description = "Check VM kernel capabilities (run inside the VM)" +run = "echo 'This script must be run inside the VM. Copy it to the rootfs or exec into a running VM.'" +hide = false + +["vm:package:openshell-vm"] +description = "Package the openshell-vm binary with its sidecar runtime bundle" +run = "tasks/scripts/package-openshell-vm-runtime.sh" +depends = ["vm:build:release", "vm:bundle-runtime"] +hide = false From f881471ae898107df64da9b25f2456c028e811a1 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 24 Mar 2026 23:36:16 -0700 Subject: [PATCH 02/10] fix(vm): address review findings in openshell-vm crate Fix targeted gvproxy kill to use tracked PID from runtime state instead of pkill, gate diagnostic dump behind OPENSHELL_VM_DIAG env var, stream SHA-256 hashing to avoid buffering entire files, clarify operator precedence in env var validation, replace hand-rolled JSON parser with serde_json, deduplicate required_runtime_lib_name(), and add openshell-vm to AGENTS.md architecture table. --- AGENTS.md | 1 + .../openshell-vm/scripts/openshell-vm-init.sh | 3 + crates/openshell-vm/src/exec.rs | 15 +++- crates/openshell-vm/src/ffi.rs | 86 ++++++++----------- crates/openshell-vm/src/lib.rs | 78 ++++++++--------- 5 files changed, 85 insertions(+), 98 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 79dc29d1b..8364e7c13 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -38,6 +38,7 @@ These pipelines connect skills into end-to-end workflows. Individual skill files | `crates/openshell-core/` | Shared core | Common types, configuration, error handling | | `crates/openshell-providers/` | Provider management | Credential provider backends | | `crates/openshell-tui/` | Terminal UI | Ratatui-based dashboard for monitoring | +| `crates/openshell-vm/` | MicroVM runtime | libkrun-based hardware-isolated VM execution | | `python/openshell/` | Python SDK | Python bindings and CLI packaging | | `proto/` | Protobuf definitions | gRPC service contracts | | `deploy/` | Docker, Helm, K8s | Dockerfiles, Helm chart, manifests | diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh index c4e89cab0..71b464bb4 100755 --- a/crates/openshell-vm/scripts/openshell-vm-init.sh +++ b/crates/openshell-vm/scripts/openshell-vm-init.sh @@ -655,6 +655,8 @@ ts "starting k3s server (bridge CNI + nftables kube-proxy)" # readable from the host at rootfs/opt/openshell/diag.txt. # The subshell runs detached with its own session (setsid) so it survives # the exec that replaces this shell with k3s as PID 1. +# Only runs when OPENSHELL_VM_DIAG=1 is set. +if [ "${OPENSHELL_VM_DIAG:-0}" = "1" ]; then DIAG_FILE="/opt/openshell/diag.txt" setsid sh -c ' sleep 60 @@ -688,5 +690,6 @@ setsid sh -c ' echo "=== [DIAG] done ===" } > "$DIAG" 2>&1 ' & +fi exec /usr/local/bin/k3s server "${K3S_ARGS[@]}" diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index 34a9d5043..ec01f6096 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -37,6 +37,9 @@ pub struct VmRuntimeState { pub rootfs: PathBuf, pub console_log: PathBuf, pub started_at_ms: u128, + /// PID of the gvproxy process (if networking uses gvproxy). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub gvproxy_pid: Option, } #[derive(Debug, Serialize)] @@ -67,7 +70,12 @@ pub fn vm_exec_socket_path(rootfs: &Path) -> PathBuf { vm_run_dir(rootfs).join(format!("{}-{}", rootfs_key(rootfs), VM_EXEC_SOCKET_NAME)) } -pub fn write_vm_runtime_state(rootfs: &Path, pid: i32, console_log: &Path) -> Result<(), VmError> { +pub fn write_vm_runtime_state( + rootfs: &Path, + pid: i32, + console_log: &Path, + gvproxy_pid: Option, +) -> Result<(), VmError> { let state = VmRuntimeState { pid, exec_vsock_port: VM_EXEC_VSOCK_PORT, @@ -75,6 +83,7 @@ pub fn write_vm_runtime_state(rootfs: &Path, pid: i32, console_log: &Path) -> Re rootfs: rootfs.to_path_buf(), console_log: console_log.to_path_buf(), started_at_ms: now_ms()?, + gvproxy_pid, }; let path = vm_state_path(rootfs); let bytes = serde_json::to_vec_pretty(&state) @@ -396,7 +405,7 @@ fn vm_run_dir(rootfs: &Path) -> PathBuf { rootfs.parent().unwrap_or(rootfs).to_path_buf() } -fn vm_state_path(rootfs: &Path) -> PathBuf { +pub fn vm_state_path(rootfs: &Path) -> PathBuf { vm_run_dir(rootfs).join(format!("{}-{}", rootfs_key(rootfs), VM_STATE_NAME)) } @@ -471,7 +480,7 @@ fn validate_env_vars(items: &[String]) -> Result<(), VmError> { })?; if key.is_empty() || !key.chars().enumerate().all(|(idx, ch)| { - ch == '_' || ch.is_ascii_alphanumeric() && (idx > 0 || !ch.is_ascii_digit()) + ch == '_' || (ch.is_ascii_alphanumeric() && (idx > 0 || !ch.is_ascii_digit())) }) { return Err(VmError::Exec(format!( diff --git a/crates/openshell-vm/src/ffi.rs b/crates/openshell-vm/src/ffi.rs index 1ae7651cd..6b661a88d 100644 --- a/crates/openshell-vm/src/ffi.rs +++ b/crates/openshell-vm/src/ffi.rs @@ -218,7 +218,7 @@ fn preload_runtime_support_libraries(runtime_dir: &Path) -> Result, Ok(support_libs) } -fn required_runtime_lib_name() -> &'static str { +pub fn required_runtime_lib_name() -> &'static str { #[cfg(target_os = "macos")] { "libkrun.dylib" @@ -230,67 +230,49 @@ fn required_runtime_lib_name() -> &'static str { } /// Compute SHA-256 hash of a file, returning hex string. +/// +/// Streams the file contents directly to `shasum -a 256` via a pipe, +/// avoiding buffering the entire file in memory. fn compute_sha256(path: &Path) -> Result { - use std::io::Read; - let mut file = fs::File::open(path)?; - let mut hasher = sha2_hasher(); - let mut buf = [0u8; 8192]; - loop { - let n = file.read(&mut buf)?; - if n == 0 { - break; - } - hasher_update(&mut hasher, &buf[..n]); - } - Ok(hasher_finalize(hasher)) -} - -// Minimal SHA-256 using the sha2 crate if available, otherwise shell out. -// We attempt a runtime `shasum` call to avoid adding a crate dependency. -fn sha2_hasher() -> Sha256State { - Sha256State { - data: Vec::with_capacity(1024 * 1024), - } -} - -struct Sha256State { - data: Vec, -} - -fn hasher_update(state: &mut Sha256State, bytes: &[u8]) { - state.data.extend_from_slice(bytes); -} - -fn hasher_finalize(state: Sha256State) -> String { - // Use shasum via process for simplicity — avoids adding a crypto dependency. - use std::io::Write; + use std::io::{Read, Write}; use std::process::{Command, Stdio}; - let mut child = match Command::new("shasum") + let mut file = fs::File::open(path)?; + + let mut child = Command::new("shasum") .args(["-a", "256"]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::null()) - .spawn() - { - Ok(c) => c, - Err(_) => return "unknown".to_string(), - }; + .spawn()?; - if let Some(mut stdin) = child.stdin.take() { - let _ = stdin.write_all(&state.data); + // Stream file contents directly to shasum's stdin in 8KB chunks. + { + let mut stdin = child + .stdin + .take() + .ok_or_else(|| std::io::Error::other("failed to open shasum stdin"))?; + let mut buf = [0u8; 8192]; + loop { + let n = file.read(&mut buf)?; + if n == 0 { + break; + } + stdin.write_all(&buf[..n])?; + } + // stdin is dropped here, closing the pipe so shasum can finish. } - match child.wait_with_output() { - Ok(output) if output.status.success() => { - let stdout = String::from_utf8_lossy(&output.stdout); - stdout - .split_whitespace() - .next() - .unwrap_or("unknown") - .to_string() - } - _ => "unknown".to_string(), + let output = child.wait_with_output()?; + if output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + Ok(stdout + .split_whitespace() + .next() + .unwrap_or("unknown") + .to_string()) + } else { + Ok("unknown".to_string()) } } diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index a1dfedb8c..1dd57ab4b 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -23,9 +23,9 @@ use std::ptr; use std::time::Instant; pub use exec::{ - VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state, - ensure_vm_not_running, exec_running_vm, reset_runtime_state, vm_exec_socket_path, - write_vm_runtime_state, + acquire_rootfs_lock, clear_vm_runtime_state, ensure_vm_not_running, exec_running_vm, + reset_runtime_state, vm_exec_socket_path, vm_state_path, write_vm_runtime_state, VmExecOptions, + VmRuntimeState, VM_EXEC_VSOCK_PORT, }; // ── Error type ───────────────────────────────────────────────────────── @@ -235,17 +235,6 @@ pub(crate) fn configured_runtime_dir() -> Result { Ok(exe_dir.join(VM_RUNTIME_DIR_NAME)) } -fn required_runtime_lib_name() -> &'static str { - #[cfg(target_os = "macos")] - { - "libkrun.dylib" - } - #[cfg(not(target_os = "macos"))] - { - "libkrun.so" - } -} - fn validate_runtime_dir(dir: &Path) -> Result { if !dir.is_dir() { return Err(VmError::BinaryNotFound { @@ -256,7 +245,7 @@ fn validate_runtime_dir(dir: &Path) -> Result { }); } - let libkrun = dir.join(required_runtime_lib_name()); + let libkrun = dir.join(ffi::required_runtime_lib_name()); if !libkrun.is_file() { return Err(VmError::BinaryNotFound { path: libkrun.display().to_string(), @@ -404,7 +393,7 @@ fn log_runtime_provenance(runtime_dir: &Path) { eprintln!(" type: custom (OpenShell-built)"); // Parse provenance.json for additional details. if let Some(ref json) = prov.provenance_json { - // Extract key fields without pulling in serde_json for this. + // Extract key fields from provenance metadata. for key in &["libkrunfw_commit", "kernel_version", "build_timestamp"] { if let Some(val) = extract_json_string(json, key) { eprintln!(" {}: {}", key.replace('_', "-"), val); @@ -417,22 +406,10 @@ fn log_runtime_provenance(runtime_dir: &Path) { } } -/// Simple JSON string value extractor (avoids serde_json dependency -/// for this single use case). +/// Extract a string value from a JSON object by key. fn extract_json_string(json: &str, key: &str) -> Option { - let pattern = format!("\"{}\"", key); - let idx = json.find(&pattern)?; - let after_key = &json[idx + pattern.len()..]; - // Skip whitespace and colon - let after_colon = after_key.trim_start().strip_prefix(':')?; - let after_ws = after_colon.trim_start(); - if after_ws.starts_with('"') { - let value_start = &after_ws[1..]; - let end = value_start.find('"')?; - Some(value_start[..end].to_string()) - } else { - None - } + let map: serde_json::Map = serde_json::from_str(json).ok()?; + map.get(key)?.as_str().map(ToOwned::to_owned) } fn clamp_log_level(level: u32) -> u32 { @@ -669,18 +646,30 @@ fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { } } -/// Kill any stale gvproxy process from a previous openshell-vm run. +/// Kill a stale gvproxy process from a previous openshell-vm run. /// /// If the CLI crashes or is killed before cleanup, gvproxy keeps running /// and holds port 2222. A new gvproxy instance then fails with /// "bind: address already in use". -fn kill_stale_gvproxy() { - let output = std::process::Command::new("pkill") - .args(["-x", "gvproxy"]) - .output(); - if let Ok(o) = output { - if o.status.success() { - eprintln!("Killed stale gvproxy process"); +/// +/// We only kill the specific gvproxy PID recorded in the VM runtime state +/// to avoid disrupting unrelated gvproxy instances (e.g. Podman Desktop). +fn kill_stale_gvproxy(rootfs: &Path) { + let state_path = vm_state_path(rootfs); + let pid = std::fs::read(&state_path) + .ok() + .and_then(|bytes| serde_json::from_slice::(&bytes).ok()) + .and_then(|state| state.gvproxy_pid); + + if let Some(gvproxy_pid) = pid { + // Verify the process is still alive before killing it. + let pid_i32 = gvproxy_pid as libc::pid_t; + let is_alive = unsafe { libc::kill(pid_i32, 0) } == 0; + if is_alive { + unsafe { + libc::kill(pid_i32, libc::SIGTERM); + } + eprintln!("Killed stale gvproxy process (pid {gvproxy_pid})"); // Brief pause for the port to be released. std::thread::sleep(std::time::Duration::from_millis(200)); } @@ -794,7 +783,7 @@ pub fn launch(config: &VmConfig) -> Result { // Kill any stale gvproxy process from a previous run. // If gvproxy is still holding port 2222, the new instance // will fail with "bind: address already in use". - kill_stale_gvproxy(); + kill_stale_gvproxy(&config.rootfs); // Clean stale sockets (including the -krun.sock file that // libkrun creates as its datagram endpoint). @@ -927,7 +916,10 @@ pub fn launch(config: &VmConfig) -> Result { _ => { // Parent: wait for child if config.exec_path == "/srv/openshell-vm-init.sh" { - if let Err(err) = write_vm_runtime_state(&config.rootfs, pid, &console_log) { + let gvproxy_pid = gvproxy_child.as_ref().map(std::process::Child::id); + if let Err(err) = + write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid) + { unsafe { libc::kill(pid, libc::SIGTERM); } @@ -1414,7 +1406,7 @@ mod tests { let dir = temp_runtime_dir(); fs::create_dir_all(&dir).expect("failed to create runtime dir"); - write_runtime_file(&dir.join(required_runtime_lib_name())); + write_runtime_file(&dir.join(ffi::required_runtime_lib_name())); write_runtime_file(&dir.join("libkrunfw.test")); let gvproxy = dir.join("gvproxy"); write_runtime_file(&gvproxy); @@ -1438,7 +1430,7 @@ mod tests { let dir = temp_runtime_dir(); fs::create_dir_all(&dir).expect("failed to create runtime dir"); - write_runtime_file(&dir.join(required_runtime_lib_name())); + write_runtime_file(&dir.join(ffi::required_runtime_lib_name())); write_runtime_file(&dir.join("libkrunfw.test")); let err = validate_runtime_dir(&dir).expect_err("missing gvproxy should fail"); From f721839c453c0373c45534dfe1aae7f5354d9f8e Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 24 Mar 2026 23:42:36 -0700 Subject: [PATCH 03/10] wip --- deploy/helm/openshell/templates/statefulset.yaml | 4 +--- tasks/rust.toml | 4 ++-- tasks/scripts/sync-vm-rootfs.sh | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 55d698a8c..0afd0cffd 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -101,10 +101,8 @@ spec: {{- end }} {{- end }} volumeMounts: - {{- if .Values.persistence.enabled }} - name: openshell-data mountPath: /var/openshell - {{- end }} {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -147,11 +145,11 @@ spec: - name: tls-client-ca secret: secretName: {{ .Values.server.tls.clientCaSecretName }} + {{- end }} {{- if not .Values.persistence.enabled }} - name: openshell-data emptyDir: {} {{- end }} - {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/tasks/rust.toml b/tasks/rust.toml index dfa4068f4..69214ce7f 100644 --- a/tasks/rust.toml +++ b/tasks/rust.toml @@ -5,12 +5,12 @@ ["rust:check"] description = "Check all Rust crates for errors" -run = "cargo check --workspace --exclude openshell-vm" +run = "cargo check --workspace" hide = true ["rust:lint"] description = "Lint Rust code with Clippy" -run = "cargo clippy --workspace --all-targets --exclude openshell-vm" +run = "cargo clippy --workspace --all-targets" hide = true ["rust:format"] diff --git a/tasks/scripts/sync-vm-rootfs.sh b/tasks/scripts/sync-vm-rootfs.sh index 053b3d99f..edbc2f9b8 100755 --- a/tasks/scripts/sync-vm-rootfs.sh +++ b/tasks/scripts/sync-vm-rootfs.sh @@ -56,7 +56,7 @@ if [ -d "${HELM_CHART_DIR}" ]; then fi # ── Kubernetes manifests ─────────────────────────────────────────────── -MANIFEST_SRC="${ROOT}/deploy/k8s" +MANIFEST_SRC="${ROOT}/deploy/kube/manifests" MANIFEST_DST="${ROOTFS_DIR}/opt/openshell/manifests" if [ -d "${MANIFEST_SRC}" ]; then mkdir -p "${MANIFEST_DST}" From e014513c33aac56582690d41f600d5795744cbf0 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 25 Mar 2026 11:40:02 -0700 Subject: [PATCH 04/10] wip --- crates/openshell-vm/src/exec.rs | 24 ++- crates/openshell-vm/src/lib.rs | 276 +++++++++++++++++++++++++++----- crates/openshell-vm/src/main.rs | 26 ++- e2e/rust/tests/smoke.rs | 97 +++++++++++ tasks/scripts/e2e-vm.sh | 131 +++++++++++++++ tasks/test.toml | 10 +- 6 files changed, 510 insertions(+), 54 deletions(-) create mode 100644 e2e/rust/tests/smoke.rs create mode 100755 tasks/scripts/e2e-vm.sh diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index ec01f6096..961b5d7cf 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -15,7 +15,6 @@ use crate::VmError; pub const VM_EXEC_VSOCK_PORT: u32 = 10_777; -const VM_EXEC_SOCKET_NAME: &str = "openshell-vm-exec.sock"; const VM_STATE_NAME: &str = "vm-state.json"; const VM_LOCK_NAME: &str = "vm.lock"; const KUBECONFIG_ENV: &str = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"; @@ -67,7 +66,22 @@ enum ServerFrame { } pub fn vm_exec_socket_path(rootfs: &Path) -> PathBuf { - vm_run_dir(rootfs).join(format!("{}-{}", rootfs_key(rootfs), VM_EXEC_SOCKET_NAME)) + let mut base = PathBuf::from("/tmp"); + if !base.is_dir() { + base = std::env::temp_dir(); + } + let dir = base.join("ovm-exec"); + let id = hash_path_id(rootfs); + dir.join(format!("{id}.sock")) +} + +fn hash_path_id(path: &Path) -> String { + let mut hash: u64 = 0xcbf29ce484222325; + for byte in path.to_string_lossy().as_bytes() { + hash ^= u64::from(*byte); + hash = hash.wrapping_mul(0x100000001b3); + } + format!("{:012x}", hash & 0x0000_ffff_ffff_ffff) } pub fn write_vm_runtime_state( @@ -121,7 +135,7 @@ pub fn clear_vm_runtime_state(rootfs: &Path) { /// - containerd snapshots (no re-extract needed) /// - containerd metadata database (meta.db — image/snapshot tracking) /// - k3s server state (kine/sqlite, TLS certs, manifests) -pub fn reset_runtime_state(rootfs: &Path) -> Result<(), VmError> { +pub fn reset_runtime_state(rootfs: &Path, gateway_name: &str) -> Result<(), VmError> { // Full reset: wipe all k3s state so the VM cold-starts from scratch. // The init script will re-import airgap images, deploy manifests, // and generate fresh cluster state. This is slower (~30-60s) but @@ -179,7 +193,7 @@ pub fn reset_runtime_state(rootfs: &Path) -> Result<(), VmError> { std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); let mtls_dir = PathBuf::from(&config_base) .join("openshell/gateways") - .join(super::GATEWAY_CLUSTER_NAME) + .join(gateway_name) .join("mtls"); if mtls_dir.is_dir() { fs::remove_dir_all(&mtls_dir).ok(); @@ -187,7 +201,7 @@ pub fn reset_runtime_state(rootfs: &Path) -> Result<(), VmError> { // Also remove metadata so is_warm_boot() returns false. let metadata = PathBuf::from(&config_base) .join("openshell/gateways") - .join(super::GATEWAY_CLUSTER_NAME) + .join(gateway_name) .join("metadata.json"); if metadata.is_file() { fs::remove_file(&metadata).ok(); diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index 1dd57ab4b..993972d14 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -158,6 +158,9 @@ pub struct VmConfig { /// Wipe all runtime state (containerd tasks/sandboxes, kubelet pods) /// before booting. Recovers from corrupted state after a crash. pub reset: bool, + + /// Gateway metadata name used for host-side config and mTLS material. + pub gateway_name: String, } impl VmConfig { @@ -198,8 +201,124 @@ impl VmConfig { binary: default_runtime_gvproxy_path(), }, reset: false, + gateway_name: DEFAULT_GATEWAY_NAME.to_string(), + } + } +} + +/// Default gateway metadata name used by the legacy single-instance layout. +pub const DEFAULT_GATEWAY_NAME: &str = "openshell-vm"; + +/// Resolve the gateway metadata name for an optional instance name. +pub fn gateway_name(instance_name: Option<&str>) -> Result { + match instance_name { + Some(name) => Ok(format!( + "{DEFAULT_GATEWAY_NAME}-{}", + sanitize_instance_name(name)? + )), + None => Ok(DEFAULT_GATEWAY_NAME.to_string()), + } +} + +/// Resolve the rootfs path for a named instance. +pub fn named_rootfs_dir(instance_name: &str) -> Result { + let name = sanitize_instance_name(instance_name)?; + let base = openshell_bootstrap::paths::default_rootfs_dir() + .map_err(|e| VmError::RuntimeState(format!("resolve default VM rootfs: {e}")))?; + let parent = base.parent().ok_or_else(|| { + VmError::RuntimeState(format!("default rootfs has no parent: {}", base.display())) + })?; + Ok(parent.join("instances").join(name).join("rootfs")) +} + +/// Ensure a named instance rootfs exists, cloning from the default rootfs +/// on first use. +pub fn ensure_named_rootfs(instance_name: &str) -> Result { + let instance_rootfs = named_rootfs_dir(instance_name)?; + if instance_rootfs.is_dir() { + return Ok(instance_rootfs); + } + + let default_rootfs = openshell_bootstrap::paths::default_rootfs_dir() + .map_err(|e| VmError::RuntimeState(format!("resolve default VM rootfs: {e}")))?; + if !default_rootfs.is_dir() { + return Err(VmError::RootfsNotFound { + path: default_rootfs.display().to_string(), + }); + } + + clone_rootfs(&default_rootfs, &instance_rootfs)?; + Ok(instance_rootfs) +} + +fn sanitize_instance_name(name: &str) -> Result { + let trimmed = name.trim(); + if trimmed.is_empty() { + return Err(VmError::RuntimeState( + "instance name cannot be empty".to_string(), + )); + } + + let mut out = String::with_capacity(trimmed.len()); + for ch in trimmed.chars() { + if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { + out.push(ch); + } else { + return Err(VmError::RuntimeState(format!( + "invalid instance name '{trimmed}': only [A-Za-z0-9_-] are allowed" + ))); + } + } + + Ok(out) +} + +fn clone_rootfs(source: &Path, dest: &Path) -> Result<(), VmError> { + let parent = dest.parent().ok_or_else(|| { + VmError::RuntimeState(format!("instance rootfs has no parent: {}", dest.display())) + })?; + std::fs::create_dir_all(parent).map_err(|e| { + VmError::RuntimeState(format!( + "create instance parent dir {}: {e}", + parent.display() + )) + })?; + + let status = if cfg!(target_os = "macos") { + let clone_status = std::process::Command::new("cp") + .args(["-c", "-R"]) + .arg(source) + .arg(dest) + .status() + .map_err(|e| VmError::RuntimeState(format!("clone rootfs with cp failed: {e}")))?; + if clone_status.success() { + clone_status + } else { + std::process::Command::new("cp") + .args(["-R"]) + .arg(source) + .arg(dest) + .status() + .map_err(|e| VmError::RuntimeState(format!("copy rootfs with cp failed: {e}")))? } + } else { + std::process::Command::new("cp") + .args(["-a", "--reflink=auto"]) + .arg(source) + .arg(dest) + .status() + .map_err(|e| VmError::RuntimeState(format!("clone rootfs with cp failed: {e}")))? + }; + + if !status.success() { + return Err(VmError::RuntimeState(format!( + "failed to clone rootfs {} -> {}", + source.display(), + dest.display() + ))); } + + Ok(()) } // ── Helpers ───────────────────────────────────────────────────────────── @@ -676,6 +795,70 @@ fn kill_stale_gvproxy(rootfs: &Path) { } } +fn vm_rootfs_key(rootfs: &Path) -> String { + let name = rootfs + .file_name() + .and_then(|part| part.to_str()) + .unwrap_or("openshell-vm"); + let mut out = String::with_capacity(name.len()); + for ch in name.chars() { + if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { + out.push(ch); + } else { + out.push('_'); + } + } + if out.is_empty() { + "openshell-vm".to_string() + } else { + out + } +} + +fn hash_path_id(path: &Path) -> String { + let mut hash: u64 = 0xcbf29ce484222325; + for byte in path.to_string_lossy().as_bytes() { + hash ^= u64::from(*byte); + hash = hash.wrapping_mul(0x100000001b3); + } + format!("{:012x}", hash & 0x0000_ffff_ffff_ffff) +} + +fn gvproxy_socket_dir(rootfs: &Path) -> Result { + let mut base = PathBuf::from("/tmp"); + if !base.is_dir() { + base = std::env::temp_dir(); + } + let dir = base.join("ovm-gv"); + std::fs::create_dir_all(&dir).map_err(|e| { + VmError::HostSetup(format!("create gvproxy socket dir {}: {e}", dir.display())) + })?; + + // macOS unix socket path limit is tight (~104 bytes). Keep paths very short. + let id = hash_path_id(rootfs); + Ok(dir.join(id)) +} + +fn gateway_host_port(config: &VmConfig) -> u16 { + config + .port_map + .first() + .and_then(|pm| pm.split(':').next()) + .and_then(|port| port.parse::().ok()) + .unwrap_or(DEFAULT_GATEWAY_PORT) +} + +fn pick_gvproxy_ssh_port() -> Result { + let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) + .map_err(|e| VmError::HostSetup(format!("allocate gvproxy ssh port on localhost: {e}")))?; + let port = listener + .local_addr() + .map_err(|e| VmError::HostSetup(format!("read gvproxy ssh port: {e}")))? + .port(); + drop(listener); + Ok(port) +} + fn path_to_cstring(path: &Path) -> Result { let s = path .to_str() @@ -718,7 +901,7 @@ pub fn launch(config: &VmConfig) -> Result { // This must happen after the lock (to confirm no other VM is using // the rootfs) but before booting (so the new VM starts clean). if config.reset { - reset_runtime_state(&config.rootfs)?; + reset_runtime_state(&config.rootfs, &config.gateway_name)?; } let launch_start = Instant::now(); @@ -777,8 +960,10 @@ pub fn launch(config: &VmConfig) -> Result { .parent() .unwrap_or(&config.rootfs) .to_path_buf(); - let vfkit_sock = run_dir.join("gvproxy-vfkit.sock"); - let api_sock = run_dir.join("gvproxy-api.sock"); + let rootfs_key = vm_rootfs_key(&config.rootfs); + let sock_base = gvproxy_socket_dir(&config.rootfs)?; + let vfkit_sock = sock_base.with_extension("v"); + let api_sock = sock_base.with_extension("a"); // Kill any stale gvproxy process from a previous run. // If gvproxy is still holding port 2222, the new instance @@ -789,12 +974,13 @@ pub fn launch(config: &VmConfig) -> Result { // libkrun creates as its datagram endpoint). let _ = std::fs::remove_file(&vfkit_sock); let _ = std::fs::remove_file(&api_sock); - let krun_sock = run_dir.join("gvproxy-vfkit.sock-krun.sock"); + let krun_sock = sock_base.with_extension("v-krun.sock"); let _ = std::fs::remove_file(&krun_sock); // Start gvproxy eprintln!("Starting gvproxy: {}", binary.display()); - let gvproxy_log = run_dir.join("gvproxy.log"); + let ssh_port = pick_gvproxy_ssh_port()?; + let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); let gvproxy_log_file = std::fs::File::create(&gvproxy_log) .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; let child = std::process::Command::new(binary) @@ -802,14 +988,17 @@ pub fn launch(config: &VmConfig) -> Result { .arg(format!("unixgram://{}", vfkit_sock.display())) .arg("-listen") .arg(format!("unix://{}", api_sock.display())) + .arg("-ssh-port") + .arg(ssh_port.to_string()) .stdout(std::process::Stdio::null()) .stderr(gvproxy_log_file) .spawn() .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; eprintln!( - "gvproxy started (pid {}) [{:.1}s]", + "gvproxy started (pid {}, ssh port {}) [{:.1}s]", child.id(), + ssh_port, launch_start.elapsed().as_secs_f64() ); @@ -868,6 +1057,11 @@ pub fn launch(config: &VmConfig) -> Result { } for vsock_port in &config.vsock_ports { + if let Some(parent) = vsock_port.socket_path.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display())) + })?; + } vm.add_vsock_port(vsock_port)?; } @@ -877,7 +1071,7 @@ pub fn launch(config: &VmConfig) -> Result { .rootfs .parent() .unwrap_or(&config.rootfs) - .join("console.log") + .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs))) }); vm.set_console_output(&console_log)?; @@ -1007,7 +1201,10 @@ pub fn launch(config: &VmConfig) -> Result { // from virtio-fs — no kubectl or port forwarding needed. // Cold boot (Path 2) writes secret manifests into the // k3s auto-deploy directory via virtio-fs. - if let Err(e) = bootstrap_gateway(&config.rootfs) { + let gateway_port = gateway_host_port(config); + if let Err(e) = + bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port) + { eprintln!("Bootstrap failed: {e}"); eprintln!(" The VM is running but OpenShell may not be fully operational."); } @@ -1015,7 +1212,7 @@ pub fn launch(config: &VmConfig) -> Result { // Wait for the gRPC service to be reachable via TCP // probe on host:30051. This confirms the full path // (gvproxy → kube-proxy nftables → pod:8080) is working. - wait_for_gateway_service(); + wait_for_gateway_service(gateway_port); } eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); @@ -1066,11 +1263,8 @@ pub fn launch(config: &VmConfig) -> Result { // ── Post-boot bootstrap ──────────────────────────────────────────────── -/// Cluster name used for metadata and mTLS storage. -const GATEWAY_CLUSTER_NAME: &str = "openshell-vm"; - -/// Gateway port: the host port mapped to the OpenShell `NodePort` (30051). -const GATEWAY_PORT: u16 = 30051; +/// Default gateway port: host port mapped to the OpenShell `NodePort` (30051). +const DEFAULT_GATEWAY_PORT: u16 = 30051; /// Bootstrap the OpenShell control plane after k3s is ready. /// @@ -1095,14 +1289,14 @@ const GATEWAY_PORT: u16 = 30051; /// 1. **Warm boot**: host already has certs at `~/.config/.../mtls/` — skip. /// 2. **First boot / post-reset**: polls the rootfs for `/opt/openshell/pki/ca.crt` /// (written by the VM init script), then copies client certs to the host. -fn bootstrap_gateway(rootfs: &Path) -> Result<(), VmError> { +fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Result<(), VmError> { let bootstrap_start = Instant::now(); let metadata = openshell_bootstrap::GatewayMetadata { - name: GATEWAY_CLUSTER_NAME.to_string(), - gateway_endpoint: format!("https://127.0.0.1:{GATEWAY_PORT}"), + name: gateway_name.to_string(), + gateway_endpoint: format!("https://127.0.0.1:{gateway_port}"), is_remote: false, - gateway_port: GATEWAY_PORT, + gateway_port, remote_host: None, resolved_host: None, auth_mode: None, @@ -1111,11 +1305,11 @@ fn bootstrap_gateway(rootfs: &Path) -> Result<(), VmError> { }; // ── Warm boot: host already has certs ────────────────────────── - if is_warm_boot() { + if is_warm_boot(gateway_name) { // Always (re-)store metadata so port/endpoint changes are picked up. - openshell_bootstrap::store_gateway_metadata(GATEWAY_CLUSTER_NAME, &metadata) + openshell_bootstrap::store_gateway_metadata(gateway_name, &metadata) .map_err(|e| VmError::Bootstrap(format!("failed to store metadata: {e}")))?; - openshell_bootstrap::save_active_gateway(GATEWAY_CLUSTER_NAME) + openshell_bootstrap::save_active_gateway(gateway_name) .map_err(|e| VmError::Bootstrap(format!("failed to set active cluster: {e}")))?; // Verify host certs match the rootfs PKI. If they diverge (e.g. @@ -1123,7 +1317,7 @@ fn bootstrap_gateway(rootfs: &Path) -> Result<(), VmError> { // re-sync the host certs from the authoritative rootfs copy. let pki_dir = rootfs.join("opt/openshell/pki"); if pki_dir.join("ca.crt").is_file() { - if let Err(e) = sync_host_certs_if_stale(&pki_dir) { + if let Err(e) = sync_host_certs_if_stale(&pki_dir, gateway_name) { eprintln!("Warning: cert sync check failed: {e}"); } } @@ -1132,9 +1326,9 @@ fn bootstrap_gateway(rootfs: &Path) -> Result<(), VmError> { "Warm boot [{:.1}s]", bootstrap_start.elapsed().as_secs_f64() ); - eprintln!(" Cluster: {GATEWAY_CLUSTER_NAME}"); - eprintln!(" Gateway: https://127.0.0.1:{GATEWAY_PORT}"); - eprintln!(" mTLS: ~/.config/openshell/gateways/{GATEWAY_CLUSTER_NAME}/mtls/"); + eprintln!(" Cluster: {gateway_name}"); + eprintln!(" Gateway: https://127.0.0.1:{gateway_port}"); + eprintln!(" mTLS: ~/.config/openshell/gateways/{gateway_name}/mtls/"); return Ok(()); } @@ -1182,22 +1376,22 @@ fn bootstrap_gateway(rootfs: &Path) -> Result<(), VmError> { client_key_pem: read("client.key")?, }; - openshell_bootstrap::store_gateway_metadata(GATEWAY_CLUSTER_NAME, &metadata) + openshell_bootstrap::store_gateway_metadata(gateway_name, &metadata) .map_err(|e| VmError::Bootstrap(format!("failed to store metadata: {e}")))?; - openshell_bootstrap::mtls::store_pki_bundle(GATEWAY_CLUSTER_NAME, &pki_bundle) + openshell_bootstrap::mtls::store_pki_bundle(gateway_name, &pki_bundle) .map_err(|e| VmError::Bootstrap(format!("failed to store mTLS creds: {e}")))?; - openshell_bootstrap::save_active_gateway(GATEWAY_CLUSTER_NAME) + openshell_bootstrap::save_active_gateway(gateway_name) .map_err(|e| VmError::Bootstrap(format!("failed to set active cluster: {e}")))?; eprintln!( "Bootstrap complete [{:.1}s]", bootstrap_start.elapsed().as_secs_f64() ); - eprintln!(" Cluster: {GATEWAY_CLUSTER_NAME}"); - eprintln!(" Gateway: https://127.0.0.1:{GATEWAY_PORT}"); - eprintln!(" mTLS: ~/.config/openshell/gateways/{GATEWAY_CLUSTER_NAME}/mtls/"); + eprintln!(" Cluster: {gateway_name}"); + eprintln!(" Gateway: https://127.0.0.1:{gateway_port}"); + eprintln!(" mTLS: ~/.config/openshell/gateways/{gateway_name}/mtls/"); Ok(()) } @@ -1212,7 +1406,7 @@ fn bootstrap_gateway(rootfs: &Path) -> Result<(), VmError> { /// metadata storage) can be skipped because the virtio-fs rootfs persists k3s /// state (TLS certs, kine/sqlite, containerd images, helm releases) across VM /// restarts. -fn is_warm_boot() -> bool { +fn is_warm_boot(gateway_name: &str) -> bool { let Ok(home) = std::env::var("HOME") else { return false; }; @@ -1225,13 +1419,13 @@ fn is_warm_boot() -> bool { .join("gateways"); // Check metadata file. - let metadata_path = config_dir.join(GATEWAY_CLUSTER_NAME).join("metadata.json"); + let metadata_path = config_dir.join(gateway_name).join("metadata.json"); if !metadata_path.is_file() { return false; } // Check mTLS cert files. - let mtls_dir = config_dir.join(GATEWAY_CLUSTER_NAME).join("mtls"); + let mtls_dir = config_dir.join(gateway_name).join("mtls"); for name in &["ca.crt", "tls.crt", "tls.key"] { let path = mtls_dir.join(name); match std::fs::metadata(&path) { @@ -1248,7 +1442,7 @@ fn is_warm_boot() -> bool { /// /// This catches cases where PKI was regenerated (e.g. rootfs rebuilt, /// manual reset) but host-side certs survived from a previous boot cycle. -fn sync_host_certs_if_stale(pki_dir: &Path) -> Result<(), VmError> { +fn sync_host_certs_if_stale(pki_dir: &Path, gateway_name: &str) -> Result<(), VmError> { let Ok(home) = std::env::var("HOME") else { return Ok(()); }; @@ -1256,7 +1450,7 @@ fn sync_host_certs_if_stale(pki_dir: &Path) -> Result<(), VmError> { std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); let host_ca = PathBuf::from(&config_base) .join("openshell/gateways") - .join(GATEWAY_CLUSTER_NAME) + .join(gateway_name) .join("mtls/ca.crt"); let rootfs_ca = std::fs::read_to_string(pki_dir.join("ca.crt")) @@ -1285,7 +1479,7 @@ fn sync_host_certs_if_stale(pki_dir: &Path) -> Result<(), VmError> { client_key_pem: read("client.key")?, }; - openshell_bootstrap::mtls::store_pki_bundle(GATEWAY_CLUSTER_NAME, &pki_bundle) + openshell_bootstrap::mtls::store_pki_bundle(gateway_name, &pki_bundle) .map_err(|e| VmError::Bootstrap(format!("failed to store mTLS creds: {e}")))?; eprintln!(" mTLS certs re-synced from rootfs"); @@ -1307,7 +1501,7 @@ fn sync_host_certs_if_stale(pki_dir: &Path) -> Result<(), VmError> { /// nftables → pod:8080. A successful probe means the pod is running, /// the NodePort service is routing, and the server is accepting /// connections. No kubectl or API server access required. -fn wait_for_gateway_service() { +fn wait_for_gateway_service(gateway_port: u16) { let start = Instant::now(); let timeout = std::time::Duration::from_secs(90); let poll_interval = std::time::Duration::from_secs(1); @@ -1315,7 +1509,7 @@ fn wait_for_gateway_service() { eprintln!("Waiting for gateway service..."); loop { - if host_tcp_probe() { + if host_tcp_probe(gateway_port) { eprintln!("Service healthy [{:.1}s]", start.elapsed().as_secs_f64()); return; } @@ -1341,12 +1535,12 @@ fn wait_for_gateway_service() { /// ClientHello). We exploit this: connect, then try a short read. If /// the read **times out** the server is alive; if it returns an error /// (reset/EOF) the server is down. -fn host_tcp_probe() -> bool { +fn host_tcp_probe(gateway_port: u16) -> bool { use std::io::Read; use std::net::{SocketAddr, TcpStream}; use std::time::Duration; - let addr: SocketAddr = ([127, 0, 0, 1], GATEWAY_PORT).into(); + let addr: SocketAddr = ([127, 0, 0, 1], gateway_port).into(); let Ok(mut stream) = TcpStream::connect_timeout(&addr, Duration::from_secs(2)) else { return false; }; diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs index ec35f53ea..e97648ad6 100644 --- a/crates/openshell-vm/src/main.rs +++ b/crates/openshell-vm/src/main.rs @@ -36,6 +36,15 @@ struct Cli { #[arg(long, value_hint = ValueHint::DirPath)] rootfs: Option, + /// Named VM instance. + /// + /// When set, the rootfs resolves to + /// `~/.local/share/openshell/openshell-vm/instances//rootfs`. + /// For launch mode, the instance rootfs is cloned from the default + /// rootfs on first use. + #[arg(long, conflicts_with = "rootfs")] + name: Option, + /// Executable path inside the VM. When set, runs this instead of /// the default k3s server. #[arg(long)] @@ -133,7 +142,11 @@ fn run(cli: Cli) -> Result> { } return Ok(openshell_vm::exec_running_vm( openshell_vm::VmExecOptions { - rootfs: cli.rootfs, + rootfs: match (cli.rootfs, cli.name.as_deref()) { + (Some(path), _) => Some(path), + (None, Some(name)) => Some(openshell_vm::named_rootfs_dir(name)?), + (None, None) => None, + }, command, workdir, env, @@ -155,11 +168,14 @@ fn run(cli: Cli) -> Result> { } }; - let rootfs = match cli.rootfs { - Some(p) => p, - None => openshell_bootstrap::paths::default_rootfs_dir()?, + let rootfs = match (cli.rootfs, cli.name.as_deref()) { + (Some(path), _) => path, + (None, Some(name)) => openshell_vm::ensure_named_rootfs(name)?, + (None, None) => openshell_bootstrap::paths::default_rootfs_dir()?, }; + let gateway_name = openshell_vm::gateway_name(cli.name.as_deref())?; + let mut config = if let Some(exec_path) = cli.exec { openshell_vm::VmConfig { rootfs, @@ -175,6 +191,7 @@ fn run(cli: Cli) -> Result> { console_output: None, net: net_backend.clone(), reset: cli.reset, + gateway_name, } } else { let mut c = openshell_vm::VmConfig::gateway(rootfs); @@ -189,6 +206,7 @@ fn run(cli: Cli) -> Result> { } c.net = net_backend; c.reset = cli.reset; + c.gateway_name = gateway_name; c }; config.log_level = cli.krun_log_level; diff --git a/e2e/rust/tests/smoke.rs b/e2e/rust/tests/smoke.rs new file mode 100644 index 000000000..c380efc8c --- /dev/null +++ b/e2e/rust/tests/smoke.rs @@ -0,0 +1,97 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! Smoke test: verify the gateway is healthy, create a sandbox, exec a +//! command inside it, and tear it down. +//! +//! This test is cluster-agnostic — it works against any running gateway +//! (Docker-based cluster or openshell-vm microVM). The `e2e:vm` mise +//! task uses it to validate the VM gateway after boot. + +use std::process::Stdio; +use std::time::Duration; + +use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; + +/// End-to-end smoke test: status → create → exec → list → delete. +#[tokio::test] +async fn gateway_smoke() { + // ── 1. Gateway must be reachable ────────────────────────────────── + let mut clean_status = String::new(); + let mut status_ok = false; + for _ in 0..15 { + let mut status_cmd = openshell_cmd(); + status_cmd + .arg("status") + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let status_out = status_cmd + .output() + .await + .expect("failed to run openshell status"); + + let status_text = format!( + "{}{}", + String::from_utf8_lossy(&status_out.stdout), + String::from_utf8_lossy(&status_out.stderr), + ); + clean_status = strip_ansi(&status_text); + + if status_out.status.success() && clean_status.contains("Connected") { + status_ok = true; + break; + } + + tokio::time::sleep(Duration::from_secs(2)).await; + } + + assert!( + status_ok, + "openshell status never became healthy:\n{clean_status}", + ); + + // ── 2. Create a sandbox and exec a command ─────────────────────── + // Default behaviour keeps the sandbox alive after the command exits, + // so we can verify it in the list before cleaning up. + let mut sb = SandboxGuard::create(&["--", "echo", "smoke-ok"]) + .await + .expect("sandbox create should succeed"); + + assert!( + sb.create_output.contains("smoke-ok"), + "expected 'smoke-ok' in sandbox output:\n{}", + sb.create_output, + ); + + // ── 3. Verify the sandbox appeared in the list ─────────────────── + let mut list_cmd = openshell_cmd(); + list_cmd + .args(["sandbox", "list", "--names"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let list_out = list_cmd + .output() + .await + .expect("failed to run openshell sandbox list"); + + let list_text = strip_ansi(&format!( + "{}{}", + String::from_utf8_lossy(&list_out.stdout), + String::from_utf8_lossy(&list_out.stderr), + )); + + assert!( + list_text.contains(&sb.name), + "sandbox '{}' should appear in list output:\n{list_text}", + sb.name, + ); + + // ── 4. Cleanup ─────────────────────────────────────────────────── + sb.cleanup().await; +} diff --git a/tasks/scripts/e2e-vm.sh b/tasks/scripts/e2e-vm.sh new file mode 100755 index 000000000..a929c7c84 --- /dev/null +++ b/tasks/scripts/e2e-vm.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run the Rust e2e smoke test against an openshell-vm gateway. +# +# Usage: +# mise run e2e:vm # start new named VM on random port +# mise run e2e:vm -- --vm-port=30051 # reuse existing VM on port 30051 +# mise run e2e:vm -- --vm-port=30051 --vm-name=my-vm # reuse existing named VM and run exec check +# +# Options: +# --vm-port=PORT Skip VM startup and test against this port. +# --vm-name=NAME VM instance name. Auto-generated for fresh VMs. +# +# When --vm-port is omitted: +# 1. Picks a random free host port +# 2. Starts the VM with --name --port :30051 +# 3. Waits for the gRPC port to become reachable +# 4. Verifies `openshell-vm exec` works +# 5. Runs the Rust smoke test +# 6. Tears down the VM +# +# When --vm-port is given the script assumes the VM is already running +# on that port and runs the smoke test. The VM exec check runs only when +# --vm-name is provided (so the script can target the correct instance). +# +# Prerequisites (when starting a new VM): `mise run vm:build:binary`, +# codesign, bundle-runtime, ensure-rootfs, and sync-rootfs must already +# be done (the e2e:vm mise task handles these via depends). + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +RUNTIME_DIR="${ROOT}/target/debug/openshell-vm.runtime" +GATEWAY_BIN="${ROOT}/target/debug/openshell-vm" +GUEST_PORT=30051 +TIMEOUT=180 + +# ── Parse arguments ────────────────────────────────────────────────── +VM_PORT="" +VM_NAME="" +for arg in "$@"; do + case "$arg" in + --vm-port=*) VM_PORT="${arg#--vm-port=}" ;; + --vm-name=*) VM_NAME="${arg#--vm-name=}" ;; + *) echo "Unknown argument: $arg"; exit 1 ;; + esac +done + +# ── Determine mode ─────────────────────────────────────────────────── +if [ -n "${VM_PORT}" ]; then + # Point at an already-running VM. + HOST_PORT="${VM_PORT}" + echo "Using existing VM on port ${HOST_PORT}." +else + # Pick a random free port and start a new VM. + HOST_PORT=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()') + if [ -z "${VM_NAME}" ]; then + VM_NAME="e2e-${HOST_PORT}-$$" + fi + + cleanup() { + if [ -n "${VM_PID:-}" ] && kill -0 "$VM_PID" 2>/dev/null; then + echo "Stopping openshell-vm (pid ${VM_PID})..." + kill "$VM_PID" 2>/dev/null || true + wait "$VM_PID" 2>/dev/null || true + fi + } + trap cleanup EXIT + + echo "Starting openshell-vm '${VM_NAME}' on port ${HOST_PORT}..." + if [ "$(uname -s)" = "Darwin" ]; then + export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" + fi + + "${GATEWAY_BIN}" --name "${VM_NAME}" --port "${HOST_PORT}:${GUEST_PORT}" & + VM_PID=$! + + # ── Wait for gRPC port ───────────────────────────────────────────── + echo "Waiting for gRPC port ${HOST_PORT} (timeout ${TIMEOUT}s)..." + elapsed=0 + while ! nc -z 127.0.0.1 "${HOST_PORT}" 2>/dev/null; do + if ! kill -0 "$VM_PID" 2>/dev/null; then + echo "ERROR: openshell-vm exited before gRPC port became reachable" + exit 1 + fi + if [ "$elapsed" -ge "$TIMEOUT" ]; then + echo "ERROR: openshell-vm gRPC port not reachable after ${TIMEOUT}s" + exit 1 + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + echo "Gateway is ready (${elapsed}s)." +fi + +# ── Exec into the VM (when instance name is known) ─────────────────── +if [ -n "${VM_NAME}" ]; then + echo "Verifying openshell-vm exec for '${VM_NAME}'..." + exec_elapsed=0 + exec_timeout=60 + until "${GATEWAY_BIN}" --name "${VM_NAME}" exec -- /bin/true; do + if [ "$exec_elapsed" -ge "$exec_timeout" ]; then + echo "ERROR: openshell-vm exec did not become ready after ${exec_timeout}s" + exit 1 + fi + sleep 2 + exec_elapsed=$((exec_elapsed + 2)) + done + echo "VM exec succeeded." +else + echo "Skipping openshell-vm exec check (provide --vm-name for existing VMs)." +fi + +# ── Run the smoke test ─────────────────────────────────────────────── +# The openshell CLI reads OPENSHELL_GATEWAY_ENDPOINT to connect to the +# gateway directly, and OPENSHELL_GATEWAY to resolve mTLS certs from +# ~/.config/openshell/gateways//mtls/. +export OPENSHELL_GATEWAY_ENDPOINT="https://127.0.0.1:${HOST_PORT}" +if [ -n "${VM_NAME}" ]; then + export OPENSHELL_GATEWAY="openshell-vm-${VM_NAME}" +else + export OPENSHELL_GATEWAY="openshell-vm" +fi + +echo "Running e2e smoke test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})..." +cargo build -p openshell-cli --features openshell-core/dev-settings +cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test smoke -- --nocapture + +echo "Smoke test passed." diff --git a/tasks/test.toml b/tasks/test.toml index c514fe382..7bd88708d 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -48,7 +48,9 @@ env = { UV_NO_SYNC = "1", PYTHONPATH = "python" } run = "uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2e/python" ["e2e:vm"] -description = "Run e2e tests against a gateway VM (macOS ARM64)" -depends = ["python:proto"] -env = { UV_NO_SYNC = "1", PYTHONPATH = "python", OPENSHELL_GATEWAY = "gateway" } -run = "uv run pytest -o python_files='test_*.py' e2e/python" +description = "Boot openshell-vm and run smoke e2e (macOS ARM64; pass -- --vm-port=N [--vm-name=NAME] to reuse)" +depends = ["vm:build:binary", "vm:codesign", "vm:bundle-runtime", "vm:rootfs"] +run = [ + "tasks/scripts/sync-vm-rootfs.sh", + "tasks/scripts/e2e-vm.sh", +] From 7ae955da919d68f1c2219552236a92c8a558e979 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 25 Mar 2026 11:49:30 -0700 Subject: [PATCH 05/10] wip --- .../helm/openshell/templates/statefulset.yaml | 11 ----------- deploy/helm/openshell/values.yaml | 19 ------------------- 2 files changed, 30 deletions(-) diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 0afd0cffd..1be8f14ab 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -26,16 +26,11 @@ spec: {{- end }} spec: terminationGracePeriodSeconds: {{ .Values.podLifecycle.terminationGracePeriodSeconds }} - {{- if .Values.hostNetwork }} - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - {{- end }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} serviceAccountName: {{ include "openshell.serviceAccountName" . }} - automountServiceAccountToken: {{ .Values.automountServiceAccountToken }} {{- if .Values.server.hostGatewayIP }} hostAliases: - ip: {{ .Values.server.hostGatewayIP | quote }} @@ -146,10 +141,6 @@ spec: secret: secretName: {{ .Values.server.tls.clientCaSecretName }} {{- end }} - {{- if not .Values.persistence.enabled }} - - name: openshell-data - emptyDir: {} - {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} @@ -162,7 +153,6 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} - {{- if .Values.persistence.enabled }} volumeClaimTemplates: - metadata: name: openshell-data @@ -171,4 +161,3 @@ spec: resources: requests: storage: 1Gi - {{- end }} diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index 1b835202b..ccc8d1ffa 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -19,12 +19,6 @@ serviceAccount: annotations: {} name: "" -# Whether to auto-mount the ServiceAccount token into the pod. Disabled -# in microVM gateway mode because the projected volume mount at -# /var/run/secrets/kubernetes.io/serviceaccount hits a containerd -# native-snapshotter + virtiofs incompatibility on sandbox re-creation. -automountServiceAccountToken: true - podAnnotations: {} podLabels: {} @@ -66,19 +60,6 @@ probes: resources: {} -# Persistent storage for the OpenShell database. When disabled, an -# emptyDir volume is used instead of a PVC. This is useful in microVM -# environments where overlayfs-on-virtiofs doesn't support PVC mounts -# reliably. -persistence: - enabled: true - -# Run the pod directly on the host network. Useful in microVM -# environments where kube-proxy is unavailable (no iptables). -# When true, the pod binds to the VM's eth0 and NodePort is -# unnecessary — gvproxy forwards host ports to the pod directly. -hostNetwork: false - nodeSelector: {} tolerations: [] From d5fdd0147b9c4a17ce4b64a8e485e86ea5f1b915 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Mon, 30 Mar 2026 17:40:39 -0700 Subject: [PATCH 06/10] wip --- AGENTS.md | 2 +- Cargo.lock | 37 ++ architecture/custom-vm-runtime.md | 97 +++- architecture/gateway-single-node.md | 1 - crates/openshell-bootstrap/src/paths.rs | 14 +- crates/openshell-vm/Cargo.toml | 14 + crates/openshell-vm/build.rs | 131 ++++++ crates/openshell-vm/pins.env | 39 ++ crates/openshell-vm/runtime/README.md | 6 +- .../runtime/build-custom-libkrunfw.sh | 23 +- .../{bridge-cni.config => openshell.kconfig} | 2 + .../scripts/build-rootfs-minimal.sh | 232 ++++++++++ crates/openshell-vm/scripts/build-rootfs.sh | 26 +- crates/openshell-vm/src/embedded.rs | 415 ++++++++++++++++++ crates/openshell-vm/src/health.rs | 200 +++++++++ crates/openshell-vm/src/lib.rs | 205 +++------ tasks/scripts/build-libkrun-macos.sh | 281 ++++++++++++ tasks/scripts/build-libkrun.sh | 148 +++++++ tasks/scripts/build-rootfs-tarball.sh | 116 +++++ tasks/scripts/bundle-vm-runtime.sh | 122 ----- tasks/scripts/compress-vm-runtime.sh | 289 ++++++++++++ tasks/scripts/package-openshell-vm-runtime.sh | 27 -- tasks/vm.toml | 95 ++-- 23 files changed, 2176 insertions(+), 346 deletions(-) create mode 100644 crates/openshell-vm/build.rs create mode 100644 crates/openshell-vm/pins.env rename crates/openshell-vm/runtime/kernel/{bridge-cni.config => openshell.kconfig} (98%) create mode 100755 crates/openshell-vm/scripts/build-rootfs-minimal.sh create mode 100644 crates/openshell-vm/src/embedded.rs create mode 100644 crates/openshell-vm/src/health.rs create mode 100755 tasks/scripts/build-libkrun-macos.sh create mode 100755 tasks/scripts/build-libkrun.sh create mode 100755 tasks/scripts/build-rootfs-tarball.sh delete mode 100755 tasks/scripts/bundle-vm-runtime.sh create mode 100755 tasks/scripts/compress-vm-runtime.sh delete mode 100755 tasks/scripts/package-openshell-vm-runtime.sh diff --git a/AGENTS.md b/AGENTS.md index 8364e7c13..0972d1d6a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -38,7 +38,7 @@ These pipelines connect skills into end-to-end workflows. Individual skill files | `crates/openshell-core/` | Shared core | Common types, configuration, error handling | | `crates/openshell-providers/` | Provider management | Credential provider backends | | `crates/openshell-tui/` | Terminal UI | Ratatui-based dashboard for monitoring | -| `crates/openshell-vm/` | MicroVM runtime | libkrun-based hardware-isolated VM execution | +| `crates/openshell-vm/` | MicroVM runtime | Experimental, work-in-progress libkrun-based VM execution | | `python/openshell/` | Python SDK | Python bindings and CLI packaging | | `proto/` | Protobuf definitions | gRPC service contracts | | `deploy/` | Docker, Helm, K8s | Dockerfiles, Helm chart, manifests | diff --git a/Cargo.lock b/Cargo.lock index ec73e4a76..60273d191 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3062,16 +3062,25 @@ version = "0.0.0" dependencies = [ "base64 0.22.1", "clap", + "indicatif", "libc", "libloading", "miette", "nix", "openshell-bootstrap", + "openshell-core", + "rustls", + "rustls-pemfile", "serde", "serde_json", + "tar", "thiserror 2.0.18", + "tokio", + "tokio-rustls", + "tonic", "tracing", "tracing-subscriber", + "zstd", ] [[package]] @@ -6456,3 +6465,31 @@ name = "zmij" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index e7a7d73ae..5b0b4e287 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -1,5 +1,7 @@ # Custom libkrunfw VM Runtime +> Status: Experimental and work in progress (WIP). VM support is under active development and may change. + ## Overview The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a @@ -17,16 +19,13 @@ the VM kernel, enabling standard Kubernetes networking. ``` Host (macOS/Linux) ├── openshell-vm binary -│ ├── Loads libkrun.dylib (VMM) -│ ├── Preloads libkrunfw.dylib (kernel) +│ ├── Embedded runtime (zstd-compressed) +│ │ ├── libkrun.{dylib,so} +│ │ ├── libkrunfw.{dylib,so} (kernel) +│ │ └── gvproxy +│ ├── Extracts to ~/.local/share/openshell/vm-runtime/{version}/ │ └── Logs runtime provenance -├── openshell-vm.runtime/ (sidecar bundle) -│ ├── libkrun.dylib -│ ├── libkrunfw.dylib (stock or custom) -│ ├── gvproxy -│ ├── manifest.json -│ └── provenance.json (custom only) -└── gvproxy (networking) +└── gvproxy (networking proxy) Guest VM ├── openshell-vm-init.sh (PID 1) @@ -38,6 +37,44 @@ Guest VM └── check-vm-capabilities.sh (diagnostics) ``` +## Embedded Runtime + +The openshell-vm binary is fully self-contained, embedding both the VM runtime libraries +and a minimal rootfs as zstd-compressed byte arrays. On first use, the binary extracts +these to XDG cache directories with progress bars: + +``` +~/.local/share/openshell/vm-runtime/{version}/ +├── libkrun.{dylib,so} +├── libkrunfw.{5.dylib,.so.5} +└── gvproxy + +~/.local/share/openshell/openshell-vm/{version}/rootfs/ +├── usr/local/bin/k3s +├── opt/openshell/bin/openshell-sandbox +├── opt/openshell/manifests/ +└── ... +``` + +This eliminates the need for separate bundles or downloads - a single ~120MB binary +provides everything needed to run the VM. Old cache versions are automatically +cleaned up when a new version is extracted. + +### Hybrid Approach + +The embedded rootfs uses a "minimal" configuration: +- Includes: Base Ubuntu, k3s binary, supervisor binary, helm charts, manifests +- Excludes: Pre-loaded container images (~1GB savings) + +Container images are pulled on demand when sandboxes are created. First boot takes +~30-60s as k3s initializes; subsequent boots use cached state for ~3-5s startup. + +For fully air-gapped environments requiring pre-loaded images, build with: +```bash +mise run vm:build:rootfs-tarball # Full rootfs (~2GB, includes images) +mise run vm:build:embedded:quick # Rebuild binary with full rootfs +``` + ## Network Profile The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and @@ -69,19 +106,19 @@ and makes it straightforward to correlate VM behavior with a specific runtime ar crates/openshell-vm/runtime/ ├── build-custom-libkrunfw.sh # Clones libkrunfw, applies config, builds ├── kernel/ -│ └── bridge-cni.config # Kernel config fragment +│ └── openshell.kconfig # Kernel config fragment └── README.md # Operator documentation Output: target/custom-runtime/ ├── libkrunfw.dylib # Custom library ├── provenance.json # Build metadata -├── bridge-cni.config # Config fragment used +├── openshell.kconfig # Config fragment used └── kernel.config # Full kernel .config ``` ## Kernel Config Fragment -The `bridge-cni.config` fragment enables these kernel features on top of the stock +The `openshell.kconfig` fragment enables these kernel features on top of the stock libkrunfw kernel: | Feature | Config | Purpose | @@ -118,23 +155,51 @@ The standalone `openshell-vm` binary supports `openshell-vm exec -- `openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style commands work the same way they would inside the VM shell. +## Build Commands + +```bash +# Build embedded binary with minimal rootfs (~120MB, recommended) +mise run vm:build:rootfs-tarball:minimal # Build minimal rootfs tarball +mise run vm:build:embedded # Build binary with embedded rootfs + +# Quick rebuild (uses cached artifacts, skips rootfs rebuild) +mise run vm:build:embedded:quick + +# Build with full rootfs (air-gapped, ~2GB+) +mise run vm:build:rootfs-tarball # Build full rootfs tarball +mise run vm:build:embedded:quick # Rebuild binary + +# With custom kernel (optional, adds ~20 min) +mise run vm:runtime:build-libkrunfw # Build custom libkrunfw +mise run vm:build:embedded # Then build embedded binary + +# For Linux (first time only) +mise run vm:runtime:build-libkrun # Build libkrun/libkrunfw from source +mise run vm:build:embedded # Then build embedded binary +``` + ## Rollout Strategy -1. Custom runtime support is opt-in via `OPENSHELL_VM_RUNTIME_SOURCE_DIR`. +1. Custom runtime is embedded by default when building with `mise run vm:build:embedded`. 2. The init script validates kernel capabilities at boot and fails fast if missing. -3. Rollback: unset the env var and re-bundle with stock libraries (note: stock - libraries lack bridge/netfilter and pod networking will not work). +3. For development, override with `OPENSHELL_VM_RUNTIME_DIR` to use a local directory. ## Related Files | File | Purpose | |------|---------| +| `crates/openshell-vm/src/embedded.rs` | Embedded resource extraction and caching | | `crates/openshell-vm/src/ffi.rs` | Runtime loading, provenance capture | | `crates/openshell-vm/src/lib.rs` | VM launch, provenance logging | | `crates/openshell-vm/src/exec.rs` | Runtime state tracking and host-side exec transport | +| `crates/openshell-vm/build.rs` | Build script for embedding compressed artifacts | | `crates/openshell-vm/scripts/openshell-vm-init.sh` | Guest init, network profile selection | | `crates/openshell-vm/scripts/openshell-vm-exec-agent.py` | Guest-side exec agent | | `crates/openshell-vm/scripts/check-vm-capabilities.sh` | Kernel capability checker | | `crates/openshell-vm/runtime/` | Build pipeline and kernel config | -| `tasks/scripts/bundle-vm-runtime.sh` | Runtime bundling (stock + custom) | +| `tasks/scripts/compress-vm-runtime.sh` | Gather and compress runtime artifacts | +| `tasks/scripts/build-rootfs-tarball.sh` | Build and compress rootfs tarball | +| `tasks/scripts/build-libkrun.sh` | Build libkrun from source (Linux) | +| `crates/openshell-vm/scripts/build-rootfs.sh` | Build full rootfs with pre-loaded images | +| `crates/openshell-vm/scripts/build-rootfs-minimal.sh` | Build minimal rootfs without images | | `tasks/vm.toml` | Mise task definitions | diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index d5ed49943..57aebd3a5 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -49,7 +49,6 @@ All gateway lifecycle commands live under `openshell gateway`: | `openshell status` | Show gateway health via gRPC/HTTP | | `openshell doctor logs [--name NAME] [--remote user@host] [--tail N]` | Fetch gateway container logs | | `openshell doctor exec [--name NAME] [--remote user@host] -- ` | Run a command inside the gateway container | -| `gateway exec [--workdir DIR] [--env KEY=VALUE] -- ` | Run a command inside the standalone gateway microVM | | `openshell gateway select ` | Set the active gateway | | `openshell gateway select` | Open an interactive chooser on a TTY, or list all gateways in non-interactive mode | diff --git a/crates/openshell-bootstrap/src/paths.rs b/crates/openshell-bootstrap/src/paths.rs index 529eab87a..b8028b4b4 100644 --- a/crates/openshell-bootstrap/src/paths.rs +++ b/crates/openshell-bootstrap/src/paths.rs @@ -28,14 +28,26 @@ pub fn last_sandbox_path(gateway: &str) -> Result { /// Default rootfs directory for gateway microVMs. /// -/// Location: `$XDG_DATA_HOME/openshell/openshell-vm/rootfs` +/// Location: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/rootfs` +/// +/// The version is taken from the CARGO_PKG_VERSION at build time, allowing +/// multiple versions to coexist and enabling clean upgrades. pub fn default_rootfs_dir() -> Result { + const VERSION: &str = env!("CARGO_PKG_VERSION"); Ok(xdg_data_dir()? .join("openshell") .join("openshell-vm") + .join(VERSION) .join("rootfs")) } +/// Base directory for openshell-vm data (without version). +/// +/// Location: `$XDG_DATA_HOME/openshell/openshell-vm/` +pub fn openshell_vm_base_dir() -> Result { + Ok(xdg_data_dir()?.join("openshell").join("openshell-vm")) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml index 71800c684..7d74b3139 100644 --- a/crates/openshell-vm/Cargo.toml +++ b/crates/openshell-vm/Cargo.toml @@ -21,16 +21,30 @@ path = "src/main.rs" [dependencies] base64 = "0.22" clap = { workspace = true } +indicatif = "0.17" libc = "0.2" libloading = "0.8" miette = { workspace = true } nix = { workspace = true } openshell-bootstrap = { path = "../openshell-bootstrap" } +openshell-core = { path = "../openshell-core" } serde = { workspace = true } serde_json = "1" +tar = "0.4" thiserror = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } +zstd = "0.13" + +# Async runtime and gRPC for health check +tokio = { workspace = true } +tonic = { workspace = true, features = ["tls", "tls-native-roots"] } +rustls = { workspace = true } +rustls-pemfile = { workspace = true } +tokio-rustls = { workspace = true } + +[build-dependencies] +zstd = "0.13" [lints] workspace = true diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs new file mode 100644 index 000000000..437f4f9be --- /dev/null +++ b/crates/openshell-vm/build.rs @@ -0,0 +1,131 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Build script for openshell-vm. +//! +//! This script copies pre-compressed VM runtime artifacts (libkrun, libkrunfw, +//! gvproxy) to OUT_DIR for embedding via `include_bytes!()`. +//! +//! The compressed artifacts are expected to be prepared by: +//! `mise run vm:runtime:compress` +//! +//! Environment: +//! OPENSHELL_VM_RUNTIME_COMPRESSED_DIR - Path to compressed artifacts + +use std::path::PathBuf; +use std::{env, fs}; + +fn main() { + println!("cargo:rerun-if-env-changed=OPENSHELL_VM_RUNTIME_COMPRESSED_DIR"); + + let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set")); + let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); + + // Determine platform-specific file names + let (libkrun_name, libkrunfw_name) = match target_os.as_str() { + "macos" => ("libkrun.dylib", "libkrunfw.5.dylib"), + "linux" => ("libkrun.so", "libkrunfw.so.5"), + _ => { + println!( + "cargo:warning=VM runtime not available for {}-{}", + target_os, target_arch + ); + generate_stub_resources(&out_dir); + return; + } + }; + + // Check for pre-compressed artifacts from mise task + let compressed_dir = match env::var("OPENSHELL_VM_RUNTIME_COMPRESSED_DIR") { + Ok(dir) => PathBuf::from(dir), + Err(_) => { + println!("cargo:warning=OPENSHELL_VM_RUNTIME_COMPRESSED_DIR not set"); + println!("cargo:warning=Run: mise run vm:runtime:compress"); + generate_stub_resources(&out_dir); + return; + } + }; + + if !compressed_dir.is_dir() { + println!( + "cargo:warning=Compressed runtime dir not found: {}", + compressed_dir.display() + ); + println!("cargo:warning=Run: mise run vm:runtime:compress"); + generate_stub_resources(&out_dir); + return; + } + + // Copy compressed files to OUT_DIR + let files = [ + (format!("{libkrun_name}.zst"), format!("{libkrun_name}.zst")), + ( + format!("{libkrunfw_name}.zst"), + format!("{libkrunfw_name}.zst"), + ), + ("gvproxy.zst".to_string(), "gvproxy.zst".to_string()), + ("rootfs.tar.zst".to_string(), "rootfs.tar.zst".to_string()), + ]; + + let mut all_found = true; + for (src_name, dst_name) in &files { + let src_path = compressed_dir.join(src_name); + let dst_path = out_dir.join(dst_name); + + if src_path.exists() { + // Remove existing file first (may be read-only from previous build) + if dst_path.exists() { + let _ = fs::remove_file(&dst_path); + } + fs::copy(&src_path, &dst_path).unwrap_or_else(|e| { + panic!( + "Failed to copy {} to {}: {}", + src_path.display(), + dst_path.display(), + e + ) + }); + let size = fs::metadata(&dst_path).map(|m| m.len()).unwrap_or(0); + println!("cargo:warning=Embedded {}: {} bytes", src_name, size); + } else { + println!( + "cargo:warning=Missing compressed artifact: {}", + src_path.display() + ); + all_found = false; + } + } + + if !all_found { + println!("cargo:warning=Some artifacts missing. Run: mise run vm:runtime:compress"); + generate_stub_resources(&out_dir); + } +} + +/// Generate stub (empty) resource files so the build can complete. +/// The embedded module will fail at runtime if these stubs are used. +fn generate_stub_resources(out_dir: &PathBuf) { + let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + + let (libkrun_name, libkrunfw_name) = match target_os.as_str() { + "macos" => ("libkrun.dylib", "libkrunfw.5.dylib"), + _ => ("libkrun.so", "libkrunfw.so.5"), + }; + + let stubs = [ + format!("{libkrun_name}.zst"), + format!("{libkrunfw_name}.zst"), + "gvproxy.zst".to_string(), + "rootfs.tar.zst".to_string(), + ]; + + for name in &stubs { + let path = out_dir.join(name); + if !path.exists() { + // Write an empty file as a stub + fs::write(&path, b"") + .unwrap_or_else(|e| panic!("Failed to write stub {}: {}", path.display(), e)); + } + } +} diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env new file mode 100644 index 000000000..a6c774dcc --- /dev/null +++ b/crates/openshell-vm/pins.env @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Pinned dependency versions for openshell-vm builds. +# +# This file is sourced by build-rootfs.sh, build-rootfs-minimal.sh, and +# build-custom-libkrunfw.sh. It centralises version pins and content-addressed +# digests so that builds are reproducible and auditable. +# +# Environment variables override these defaults — CI and local dev workflows +# can still set IMAGE_TAG, K3S_VERSION, etc. as before. +# +# To update a dependency: +# 1. Change the version/digest below. +# 2. Run the relevant build script to verify. +# 3. Commit pins.env alongside any script changes. + +# ── k3s binary (arm64) ───────────────────────────────────────────────── +K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" +K3S_ARM64_SHA256="${K3S_ARM64_SHA256:-228809a7ef47d25c1bdbe746944931ec2fd2edf842b9cf50f1dd4f9ec2505b0e}" + +# ── Base Docker image (digest-pinned) ────────────────────────────────── +# Tag: nvcr.io/nvidia/base/ubuntu:noble-20251013 +VM_BASE_IMAGE="${VM_BASE_IMAGE:-nvcr.io/nvidia/base/ubuntu@sha256:43fa5063e80fbbc533892af3ccca190868ce48db5a8928b19d7815c40436af8e}" + +# ── Container images for rootfs pre-loading (digest-pinned) ──────────── +# Tag: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0 +AGENT_SANDBOX_IMAGE="${AGENT_SANDBOX_IMAGE:-registry.k8s.io/agent-sandbox/agent-sandbox-controller@sha256:b536762a159b121af18bc004741235160605075ce826f16f95a2103afe2ef4db}" +# Tag: ghcr.io/nvidia/openshell-community/sandboxes/base:latest +COMMUNITY_SANDBOX_IMAGE="${COMMUNITY_SANDBOX_IMAGE:-ghcr.io/nvidia/openshell-community/sandboxes/base@sha256:d446c17105e7448e602238a8a5a4ddd0233c071082406522f81c31f8b1309525}" + +# SERVER_IMAGE is intentionally NOT pinned here — it changes frequently +# during local development. Override via IMAGE_REPO_BASE and IMAGE_TAG +# environment variables (defaults: openshell/gateway:dev). + +# ── libkrunfw upstream (commit-pinned) ───────────────────────────────── +# Repo: https://github.com/containers/libkrunfw +# Pinned: 2026-03-27 (main branch HEAD at time of pinning) +LIBKRUNFW_REF="${LIBKRUNFW_REF:-463f717bbdd916e1352a025b6fb2456e882b0b39}" diff --git a/crates/openshell-vm/runtime/README.md b/crates/openshell-vm/runtime/README.md index 52bb1382d..d0c2f72fa 100644 --- a/crates/openshell-vm/runtime/README.md +++ b/crates/openshell-vm/runtime/README.md @@ -1,5 +1,7 @@ # Custom libkrunfw Runtime +> Status: Experimental and work in progress (WIP). VM support is under active development and may change. + This directory contains the build infrastructure for a custom `libkrunfw` runtime that enables bridge CNI and netfilter support in the OpenShell gateway VM. @@ -21,7 +23,7 @@ that enables these networking and sandboxing features. runtime/ build-custom-libkrunfw.sh # Build script for custom libkrunfw kernel/ - bridge-cni.config # Kernel config fragment (networking + sandboxing) + openshell.kconfig # Kernel config fragment (networking + sandboxing) ``` ## Building @@ -51,7 +53,7 @@ target/custom-runtime/ libkrunfw.dylib # The custom library libkrunfw..dylib # Version-suffixed copy provenance.json # Build metadata (commit, hash, timestamp) - bridge-cni.config # The config fragment used + openshell.kconfig # The config fragment used kernel.config # Full kernel .config (for debugging) ``` diff --git a/crates/openshell-vm/runtime/build-custom-libkrunfw.sh b/crates/openshell-vm/runtime/build-custom-libkrunfw.sh index a69fc0c13..2bfbdbe56 100755 --- a/crates/openshell-vm/runtime/build-custom-libkrunfw.sh +++ b/crates/openshell-vm/runtime/build-custom-libkrunfw.sh @@ -26,9 +26,18 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -KERNEL_CONFIG_FRAGMENT="${SCRIPT_DIR}/kernel/bridge-cni.config" +KERNEL_CONFIG_FRAGMENT="${SCRIPT_DIR}/kernel/openshell.kconfig" + +# Source pinned dependency versions (digests, checksums, commit SHAs). +# Environment variables override pins — see pins.env for details. +PINS_FILE="${SCRIPT_DIR}/../pins.env" +if [ -f "$PINS_FILE" ]; then + # shellcheck source=../pins.env + source "$PINS_FILE" +fi -# Defaults +# Defaults (LIBKRUNFW_REF is commit-pinned in pins.env; falls back to main +# only if pins.env is missing and no env var is set). LIBKRUNFW_REPO="${LIBKRUNFW_REPO:-https://github.com/containers/libkrunfw.git}" LIBKRUNFW_REF="${LIBKRUNFW_REF:-main}" OUTPUT_DIR="${OPENSHELL_RUNTIME_OUTPUT_DIR:-${PROJECT_ROOT}/target/custom-runtime}" @@ -223,7 +232,7 @@ if [ "$(uname -s)" = "Darwin" ]; then # Copy the config fragment into the libkrunfw tree so the VM can see it. # The merge hook (MERGE_HOOK) is already written there by the cat above. - cp -f "${KERNEL_CONFIG_FRAGMENT}" "${LIBKRUNFW_DIR}/openshell-bridge-cni.config" + cp -f "${KERNEL_CONFIG_FRAGMENT}" "${LIBKRUNFW_DIR}/openshell.kconfig" echo " Creating VM..." # krunvm may print "The volume has been configured" on first use of a @@ -256,7 +265,7 @@ if [ "$(uname -s)" = "Darwin" ]; then # Step 2: merge the OpenShell config fragment echo " Merging OpenShell kernel config fragment..." krunvm start "${VM_NAME}" /usr/bin/bash -- \ - /work/openshell-kconfig-hook.sh "/work/${KERNEL_DIR_NAME}" /work/openshell-bridge-cni.config + /work/openshell-kconfig-hook.sh "/work/${KERNEL_DIR_NAME}" /work/openshell.kconfig # Step 3: build the kernel and generate the C bundle echo " Building kernel (this is the slow part)..." @@ -266,7 +275,7 @@ if [ "$(uname -s)" = "Darwin" ]; then krunvm delete "${VM_NAME}" # Clean up temp files from the libkrunfw tree - rm -f "${LIBKRUNFW_DIR}/openshell-bridge-cni.config" + rm -f "${LIBKRUNFW_DIR}/openshell.kconfig" if [ ! -f "${LIBKRUNFW_DIR}/kernel.c" ]; then echo "ERROR: kernel.c was not produced — build failed" >&2 @@ -352,7 +361,7 @@ if [ -n "$KERNEL_SRC_DIR" ] && [ -f "${KERNEL_SRC_DIR}/.config" ]; then fi # Copy our fragment for reference -cp "${KERNEL_CONFIG_FRAGMENT}" "${OUTPUT_DIR}/bridge-cni.config" +cp "${KERNEL_CONFIG_FRAGMENT}" "${OUTPUT_DIR}/openshell.kconfig" # ── Write provenance metadata ────────────────────────────────────────── @@ -365,7 +374,7 @@ cat > "${OUTPUT_DIR}/provenance.json" << EOF "libkrunfw_ref": "${LIBKRUNFW_REF}", "libkrunfw_commit": "${LIBKRUNFW_COMMIT}", "kernel_version": "${KERNEL_VERSION:-unknown}", - "kernel_config_fragment": "bridge-cni.config", + "kernel_config_fragment": "openshell.kconfig", "artifact_sha256": "${ARTIFACT_HASH}", "host_os": "$(uname -s)", "host_arch": "$(uname -m)", diff --git a/crates/openshell-vm/runtime/kernel/bridge-cni.config b/crates/openshell-vm/runtime/kernel/openshell.kconfig similarity index 98% rename from crates/openshell-vm/runtime/kernel/bridge-cni.config rename to crates/openshell-vm/runtime/kernel/openshell.kconfig index 7b9610e30..cc219f50d 100644 --- a/crates/openshell-vm/runtime/kernel/bridge-cni.config +++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig @@ -47,6 +47,8 @@ CONFIG_NETFILTER_XT_MATCH_MARK=y CONFIG_NETFILTER_XT_MATCH_STATISTIC=y CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y CONFIG_NETFILTER_XT_MATCH_RECENT=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_TARGET_LOG=y CONFIG_NETFILTER_XT_TARGET_MARK=y CONFIG_NETFILTER_XT_TARGET_CONNMARK=y CONFIG_NETFILTER_XT_MATCH_CONNMARK=y diff --git a/crates/openshell-vm/scripts/build-rootfs-minimal.sh b/crates/openshell-vm/scripts/build-rootfs-minimal.sh new file mode 100755 index 000000000..d73aeb221 --- /dev/null +++ b/crates/openshell-vm/scripts/build-rootfs-minimal.sh @@ -0,0 +1,232 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build a minimal aarch64 Ubuntu rootfs for embedding in openshell-vm. +# +# This produces a lightweight rootfs (~200-300MB) with: +# - Base Ubuntu with k3s binary +# - OpenShell supervisor binary +# - Helm charts and Kubernetes manifests +# - NO pre-loaded container images (pulled on demand) +# - NO pre-initialized k3s state (cold start on first boot) +# +# First boot will be slower (~30-60s) as k3s initializes and pulls images, +# but subsequent boots use cached state. +# +# Usage: +# ./build-rootfs-minimal.sh [output_dir] +# +# Requires: Docker, curl, helm + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source pinned dependency versions (digests, checksums, commit SHAs). +# Environment variables override pins — see pins.env for details. +PINS_FILE="${SCRIPT_DIR}/../pins.env" +if [ -f "$PINS_FILE" ]; then + # shellcheck source=../pins.env + source "$PINS_FILE" +fi +DEFAULT_ROOTFS="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" +ROOTFS_DIR="${1:-${DEFAULT_ROOTFS}}" +CONTAINER_NAME="krun-rootfs-minimal-builder" +BASE_IMAGE_TAG="krun-rootfs:openshell-vm-minimal" +K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" +K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" + +# Project root (two levels up from crates/openshell-vm/scripts/) +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +echo "==> Building minimal openshell-vm rootfs" +echo " k3s version: ${K3S_VERSION}" +echo " Output: ${ROOTFS_DIR}" +echo " Mode: minimal (no pre-loaded images, cold start)" +echo "" + +# ── Check for running VM ──────────────────────────────────────────────── +VM_LOCK_FILE="$(dirname "${ROOTFS_DIR}")/$(basename "${ROOTFS_DIR}")-vm.lock" +if [ -f "${VM_LOCK_FILE}" ]; then + if ! python3 -c " +import fcntl, os, sys +fd = os.open(sys.argv[1], os.O_RDONLY) +try: + fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + fcntl.flock(fd, fcntl.LOCK_UN) +except BlockingIOError: + sys.exit(1) +finally: + os.close(fd) +" "${VM_LOCK_FILE}" 2>/dev/null; then + HOLDER_PID=$(cat "${VM_LOCK_FILE}" 2>/dev/null | tr -d '[:space:]') + echo "ERROR: An openshell-vm (pid ${HOLDER_PID:-unknown}) holds a lock on this rootfs." + echo " Stop the VM first, then re-run this script." + exit 1 + fi +fi + +VM_STATE_FILE="$(dirname "${ROOTFS_DIR}")/$(basename "${ROOTFS_DIR}")-vm-state.json" +if [ -f "${VM_STATE_FILE}" ]; then + VM_PID=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1]))['pid'])" "${VM_STATE_FILE}" 2>/dev/null || echo "") + if [ -n "${VM_PID}" ] && kill -0 "${VM_PID}" 2>/dev/null; then + echo "ERROR: An openshell-vm is running (pid ${VM_PID}) using this rootfs." + echo " Stop the VM first, then re-run this script." + exit 1 + else + rm -f "${VM_STATE_FILE}" + fi +fi + +# ── Download k3s binary ───────────────────────────────────────────────── +K3S_BIN="/tmp/k3s-arm64-${K3S_VERSION}" +if [ -f "${K3S_BIN}" ]; then + echo "==> Using cached k3s binary: ${K3S_BIN}" +else + echo "==> Downloading k3s ${K3S_VERSION} for arm64..." + curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s-arm64" \ + -o "${K3S_BIN}" + chmod +x "${K3S_BIN}" +fi + +# Verify k3s binary integrity. +if [ -n "${K3S_ARM64_SHA256:-}" ]; then + echo "==> Verifying k3s binary checksum..." + echo "${K3S_ARM64_SHA256} ${K3S_BIN}" | shasum -a 256 -c - +else + echo "WARNING: K3S_ARM64_SHA256 not set, skipping checksum verification" +fi + +# ── Build base image ─────────────────────────────────────────────────── +docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true + +echo "==> Building base image..." +docker build --platform linux/arm64 -t "${BASE_IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' +ARG BASE_IMAGE +FROM ${BASE_IMAGE} +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + iptables \ + iproute2 \ + python3 \ + busybox-static \ + zstd \ + && rm -rf /var/lib/apt/lists/* +RUN mkdir -p /usr/share/udhcpc && \ + ln -sf /bin/busybox /sbin/udhcpc +RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s +DOCKERFILE + +# Create container and export filesystem +echo "==> Creating container..." +docker create --platform linux/arm64 --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true + +echo "==> Exporting filesystem..." +if [ -d "${ROOTFS_DIR}" ]; then + chmod -R u+rwx "${ROOTFS_DIR}" 2>/dev/null || true + rm -rf "${ROOTFS_DIR}" +fi +mkdir -p "${ROOTFS_DIR}" +docker export "${CONTAINER_NAME}" | tar -C "${ROOTFS_DIR}" -xf - +docker rm "${CONTAINER_NAME}" + +# ── Inject k3s binary ────────────────────────────────────────────────── +echo "==> Injecting k3s binary..." +cp "${K3S_BIN}" "${ROOTFS_DIR}/usr/local/bin/k3s" +chmod +x "${ROOTFS_DIR}/usr/local/bin/k3s" +ln -sf /usr/local/bin/k3s "${ROOTFS_DIR}/usr/local/bin/kubectl" + +# ── Inject scripts ───────────────────────────────────────────────────── +echo "==> Injecting scripts..." +mkdir -p "${ROOTFS_DIR}/srv" +cp "${SCRIPT_DIR}/openshell-vm-init.sh" "${ROOTFS_DIR}/srv/openshell-vm-init.sh" +chmod +x "${ROOTFS_DIR}/srv/openshell-vm-init.sh" + +cp "${SCRIPT_DIR}/hello-server.py" "${ROOTFS_DIR}/srv/hello-server.py" +chmod +x "${ROOTFS_DIR}/srv/hello-server.py" + +cp "${SCRIPT_DIR}/check-vm-capabilities.sh" "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" +chmod +x "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" + +cp "${SCRIPT_DIR}/openshell-vm-exec-agent.py" "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" +chmod +x "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" + +# ── Build and inject supervisor binary ───────────────────────────────── +SUPERVISOR_TARGET="aarch64-unknown-linux-gnu" +SUPERVISOR_BIN="${PROJECT_ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" + +echo "==> Building openshell-sandbox supervisor binary (${SUPERVISOR_TARGET})..." +if ! command -v cargo-zigbuild >/dev/null 2>&1; then + echo "ERROR: cargo-zigbuild is not installed." + echo " Install it with: cargo install cargo-zigbuild" + exit 1 +fi + +cargo zigbuild --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" \ + --manifest-path "${PROJECT_ROOT}/Cargo.toml" 2>&1 | tail -5 + +if [ ! -f "${SUPERVISOR_BIN}" ]; then + echo "ERROR: supervisor binary not found at ${SUPERVISOR_BIN}" + exit 1 +fi + +echo " Injecting supervisor binary into rootfs..." +mkdir -p "${ROOTFS_DIR}/opt/openshell/bin" +cp "${SUPERVISOR_BIN}" "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" +chmod +x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" + +# ── Package and inject helm chart ────────────────────────────────────── +HELM_CHART_DIR="${PROJECT_ROOT}/deploy/helm/openshell" +CHART_DEST="${ROOTFS_DIR}/var/lib/rancher/k3s/server/static/charts" + +if [ -d "${HELM_CHART_DIR}" ]; then + echo "==> Packaging helm chart..." + mkdir -p "${CHART_DEST}" + helm package "${HELM_CHART_DIR}" -d "${CHART_DEST}" + mkdir -p "${ROOTFS_DIR}/opt/openshell/charts" + cp "${CHART_DEST}"/*.tgz "${ROOTFS_DIR}/opt/openshell/charts/" +fi + +# ── Inject Kubernetes manifests ──────────────────────────────────────── +MANIFEST_SRC="${PROJECT_ROOT}/deploy/kube/manifests" +MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/manifests" + +echo "==> Injecting Kubernetes manifests..." +mkdir -p "${MANIFEST_DEST}" + +for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do + if [ -f "${MANIFEST_SRC}/${manifest}" ]; then + cp "${MANIFEST_SRC}/${manifest}" "${MANIFEST_DEST}/" + echo " ${manifest}" + fi +done + +# ── Create empty images directory ────────────────────────────────────── +# k3s expects this directory to exist for airgap image loading. +mkdir -p "${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images" + +# ── Mark as minimal (not pre-initialized) ────────────────────────────── +# The init script checks for this file to determine if cold start is expected. +echo "minimal" > "${ROOTFS_DIR}/opt/openshell/.rootfs-type" + +# ── Verify ───────────────────────────────────────────────────────────── +if [ ! -f "${ROOTFS_DIR}/usr/local/bin/k3s" ]; then + echo "ERROR: k3s binary not found in rootfs." + exit 1 +fi + +if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then + echo "ERROR: openshell-sandbox supervisor binary not found in rootfs." + exit 1 +fi + +echo "" +echo "==> Minimal rootfs ready at: ${ROOTFS_DIR}" +echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" +echo " Type: minimal (cold start, images pulled on demand)" +echo "" +echo "Note: First boot will take ~30-60s as k3s initializes." +echo " Container images will be pulled from registries on first use." diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index 50cc13ca4..d423ce5ab 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -19,6 +19,14 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source pinned dependency versions (digests, checksums, commit SHAs). +# Environment variables override pins — see pins.env for details. +PINS_FILE="${SCRIPT_DIR}/../pins.env" +if [ -f "$PINS_FILE" ]; then + # shellcheck source=../pins.env + source "$PINS_FILE" +fi DEFAULT_ROOTFS="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" ROOTFS_DIR="${1:-${DEFAULT_ROOTFS}}" CONTAINER_NAME="krun-rootfs-builder" @@ -34,11 +42,11 @@ K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" # Container images to pre-load into k3s (arm64). +# AGENT_SANDBOX_IMAGE and COMMUNITY_SANDBOX_IMAGE are digest-pinned in pins.env. +# SERVER_IMAGE is intentionally unpinned (local dev artifact). IMAGE_REPO_BASE="${IMAGE_REPO_BASE:-openshell}" IMAGE_TAG="${IMAGE_TAG:-dev}" SERVER_IMAGE="${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" -AGENT_SANDBOX_IMAGE="registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0" -COMMUNITY_SANDBOX_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base:latest" echo "==> Building openshell-vm rootfs" echo " k3s version: ${K3S_VERSION}" @@ -117,6 +125,14 @@ else chmod +x "${K3S_BIN}" fi +# Verify k3s binary integrity. +if [ -n "${K3S_ARM64_SHA256:-}" ]; then + echo "==> Verifying k3s binary checksum..." + echo "${K3S_ARM64_SHA256} ${K3S_BIN}" | shasum -a 256 -c - +else + echo "WARNING: K3S_ARM64_SHA256 not set, skipping checksum verification" +fi + # ── Build base image with dependencies ───────────────────────────────── # Clean up any previous run @@ -124,8 +140,10 @@ docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true echo "==> Building base image..." -docker build --platform linux/arm64 -t "${BASE_IMAGE_TAG}" -f - . <<'DOCKERFILE' -FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 +docker build --platform linux/arm64 -t "${BASE_IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' +ARG BASE_IMAGE +FROM ${BASE_IMAGE} RUN apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates \ diff --git a/crates/openshell-vm/src/embedded.rs b/crates/openshell-vm/src/embedded.rs new file mode 100644 index 000000000..d898c269a --- /dev/null +++ b/crates/openshell-vm/src/embedded.rs @@ -0,0 +1,415 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Embedded VM runtime resources. +//! +//! Native libraries (libkrun, libkrunfw, gvproxy) and the rootfs are embedded as +//! zstd-compressed byte arrays and extracted to XDG cache directories on first use. +//! +//! Cache locations: +//! - Runtime: `~/.local/share/openshell/vm-runtime/{version}/` +//! - Rootfs: `~/.local/share/openshell/openshell-vm/{version}/rootfs/` + +use std::fs; +use std::io::{Read, Write}; +use std::path::{Path, PathBuf}; + +use indicatif::{ProgressBar, ProgressStyle}; + +use crate::VmError; + +// ── Platform-specific embedded resources ─────────────────────────────────── + +#[cfg(all(target_os = "macos", target_arch = "aarch64"))] +mod resources { + pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.dylib.zst")); + pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.5.dylib.zst")); + pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); + pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const LIBKRUN_NAME: &str = "libkrun.dylib"; + pub const LIBKRUNFW_NAME: &str = "libkrunfw.5.dylib"; +} + +#[cfg(all(target_os = "linux", target_arch = "aarch64"))] +mod resources { + pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.so.zst")); + pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); + pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); + pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const LIBKRUN_NAME: &str = "libkrun.so"; + pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; +} + +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +mod resources { + pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.so.zst")); + pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); + pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); + pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const LIBKRUN_NAME: &str = "libkrun.so"; + pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; +} + +// Fallback for unsupported platforms (will fail at runtime) +#[cfg(not(any( + all(target_os = "macos", target_arch = "aarch64"), + all(target_os = "linux", target_arch = "aarch64"), + all(target_os = "linux", target_arch = "x86_64"), +)))] +mod resources { + pub const LIBKRUN: &[u8] = &[]; + pub const LIBKRUNFW: &[u8] = &[]; + pub const GVPROXY: &[u8] = &[]; + pub const ROOTFS: &[u8] = &[]; + pub const LIBKRUN_NAME: &str = "libkrun"; + pub const LIBKRUNFW_NAME: &str = "libkrunfw"; +} + +const VERSION: &str = env!("CARGO_PKG_VERSION"); + +// ── Public API ───────────────────────────────────────────────────────────── + +/// Ensures the embedded VM runtime is extracted to the cache directory. +/// +/// Returns the path to the runtime directory containing: +/// - libkrun.{dylib,so} +/// - libkrunfw.{5.dylib,.so.5} +/// - gvproxy +/// +/// On first call, this extracts the compressed embedded resources to the cache. +/// Subsequent calls return the cached path if valid. +pub fn ensure_runtime_extracted() -> Result { + // Check if embedded resources are available (non-empty) + if resources::LIBKRUN.is_empty() { + return Err(VmError::HostSetup( + "VM runtime not embedded for this platform. \ + Supported: macOS ARM64, Linux ARM64, Linux x86_64" + .to_string(), + )); + } + + let cache_dir = runtime_cache_dir()?; + let version_marker = cache_dir.join(".version"); + + // Check if already extracted with correct version + if version_marker.exists() { + if let Ok(cached_version) = fs::read_to_string(&version_marker) { + if cached_version.trim() == VERSION { + // Validate files exist + if validate_runtime_dir(&cache_dir).is_ok() { + tracing::debug!( + path = %cache_dir.display(), + "Using cached VM runtime" + ); + return Ok(cache_dir); + } + } + } + } + + // Clean up old versions before extracting new one + cleanup_old_versions(&cache_dir)?; + + // Create fresh directory + if cache_dir.exists() { + fs::remove_dir_all(&cache_dir) + .map_err(|e| VmError::HostSetup(format!("remove old cache: {e}")))?; + } + fs::create_dir_all(&cache_dir) + .map_err(|e| VmError::HostSetup(format!("create cache dir: {e}")))?; + + tracing::info!( + path = %cache_dir.display(), + version = VERSION, + "Extracting embedded VM runtime" + ); + + // Extract all resources + extract_resource(resources::LIBKRUN, &cache_dir.join(resources::LIBKRUN_NAME))?; + extract_resource( + resources::LIBKRUNFW, + &cache_dir.join(resources::LIBKRUNFW_NAME), + )?; + extract_resource(resources::GVPROXY, &cache_dir.join("gvproxy"))?; + + // Make gvproxy executable + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + fs::set_permissions(cache_dir.join("gvproxy"), fs::Permissions::from_mode(0o755)) + .map_err(|e| VmError::HostSetup(format!("chmod gvproxy: {e}")))?; + } + + // Write version marker + fs::write(&version_marker, VERSION) + .map_err(|e| VmError::HostSetup(format!("write version marker: {e}")))?; + + tracing::info!( + path = %cache_dir.display(), + "VM runtime extracted successfully" + ); + + Ok(cache_dir) +} + +/// Returns the path where the runtime would be cached (without extracting). +pub fn runtime_cache_path() -> Result { + runtime_cache_dir() +} + +/// Ensures the embedded rootfs is extracted to the cache directory. +/// +/// Returns the path to the rootfs directory. +/// +/// On first call, this extracts the compressed embedded rootfs tarball to the cache. +/// Subsequent calls return the cached path if valid. +pub fn ensure_rootfs_extracted() -> Result { + // Check if embedded rootfs is available (non-empty) + if resources::ROOTFS.is_empty() { + return Err(VmError::HostSetup( + "Rootfs not embedded. Build with: mise run vm:build:embedded".to_string(), + )); + } + + let rootfs_dir = rootfs_cache_dir()?; + let version_marker = rootfs_dir.join(".version"); + + // Check if already extracted with correct version + if version_marker.exists() { + if let Ok(cached_version) = fs::read_to_string(&version_marker) { + if cached_version.trim() == VERSION { + tracing::debug!( + path = %rootfs_dir.display(), + "Using cached rootfs" + ); + return Ok(rootfs_dir); + } + } + } + + // Clean up old versions before extracting new one + cleanup_old_rootfs_versions(&rootfs_dir)?; + + // Remove existing if present (version mismatch) + if rootfs_dir.exists() { + eprintln!("Removing outdated rootfs..."); + fs::remove_dir_all(&rootfs_dir) + .map_err(|e| VmError::HostSetup(format!("remove old rootfs: {e}")))?; + } + + // Extract with progress bar + extract_rootfs_with_progress(resources::ROOTFS, &rootfs_dir)?; + + // Write version marker + fs::write(&version_marker, VERSION) + .map_err(|e| VmError::HostSetup(format!("write rootfs version marker: {e}")))?; + + Ok(rootfs_dir) +} + +/// Check if the rootfs is embedded (non-empty). +pub fn has_embedded_rootfs() -> bool { + !resources::ROOTFS.is_empty() +} + +// ── Internal helpers ─────────────────────────────────────────────────────── + +fn runtime_cache_dir() -> Result { + let base = openshell_core::paths::xdg_data_dir() + .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; + Ok(base.join("openshell").join("vm-runtime").join(VERSION)) +} + +fn runtime_cache_base() -> Result { + let base = openshell_core::paths::xdg_data_dir() + .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; + Ok(base.join("openshell").join("vm-runtime")) +} + +fn rootfs_cache_dir() -> Result { + let base = openshell_core::paths::xdg_data_dir() + .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; + Ok(base + .join("openshell") + .join("openshell-vm") + .join(VERSION) + .join("rootfs")) +} + +fn rootfs_cache_base() -> Result { + let base = openshell_core::paths::xdg_data_dir() + .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; + Ok(base.join("openshell").join("openshell-vm")) +} + +fn cleanup_old_versions(current_dir: &Path) -> Result<(), VmError> { + cleanup_old_versions_in_base(&runtime_cache_base()?, current_dir) +} + +fn cleanup_old_rootfs_versions(current_dir: &Path) -> Result<(), VmError> { + cleanup_old_versions_in_base(&rootfs_cache_base()?, current_dir) +} + +fn cleanup_old_versions_in_base(base: &Path, current_dir: &Path) -> Result<(), VmError> { + if !base.exists() { + return Ok(()); + } + + let entries = match fs::read_dir(base) { + Ok(e) => e, + Err(_) => return Ok(()), // Can't read, skip cleanup + }; + + for entry in entries.filter_map(Result::ok) { + let path = entry.path(); + // Skip if this is the current version directory or a parent of it + if path.is_dir() && !current_dir.starts_with(&path) && path != current_dir { + tracing::debug!( + path = %path.display(), + "Cleaning up old version" + ); + if let Err(e) = fs::remove_dir_all(&path) { + tracing::warn!( + path = %path.display(), + error = %e, + "Failed to clean up old version" + ); + } + } + } + + Ok(()) +} + +fn extract_resource(compressed: &[u8], dest: &Path) -> Result<(), VmError> { + if compressed.is_empty() { + return Err(VmError::HostSetup(format!( + "embedded resource is empty: {}", + dest.display() + ))); + } + + let decompressed = zstd::decode_all(compressed) + .map_err(|e| VmError::HostSetup(format!("decompress {}: {e}", dest.display())))?; + + let mut file = fs::File::create(dest) + .map_err(|e| VmError::HostSetup(format!("create {}: {e}", dest.display())))?; + + file.write_all(&decompressed) + .map_err(|e| VmError::HostSetup(format!("write {}: {e}", dest.display())))?; + + tracing::debug!( + path = %dest.display(), + compressed_size = compressed.len(), + decompressed_size = decompressed.len(), + "Extracted resource" + ); + + Ok(()) +} + +fn extract_rootfs_with_progress(compressed: &[u8], dest: &Path) -> Result<(), VmError> { + eprintln!("Extracting VM environment (first run)..."); + + // Create progress bar for decompression + let pb = ProgressBar::new(compressed.len() as u64); + pb.set_style( + ProgressStyle::default_bar() + .template(" Decompressing [{bar:40.cyan/blue}] {bytes}/{total_bytes}") + .unwrap() + .progress_chars("=>-"), + ); + + // Wrap the compressed data in a progress reader + let reader = ProgressReader::new(std::io::Cursor::new(compressed), pb.clone()); + + // Decompress zstd stream + let decoder = zstd::Decoder::new(reader) + .map_err(|e| VmError::HostSetup(format!("create zstd decoder: {e}")))?; + + pb.finish_and_clear(); + + // Create destination directory + fs::create_dir_all(dest).map_err(|e| VmError::HostSetup(format!("create rootfs dir: {e}")))?; + + // Extract tar archive with progress + eprintln!(" Extracting rootfs..."); + let mut archive = tar::Archive::new(decoder); + archive + .unpack(dest) + .map_err(|e| VmError::HostSetup(format!("extract rootfs tarball: {e}")))?; + + eprintln!(" Rootfs extracted to {}", dest.display()); + + Ok(()) +} + +/// A reader wrapper that updates a progress bar as data is read. +struct ProgressReader { + inner: R, + progress: ProgressBar, +} + +impl ProgressReader { + fn new(inner: R, progress: ProgressBar) -> Self { + Self { inner, progress } + } +} + +impl Read for ProgressReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let n = self.inner.read(buf)?; + self.progress.inc(n as u64); + Ok(n) + } +} + +fn validate_runtime_dir(dir: &Path) -> Result<(), VmError> { + let libkrun = dir.join(resources::LIBKRUN_NAME); + let libkrunfw = dir.join(resources::LIBKRUNFW_NAME); + let gvproxy = dir.join("gvproxy"); + + for path in [&libkrun, &libkrunfw, &gvproxy] { + if !path.exists() { + return Err(VmError::HostSetup(format!( + "missing runtime file: {}", + path.display() + ))); + } + + // Check file is not empty (would indicate a stub) + let size = fs::metadata(path).map(|m| m.len()).unwrap_or(0); + if size == 0 { + return Err(VmError::HostSetup(format!( + "runtime file is empty (stub): {}", + path.display() + ))); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resources_not_empty() { + // On supported platforms, resources should be non-empty + #[cfg(any( + all(target_os = "macos", target_arch = "aarch64"), + all(target_os = "linux", target_arch = "aarch64"), + all(target_os = "linux", target_arch = "x86_64"), + ))] + { + // Note: This test only passes if `mise run vm:runtime:compress` was run + // before building. In CI without compressed artifacts, resources will be + // empty stubs. + if !resources::LIBKRUN.is_empty() { + assert!(!resources::LIBKRUNFW.is_empty()); + assert!(!resources::GVPROXY.is_empty()); + } + } + } +} diff --git a/crates/openshell-vm/src/health.rs b/crates/openshell-vm/src/health.rs new file mode 100644 index 000000000..89e7c7810 --- /dev/null +++ b/crates/openshell-vm/src/health.rs @@ -0,0 +1,200 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! gRPC health check for verifying the gateway is fully ready. +//! +//! This module provides a proper gRPC health check that verifies the gateway +//! service is not just accepting TCP connections, but is actually responding +//! to gRPC requests. This ensures we don't mark the server as ready before +//! it has fully booted. + +use openshell_core::proto::{HealthRequest, ServiceStatus, open_shell_client::OpenShellClient}; +use std::path::PathBuf; +use std::time::Duration; +use tonic::transport::{Certificate, ClientTlsConfig, Endpoint, Identity}; + +/// Load mTLS materials from the gateway's cert directory. +fn load_mtls_materials(gateway_name: &str) -> Result<(Vec, Vec, Vec), String> { + let home = std::env::var("HOME").map_err(|_| "HOME not set")?; + let mtls_dir = PathBuf::from(home) + .join(".config/openshell/gateways") + .join(gateway_name) + .join("mtls"); + + let ca = std::fs::read(mtls_dir.join("ca.crt")) + .map_err(|e| format!("failed to read ca.crt: {e}"))?; + let cert = std::fs::read(mtls_dir.join("tls.crt")) + .map_err(|e| format!("failed to read tls.crt: {e}"))?; + let key = std::fs::read(mtls_dir.join("tls.key")) + .map_err(|e| format!("failed to read tls.key: {e}"))?; + + Ok((ca, cert, key)) +} + +/// Build a tonic TLS config from mTLS materials. +fn build_tls_config(ca: Vec, cert: Vec, key: Vec) -> ClientTlsConfig { + let ca_cert = Certificate::from_pem(ca); + let identity = Identity::from_pem(cert, key); + ClientTlsConfig::new() + .ca_certificate(ca_cert) + .identity(identity) +} + +/// Perform a gRPC health check against the gateway. +/// +/// Returns `Ok(())` if the health check succeeds (service reports healthy), +/// or an error describing why the check failed. +async fn grpc_health_check(gateway_port: u16, gateway_name: &str) -> Result<(), String> { + // Load mTLS materials + let (ca, cert, key) = load_mtls_materials(gateway_name)?; + let tls_config = build_tls_config(ca, cert, key); + + // Build the channel with TLS + let endpoint = format!("https://127.0.0.1:{gateway_port}"); + let channel = Endpoint::from_shared(endpoint.clone()) + .map_err(|e| format!("invalid endpoint: {e}"))? + .connect_timeout(Duration::from_secs(5)) + .tls_config(tls_config) + .map_err(|e| format!("TLS config error: {e}"))? + .connect() + .await + .map_err(|e| format!("connection failed: {e}"))?; + + // Create client and call health + let mut client = OpenShellClient::new(channel); + let response = client + .health(HealthRequest {}) + .await + .map_err(|e| format!("health RPC failed: {e}"))?; + + let health = response.into_inner(); + if health.status == ServiceStatus::Healthy as i32 { + Ok(()) + } else { + Err(format!("service not healthy: status={}", health.status)) + } +} + +/// Wait for the gateway service to be fully ready by polling the gRPC health endpoint. +/// +/// This replaces the TCP-only probe with a proper gRPC health check that verifies +/// the service is actually responding to requests, not just accepting connections. +/// +/// Falls back to TCP probe if mTLS materials aren't available yet. +pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) { + let start = std::time::Instant::now(); + let timeout = Duration::from_secs(90); + let poll_interval = Duration::from_secs(1); + + eprintln!("Waiting for gateway gRPC health check..."); + + // Create a runtime for async health checks + let rt = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(rt) => rt, + Err(e) => { + eprintln!(" failed to create tokio runtime: {e}, falling back to TCP probe"); + wait_for_tcp_only(gateway_port, timeout, poll_interval); + return; + } + }; + + loop { + // Try gRPC health check + let result = rt.block_on(async { + tokio::time::timeout( + Duration::from_secs(5), + grpc_health_check(gateway_port, gateway_name), + ) + .await + }); + + match result { + Ok(Ok(())) => { + eprintln!("Gateway healthy [{:.1}s]", start.elapsed().as_secs_f64()); + return; + } + Ok(Err(e)) => { + // gRPC call completed but failed + if start.elapsed() >= timeout { + eprintln!( + " gateway health check failed after {:.0}s: {e}", + timeout.as_secs_f64() + ); + eprintln!(" continuing anyway - gateway may not be fully operational"); + return; + } + } + Err(_) => { + // Timeout on the health check itself + if start.elapsed() >= timeout { + eprintln!( + " gateway health check timed out after {:.0}s", + timeout.as_secs_f64() + ); + eprintln!(" continuing anyway - gateway may not be fully operational"); + return; + } + } + } + + std::thread::sleep(poll_interval); + } +} + +/// Fallback TCP-only probe when gRPC health check can't be performed. +fn wait_for_tcp_only(gateway_port: u16, timeout: Duration, poll_interval: Duration) { + let start = std::time::Instant::now(); + + loop { + if host_tcp_probe(gateway_port) { + eprintln!( + "Service reachable (TCP) [{:.1}s]", + start.elapsed().as_secs_f64() + ); + return; + } + + if start.elapsed() >= timeout { + eprintln!( + " gateway TCP probe failed after {:.0}s, continuing anyway", + timeout.as_secs_f64() + ); + return; + } + + std::thread::sleep(poll_interval); + } +} + +/// Probe `127.0.0.1:port` from the host to verify the TCP path is working. +/// +/// This is a fallback when gRPC health check isn't available. +fn host_tcp_probe(gateway_port: u16) -> bool { + use std::io::Read; + use std::net::{SocketAddr, TcpStream}; + + let addr: SocketAddr = ([127, 0, 0, 1], gateway_port).into(); + let Ok(mut stream) = TcpStream::connect_timeout(&addr, Duration::from_secs(2)) else { + return false; + }; + + // A short read timeout: if the server is alive it will wait for us + // to send a TLS ClientHello, so the read will time out (= good). + // If the connection resets or closes, the server is dead. + stream + .set_read_timeout(Some(Duration::from_millis(200))) + .ok(); + let mut buf = [0u8; 1]; + match stream.read(&mut buf) { + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + true // Timeout = server alive, waiting for ClientHello. + } + _ => false, // Reset, EOF, or unexpected data = not healthy. + } +} diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index 993972d14..e3e5f312a 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -14,8 +14,10 @@ #![allow(unsafe_code)] +mod embedded; mod exec; mod ffi; +mod health; use std::ffi::CString; use std::path::{Path, PathBuf}; @@ -23,9 +25,9 @@ use std::ptr; use std::time::Instant; pub use exec::{ - acquire_rootfs_lock, clear_vm_runtime_state, ensure_vm_not_running, exec_running_vm, - reset_runtime_state, vm_exec_socket_path, vm_state_path, write_vm_runtime_state, VmExecOptions, - VmRuntimeState, VM_EXEC_VSOCK_PORT, + VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state, + ensure_vm_not_running, exec_running_vm, reset_runtime_state, vm_exec_socket_path, + vm_state_path, write_vm_runtime_state, }; // ── Error type ───────────────────────────────────────────────────────── @@ -336,30 +338,29 @@ fn c_string_array(strings: &[&str]) -> Result<(Vec, Vec<*const libc::c_ Ok((owned, ptrs)) } -const VM_RUNTIME_DIR_NAME: &str = "openshell-vm.runtime"; const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR"; pub(crate) fn configured_runtime_dir() -> Result { + // Allow override for development if let Some(path) = std::env::var_os(VM_RUNTIME_DIR_ENV) { - return Ok(PathBuf::from(path)); + let path = PathBuf::from(path); + tracing::debug!( + path = %path.display(), + "Using runtime from OPENSHELL_VM_RUNTIME_DIR" + ); + return Ok(path); } - let exe = std::env::current_exe().map_err(|e| VmError::HostSetup(e.to_string()))?; - let exe_dir = exe.parent().ok_or_else(|| { - VmError::HostSetup(format!( - "executable has no parent directory: {}", - exe.display() - )) - })?; - Ok(exe_dir.join(VM_RUNTIME_DIR_NAME)) + // Use embedded runtime (extracts on first use) + embedded::ensure_runtime_extracted() } -fn validate_runtime_dir(dir: &Path) -> Result { +fn validate_runtime_dir(dir: &Path) -> Result<(), VmError> { if !dir.is_dir() { return Err(VmError::BinaryNotFound { path: dir.display().to_string(), hint: format!( - "stage the VM runtime bundle with `mise run vm:bundle-runtime` or set {VM_RUNTIME_DIR_ENV}" + "VM runtime not found. Run `mise run vm:build:embedded` or set {VM_RUNTIME_DIR_ENV}" ), }); } @@ -368,7 +369,7 @@ fn validate_runtime_dir(dir: &Path) -> Result { if !libkrun.is_file() { return Err(VmError::BinaryNotFound { path: libkrun.display().to_string(), - hint: "runtime bundle is incomplete: missing libkrun".to_string(), + hint: "runtime is incomplete: missing libkrun".to_string(), }); } @@ -384,7 +385,7 @@ fn validate_runtime_dir(dir: &Path) -> Result { if !has_krunfw { return Err(VmError::BinaryNotFound { path: dir.display().to_string(), - hint: "runtime bundle is incomplete: missing libkrunfw".to_string(), + hint: "runtime is incomplete: missing libkrunfw".to_string(), }); } @@ -392,7 +393,7 @@ fn validate_runtime_dir(dir: &Path) -> Result { if !gvproxy.is_file() { return Err(VmError::BinaryNotFound { path: gvproxy.display().to_string(), - hint: "runtime bundle is incomplete: missing gvproxy".to_string(), + hint: "runtime is incomplete: missing gvproxy".to_string(), }); } @@ -412,48 +413,29 @@ fn validate_runtime_dir(dir: &Path) -> Result { } } - // Validate manifest.json if present — warn but don't fail if files - // listed in the manifest are missing (backwards compatibility). - let manifest_path = dir.join("manifest.json"); - if manifest_path.is_file() { - if let Ok(contents) = std::fs::read_to_string(&manifest_path) { - // Simple check: verify all listed files exist. - // The manifest lists files as JSON strings in a "files" array. - for line in contents.lines() { - let trimmed = line.trim().trim_matches(|c| c == '"' || c == ','); - if !trimmed.is_empty() - && !trimmed.starts_with('{') - && !trimmed.starts_with('}') - && !trimmed.starts_with('[') - && !trimmed.starts_with(']') - && !trimmed.contains(':') - { - let file_path = dir.join(trimmed); - if !file_path.exists() { - eprintln!( - "warning: manifest.json references missing file: {}", - trimmed - ); - } - } - } - } - } - - Ok(gvproxy) + Ok(()) } fn resolve_runtime_bundle() -> Result { let runtime_dir = configured_runtime_dir()?; - validate_runtime_dir(&runtime_dir) + // Validate the directory has required files + validate_runtime_dir(&runtime_dir)?; + Ok(runtime_dir.join("gvproxy")) } pub fn default_runtime_gvproxy_path() -> PathBuf { configured_runtime_dir() - .unwrap_or_else(|_| PathBuf::from(VM_RUNTIME_DIR_NAME)) + .or_else(|_| embedded::runtime_cache_path()) + .unwrap_or_else(|_| PathBuf::from("gvproxy")) .join("gvproxy") } +/// Check if the given path matches the expected default rootfs location. +fn is_default_rootfs_path(path: &Path) -> bool { + // Check if path matches the pattern: ~/.local/share/openshell/openshell-vm/.../rootfs + path.to_string_lossy().contains("openshell/openshell-vm") && path.ends_with("rootfs") +} + #[cfg(target_os = "macos")] fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), VmError> { let existing = std::env::var_os("DYLD_FALLBACK_LIBRARY_PATH"); @@ -876,6 +858,13 @@ fn path_to_cstring(path: &Path) -> Result { /// Returns the VM exit code (from `waitpid`). #[allow(clippy::similar_names)] pub fn launch(config: &VmConfig) -> Result { + // Auto-extract embedded rootfs if using default path and it doesn't exist + if !config.rootfs.is_dir() { + if is_default_rootfs_path(&config.rootfs) && embedded::has_embedded_rootfs() { + embedded::ensure_rootfs_extracted()?; + } + } + // Validate rootfs if !config.rootfs.is_dir() { return Err(VmError::RootfsNotFound { @@ -908,8 +897,8 @@ pub fn launch(config: &VmConfig) -> Result { eprintln!("rootfs: {}", config.rootfs.display()); eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib); - // The runtime must already be staged as a sidecar bundle next to the - // binary (or explicitly pointed to via OPENSHELL_VM_RUNTIME_DIR). + // The runtime is embedded in the binary and extracted on first use. + // Can be overridden via OPENSHELL_VM_RUNTIME_DIR for development. let runtime_gvproxy = resolve_runtime_bundle()?; let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { VmError::HostSetup(format!( @@ -1209,10 +1198,12 @@ pub fn launch(config: &VmConfig) -> Result { eprintln!(" The VM is running but OpenShell may not be fully operational."); } - // Wait for the gRPC service to be reachable via TCP - // probe on host:30051. This confirms the full path - // (gvproxy → kube-proxy nftables → pod:8080) is working. - wait_for_gateway_service(gateway_port); + // Wait for the gRPC health check to pass. This ensures + // the service is fully operational, not just accepting + // TCP connections. The health check confirms the full + // path (gvproxy → kube-proxy nftables → pod:8080) and + // that the gRPC service is responding to requests. + health::wait_for_gateway_ready(gateway_port, &config.gateway_name); } eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); @@ -1280,8 +1271,8 @@ const DEFAULT_GATEWAY_PORT: u16 = 30051; /// host. /// /// 2. **Warm boot**: host-side metadata + mTLS certs survive across VM -/// restarts. Nothing to do — service readiness is confirmed by the TCP -/// probe in `wait_for_gateway_service()`. +/// restarts. Nothing to do — service readiness is confirmed by the gRPC +/// health check in `health::wait_for_gateway_ready()`. /// /// The VM generates PKI on first boot (via openshell-vm-init.sh) and /// writes certs to `/opt/openshell/pki/` on the rootfs. This function: @@ -1315,8 +1306,25 @@ fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Re // Verify host certs match the rootfs PKI. If they diverge (e.g. // PKI was regenerated out-of-band, or the rootfs was replaced), // re-sync the host certs from the authoritative rootfs copy. + // + // Wait for the PKI to be available — the VM may be generating it + // at boot time (non-pre-initialized rootfs). Without this wait, + // the sync check runs before the VM writes its certs, causing + // mTLS mismatch errors on connection. let pki_dir = rootfs.join("opt/openshell/pki"); - if pki_dir.join("ca.crt").is_file() { + let ca_cert_path = pki_dir.join("ca.crt"); + let pki_wait_timeout = std::time::Duration::from_secs(30); + let pki_wait_start = Instant::now(); + while !ca_cert_path.is_file() + || std::fs::metadata(&ca_cert_path).map_or(true, |m| m.len() == 0) + { + if pki_wait_start.elapsed() >= pki_wait_timeout { + eprintln!("Warning: PKI not available after 30s, skipping cert sync"); + break; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + } + if ca_cert_path.is_file() { if let Err(e) = sync_host_certs_if_stale(&pki_dir, gateway_name) { eprintln!("Warning: cert sync check failed: {e}"); } @@ -1486,83 +1494,6 @@ fn sync_host_certs_if_stale(pki_dir: &Path, gateway_name: &str) -> Result<(), Vm Ok(()) } -/// Wait for the openshell pod to become Ready inside the k3s cluster -/// and verify the gRPC service is reachable from the host. -/// -/// Stale pod/lease records are cleaned from the kine DB at build time -/// (see `build-rootfs.sh`). Containerd metadata (meta.db) is preserved -/// across boots so the native snapshotter doesn't re-extract image layers. -/// Runtime task state is cleaned by `openshell-vm-init.sh` on each boot. -/// -/// Wait for the OpenShell gRPC service to be reachable from the host. -/// -/// Polls `host_tcp_probe()` on `127.0.0.1:30051` with 1s intervals. -/// The probe confirms the full networking path: gvproxy → kube-proxy -/// nftables → pod:8080. A successful probe means the pod is running, -/// the NodePort service is routing, and the server is accepting -/// connections. No kubectl or API server access required. -fn wait_for_gateway_service(gateway_port: u16) { - let start = Instant::now(); - let timeout = std::time::Duration::from_secs(90); - let poll_interval = std::time::Duration::from_secs(1); - - eprintln!("Waiting for gateway service..."); - - loop { - if host_tcp_probe(gateway_port) { - eprintln!("Service healthy [{:.1}s]", start.elapsed().as_secs_f64()); - return; - } - - if start.elapsed() >= timeout { - eprintln!( - " gateway service not ready after {:.0}s, continuing anyway", - timeout.as_secs_f64() - ); - return; - } - - std::thread::sleep(poll_interval); - } -} - -/// Probe `127.0.0.1:30051` from the host to verify the full -/// gvproxy → VM → pod path is working. -/// -/// gvproxy accepts TCP connections even when the guest port is closed, -/// but those connections are immediately reset. A server that is truly -/// listening will hold the connection open (waiting for a TLS -/// ClientHello). We exploit this: connect, then try a short read. If -/// the read **times out** the server is alive; if it returns an error -/// (reset/EOF) the server is down. -fn host_tcp_probe(gateway_port: u16) -> bool { - use std::io::Read; - use std::net::{SocketAddr, TcpStream}; - use std::time::Duration; - - let addr: SocketAddr = ([127, 0, 0, 1], gateway_port).into(); - let Ok(mut stream) = TcpStream::connect_timeout(&addr, Duration::from_secs(2)) else { - return false; - }; - - // A short read timeout: if the server is alive it will wait for us - // to send a TLS ClientHello, so the read will time out (= good). - // If the connection resets or closes, the server is dead. - stream - .set_read_timeout(Some(Duration::from_millis(200))) - .ok(); - let mut buf = [0u8; 1]; - match stream.read(&mut buf) { - Err(e) - if e.kind() == std::io::ErrorKind::WouldBlock - || e.kind() == std::io::ErrorKind::TimedOut => - { - true // Timeout = server alive, waiting for ClientHello. - } - _ => false, // Reset, EOF, or unexpected data = not healthy. - } -} - static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); extern "C" fn forward_signal(_sig: libc::c_int) { @@ -1613,8 +1544,8 @@ mod tests { fs::set_permissions(&gvproxy, perms).expect("chmod gvproxy"); } - let resolved_gvproxy = validate_runtime_dir(&dir).expect("runtime bundle should validate"); - assert_eq!(resolved_gvproxy, gvproxy); + validate_runtime_dir(&dir).expect("runtime bundle should validate"); + assert!(gvproxy.exists()); let _ = fs::remove_dir_all(&dir); } diff --git a/tasks/scripts/build-libkrun-macos.sh b/tasks/scripts/build-libkrun-macos.sh new file mode 100755 index 000000000..19972cf86 --- /dev/null +++ b/tasks/scripts/build-libkrun-macos.sh @@ -0,0 +1,281 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build libkrun from source on macOS with portable rpath. +# +# This script builds libkrun WITHOUT GPU support (no virglrenderer/libepoxy/MoltenVK +# dependencies), making the resulting binary fully portable and self-contained. +# +# For openshell-vm, we run headless k3s clusters, so GPU passthrough is not needed. +# +# Prerequisites: +# - macOS ARM64 (Apple Silicon) +# - Xcode Command Line Tools +# - Homebrew: brew install rust lld dtc xz libkrunfw +# +# Usage: +# ./build-libkrun-macos.sh +# +# Output: +# target/libkrun-build/libkrun.dylib - portable dylib with @loader_path rpath + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +BUILD_DIR="${ROOT}/target/libkrun-build" +OUTPUT_DIR="${BUILD_DIR}" + +if [ "$(uname -s)" != "Darwin" ]; then + echo "Error: This script only runs on macOS" >&2 + exit 1 +fi + +if [ "$(uname -m)" != "arm64" ]; then + echo "Error: libkrun on macOS only supports ARM64 (Apple Silicon)" >&2 + exit 1 +fi + +ARCH="$(uname -m)" +echo "==> Building libkrun for macOS ${ARCH} (no GPU support)" +echo " Build directory: ${BUILD_DIR}" +echo "" + +# ── Check dependencies ────────────────────────────────────────────────── + +check_deps() { + echo "==> Checking build dependencies..." + + MISSING="" + + # Check for Rust + if ! command -v cargo &>/dev/null; then + MISSING="$MISSING rust" + fi + + # Check for lld (LLVM linker) + if ! command -v ld.lld &>/dev/null && ! [ -x "$(brew --prefix llvm 2>/dev/null)/bin/ld.lld" ]; then + MISSING="$MISSING lld" + fi + + # Check for dtc (device tree compiler) + if ! command -v dtc &>/dev/null; then + MISSING="$MISSING dtc" + fi + + # Check for libkrunfw + BREW_PREFIX="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" + CUSTOM_RUNTIME="${ROOT}/target/custom-runtime" + if [ ! -f "${BREW_PREFIX}/lib/libkrunfw.dylib" ] && \ + [ ! -f "${BREW_PREFIX}/lib/libkrunfw.5.dylib" ] && \ + [ ! -f "${CUSTOM_RUNTIME}/libkrunfw.dylib" ]; then + MISSING="$MISSING libkrunfw" + fi + + if [ -n "$MISSING" ]; then + echo "Error: Missing dependencies:$MISSING" >&2 + echo "" >&2 + echo "Install with: brew install$MISSING" >&2 + exit 1 + fi + + echo " All dependencies found" +} + +check_deps + +# ── Setup build directory ─────────────────────────────────────────────── + +mkdir -p "$BUILD_DIR" +cd "$BUILD_DIR" + +# ── Clone libkrun ─────────────────────────────────────────────────────── + +LIBKRUN_REF="${LIBKRUN_REF:-v1.17.4}" + +if [ -d libkrun ]; then + echo "==> Updating existing libkrun checkout..." + cd libkrun + git fetch origin --tags + git checkout "${LIBKRUN_REF}" 2>/dev/null || git checkout "tags/${LIBKRUN_REF}" 2>/dev/null || { + echo "Error: Could not checkout ${LIBKRUN_REF}" >&2 + exit 1 + } + cd .. +else + echo "==> Cloning libkrun (${LIBKRUN_REF})..." + git clone https://github.com/containers/libkrun.git + cd libkrun + git fetch --tags + git checkout "${LIBKRUN_REF}" 2>/dev/null || git checkout "tags/${LIBKRUN_REF}" 2>/dev/null || { + echo "Error: Could not checkout ${LIBKRUN_REF}" >&2 + exit 1 + } + cd .. +fi + +LIBKRUN_COMMIT=$(git -C libkrun rev-parse HEAD) +echo " Commit: ${LIBKRUN_COMMIT}" + +cd libkrun + +# ── Build libkrun ─────────────────────────────────────────────────────── + +echo "" +echo "==> Building libkrun with NET=1 BLK=1 (no GPU)..." + +# Find libkrunfw - prefer custom build with bridge support +BREW_PREFIX="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" +CUSTOM_RUNTIME="${ROOT}/target/custom-runtime" + +if [ -f "${CUSTOM_RUNTIME}/provenance.json" ] && [ -f "${CUSTOM_RUNTIME}/libkrunfw.dylib" ]; then + LIBKRUNFW_DIR="${CUSTOM_RUNTIME}" + echo " Using custom libkrunfw from ${LIBKRUNFW_DIR}" +else + LIBKRUNFW_DIR="${BREW_PREFIX}/lib" + echo " Using Homebrew libkrunfw from ${LIBKRUNFW_DIR}" +fi + +# Set library search paths for build +export LIBRARY_PATH="${LIBKRUNFW_DIR}:${BREW_PREFIX}/lib:${LIBRARY_PATH:-}" +export DYLD_LIBRARY_PATH="${LIBKRUNFW_DIR}:${BREW_PREFIX}/lib:${DYLD_LIBRARY_PATH:-}" + +# Set up LLVM/clang for bindgen (required by krun_display/krun_input if they get compiled) +# Note: DYLD_LIBRARY_PATH is needed at runtime for the build scripts that use libclang +LLVM_PREFIX="$(brew --prefix llvm 2>/dev/null || echo /opt/homebrew/opt/llvm)" +if [ -d "$LLVM_PREFIX" ]; then + export LIBCLANG_PATH="${LLVM_PREFIX}/lib" + export DYLD_LIBRARY_PATH="${LLVM_PREFIX}/lib:${DYLD_LIBRARY_PATH:-}" +fi + +# Build with BLK and NET features only (no GPU) +# This avoids the virglrenderer → libepoxy → MoltenVK dependency chain +make clean 2>/dev/null || true +make BLK=1 NET=1 -j"$(sysctl -n hw.ncpu)" + +# ── Rewrite dylib paths for portability ───────────────────────────────── + +echo "" +echo "==> Making dylib portable with @loader_path..." + +DYLIB="target/release/libkrun.dylib" +if [ ! -f "$DYLIB" ]; then + echo "Error: Build did not produce $DYLIB" >&2 + exit 1 +fi + +# Copy to output +cp "$DYLIB" "${OUTPUT_DIR}/libkrun.dylib" +DYLIB="${OUTPUT_DIR}/libkrun.dylib" + +# Show current dependencies +echo " Original dependencies:" +otool -L "$DYLIB" | grep -v "^/" | sed 's/^/ /' + +# Rewrite the install name to use @loader_path (makes it relocatable) +install_name_tool -id "@loader_path/libkrun.dylib" "$DYLIB" + +# Rewrite libkrunfw path to @loader_path (will be bundled alongside) +# Find what libkrunfw path is currently referenced +# Note: grep may not find anything (libkrunfw is loaded via dlopen), so we use || true +KRUNFW_PATH=$(otool -L "$DYLIB" | grep libkrunfw | awk '{print $1}' || true) +if [ -n "$KRUNFW_PATH" ]; then + install_name_tool -change "$KRUNFW_PATH" "@loader_path/libkrunfw.dylib" "$DYLIB" + echo " Rewrote: $KRUNFW_PATH → @loader_path/libkrunfw.dylib" +fi + +# Re-codesign after modifications (required on macOS) +codesign -f -s - "$DYLIB" + +# Show final dependencies +echo "" +echo " Final dependencies:" +otool -L "$DYLIB" | grep -v "^/" | sed 's/^/ /' + +# Verify no hardcoded homebrew paths remain +if otool -L "$DYLIB" | grep -q "/opt/homebrew"; then + echo "" + echo "Warning: Homebrew paths still present in dylib!" >&2 + otool -L "$DYLIB" | grep "/opt/homebrew" | sed 's/^/ /' +else + echo "" + echo " ✓ No hardcoded Homebrew paths" +fi + +# ── Copy libkrunfw to output ──────────────────────────────────────────── + +echo "" +echo "==> Bundling libkrunfw..." + +# Find and copy libkrunfw +KRUNFW_SRC="" +for candidate in \ + "${CUSTOM_RUNTIME}/libkrunfw.dylib" \ + "${CUSTOM_RUNTIME}/libkrunfw.5.dylib" \ + "${BREW_PREFIX}/lib/libkrunfw.dylib" \ + "${BREW_PREFIX}/lib/libkrunfw.5.dylib"; do + if [ -f "$candidate" ]; then + # Resolve symlinks + if [ -L "$candidate" ]; then + KRUNFW_SRC=$(readlink -f "$candidate" 2>/dev/null || readlink "$candidate") + if [[ "$KRUNFW_SRC" != /* ]]; then + KRUNFW_SRC="$(dirname "$candidate")/${KRUNFW_SRC}" + fi + else + KRUNFW_SRC="$candidate" + fi + break + fi +done + +if [ -z "$KRUNFW_SRC" ]; then + echo "Error: Could not find libkrunfw.dylib" >&2 + exit 1 +fi + +cp "$KRUNFW_SRC" "${OUTPUT_DIR}/libkrunfw.dylib" +echo " Copied: $KRUNFW_SRC" + +# Make libkrunfw portable too +install_name_tool -id "@loader_path/libkrunfw.dylib" "${OUTPUT_DIR}/libkrunfw.dylib" +codesign -f -s - "${OUTPUT_DIR}/libkrunfw.dylib" + +# Check libkrunfw dependencies +echo " libkrunfw dependencies:" +otool -L "${OUTPUT_DIR}/libkrunfw.dylib" | grep -v "^/" | sed 's/^/ /' + +# ── Summary ───────────────────────────────────────────────────────────── + +cd "$BUILD_DIR" + +echo "" +echo "==> Build complete!" +echo " Output directory: ${OUTPUT_DIR}" +echo "" +echo " Artifacts:" +ls -lah "${OUTPUT_DIR}"/*.dylib + +# Verify portability +echo "" +echo "==> Verifying portability..." +ALL_GOOD=true + +for lib in "${OUTPUT_DIR}"/*.dylib; do + if otool -L "$lib" | grep -q "/opt/homebrew"; then + echo " ✗ $(basename "$lib") has hardcoded paths" + ALL_GOOD=false + else + echo " ✓ $(basename "$lib") is portable" + fi +done + +if $ALL_GOOD; then + echo "" + echo "All libraries are portable!" + echo "" + echo "Next step: mise run vm:runtime:compress" +else + echo "" + echo "Warning: Some libraries have non-portable paths" + echo "They may not work on machines without Homebrew" +fi diff --git a/tasks/scripts/build-libkrun.sh b/tasks/scripts/build-libkrun.sh new file mode 100755 index 000000000..99eb140b7 --- /dev/null +++ b/tasks/scripts/build-libkrun.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build libkrun and libkrunfw from source on Linux. +# +# This script builds libkrun (VMM) and libkrunfw (kernel firmware) from source +# with OpenShell's custom kernel configuration for bridge/netfilter support. +# +# Prerequisites: +# - Linux (aarch64 or x86_64) +# - Build tools: make, git, gcc, flex, bison, bc +# - Python 3 with pyelftools +# - Rust toolchain +# +# Usage: +# ./build-libkrun.sh +# +# The script will install missing dependencies on Debian/Ubuntu and Fedora. + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +BUILD_DIR="${ROOT}/target/libkrun-build" +OUTPUT_DIR="${BUILD_DIR}" +KERNEL_CONFIG="${ROOT}/crates/openshell-vm/runtime/kernel/openshell.kconfig" + +if [ "$(uname -s)" != "Linux" ]; then + echo "Error: This script only runs on Linux" >&2 + exit 1 +fi + +ARCH="$(uname -m)" +echo "==> Building libkrun for Linux ${ARCH}" +echo " Build directory: ${BUILD_DIR}" +echo " Kernel config: ${KERNEL_CONFIG}" +echo "" + +# ── Install dependencies ──────────────────────────────────────────────── + +install_deps() { + echo "==> Checking/installing build dependencies..." + + if command -v apt-get &>/dev/null; then + # Debian/Ubuntu + DEPS="build-essential git python3 python3-pyelftools flex bison libelf-dev libssl-dev bc curl" + MISSING="" + for dep in $DEPS; do + if ! dpkg -s "$dep" &>/dev/null; then + MISSING="$MISSING $dep" + fi + done + if [ -n "$MISSING" ]; then + echo " Installing:$MISSING" + sudo apt-get update + sudo apt-get install -y $MISSING + else + echo " All dependencies installed" + fi + + elif command -v dnf &>/dev/null; then + # Fedora/RHEL + DEPS="make git python3 python3-pyelftools gcc flex bison elfutils-libelf-devel openssl-devel bc glibc-static curl" + echo " Installing dependencies via dnf..." + sudo dnf install -y $DEPS + + else + echo "Warning: Unknown package manager. Please install manually:" >&2 + echo " build-essential git python3 python3-pyelftools flex bison" >&2 + echo " libelf-dev libssl-dev bc curl" >&2 + fi +} + +install_deps + +# ── Setup build directory ─────────────────────────────────────────────── + +mkdir -p "$BUILD_DIR" +cd "$BUILD_DIR" + +# ── Build libkrunfw (kernel firmware) ─────────────────────────────────── + +echo "" +echo "==> Building libkrunfw with custom kernel config..." + +if [ ! -d libkrunfw ]; then + echo " Cloning libkrunfw..." + git clone --depth 1 https://github.com/containers/libkrunfw.git +fi + +cd libkrunfw + +# Copy custom kernel config +if [ -f "$KERNEL_CONFIG" ]; then + cp "$KERNEL_CONFIG" openshell.kconfig + echo " Applied custom kernel config: openshell.kconfig" +else + echo "Warning: Custom kernel config not found at ${KERNEL_CONFIG}" >&2 + echo " Building with default config (k3s networking may not work)" >&2 +fi + +# Build libkrunfw +echo " Building kernel and libkrunfw (this may take 15-20 minutes)..." +if [ -f openshell.kconfig ]; then + make KCONFIG_FRAGMENT=openshell.kconfig -j"$(nproc)" +else + make -j"$(nproc)" +fi + +# Copy output +cp libkrunfw.so* "$OUTPUT_DIR/" +echo " Built: $(ls "$OUTPUT_DIR"/libkrunfw.so* | xargs -n1 basename | tr '\n' ' ')" + +cd "$BUILD_DIR" + +# ── Build libkrun (VMM) ───────────────────────────────────────────────── + +echo "" +echo "==> Building libkrun..." + +if [ ! -d libkrun ]; then + echo " Cloning libkrun..." + git clone --depth 1 https://github.com/containers/libkrun.git +fi + +cd libkrun + +# Build with NET support for gvproxy networking +echo " Building libkrun with NET=1..." +make NET=1 -j"$(nproc)" + +# Copy output +cp target/release/libkrun.so "$OUTPUT_DIR/" +echo " Built: libkrun.so" + +cd "$BUILD_DIR" + +# ── Summary ───────────────────────────────────────────────────────────── + +echo "" +echo "==> Build complete!" +echo " Output directory: ${OUTPUT_DIR}" +echo "" +echo " Artifacts:" +ls -lah "$OUTPUT_DIR"/*.so* + +echo "" +echo "Next step: mise run vm:runtime:compress" diff --git a/tasks/scripts/build-rootfs-tarball.sh b/tasks/scripts/build-rootfs-tarball.sh new file mode 100755 index 000000000..2e4b1f05e --- /dev/null +++ b/tasks/scripts/build-rootfs-tarball.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build rootfs and compress to tarball for embedding in openshell-vm binary. +# +# This script: +# 1. Builds the rootfs using build-rootfs.sh or build-rootfs-minimal.sh +# 2. Compresses it to a zstd tarball for embedding +# +# Usage: +# ./build-rootfs-tarball.sh [--minimal] +# +# Options: +# --minimal Build a minimal rootfs (~200-300MB) without pre-loaded images. +# First boot will be slower but binary size is much smaller. +# Default: full rootfs with pre-loaded images (~2GB+). +# +# The resulting tarball is placed at target/vm-runtime-compressed/rootfs.tar.zst +# for inclusion in the embedded binary build. + +set -euo pipefail + +ROOT="$(git rev-parse --show-toplevel 2>/dev/null)" || ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOTFS_BUILD_DIR="${ROOT}/target/rootfs-build" +OUTPUT_DIR="${ROOT}/target/vm-runtime-compressed" +OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst" + +# Parse arguments +MINIMAL=false +for arg in "$@"; do + case "$arg" in + --minimal) + MINIMAL=true + ;; + --help|-h) + echo "Usage: $0 [--minimal]" + echo "" + echo "Options:" + echo " --minimal Build minimal rootfs (~200-300MB) without pre-loaded images" + echo " First boot will be slower but binary size is much smaller" + exit 0 + ;; + *) + echo "Unknown option: $arg" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Check for Docker +if ! command -v docker &>/dev/null; then + echo "Error: Docker is required to build the rootfs" >&2 + echo "Please install Docker and try again" >&2 + exit 1 +fi + +# Check if Docker daemon is running +if ! docker info &>/dev/null; then + echo "Error: Docker daemon is not running" >&2 + echo "Please start Docker and try again" >&2 + exit 1 +fi + +if [ "$MINIMAL" = true ]; then + echo "==> Building MINIMAL rootfs for embedding" + echo " Build dir: ${ROOTFS_BUILD_DIR}" + echo " Output: ${OUTPUT}" + echo " Mode: minimal (no pre-loaded images, ~200-300MB)" + echo "" + + # Build minimal rootfs + echo "==> Step 1/2: Building minimal rootfs..." + "${ROOT}/crates/openshell-vm/scripts/build-rootfs-minimal.sh" "${ROOTFS_BUILD_DIR}" +else + echo "==> Building FULL rootfs for embedding" + echo " Build dir: ${ROOTFS_BUILD_DIR}" + echo " Output: ${OUTPUT}" + echo " Mode: full (pre-loaded images, pre-initialized, ~2GB+)" + echo "" + + # Build full rootfs + echo "==> Step 1/2: Building full rootfs (this may take 10-15 minutes)..." + "${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" "${ROOTFS_BUILD_DIR}" +fi + +# Compress to tarball +echo "" +echo "==> Step 2/2: Compressing rootfs to tarball..." +mkdir -p "${OUTPUT_DIR}" + +# Remove existing tarball if present +rm -f "${OUTPUT}" + +# Get uncompressed size for display +echo " Uncompressed size: $(du -sh "${ROOTFS_BUILD_DIR}" | cut -f1)" + +# Create tarball with zstd compression +# -19 = high compression (slower but smaller) +# -T0 = use all available threads +echo " Compressing with zstd (level 19, this may take a few minutes)..." +tar -C "${ROOTFS_BUILD_DIR}" -cf - . | zstd -19 -T0 -o "${OUTPUT}" + +# Report results +echo "" +echo "==> Rootfs tarball created successfully!" +echo " Output: ${OUTPUT}" +echo " Compressed: $(du -sh "${OUTPUT}" | cut -f1)" +if [ "$MINIMAL" = true ]; then + echo " Type: minimal (first boot ~30-60s, images pulled on demand)" +else + echo " Type: full (first boot ~3-5s, images pre-loaded)" +fi +echo "" +echo "Next step: mise run vm:build:embedded:quick" diff --git a/tasks/scripts/bundle-vm-runtime.sh b/tasks/scripts/bundle-vm-runtime.sh deleted file mode 100755 index ac2711c63..000000000 --- a/tasks/scripts/bundle-vm-runtime.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -euo pipefail - -if [ "$(uname -s)" != "Darwin" ]; then - echo "vm:bundle-runtime currently supports macOS only" >&2 - exit 1 -fi - -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -LIB_DIR="${OPENSHELL_VM_RUNTIME_SOURCE_DIR:-}" -GVPROXY_BIN="${OPENSHELL_VM_GVPROXY:-}" - -if [ -z "$LIB_DIR" ]; then - # Prefer the custom runtime (has bridge/netfilter kernel support) over - # the stock Homebrew libkrunfw which lacks these capabilities. - CUSTOM_RUNTIME_DIR="${ROOT}/target/custom-runtime" - if [ -f "${CUSTOM_RUNTIME_DIR}/provenance.json" ] && [ -e "${CUSTOM_RUNTIME_DIR}/libkrunfw.dylib" ]; then - LIB_DIR="${CUSTOM_RUNTIME_DIR}" - echo "using custom runtime at ${LIB_DIR}" - else - BREW_PREFIX="$(brew --prefix 2>/dev/null || true)" - if [ -n "$BREW_PREFIX" ]; then - LIB_DIR="${BREW_PREFIX}/lib" - else - LIB_DIR="/opt/homebrew/lib" - fi - fi -fi - -if [ -z "$GVPROXY_BIN" ]; then - if command -v gvproxy >/dev/null 2>&1; then - GVPROXY_BIN="$(command -v gvproxy)" - elif [ -x /opt/homebrew/bin/gvproxy ]; then - GVPROXY_BIN="/opt/homebrew/bin/gvproxy" - elif [ -x /opt/podman/bin/gvproxy ]; then - GVPROXY_BIN="/opt/podman/bin/gvproxy" - else - echo "gvproxy not found; set OPENSHELL_VM_GVPROXY or install gvproxy" >&2 - exit 1 - fi -fi - -# libkrun.dylib: prefer the custom runtime dir, fall back to Homebrew. -# libkrun is the VMM and does not need a custom build; only libkrunfw -# carries the custom kernel. -LIBKRUN="${LIB_DIR}/libkrun.dylib" -if [ ! -e "$LIBKRUN" ]; then - BREW_PREFIX="${BREW_PREFIX:-$(brew --prefix 2>/dev/null || true)}" - if [ -n "$BREW_PREFIX" ] && [ -e "${BREW_PREFIX}/lib/libkrun.dylib" ]; then - LIBKRUN="${BREW_PREFIX}/lib/libkrun.dylib" - echo "using Homebrew libkrun at ${LIBKRUN}" - else - echo "libkrun not found at ${LIB_DIR}/libkrun.dylib or Homebrew; install libkrun or set OPENSHELL_VM_RUNTIME_SOURCE_DIR" >&2 - exit 1 - fi -fi - -KRUNFW_FILES=() -while IFS= read -r line; do - KRUNFW_FILES+=("$line") -done < <(find "$LIB_DIR" -maxdepth 1 \( -type f -o -type l \) \( -name 'libkrunfw.dylib' -o -name 'libkrunfw.*.dylib' \) | sort -u) - -if [ "${#KRUNFW_FILES[@]}" -eq 0 ]; then - echo "libkrunfw not found under ${LIB_DIR}; set OPENSHELL_VM_RUNTIME_SOURCE_DIR" >&2 - exit 1 -fi - -# Check for provenance.json (custom runtime indicator) -PROVENANCE_FILE="${LIB_DIR}/provenance.json" -IS_CUSTOM="false" -if [ -f "$PROVENANCE_FILE" ]; then - IS_CUSTOM="true" - echo "custom runtime detected (provenance.json present)" -fi - -TARGETS=( - "${ROOT}/target/debug" - "${ROOT}/target/release" - "${ROOT}/target/aarch64-apple-darwin/debug" - "${ROOT}/target/aarch64-apple-darwin/release" -) - -for target_dir in "${TARGETS[@]}"; do - runtime_dir="${target_dir}/openshell-vm.runtime" - mkdir -p "$runtime_dir" - - install -m 0644 "$LIBKRUN" "${runtime_dir}/libkrun.dylib" - install -m 0755 "$GVPROXY_BIN" "${runtime_dir}/gvproxy" - for krunfw in "${KRUNFW_FILES[@]}"; do - install -m 0644 "$krunfw" "${runtime_dir}/$(basename "$krunfw")" - done - - # Copy provenance.json if this is a custom runtime. - if [ "$IS_CUSTOM" = "true" ] && [ -f "$PROVENANCE_FILE" ]; then - install -m 0644 "$PROVENANCE_FILE" "${runtime_dir}/provenance.json" - fi - - manifest_entries=() - manifest_entries+=(' "libkrun.dylib"') - manifest_entries+=(' "gvproxy"') - for krunfw in "${KRUNFW_FILES[@]}"; do - manifest_entries+=(" \"$(basename "$krunfw")\"") - done - if [ "$IS_CUSTOM" = "true" ]; then - manifest_entries+=(' "provenance.json"') - fi - - cat > "${runtime_dir}/manifest.json" </dev/null || true + + # Rewrite libkrunfw reference if present + local krunfw_path + krunfw_path=$(otool -L "$dylib" 2>/dev/null | grep libkrunfw | awk '{print $1}' || true) + if [ -n "$krunfw_path" ] && [[ "$krunfw_path" != @* ]]; then + install_name_tool -change "$krunfw_path" "@loader_path/libkrunfw.dylib" "$dylib" + fi + + # Re-codesign + codesign -f -s - "$dylib" 2>/dev/null || true +} + +# Bundle GPU dependencies (libepoxy, virglrenderer, MoltenVK) for Homebrew libkrun +bundle_gpu_dependencies() { + local work_dir="$1" + local brew_prefix + brew_prefix="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" + + # Dependencies to bundle + local deps=( + "${brew_prefix}/opt/libepoxy/lib/libepoxy.0.dylib" + "${brew_prefix}/opt/virglrenderer/lib/libvirglrenderer.1.dylib" + "${brew_prefix}/opt/molten-vk/lib/libMoltenVK.dylib" + ) + + for dep in "${deps[@]}"; do + if [ -f "$dep" ]; then + local dep_name + dep_name="$(basename "$dep")" + cp "$dep" "${work_dir}/${dep_name}" + echo " Copied: ${dep_name}" + fi + done + + # Rewrite all paths to use @loader_path + for dylib in "${work_dir}"/*.dylib; do + [ -f "$dylib" ] || continue + + # Rewrite install name + local dylib_name + dylib_name="$(basename "$dylib")" + install_name_tool -id "@loader_path/${dylib_name}" "$dylib" 2>/dev/null || true + + # Rewrite all Homebrew references to @loader_path + for dep in "${deps[@]}"; do + local dep_name + dep_name="$(basename "$dep")" + install_name_tool -change "$dep" "@loader_path/${dep_name}" "$dylib" 2>/dev/null || true + done + + # Also rewrite libkrunfw + local krunfw_path + krunfw_path=$(otool -L "$dylib" 2>/dev/null | grep libkrunfw | awk '{print $1}' || true) + if [ -n "$krunfw_path" ] && [[ "$krunfw_path" != @* ]]; then + install_name_tool -change "$krunfw_path" "@loader_path/libkrunfw.dylib" "$dylib" + fi + + # Re-codesign + codesign -f -s - "$dylib" 2>/dev/null || true + done + + echo " All dependencies rewritten to use @loader_path" +} +WORK_DIR="${ROOT}/target/vm-runtime" +OUTPUT_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${ROOT}/target/vm-runtime-compressed}" + +rm -rf "$WORK_DIR" +mkdir -p "$WORK_DIR" "$OUTPUT_DIR" + +echo "==> Detecting platform..." + +case "$(uname -s)-$(uname -m)" in + Darwin-arm64) + PLATFORM="darwin-aarch64" + echo " Platform: macOS ARM64" + + # Source priority for libkrun: + # 1. Custom build from build-libkrun-macos.sh (portable, no GPU deps) + # 2. Custom runtime with custom libkrunfw + # 3. Homebrew (has GPU deps, not portable) + LIBKRUN_BUILD_DIR="${ROOT}/target/libkrun-build" + CUSTOM_DIR="${ROOT}/target/custom-runtime" + BREW_PREFIX="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" + + if [ -f "${LIBKRUN_BUILD_DIR}/libkrun.dylib" ]; then + echo " Using portable libkrun from ${LIBKRUN_BUILD_DIR}" + cp "${LIBKRUN_BUILD_DIR}/libkrun.dylib" "$WORK_DIR/" + cp "${LIBKRUN_BUILD_DIR}/libkrunfw.dylib" "$WORK_DIR/" + + # Verify portability + if otool -L "${LIBKRUN_BUILD_DIR}/libkrun.dylib" | grep -q "/opt/homebrew"; then + echo " Warning: libkrun has hardcoded Homebrew paths - may not be portable" + else + echo " ✓ libkrun is portable (no hardcoded paths)" + fi + elif [ -f "${CUSTOM_DIR}/provenance.json" ]; then + echo " Using custom runtime from ${CUSTOM_DIR}" + + # libkrun from Homebrew (needs path rewriting for portability) + if [ -f "${CUSTOM_DIR}/libkrun.dylib" ]; then + cp "${CUSTOM_DIR}/libkrun.dylib" "$WORK_DIR/" + else + cp "${BREW_PREFIX}/lib/libkrun.dylib" "$WORK_DIR/" + make_dylib_portable "$WORK_DIR/libkrun.dylib" + fi + + # libkrunfw from custom build + cp "${CUSTOM_DIR}/libkrunfw.dylib" "$WORK_DIR/" + else + echo " Using Homebrew runtime from ${BREW_PREFIX}/lib" + echo " Warning: Homebrew libkrun has GPU dependencies (libepoxy, virglrenderer)" + echo " For a portable build, run: mise run vm:runtime:build-libkrun-macos" + + cp "${BREW_PREFIX}/lib/libkrun.dylib" "$WORK_DIR/" + + # Copy libkrunfw + for krunfw in "${BREW_PREFIX}/lib"/libkrunfw*.dylib; do + [ -f "$krunfw" ] || continue + if [ -L "$krunfw" ]; then + target=$(readlink "$krunfw") + if [[ "$target" != /* ]]; then + target="${BREW_PREFIX}/lib/${target}" + fi + cp "$target" "$WORK_DIR/$(basename "$krunfw")" + else + cp "$krunfw" "$WORK_DIR/" + fi + done + + # If using Homebrew libkrun with GPU, we need to bundle the GPU dependencies + # for portability. Check if libkrun has GPU deps: + if otool -L "$WORK_DIR/libkrun.dylib" | grep -q "libepoxy\|virglrenderer"; then + echo " Bundling GPU dependencies for portability..." + bundle_gpu_dependencies "$WORK_DIR" + fi + fi + + # Normalize libkrunfw naming - ensure we have libkrunfw.dylib + if [ ! -f "$WORK_DIR/libkrunfw.dylib" ] && [ -f "$WORK_DIR/libkrunfw.5.dylib" ]; then + cp "$WORK_DIR/libkrunfw.5.dylib" "$WORK_DIR/libkrunfw.dylib" + fi + + # gvproxy - prefer Podman, fall back to Homebrew + if [ -x /opt/podman/bin/gvproxy ]; then + cp /opt/podman/bin/gvproxy "$WORK_DIR/" + echo " Using gvproxy from Podman" + elif [ -x "$(brew --prefix 2>/dev/null)/bin/gvproxy" ]; then + cp "$(brew --prefix)/bin/gvproxy" "$WORK_DIR/" + echo " Using gvproxy from Homebrew" + else + echo "Error: gvproxy not found. Install Podman Desktop or run: brew install gvproxy" >&2 + exit 1 + fi + ;; + + Linux-aarch64) + PLATFORM="linux-aarch64" + echo " Platform: Linux ARM64" + + BUILD_DIR="${ROOT}/target/libkrun-build" + if [ ! -f "${BUILD_DIR}/libkrun.so" ]; then + echo "Error: libkrun not found. Run: mise run vm:runtime:build-libkrun" >&2 + exit 1 + fi + + cp "${BUILD_DIR}/libkrun.so" "$WORK_DIR/" + + # Copy libkrunfw - find the versioned .so file + for krunfw in "${BUILD_DIR}"/libkrunfw.so*; do + [ -f "$krunfw" ] || continue + cp "$krunfw" "$WORK_DIR/" + done + + # Download gvproxy if not present + if [ ! -f "$WORK_DIR/gvproxy" ]; then + echo " Downloading gvproxy for linux-arm64..." + curl -fsSL -o "$WORK_DIR/gvproxy" \ + "https://github.com/containers/gvisor-tap-vsock/releases/download/v0.8.8/gvproxy-linux-arm64" + chmod +x "$WORK_DIR/gvproxy" + fi + ;; + + Linux-x86_64) + PLATFORM="linux-x86_64" + echo " Platform: Linux x86_64" + + BUILD_DIR="${ROOT}/target/libkrun-build" + if [ ! -f "${BUILD_DIR}/libkrun.so" ]; then + echo "Error: libkrun not found. Run: mise run vm:runtime:build-libkrun" >&2 + exit 1 + fi + + cp "${BUILD_DIR}/libkrun.so" "$WORK_DIR/" + + # Copy libkrunfw + for krunfw in "${BUILD_DIR}"/libkrunfw.so*; do + [ -f "$krunfw" ] || continue + cp "$krunfw" "$WORK_DIR/" + done + + # Download gvproxy if not present + if [ ! -f "$WORK_DIR/gvproxy" ]; then + echo " Downloading gvproxy for linux-amd64..." + curl -fsSL -o "$WORK_DIR/gvproxy" \ + "https://github.com/containers/gvisor-tap-vsock/releases/download/v0.8.8/gvproxy-linux-amd64" + chmod +x "$WORK_DIR/gvproxy" + fi + ;; + + *) + echo "Error: Unsupported platform: $(uname -s)-$(uname -m)" >&2 + echo "Supported platforms: Darwin-arm64, Linux-aarch64, Linux-x86_64" >&2 + exit 1 + ;; +esac + +echo "" +echo "==> Collected artifacts:" +ls -lah "$WORK_DIR" + +echo "" +echo "==> Compressing with zstd (level 19)..." + +for file in "$WORK_DIR"/*; do + [ -f "$file" ] || continue + name=$(basename "$file") + original_size=$(du -h "$file" | cut -f1) + zstd -19 -f -q -o "${OUTPUT_DIR}/${name}.zst" "$file" + # Ensure compressed file is readable/writable (source may be read-only) + chmod 644 "${OUTPUT_DIR}/${name}.zst" + compressed_size=$(du -h "${OUTPUT_DIR}/${name}.zst" | cut -f1) + echo " ${name}: ${original_size} -> ${compressed_size}" +done + +# Check for rootfs tarball (built separately by build-rootfs-tarball.sh) +ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs.tar.zst" +if [ -f "$ROOTFS_TARBALL" ]; then + echo " rootfs.tar.zst: $(du -h "$ROOTFS_TARBALL" | cut -f1) (pre-built)" +else + echo "" + echo "Note: rootfs.tar.zst not found." + echo " For full embedded build, run: mise run vm:build:rootfs-tarball" + echo " For quick build (without rootfs), the binary will still work but" + echo " require the rootfs to be built separately on first run." +fi + +echo "" +echo "==> Compressed artifacts in ${OUTPUT_DIR}:" +ls -lah "$OUTPUT_DIR" + +TOTAL=$(du -sh "$OUTPUT_DIR" | cut -f1) +echo "" +echo "==> Total compressed size: ${TOTAL}" +echo "" +echo "Set this environment variable for cargo build:" +echo " export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=${OUTPUT_DIR}" diff --git a/tasks/scripts/package-openshell-vm-runtime.sh b/tasks/scripts/package-openshell-vm-runtime.sh deleted file mode 100755 index 2632e671a..000000000 --- a/tasks/scripts/package-openshell-vm-runtime.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -euo pipefail - -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -ARTIFACT_DIR="${ROOT}/artifacts" -TARGET_DIR="${ROOT}/target/release" - -if [ ! -x "${TARGET_DIR}/openshell-vm" ]; then - echo "target/release/openshell-vm not found; build it first with cargo build -p openshell-vm --release" >&2 - exit 1 -fi - -if [ ! -d "${TARGET_DIR}/openshell-vm.runtime" ]; then - echo "target/release/openshell-vm.runtime not found; run mise run vm:bundle-runtime first" >&2 - exit 1 -fi - -mkdir -p "${ARTIFACT_DIR}" -tar -czf "${ARTIFACT_DIR}/openshell-vm-aarch64-apple-darwin.tar.gz" \ - -C "${TARGET_DIR}" \ - openshell-vm \ - openshell-vm.runtime - -ls -lh "${ARTIFACT_DIR}/openshell-vm-aarch64-apple-darwin.tar.gz" diff --git a/tasks/vm.toml b/tasks/vm.toml index 812455df4..ee2cbbb81 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -3,12 +3,14 @@ # openshell-vm development helpers +# ═══════════════════════════════════════════════════════════════════════════ +# Main VM Commands +# ═══════════════════════════════════════════════════════════════════════════ + [vm] description = "Build and run the standalone openshell-vm microVM" run = [ - "mise run vm:build:binary", - "tasks/scripts/codesign-openshell-vm.sh", - "tasks/scripts/bundle-vm-runtime.sh", + "mise run vm:build:embedded", "tasks/scripts/ensure-vm-rootfs.sh", "tasks/scripts/sync-vm-rootfs.sh", "tasks/scripts/run-vm.sh", @@ -18,52 +20,89 @@ hide = false ["vm:build"] description = "Force a fresh openshell-vm rebuild, including the rootfs" run = [ - "mise run vm:build:binary", - "tasks/scripts/codesign-openshell-vm.sh", - "tasks/scripts/bundle-vm-runtime.sh", + "mise run vm:build:embedded", "OPENSHELL_VM_FORCE_ROOTFS_REBUILD=1 tasks/scripts/ensure-vm-rootfs.sh", "tasks/scripts/sync-vm-rootfs.sh", ] hide = false +# ═══════════════════════════════════════════════════════════════════════════ +# Embedded Binary Build (Single Binary, No Sidecar) +# ═══════════════════════════════════════════════════════════════════════════ + +["vm:build:embedded"] +description = "Build openshell-vm with embedded runtime (single binary, no sidecar)" +run = [ + "mise run vm:runtime:compress", + "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm", + "tasks/scripts/codesign-openshell-vm.sh", +] +hide = false + ["vm:build:binary"] -description = "Build the standalone openshell-vm binary" -run = "cargo build -p openshell-vm" +description = "Build the standalone openshell-vm binary (requires vm:runtime:compress first)" +run = "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm" hide = true -["vm:build:release"] -description = "Build the standalone openshell-vm binary in release mode" -run = "cargo build -p openshell-vm --release" -hide = true +# ═══════════════════════════════════════════════════════════════════════════ +# VM Runtime Artifact Management +# ═══════════════════════════════════════════════════════════════════════════ + +["vm:runtime:compress"] +description = "Gather and compress VM runtime artifacts for embedding" +run = "tasks/scripts/compress-vm-runtime.sh" +hide = false + +["vm:runtime:build-libkrunfw"] +description = "Build custom libkrunfw with bridge/netfilter kernel support" +run = "crates/openshell-vm/runtime/build-custom-libkrunfw.sh" +hide = false + +["vm:runtime:build-libkrun"] +description = "Build libkrun and libkrunfw from source (Linux only)" +run = "tasks/scripts/build-libkrun.sh" +hide = false + +["vm:runtime:build-libkrun-macos"] +description = "Build portable libkrun from source (macOS, no GPU deps)" +run = "tasks/scripts/build-libkrun-macos.sh" +hide = false + +# ═══════════════════════════════════════════════════════════════════════════ +# Rootfs Management +# ═══════════════════════════════════════════════════════════════════════════ ["vm:rootfs"] description = "Build the default openshell-vm rootfs if needed" run = "tasks/scripts/ensure-vm-rootfs.sh" hide = true +["vm:build:rootfs-tarball"] +description = "Build and compress FULL rootfs tarball for embedding (~2GB+)" +run = "tasks/scripts/build-rootfs-tarball.sh" +hide = false + +["vm:build:rootfs-tarball:minimal"] +description = "Build and compress MINIMAL rootfs tarball for embedding (~200-300MB)" +run = "tasks/scripts/build-rootfs-tarball.sh --minimal" +hide = false + +["vm:build:embedded:quick"] +description = "Build embedded binary using cached rootfs tarball (skips rootfs rebuild)" +run = [ + "mise run vm:runtime:compress", + "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm", + "tasks/scripts/codesign-openshell-vm.sh", +] +hide = false + ["vm:codesign"] description = "Codesign the openshell-vm binary for Hypervisor.framework access on macOS" depends = ["vm:build:binary"] run = "tasks/scripts/codesign-openshell-vm.sh" hide = true -["vm:bundle-runtime"] -description = "Stage the openshell-vm sidecar runtime bundle next to local build outputs" -run = "tasks/scripts/bundle-vm-runtime.sh" -hide = false - -["vm:build-custom-runtime"] -description = "Build a custom libkrunfw with bridge/netfilter kernel support" -run = "crates/openshell-vm/runtime/build-custom-libkrunfw.sh" -hide = false - ["vm:check-capabilities"] description = "Check VM kernel capabilities (run inside the VM)" run = "echo 'This script must be run inside the VM. Copy it to the rootfs or exec into a running VM.'" hide = false - -["vm:package:openshell-vm"] -description = "Package the openshell-vm binary with its sidecar runtime bundle" -run = "tasks/scripts/package-openshell-vm-runtime.sh" -depends = ["vm:build:release", "vm:bundle-runtime"] -hide = false From d06b90b0e7a3772f054fff5fdb9651ce226650c1 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Mon, 30 Mar 2026 18:21:34 -0700 Subject: [PATCH 07/10] wip --- crates/openshell-vm/README.md | 235 ++++++++++++++++++ crates/openshell-vm/pins.env | 3 +- .../runtime/build-custom-libkrunfw.sh | 9 +- .../scripts/build-rootfs-minimal.sh | 85 ++++++- crates/openshell-vm/scripts/build-rootfs.sh | 113 +++++++-- crates/openshell-vm/src/ffi.rs | 14 +- crates/openshell-vm/src/lib.rs | 21 +- 7 files changed, 441 insertions(+), 39 deletions(-) create mode 100644 crates/openshell-vm/README.md diff --git a/crates/openshell-vm/README.md b/crates/openshell-vm/README.md new file mode 100644 index 000000000..1861d7294 --- /dev/null +++ b/crates/openshell-vm/README.md @@ -0,0 +1,235 @@ +# openshell-vm + +> Status: Experimental and work in progress (WIP). VM support is under active development and may change. + +MicroVM runtime for OpenShell, powered by [libkrun](https://github.com/containers/libkrun). Boots a lightweight ARM64 Linux VM on macOS (Apple Hypervisor.framework) or Linux (KVM) running a single-node k3s cluster with the OpenShell control plane. + +## Quick Start + +Build and run the VM in one command: + +```bash +mise run vm +``` + +This will: + +1. Compress runtime artifacts (libkrun, libkrunfw, gvproxy, rootfs) +2. Build the `openshell-vm` binary with embedded runtime +3. Codesign it (macOS) +4. Build the rootfs if needed +5. Boot the VM + +## Prerequisites + +- **macOS (Apple Silicon)** or **Linux (aarch64 with KVM)** +- Rust toolchain +- [mise](https://mise.jdx.dev/) task runner +- Docker (for rootfs builds) + +### macOS-Specific + +The binary must be codesigned with the Hypervisor.framework entitlement. The `mise run vm` flow handles this automatically. To codesign manually: + +```bash +codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - target/debug/openshell-vm +``` + +## Build + +### Embedded Binary (Recommended) + +Produces a single self-extracting binary with all runtime artifacts baked in: + +```bash +mise run vm:build:embedded +``` + +On first run, the binary extracts its runtime to `~/.local/share/openshell/vm-runtime//`. + +### Quick Rebuild (Skip Rootfs) + +If you already have a cached rootfs tarball and just want to rebuild the binary: + +```bash +mise run vm:build:embedded:quick +``` + +### Force Full Rebuild + +Rebuilds everything including the rootfs: + +```bash +mise run vm:build +``` + +## Run + +### Default (Gateway Mode) + +Boots the full OpenShell gateway --- k3s + openshell-server + openshell-sandbox: + +```bash +mise run vm +``` + +Or run the binary directly: + +```bash +./target/debug/openshell-vm +``` + +### Custom Process + +Run an arbitrary process inside a fresh VM instead of k3s: + +```bash +./target/debug/openshell-vm --exec /bin/sh --vcpus 2 --mem 2048 +``` + +### Execute in a Running VM + +Attach to a running VM and run a command: + +```bash +./target/debug/openshell-vm exec -- ls / +./target/debug/openshell-vm exec -- sh # interactive shell +``` + +### Named Instances + +Run multiple isolated VM instances side-by-side: + +```bash +./target/debug/openshell-vm --name dev +./target/debug/openshell-vm --name staging +``` + +Each instance gets its own rootfs clone under `~/.local/share/openshell/openshell-vm/instances//`. + +## CLI Reference + +``` +openshell-vm [OPTIONS] [COMMAND] + +Options: + --rootfs Path to aarch64 Linux rootfs directory + --name Named VM instance (auto-clones rootfs) + --exec Run a custom process instead of k3s + --args ... Arguments to the executable + --env ... Environment variables + --workdir Working directory inside the VM [default: /] + -p, --port ... Port mappings (host_port:guest_port) + --vcpus Virtual CPUs [default: 4 gateway, 2 exec] + --mem RAM in MiB [default: 8192 gateway, 2048 exec] + --krun-log-level <0-5> libkrun log level [default: 1] + --net Networking: gvproxy, tsi, none [default: gvproxy] + --reset Wipe runtime state before booting + +Subcommands: + exec Execute a command inside a running VM +``` + +## Rootfs + +The rootfs is an aarch64 Ubuntu filesystem containing k3s, pre-loaded container images, and the OpenShell binaries. + +### Full Rootfs (~2GB+) + +Pre-initialized k3s cluster state for fast boot (~3-5s): + +```bash +mise run vm:build:rootfs-tarball +``` + +### Minimal Rootfs (~200-300MB) + +Just k3s + supervisor, cold starts in ~30-60s: + +```bash +mise run vm:build:rootfs-tarball:minimal +``` + +## Custom Kernel (libkrunfw) + +The stock libkrunfw (e.g. from Homebrew) lacks bridge, netfilter, and conntrack support needed for pod networking. OpenShell builds a custom libkrunfw with these enabled. + +Build it: + +```bash +mise run vm:runtime:build-libkrunfw +``` + +See [`runtime/README.md`](runtime/README.md) for details on the kernel config and troubleshooting. + +## Architecture + +``` +Host (macOS / Linux) + openshell-vm binary + ├── Embedded runtime (libkrun, libkrunfw, gvproxy, rootfs.tar.zst) + ├── FFI: loads libkrun at runtime via dlopen + ├── gvproxy: virtio-net networking (real eth0 + DHCP) + ├── virtio-fs: shares rootfs with guest + └── vsock: host-to-guest command execution (port 10777) + +Guest VM (aarch64 Linux) + PID 1: openshell-vm-init.sh + ├── Mounts filesystems, configures networking + ├── Sets up bridge CNI, generates PKI + └── Execs k3s server + ├── openshell-server (gateway control plane) + └── openshell-sandbox (pod supervisor) +``` + +## Environment Variables + +| Variable | When | Purpose | +|----------|------|---------| +| `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` | Build time | Path to compressed runtime artifacts | +| `OPENSHELL_VM_RUNTIME_DIR` | Runtime | Override the runtime bundle directory | +| `OPENSHELL_VM_DIAG=1` | Runtime | Enable diagnostic output inside the VM | + +## mise Tasks Reference + +| Task | Description | +|------|-------------| +| `vm` | Build and run the VM | +| `vm:build` | Force full rebuild including rootfs | +| `vm:build:embedded` | Build single binary with embedded runtime | +| `vm:build:embedded:quick` | Build using cached rootfs tarball | +| `vm:build:rootfs-tarball` | Build full rootfs tarball | +| `vm:build:rootfs-tarball:minimal` | Build minimal rootfs tarball | +| `vm:runtime:compress` | Compress runtime artifacts for embedding | +| `vm:runtime:build-libkrunfw` | Build custom libkrunfw | +| `vm:runtime:build-libkrun` | Build libkrun from source (Linux) | +| `vm:runtime:build-libkrun-macos` | Build libkrun from source (macOS) | +| `vm:check-capabilities` | Check VM kernel capabilities | + +## Testing + +Integration tests require a built rootfs and macOS ARM64 with libkrun: + +```bash +cargo test -p openshell-vm -- --ignored +``` + +Individual tests: + +```bash +# Full gateway boot test (boots VM, waits for gRPC on port 30051) +cargo test -p openshell-vm gateway_boots -- --ignored + +# Run a command inside the VM +cargo test -p openshell-vm gateway_exec_runs -- --ignored + +# Exec into a running VM +cargo test -p openshell-vm gateway_exec_attaches -- --ignored +``` + +Verify kernel capabilities inside a running VM: + +```bash +./target/debug/openshell-vm exec -- /srv/check-vm-capabilities.sh +./target/debug/openshell-vm exec -- /srv/check-vm-capabilities.sh --json +``` diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index a6c774dcc..ad9ff32dd 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -15,9 +15,10 @@ # 2. Run the relevant build script to verify. # 3. Commit pins.env alongside any script changes. -# ── k3s binary (arm64) ───────────────────────────────────────────────── +# ── k3s binary ───────────────────────────────────────────────────────── K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" K3S_ARM64_SHA256="${K3S_ARM64_SHA256:-228809a7ef47d25c1bdbe746944931ec2fd2edf842b9cf50f1dd4f9ec2505b0e}" +K3S_AMD64_SHA256="${K3S_AMD64_SHA256:-3ae8e35a62ac83e8e197c117858a564134057a7b8703cf73e67ce60d19f4a22b}" # ── Base Docker image (digest-pinned) ────────────────────────────────── # Tag: nvcr.io/nvidia/base/ubuntu:noble-20251013 diff --git a/crates/openshell-vm/runtime/build-custom-libkrunfw.sh b/crates/openshell-vm/runtime/build-custom-libkrunfw.sh index 2bfbdbe56..c39461b4e 100755 --- a/crates/openshell-vm/runtime/build-custom-libkrunfw.sh +++ b/crates/openshell-vm/runtime/build-custom-libkrunfw.sh @@ -149,7 +149,14 @@ else cat "$FRAGMENT" >> .config fi -make ARCH=arm64 olddefconfig +# Detect the kernel ARCH value from the host (or krunvm guest) architecture. +case "$(uname -m)" in + aarch64) KARCH="arm64" ;; + x86_64) KARCH="x86_64" ;; + *) KARCH="$(uname -m)" ;; +esac +echo " Kernel ARCH: ${KARCH}" +make ARCH="${KARCH}" olddefconfig # Verify critical configs are set REQUIRED=( diff --git a/crates/openshell-vm/scripts/build-rootfs-minimal.sh b/crates/openshell-vm/scripts/build-rootfs-minimal.sh index d73aeb221..cf7bb592e 100755 --- a/crates/openshell-vm/scripts/build-rootfs-minimal.sh +++ b/crates/openshell-vm/scripts/build-rootfs-minimal.sh @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Build a minimal aarch64 Ubuntu rootfs for embedding in openshell-vm. +# Build a minimal Ubuntu rootfs for embedding in openshell-vm. # # This produces a lightweight rootfs (~200-300MB) with: # - Base Ubuntu with k3s binary @@ -14,8 +14,11 @@ # First boot will be slower (~30-60s) as k3s initializes and pulls images, # but subsequent boots use cached state. # +# Supports aarch64 and x86_64 guest architectures. The target architecture +# is auto-detected from the host but can be overridden with --arch. +# # Usage: -# ./build-rootfs-minimal.sh [output_dir] +# ./build-rootfs-minimal.sh [--arch aarch64|x86_64] [output_dir] # # Requires: Docker, curl, helm @@ -30,8 +33,54 @@ if [ -f "$PINS_FILE" ]; then # shellcheck source=../pins.env source "$PINS_FILE" fi + +# ── Architecture detection ───────────────────────────────────────────── +# Allow override via --arch flag; default to host architecture. +GUEST_ARCH="" +POSITIONAL_ARGS=() +while [[ $# -gt 0 ]]; do + case "$1" in + --arch) + GUEST_ARCH="$2"; shift 2 ;; + *) + POSITIONAL_ARGS+=("$1"); shift ;; + esac +done + +if [ -z "$GUEST_ARCH" ]; then + case "$(uname -m)" in + aarch64|arm64) GUEST_ARCH="aarch64" ;; + x86_64) GUEST_ARCH="x86_64" ;; + *) + echo "ERROR: Unsupported host architecture: $(uname -m)" >&2 + echo " Use --arch aarch64 or --arch x86_64 to override." >&2 + exit 1 + ;; + esac +fi + +case "$GUEST_ARCH" in + aarch64) + DOCKER_PLATFORM="linux/arm64" + K3S_BINARY_SUFFIX="-arm64" + K3S_CHECKSUM_VAR="K3S_ARM64_SHA256" + RUST_TARGET="aarch64-unknown-linux-gnu" + ;; + x86_64) + DOCKER_PLATFORM="linux/amd64" + K3S_BINARY_SUFFIX="" # x86_64 binary has no suffix + K3S_CHECKSUM_VAR="K3S_AMD64_SHA256" + RUST_TARGET="x86_64-unknown-linux-gnu" + ;; + *) + echo "ERROR: Unsupported guest architecture: ${GUEST_ARCH}" >&2 + echo " Supported: aarch64, x86_64" >&2 + exit 1 + ;; +esac + DEFAULT_ROOTFS="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" -ROOTFS_DIR="${1:-${DEFAULT_ROOTFS}}" +ROOTFS_DIR="${POSITIONAL_ARGS[0]:-${DEFAULT_ROOTFS}}" CONTAINER_NAME="krun-rootfs-minimal-builder" BASE_IMAGE_TAG="krun-rootfs:openshell-vm-minimal" K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" @@ -40,7 +89,18 @@ K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" # Project root (two levels up from crates/openshell-vm/scripts/) PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" +# Cross-platform checksum helper +verify_checksum() { + local expected="$1" file="$2" + if command -v sha256sum &>/dev/null; then + echo "${expected} ${file}" | sha256sum -c - + else + echo "${expected} ${file}" | shasum -a 256 -c - + fi +} + echo "==> Building minimal openshell-vm rootfs" +echo " Guest arch: ${GUEST_ARCH}" echo " k3s version: ${K3S_VERSION}" echo " Output: ${ROOTFS_DIR}" echo " Mode: minimal (no pre-loaded images, cold start)" @@ -80,29 +140,30 @@ if [ -f "${VM_STATE_FILE}" ]; then fi # ── Download k3s binary ───────────────────────────────────────────────── -K3S_BIN="/tmp/k3s-arm64-${K3S_VERSION}" +K3S_BIN="/tmp/k3s-${GUEST_ARCH}-${K3S_VERSION}" if [ -f "${K3S_BIN}" ]; then echo "==> Using cached k3s binary: ${K3S_BIN}" else - echo "==> Downloading k3s ${K3S_VERSION} for arm64..." - curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s-arm64" \ + echo "==> Downloading k3s ${K3S_VERSION} for ${GUEST_ARCH}..." + curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s${K3S_BINARY_SUFFIX}" \ -o "${K3S_BIN}" chmod +x "${K3S_BIN}" fi # Verify k3s binary integrity. -if [ -n "${K3S_ARM64_SHA256:-}" ]; then +K3S_CHECKSUM="${!K3S_CHECKSUM_VAR:-}" +if [ -n "${K3S_CHECKSUM}" ]; then echo "==> Verifying k3s binary checksum..." - echo "${K3S_ARM64_SHA256} ${K3S_BIN}" | shasum -a 256 -c - + verify_checksum "${K3S_CHECKSUM}" "${K3S_BIN}" else - echo "WARNING: K3S_ARM64_SHA256 not set, skipping checksum verification" + echo "WARNING: ${K3S_CHECKSUM_VAR} not set, skipping checksum verification" fi # ── Build base image ─────────────────────────────────────────────────── docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true echo "==> Building base image..." -docker build --platform linux/arm64 -t "${BASE_IMAGE_TAG}" \ +docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' ARG BASE_IMAGE FROM ${BASE_IMAGE} @@ -122,7 +183,7 @@ DOCKERFILE # Create container and export filesystem echo "==> Creating container..." -docker create --platform linux/arm64 --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true +docker create --platform "${DOCKER_PLATFORM}" --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true echo "==> Exporting filesystem..." if [ -d "${ROOTFS_DIR}" ]; then @@ -155,7 +216,7 @@ cp "${SCRIPT_DIR}/openshell-vm-exec-agent.py" "${ROOTFS_DIR}/srv/openshell-vm-ex chmod +x "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" # ── Build and inject supervisor binary ───────────────────────────────── -SUPERVISOR_TARGET="aarch64-unknown-linux-gnu" +SUPERVISOR_TARGET="${RUST_TARGET}" SUPERVISOR_BIN="${PROJECT_ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" echo "==> Building openshell-sandbox supervisor binary (${SUPERVISOR_TARGET})..." diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index d423ce5ab..8db62acc6 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Build an aarch64 Ubuntu rootfs for the openshell-vm microVM. +# Build a Ubuntu rootfs for the openshell-vm microVM. # # Produces a rootfs with k3s pre-installed, the OpenShell helm chart and # manifests baked in, container images pre-loaded, AND a fully initialized @@ -11,8 +11,11 @@ # On first VM boot, k3s resumes from this pre-baked state instead of # cold-starting, achieving ~3-5s startup times. # +# Supports aarch64 and x86_64 guest architectures. The target architecture +# is auto-detected from the host but can be overridden with --arch. +# # Usage: -# ./crates/openshell-vm/scripts/build-rootfs.sh [output_dir] +# ./crates/openshell-vm/scripts/build-rootfs.sh [--arch aarch64|x86_64] [output_dir] # # Requires: Docker (or compatible container runtime), curl, helm, zstd @@ -27,8 +30,54 @@ if [ -f "$PINS_FILE" ]; then # shellcheck source=../pins.env source "$PINS_FILE" fi + +# ── Architecture detection ───────────────────────────────────────────── +# Allow override via --arch flag; default to host architecture. +GUEST_ARCH="" +POSITIONAL_ARGS=() +while [[ $# -gt 0 ]]; do + case "$1" in + --arch) + GUEST_ARCH="$2"; shift 2 ;; + *) + POSITIONAL_ARGS+=("$1"); shift ;; + esac +done + +if [ -z "$GUEST_ARCH" ]; then + case "$(uname -m)" in + aarch64|arm64) GUEST_ARCH="aarch64" ;; + x86_64) GUEST_ARCH="x86_64" ;; + *) + echo "ERROR: Unsupported host architecture: $(uname -m)" >&2 + echo " Use --arch aarch64 or --arch x86_64 to override." >&2 + exit 1 + ;; + esac +fi + +case "$GUEST_ARCH" in + aarch64) + DOCKER_PLATFORM="linux/arm64" + K3S_BINARY_SUFFIX="-arm64" + K3S_CHECKSUM_VAR="K3S_ARM64_SHA256" + RUST_TARGET="aarch64-unknown-linux-gnu" + ;; + x86_64) + DOCKER_PLATFORM="linux/amd64" + K3S_BINARY_SUFFIX="" # x86_64 binary has no suffix + K3S_CHECKSUM_VAR="K3S_AMD64_SHA256" + RUST_TARGET="x86_64-unknown-linux-gnu" + ;; + *) + echo "ERROR: Unsupported guest architecture: ${GUEST_ARCH}" >&2 + echo " Supported: aarch64, x86_64" >&2 + exit 1 + ;; +esac + DEFAULT_ROOTFS="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" -ROOTFS_DIR="${1:-${DEFAULT_ROOTFS}}" +ROOTFS_DIR="${POSITIONAL_ARGS[0]:-${DEFAULT_ROOTFS}}" CONTAINER_NAME="krun-rootfs-builder" INIT_CONTAINER_NAME="krun-k3s-init" BASE_IMAGE_TAG="krun-rootfs:openshell-vm" @@ -41,14 +90,25 @@ K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" # Project root (two levels up from crates/openshell-vm/scripts/) PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -# Container images to pre-load into k3s (arm64). +# Container images to pre-load into k3s. # AGENT_SANDBOX_IMAGE and COMMUNITY_SANDBOX_IMAGE are digest-pinned in pins.env. # SERVER_IMAGE is intentionally unpinned (local dev artifact). IMAGE_REPO_BASE="${IMAGE_REPO_BASE:-openshell}" IMAGE_TAG="${IMAGE_TAG:-dev}" SERVER_IMAGE="${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" +# Cross-platform checksum helper +verify_checksum() { + local expected="$1" file="$2" + if command -v sha256sum &>/dev/null; then + echo "${expected} ${file}" | sha256sum -c - + else + echo "${expected} ${file}" | shasum -a 256 -c - + fi +} + echo "==> Building openshell-vm rootfs" +echo " Guest arch: ${GUEST_ARCH}" echo " k3s version: ${K3S_VERSION}" echo " Images: ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}" echo " Output: ${ROOTFS_DIR}" @@ -115,22 +175,23 @@ fi # ── Download k3s binary (outside Docker — much faster) ───────────────── -K3S_BIN="/tmp/k3s-arm64-${K3S_VERSION}" +K3S_BIN="/tmp/k3s-${GUEST_ARCH}-${K3S_VERSION}" if [ -f "${K3S_BIN}" ]; then echo "==> Using cached k3s binary: ${K3S_BIN}" else - echo "==> Downloading k3s ${K3S_VERSION} for arm64..." - curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s-arm64" \ + echo "==> Downloading k3s ${K3S_VERSION} for ${GUEST_ARCH}..." + curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s${K3S_BINARY_SUFFIX}" \ -o "${K3S_BIN}" chmod +x "${K3S_BIN}" fi # Verify k3s binary integrity. -if [ -n "${K3S_ARM64_SHA256:-}" ]; then +K3S_CHECKSUM="${!K3S_CHECKSUM_VAR:-}" +if [ -n "${K3S_CHECKSUM}" ]; then echo "==> Verifying k3s binary checksum..." - echo "${K3S_ARM64_SHA256} ${K3S_BIN}" | shasum -a 256 -c - + verify_checksum "${K3S_CHECKSUM}" "${K3S_BIN}" else - echo "WARNING: K3S_ARM64_SHA256 not set, skipping checksum verification" + echo "WARNING: ${K3S_CHECKSUM_VAR} not set, skipping checksum verification" fi # ── Build base image with dependencies ───────────────────────────────── @@ -140,7 +201,7 @@ docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true echo "==> Building base image..." -docker build --platform linux/arm64 -t "${BASE_IMAGE_TAG}" \ +docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' ARG BASE_IMAGE FROM ${BASE_IMAGE} @@ -161,7 +222,7 @@ DOCKERFILE # Create a container and export the filesystem echo "==> Creating container..." -docker create --platform linux/arm64 --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true +docker create --platform "${DOCKER_PLATFORM}" --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true echo "==> Exporting filesystem..." # Previous builds may leave overlayfs work/ dirs with permissions that @@ -217,7 +278,7 @@ chmod +x "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" # Dockerfile.cluster supervisor-builder stage; here we cross-compile # from the host using cargo-zigbuild. -SUPERVISOR_TARGET="aarch64-unknown-linux-gnu" +SUPERVISOR_TARGET="${RUST_TARGET}" SUPERVISOR_BIN="${PROJECT_ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" echo "==> Building openshell-sandbox supervisor binary (${SUPERVISOR_TARGET})..." @@ -281,9 +342,10 @@ for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do done # ── Pre-load container images ──────────────────────────────────────── -# Pull arm64 images and save as tarballs in the k3s airgap images -# directory. k3s auto-imports from /var/lib/rancher/k3s/agent/images/ -# on startup, so no internet access is needed at boot time. +# Pull images for the target architecture and save as tarballs in the +# k3s airgap images directory. k3s auto-imports from +# /var/lib/rancher/k3s/agent/images/ on startup, so no internet access +# is needed at boot time. # # Tarballs are cached in a persistent directory outside the rootfs so # they survive rebuilds. This avoids re-pulling and re-saving ~1 GiB @@ -293,7 +355,7 @@ IMAGES_DIR="${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images" IMAGE_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/openshell/openshell-vm/images" mkdir -p "${IMAGES_DIR}" "${IMAGE_CACHE_DIR}" -echo "==> Pre-loading container images (arm64)..." +echo "==> Pre-loading container images (${GUEST_ARCH})..." pull_and_save() { local image="$1" @@ -310,7 +372,7 @@ pull_and_save() { # Try to pull; if the registry is unavailable, fall back to the # local Docker image cache (image may exist from a previous pull). echo " pulling: ${image}..." - if ! docker pull --platform linux/arm64 "${image}" --quiet 2>/dev/null; then + if ! docker pull --platform "${DOCKER_PLATFORM}" "${image}" --quiet 2>/dev/null; then echo " pull failed, checking local Docker cache..." if ! docker image inspect "${image}" >/dev/null 2>&1; then echo "ERROR: image ${image} not available locally or from registry" @@ -469,13 +531,22 @@ fi # Helper: run a command inside the VM via the exec agent. vm_exec() { - DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" \ - "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" exec -- "$@" 2>&1 + if [ "$(uname -s)" = "Darwin" ]; then + DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" \ + "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" exec -- "$@" 2>&1 + else + LD_LIBRARY_PATH="${RUNTIME_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" \ + "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" exec -- "$@" 2>&1 + fi } # Ensure no stale VM is using this rootfs. echo " Starting VM for pre-initialization..." -export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" +if [ "$(uname -s)" = "Darwin" ]; then + export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" +else + export LD_LIBRARY_PATH="${RUNTIME_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" +fi "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" --reset & VM_PID=$! diff --git a/crates/openshell-vm/src/ffi.rs b/crates/openshell-vm/src/ffi.rs index 6b661a88d..c8854fe15 100644 --- a/crates/openshell-vm/src/ffi.rs +++ b/crates/openshell-vm/src/ffi.rs @@ -239,12 +239,20 @@ fn compute_sha256(path: &Path) -> Result { let mut file = fs::File::open(path)?; - let mut child = Command::new("shasum") - .args(["-a", "256"]) + // sha256sum is standard on Linux; shasum ships with macOS/Perl. + let mut child = Command::new("sha256sum") .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::null()) - .spawn()?; + .spawn() + .or_else(|_| { + Command::new("shasum") + .args(["-a", "256"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .spawn() + })?; // Stream file contents directly to shasum's stdin in 8KB chunks. { diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index e3e5f312a..ef91403d3 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -451,7 +451,26 @@ fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), VmError> { Ok(()) } -#[cfg(not(target_os = "macos"))] +#[cfg(target_os = "linux")] +fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), VmError> { + // On Linux, libkrun.so has a DT_NEEDED for libkrunfw.so. Even though we + // preload libkrunfw with RTLD_GLOBAL, the ELF dynamic linker still resolves + // DT_NEEDED entries through LD_LIBRARY_PATH / system paths. Without this, + // dlopen("libkrun.so") fails if libkrunfw.so is only in the runtime bundle. + let existing = std::env::var_os("LD_LIBRARY_PATH"); + let mut paths = vec![runtime_dir.to_path_buf()]; + if let Some(existing) = existing { + paths.extend(std::env::split_paths(&existing)); + } + let joined = std::env::join_paths(paths) + .map_err(|e| VmError::HostSetup(format!("join LD_LIBRARY_PATH: {e}")))?; + unsafe { + std::env::set_var("LD_LIBRARY_PATH", joined); + } + Ok(()) +} + +#[cfg(not(any(target_os = "macos", target_os = "linux")))] fn configure_runtime_loader_env(_runtime_dir: &Path) -> Result<(), VmError> { Ok(()) } From ab2c8841cf1020ec29bed50ec096c0746795f9fe Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 20:37:20 -0700 Subject: [PATCH 08/10] feat(vm): add Linux networking support and recover stale kine DB on boot - Gate krun_add_net_unixgram FFI type, struct field, and VmContext method behind #[cfg(target_os = "macos")] to eliminate dead_code warnings on Linux - Add krun_add_net_unixstream FFI binding and add_net_unixstream VmContext method for Linux SOCK_STREAM networking via gvproxy QEMU listener - Switch gvproxy launch flags: -listen-qemu (Linux) vs -listen-vfkit (macOS) - Add recover_stale_kine_db: removes corrupt or bootstrap-locked SQLite kine database before boot so k3s always starts with a clean schema - Drop version segment from rootfs and runtime cache paths to avoid stale directories accumulating across upgrades - Improve build-libkrun.sh: two-phase kernel config merge using merge_config.sh and auto-detect LIBCLANG_PATH for clang-sys on Debian/Ubuntu - Ensure libkrunfw.so.5 soname file is present alongside versioned artifact - Add bundle-vm-runtime.sh task and vm:bundle-runtime mise target - Pin AGENT_SANDBOX_IMAGE to updated digest - Add pyelftools dev dependency --- crates/openshell-bootstrap/src/paths.rs | 7 +- crates/openshell-vm/build.rs | 15 ++++ crates/openshell-vm/pins.env | 2 +- crates/openshell-vm/src/embedded.rs | 8 +- crates/openshell-vm/src/exec.rs | 90 +++++++++++++++++++++ crates/openshell-vm/src/ffi.rs | 13 +++ crates/openshell-vm/src/lib.rs | 71 ++++++++++++++--- pyproject.toml | 1 + tasks/scripts/build-libkrun.sh | 102 ++++++++++++++++++++++-- tasks/scripts/bundle-vm-runtime.sh | 65 +++++++++++++++ tasks/scripts/compress-vm-runtime.sh | 18 +++++ tasks/vm.toml | 8 ++ uv.lock | 11 +++ 13 files changed, 380 insertions(+), 31 deletions(-) create mode 100755 tasks/scripts/bundle-vm-runtime.sh diff --git a/crates/openshell-bootstrap/src/paths.rs b/crates/openshell-bootstrap/src/paths.rs index b8028b4b4..d11888e1f 100644 --- a/crates/openshell-bootstrap/src/paths.rs +++ b/crates/openshell-bootstrap/src/paths.rs @@ -28,16 +28,11 @@ pub fn last_sandbox_path(gateway: &str) -> Result { /// Default rootfs directory for gateway microVMs. /// -/// Location: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/rootfs` -/// -/// The version is taken from the CARGO_PKG_VERSION at build time, allowing -/// multiple versions to coexist and enabling clean upgrades. +/// Location: `$XDG_DATA_HOME/openshell/openshell-vm/rootfs` pub fn default_rootfs_dir() -> Result { - const VERSION: &str = env!("CARGO_PKG_VERSION"); Ok(xdg_data_dir()? .join("openshell") .join("openshell-vm") - .join(VERSION) .join("rootfs")) } diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs index 437f4f9be..89013838c 100644 --- a/crates/openshell-vm/build.rs +++ b/crates/openshell-vm/build.rs @@ -18,6 +18,21 @@ use std::{env, fs}; fn main() { println!("cargo:rerun-if-env-changed=OPENSHELL_VM_RUNTIME_COMPRESSED_DIR"); + // Re-run if any compressed artifact changes. + if let Ok(dir) = env::var("OPENSHELL_VM_RUNTIME_COMPRESSED_DIR") { + println!("cargo:rerun-if-changed={dir}"); + for name in &[ + "libkrun.so.zst", + "libkrunfw.so.5.zst", + "libkrun.dylib.zst", + "libkrunfw.5.dylib.zst", + "gvproxy.zst", + "rootfs.tar.zst", + ] { + println!("cargo:rerun-if-changed={dir}/{name}"); + } + } + let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set")); let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index ad9ff32dd..557bc2d3d 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -26,7 +26,7 @@ VM_BASE_IMAGE="${VM_BASE_IMAGE:-nvcr.io/nvidia/base/ubuntu@sha256:43fa5063e80fbb # ── Container images for rootfs pre-loading (digest-pinned) ──────────── # Tag: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0 -AGENT_SANDBOX_IMAGE="${AGENT_SANDBOX_IMAGE:-registry.k8s.io/agent-sandbox/agent-sandbox-controller@sha256:b536762a159b121af18bc004741235160605075ce826f16f95a2103afe2ef4db}" +AGENT_SANDBOX_IMAGE="${AGENT_SANDBOX_IMAGE:-registry.k8s.io/agent-sandbox/agent-sandbox-controller@sha256:ba71ea40ae0872791197badf2ab84f3f482df3902f1fce7ca9e076b1de9b57f6}" # Tag: ghcr.io/nvidia/openshell-community/sandboxes/base:latest COMMUNITY_SANDBOX_IMAGE="${COMMUNITY_SANDBOX_IMAGE:-ghcr.io/nvidia/openshell-community/sandboxes/base@sha256:d446c17105e7448e602238a8a5a4ddd0233c071082406522f81c31f8b1309525}" diff --git a/crates/openshell-vm/src/embedded.rs b/crates/openshell-vm/src/embedded.rs index d898c269a..c301cfb24 100644 --- a/crates/openshell-vm/src/embedded.rs +++ b/crates/openshell-vm/src/embedded.rs @@ -8,7 +8,7 @@ //! //! Cache locations: //! - Runtime: `~/.local/share/openshell/vm-runtime/{version}/` -//! - Rootfs: `~/.local/share/openshell/openshell-vm/{version}/rootfs/` +//! - Rootfs: `~/.local/share/openshell/openshell-vm/rootfs/` use std::fs; use std::io::{Read, Write}; @@ -229,11 +229,7 @@ fn runtime_cache_base() -> Result { fn rootfs_cache_dir() -> Result { let base = openshell_core::paths::xdg_data_dir() .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; - Ok(base - .join("openshell") - .join("openshell-vm") - .join(VERSION) - .join("rootfs")) + Ok(base.join("openshell").join("openshell-vm").join("rootfs")) } fn rootfs_cache_base() -> Result { diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index 961b5d7cf..de6bf4a6a 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -212,6 +212,96 @@ pub fn reset_runtime_state(rootfs: &Path, gateway_name: &str) -> Result<(), VmEr Ok(()) } +/// Recover from a corrupt or bootstrap-locked kine (SQLite) database. +/// +/// k3s uses kine with a SQLite backend at `var/lib/rancher/k3s/server/db/state.db`. +/// If the VM is killed mid-write (SIGKILL, host crash, power loss), SQLite may +/// not have checkpointed its WAL, leaving the database in one of two broken states: +/// +/// 1. **Corrupt file** — the SQLite header magic is missing or the file is +/// truncated. k3s opens the DB, gets `SQLITE_NOTADB` / `SQLITE_CORRUPT`, +/// and crashes at startup. +/// +/// 2. **Stale bootstrap lock** — the kine schema is intact but a previous server +/// instance left the kine bootstrap lock held. k3s loops forever on +/// "Bootstrap key already locked — waiting for data to be populated by +/// another server" every 5 s. The VM appears booted but k3s never becomes +/// ready. +/// +/// Neither condition can be fixed inside the VM: k3s reads `state.db` before the +/// init script has any chance to intervene. The only reliable recovery is to +/// delete the file before boot and let k3s create a fresh one. +/// +/// Since `state.db` contains only ephemeral single-node cluster state — all +/// Kubernetes objects are re-applied from the auto-deploy manifests in +/// `server/manifests/` on every cold start — removing it on every boot is safe. +/// +/// **What is lost:** cluster object records (Pods, Deployments, etc.) and the +/// bootstrap token. Both are re-created from manifests on boot. +/// +/// **What is preserved:** all container images and snapshots (under `k3s/agent/`), +/// PKI, and the `.initialized` sentinel. +/// +/// This function is a no-op if `state.db` does not exist (e.g. first boot or +/// after a full `--reset`). +pub fn recover_stale_kine_db(rootfs: &Path) { + let db_path = rootfs.join("var/lib/rancher/k3s/server/db/state.db"); + if !db_path.exists() { + return; // Nothing to check. + } + + // The SQLite file format begins with a 16-byte magic string. + // Reference: https://www.sqlite.org/fileformat.html#the_database_header + const SQLITE_MAGIC: &[u8] = b"SQLite format 3\x00"; + + let corrupt = match fs::read(&db_path) { + Err(_) => true, // Can't read → treat as corrupt. + Ok(bytes) if bytes.len() < 100 => true, // Too short to be a valid DB. + Ok(bytes) => !bytes.starts_with(SQLITE_MAGIC), + }; + + if corrupt { + eprintln!( + "Warning: kine database is corrupt ({}), removing for clean boot", + db_path.display() + ); + if let Err(e) = fs::remove_file(&db_path) { + eprintln!("Warning: failed to remove corrupt kine database: {e}"); + } + // Also remove any WAL/SHM sidecar files left by the interrupted write. + let _ = fs::remove_file(db_path.with_extension("db-wal")); + let _ = fs::remove_file(db_path.with_extension("db-shm")); + return; + } + + // The file has a valid SQLite header. Even a structurally intact DB can + // cause k3s to hang forever with "Bootstrap key already locked — waiting + // for data to be populated by another server". This is a kine application- + // level lock: when a k3s server starts, kine marks bootstrap as "in + // progress"; if that server is killed before it finishes, the lock row + // persists and no subsequent server can complete bootstrap. + // + // There is no reliable way to detect this condition without executing + // SQLite queries (which would require a dependency). However, `state.db` + // contains only ephemeral cluster state for this single-node VM — all + // Kubernetes objects are re-created from the auto-deploy manifests in + // `server/manifests/` on every cold start. Removing the DB on every boot + // is therefore safe and guarantees a clean bootstrap every time, at the + // cost of ~1-2s extra startup time for k3s to recreate the schema. + // + // The DB is NOT removed under --reset (reset_runtime_state wipes the + // entire k3s/server/ tree, which is a superset of this operation). + eprintln!( + "Removing stale kine database for clean boot ({})", + db_path.display() + ); + if let Err(e) = fs::remove_file(&db_path) { + eprintln!("Warning: failed to remove kine database: {e}"); + } + let _ = fs::remove_file(db_path.with_extension("db-wal")); + let _ = fs::remove_file(db_path.with_extension("db-shm")); +} + /// Acquire an exclusive lock on the rootfs lock file. /// /// The lock is held for the lifetime of the returned `File` handle. When diff --git a/crates/openshell-vm/src/ffi.rs b/crates/openshell-vm/src/ffi.rs index c8854fe15..0bfd1f7c8 100644 --- a/crates/openshell-vm/src/ffi.rs +++ b/crates/openshell-vm/src/ffi.rs @@ -61,6 +61,7 @@ type KrunAddVsockPort2 = type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; type KrunAddVsock = unsafe extern "C" fn(ctx_id: u32, tsi_features: u32) -> i32; +#[cfg(target_os = "macos")] type KrunAddNetUnixgram = unsafe extern "C" fn( ctx_id: u32, c_path: *const c_char, @@ -69,6 +70,14 @@ type KrunAddNetUnixgram = unsafe extern "C" fn( features: u32, flags: u32, ) -> i32; +type KrunAddNetUnixstream = unsafe extern "C" fn( + ctx_id: u32, + c_path: *const c_char, + fd: i32, + c_mac: *const u8, + features: u32, + flags: u32, +) -> i32; pub struct LibKrun { pub krun_init_log: KrunInitLog, @@ -84,7 +93,9 @@ pub struct LibKrun { pub krun_start_enter: KrunStartEnter, pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, pub krun_add_vsock: KrunAddVsock, + #[cfg(target_os = "macos")] pub krun_add_net_unixgram: KrunAddNetUnixgram, + pub krun_add_net_unixstream: KrunAddNetUnixstream, } static LIBKRUN: OnceLock = OnceLock::new(); @@ -155,7 +166,9 @@ impl LibKrun { &path, )?, krun_add_vsock: load_symbol(library, b"krun_add_vsock\0", &path)?, + #[cfg(target_os = "macos")] krun_add_net_unixgram: load_symbol(library, b"krun_add_net_unixgram\0", &path)?, + krun_add_net_unixstream: load_symbol(library, b"krun_add_net_unixstream\0", &path)?, }) } } diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index ef91403d3..b2612897b 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -26,8 +26,8 @@ use std::time::Instant; pub use exec::{ VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state, - ensure_vm_not_running, exec_running_vm, reset_runtime_state, vm_exec_socket_path, - vm_state_path, write_vm_runtime_state, + ensure_vm_not_running, exec_running_vm, recover_stale_kine_db, reset_runtime_state, + vm_exec_socket_path, vm_state_path, write_vm_runtime_state, }; // ── Error type ───────────────────────────────────────────────────────── @@ -624,6 +624,7 @@ impl VmContext { } } + #[cfg(target_os = "macos")] fn add_net_unixgram( &self, socket_path: &Path, @@ -647,6 +648,28 @@ impl VmContext { } } + fn add_net_unixstream( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixstream)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + 0, + ), + "krun_add_net_unixstream", + ) + } + } + fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; @@ -905,6 +928,14 @@ pub fn launch(config: &VmConfig) -> Result { None }; + // Recover from a corrupt or bootstrap-locked kine (SQLite) database. + // Runs on every normal boot (not under --reset, which wipes the entire + // k3s/server/ tree anyway). Must happen after the lock so we know no + // other VM process is using the rootfs. + if !config.reset && config.exec_path == "/srv/openshell-vm-init.sh" { + recover_stale_kine_db(&config.rootfs); + } + // Wipe stale containerd/kubelet runtime state if requested. // This must happen after the lock (to confirm no other VM is using // the rootfs) but before booting (so the new VM starts clean). @@ -970,7 +1001,7 @@ pub fn launch(config: &VmConfig) -> Result { .to_path_buf(); let rootfs_key = vm_rootfs_key(&config.rootfs); let sock_base = gvproxy_socket_dir(&config.rootfs)?; - let vfkit_sock = sock_base.with_extension("v"); + let net_sock = sock_base.with_extension("v"); let api_sock = sock_base.with_extension("a"); // Kill any stale gvproxy process from a previous run. @@ -979,8 +1010,8 @@ pub fn launch(config: &VmConfig) -> Result { kill_stale_gvproxy(&config.rootfs); // Clean stale sockets (including the -krun.sock file that - // libkrun creates as its datagram endpoint). - let _ = std::fs::remove_file(&vfkit_sock); + // libkrun creates as its datagram endpoint on macOS). + let _ = std::fs::remove_file(&net_sock); let _ = std::fs::remove_file(&api_sock); let krun_sock = sock_base.with_extension("v-krun.sock"); let _ = std::fs::remove_file(&krun_sock); @@ -991,9 +1022,21 @@ pub fn launch(config: &VmConfig) -> Result { let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); let gvproxy_log_file = std::fs::File::create(&gvproxy_log) .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; + + // On Linux, gvproxy uses QEMU mode (SOCK_STREAM) since the vfkit + // unixgram scheme is macOS/vfkit-specific. On macOS, use vfkit mode. + #[cfg(target_os = "linux")] + let (gvproxy_net_flag, gvproxy_net_url) = + ("-listen-qemu", format!("unix://{}", net_sock.display())); + #[cfg(target_os = "macos")] + let (gvproxy_net_flag, gvproxy_net_url) = ( + "-listen-vfkit", + format!("unixgram://{}", net_sock.display()), + ); + let child = std::process::Command::new(binary) - .arg("-listen-vfkit") - .arg(format!("unixgram://{}", vfkit_sock.display())) + .arg(gvproxy_net_flag) + .arg(&gvproxy_net_url) .arg("-listen") .arg(format!("unix://{}", api_sock.display())) .arg("-ssh-port") @@ -1014,7 +1057,7 @@ pub fn launch(config: &VmConfig) -> Result { { let deadline = Instant::now() + std::time::Duration::from_secs(5); let mut interval = std::time::Duration::from_millis(5); - while !vfkit_sock.exists() { + while !net_sock.exists() { if Instant::now() >= deadline { return Err(VmError::Fork( "gvproxy socket did not appear within 5s".to_string(), @@ -1046,9 +1089,17 @@ pub fn launch(config: &VmConfig) -> Result { | NET_FEATURE_GUEST_UFO | NET_FEATURE_HOST_TSO4 | NET_FEATURE_HOST_UFO; - const NET_FLAG_VFKIT: u32 = 1 << 0; - vm.add_net_unixgram(&vfkit_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; + // On Linux use unixstream (SOCK_STREAM) to connect to gvproxy's + // QEMU listener. On macOS use unixgram (SOCK_DGRAM) with the vfkit + // magic byte for the vfkit listener. + #[cfg(target_os = "linux")] + vm.add_net_unixstream(&net_sock, &mac, COMPAT_NET_FEATURES)?; + #[cfg(target_os = "macos")] + { + const NET_FLAG_VFKIT: u32 = 1 << 0; + vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; + } eprintln!( "Networking: gvproxy (virtio-net) [{:.1}s]", diff --git a/pyproject.toml b/pyproject.toml index 60d5177d5..899885929 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dev = [ "maturin>=1.5,<2.0", "setuptools-scm>=8", "grpcio-tools>=1.60", + "pyelftools>=0.30", ] docs = [ "sphinx<=7.5", diff --git a/tasks/scripts/build-libkrun.sh b/tasks/scripts/build-libkrun.sh index 99eb140b7..7a31430ae 100755 --- a/tasks/scripts/build-libkrun.sh +++ b/tasks/scripts/build-libkrun.sh @@ -43,7 +43,7 @@ install_deps() { if command -v apt-get &>/dev/null; then # Debian/Ubuntu - DEPS="build-essential git python3 python3-pyelftools flex bison libelf-dev libssl-dev bc curl" + DEPS="build-essential git python3 python3-pyelftools flex bison libelf-dev libssl-dev bc curl libclang-dev" MISSING="" for dep in $DEPS; do if ! dpkg -s "$dep" &>/dev/null; then @@ -60,7 +60,7 @@ install_deps() { elif command -v dnf &>/dev/null; then # Fedora/RHEL - DEPS="make git python3 python3-pyelftools gcc flex bison elfutils-libelf-devel openssl-devel bc glibc-static curl" + DEPS="make git python3 python3-pyelftools gcc flex bison elfutils-libelf-devel openssl-devel bc glibc-static curl clang-devel" echo " Installing dependencies via dnf..." sudo dnf install -y $DEPS @@ -90,23 +90,95 @@ fi cd libkrunfw -# Copy custom kernel config +# Copy custom kernel config fragment if [ -f "$KERNEL_CONFIG" ]; then cp "$KERNEL_CONFIG" openshell.kconfig - echo " Applied custom kernel config: openshell.kconfig" + echo " Applied custom kernel config fragment: openshell.kconfig" else echo "Warning: Custom kernel config not found at ${KERNEL_CONFIG}" >&2 echo " Building with default config (k3s networking may not work)" >&2 fi -# Build libkrunfw echo " Building kernel and libkrunfw (this may take 15-20 minutes)..." -if [ -f openshell.kconfig ]; then - make KCONFIG_FRAGMENT=openshell.kconfig -j"$(nproc)" + +# The libkrunfw Makefile does not support a config fragment — it copies the +# base config and runs olddefconfig, then builds the kernel image in one +# make invocation. We cannot inject the fragment mid-build via make flags. +# +# Instead we drive the build in two phases: +# +# Phase 1: Run the Makefile's $(KERNEL_SOURCES) target, which: +# - downloads and extracts the kernel tarball (if needed) +# - applies patches +# - copies config-libkrunfw_aarch64 to $(KERNEL_SOURCES)/.config +# - runs olddefconfig +# +# Phase 2: Merge our fragment on top of the .config produced by Phase 1 +# using the kernel's own merge_config.sh, then re-run olddefconfig +# to resolve new dependency chains (e.g. CONFIG_BRIDGE pulls in +# CONFIG_BRIDGE_NETFILTER which needs CONFIG_NETFILTER etc). +# +# Phase 3: Let the Makefile build everything (kernel + kernel.c + .so), +# skipping the $(KERNEL_SOURCES) target since it already exists. + +KERNEL_VERSION="$(grep '^KERNEL_VERSION' Makefile | head -1 | awk '{print $3}')" +KERNEL_SOURCES="${KERNEL_VERSION}" + +# Phase 1: prepare kernel source tree + base .config. +# Run the Makefile's $(KERNEL_SOURCES) target whenever the .config is absent +# (either because the tree was never extracted, or because it was cleaned). +# The target is idempotent: if the directory already exists make skips the +# tarball extraction but still copies the base config and runs olddefconfig. +if [ ! -f "${KERNEL_SOURCES}/.config" ]; then + echo " Phase 1: preparing kernel source tree and base .config..." + # Remove the directory so make re-runs the full $(KERNEL_SOURCES) recipe + # (extract + patch + config copy + olddefconfig). + rm -rf "${KERNEL_SOURCES}" + make "${KERNEL_SOURCES}" else - make -j"$(nproc)" + echo " Phase 1: kernel source tree and .config already present, skipping" +fi + +# Phase 2: merge the openshell fragment on top +if [ -f openshell.kconfig ]; then + echo " Phase 2: merging openshell.kconfig fragment..." + + # merge_config.sh must be called with ARCH set so it finds the right Kconfig + # entry points. -m means "merge into existing .config" (vs starting fresh). + ARCH=arm64 KCONFIG_CONFIG="${KERNEL_SOURCES}/.config" \ + "${KERNEL_SOURCES}/scripts/kconfig/merge_config.sh" \ + -m -O "${KERNEL_SOURCES}" \ + "${KERNEL_SOURCES}/.config" \ + openshell.kconfig + + # Re-run olddefconfig to fill in any new symbols introduced by the fragment. + make -C "${KERNEL_SOURCES}" ARCH=arm64 olddefconfig + + # Verify that the key options were actually applied. + all_ok=true + for opt in CONFIG_BRIDGE CONFIG_NETFILTER CONFIG_NF_NAT; do + val="$(grep "^${opt}=" "${KERNEL_SOURCES}/.config" 2>/dev/null || true)" + if [ -n "$val" ]; then + echo " ${opt}: ${val#*=}" + else + echo " WARNING: ${opt} not set after merge!" >&2 + all_ok=false + fi + done + if [ "$all_ok" = false ]; then + echo "ERROR: kernel config fragment merge failed — required options missing" >&2 + exit 1 + fi + + # The kernel binary and kernel.c from the previous (bad) build must be + # removed so make rebuilds them with the updated .config. + rm -f kernel.c "${KERNEL_SOURCES}/arch/arm64/boot/Image" \ + "${KERNEL_SOURCES}/vmlinux" libkrunfw.so* fi +# Phase 3: build kernel image, kernel.c bundle, and the shared library +make -j"$(nproc)" + # Copy output cp libkrunfw.so* "$OUTPUT_DIR/" echo " Built: $(ls "$OUTPUT_DIR"/libkrunfw.so* | xargs -n1 basename | tr '\n' ' ')" @@ -127,6 +199,20 @@ cd libkrun # Build with NET support for gvproxy networking echo " Building libkrun with NET=1..." + +# Locate libclang for clang-sys if LIBCLANG_PATH isn't already set. +# clang-sys looks for libclang.so or libclang-*.so; on Debian/Ubuntu the +# versioned file (e.g. libclang-18.so.18) lives under the LLVM lib dir. +if [ -z "${LIBCLANG_PATH:-}" ]; then + for llvm_lib in /usr/lib/llvm-*/lib; do + if ls "$llvm_lib"/libclang*.so* &>/dev/null; then + export LIBCLANG_PATH="$llvm_lib" + echo " LIBCLANG_PATH=$LIBCLANG_PATH" + break + fi + done +fi + make NET=1 -j"$(nproc)" # Copy output diff --git a/tasks/scripts/bundle-vm-runtime.sh b/tasks/scripts/bundle-vm-runtime.sh new file mode 100755 index 000000000..bc7c03ca1 --- /dev/null +++ b/tasks/scripts/bundle-vm-runtime.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Stage the openshell-vm sidecar runtime bundle next to local build outputs. +# +# Copies the uncompressed VM runtime libraries (libkrun, libkrunfw, gvproxy) +# from target/vm-runtime/ into the .runtime sidecar directories alongside +# each build output. This is required for: +# - build-rootfs.sh pre-initialization (boots the real VM to pre-bake k3s state) +# - Direct invocation of target/debug/openshell-vm without embedding +# +# The source artifacts are collected by compress-vm-runtime.sh into +# target/vm-runtime/ before compression; this script re-uses that work dir. +# +# Usage: +# ./tasks/scripts/bundle-vm-runtime.sh + +set -euo pipefail + +ROOT="$(git rev-parse --show-toplevel 2>/dev/null)" || ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +SOURCE_DIR="${ROOT}/target/vm-runtime" + +if [ ! -d "${SOURCE_DIR}" ]; then + echo "ERROR: VM runtime source not found at ${SOURCE_DIR}" + echo " Run: mise run vm:runtime:compress" + exit 1 +fi + +# Verify required files are present +for required in libkrun.so gvproxy; do + if ! ls "${SOURCE_DIR}/${required}" >/dev/null 2>&1; then + # Try platform-specific variants + if [ "$required" = "libkrun.so" ] && ls "${SOURCE_DIR}"/libkrun.dylib >/dev/null 2>&1; then + continue + fi + echo "ERROR: Required runtime file not found: ${SOURCE_DIR}/${required}" + echo " Run: mise run vm:runtime:compress" + exit 1 + fi +done + +TARGETS=( + "${ROOT}/target/debug" + "${ROOT}/target/release" +) + +for target_dir in "${TARGETS[@]}"; do + # Only stage if the binary exists (avoid creating orphan runtime dirs) + if [ ! -f "${target_dir}/openshell-vm" ] && [ ! -f "${target_dir}/openshell-vm.d" ]; then + continue + fi + + runtime_dir="${target_dir}/openshell-vm.runtime" + mkdir -p "${runtime_dir}" + + for file in "${SOURCE_DIR}"/*; do + [ -f "$file" ] || continue + name="$(basename "$file")" + install -m 0755 "$file" "${runtime_dir}/${name}" + done + + echo "staged runtime bundle in ${runtime_dir}" +done diff --git a/tasks/scripts/compress-vm-runtime.sh b/tasks/scripts/compress-vm-runtime.sh index 2159a59eb..18b1589b1 100755 --- a/tasks/scripts/compress-vm-runtime.sh +++ b/tasks/scripts/compress-vm-runtime.sh @@ -204,6 +204,15 @@ case "$(uname -s)-$(uname -m)" in cp "$krunfw" "$WORK_DIR/" done + # Ensure the soname symlink (libkrunfw.so.5) exists alongside the fully + # versioned file (libkrunfw.so.5.x.y). libloading loads by soname. + if [ ! -f "$WORK_DIR/libkrunfw.so.5" ]; then + versioned=$(ls "$WORK_DIR"/libkrunfw.so.5.* 2>/dev/null | head -n1) + if [ -n "$versioned" ]; then + cp "$versioned" "$WORK_DIR/libkrunfw.so.5" + fi + fi + # Download gvproxy if not present if [ ! -f "$WORK_DIR/gvproxy" ]; then echo " Downloading gvproxy for linux-arm64..." @@ -231,6 +240,15 @@ case "$(uname -s)-$(uname -m)" in cp "$krunfw" "$WORK_DIR/" done + # Ensure the soname symlink (libkrunfw.so.5) exists alongside the fully + # versioned file (libkrunfw.so.5.x.y). libloading loads by soname. + if [ ! -f "$WORK_DIR/libkrunfw.so.5" ]; then + versioned=$(ls "$WORK_DIR"/libkrunfw.so.5.* 2>/dev/null | head -n1) + if [ -n "$versioned" ]; then + cp "$versioned" "$WORK_DIR/libkrunfw.so.5" + fi + fi + # Download gvproxy if not present if [ ! -f "$WORK_DIR/gvproxy" ]; then echo " Downloading gvproxy for linux-amd64..." diff --git a/tasks/vm.toml b/tasks/vm.toml index ee2cbbb81..5d61a60b5 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -9,6 +9,7 @@ [vm] description = "Build and run the standalone openshell-vm microVM" +depends = ["build:docker:gateway"] run = [ "mise run vm:build:embedded", "tasks/scripts/ensure-vm-rootfs.sh", @@ -19,6 +20,7 @@ hide = false ["vm:build"] description = "Force a fresh openshell-vm rebuild, including the rootfs" +depends = ["build:docker:gateway"] run = [ "mise run vm:build:embedded", "OPENSHELL_VM_FORCE_ROOTFS_REBUILD=1 tasks/scripts/ensure-vm-rootfs.sh", @@ -36,9 +38,15 @@ run = [ "mise run vm:runtime:compress", "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm", "tasks/scripts/codesign-openshell-vm.sh", + "tasks/scripts/bundle-vm-runtime.sh", ] hide = false +["vm:bundle-runtime"] +description = "Stage the openshell-vm sidecar runtime bundle next to local build outputs" +run = "tasks/scripts/bundle-vm-runtime.sh" +hide = false + ["vm:build:binary"] description = "Build the standalone openshell-vm binary (requires vm:runtime:compress first)" run = "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm" diff --git a/uv.lock b/uv.lock index 687a035ae..cbed4151e 100644 --- a/uv.lock +++ b/uv.lock @@ -537,6 +537,7 @@ dependencies = [ dev = [ { name = "grpcio-tools" }, { name = "maturin" }, + { name = "pyelftools" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, @@ -566,6 +567,7 @@ requires-dist = [ dev = [ { name = "grpcio-tools", specifier = ">=1.60" }, { name = "maturin", specifier = ">=1.5,<2.0" }, + { name = "pyelftools", specifier = ">=0.30" }, { name = "pytest", specifier = ">=8.0" }, { name = "pytest-asyncio", specifier = ">=0.23" }, { name = "pytest-cov", specifier = ">=4.0" }, @@ -635,6 +637,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e2/0d/8ba33fa83a7dcde13eb3c1c2a0c1cc29950a048bfed6d9b0d8b6bd710b4c/pydata_sphinx_theme-0.16.1-py3-none-any.whl", hash = "sha256:225331e8ac4b32682c18fcac5a57a6f717c4e632cea5dd0e247b55155faeccde", size = 6723264, upload-time = "2024-12-17T10:53:35.645Z" }, ] +[[package]] +name = "pyelftools" +version = "0.32" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b9/ab/33968940b2deb3d92f5b146bc6d4009a5f95d1d06c148ea2f9ee965071af/pyelftools-0.32.tar.gz", hash = "sha256:6de90ee7b8263e740c8715a925382d4099b354f29ac48ea40d840cf7aa14ace5", size = 15047199, upload-time = "2025-02-19T14:20:05.549Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/43/700932c4f0638c3421177144a2e86448c0d75dbaee2c7936bda3f9fd0878/pyelftools-0.32-py3-none-any.whl", hash = "sha256:013df952a006db5e138b1edf6d8a68ecc50630adbd0d83a2d41e7f846163d738", size = 188525, upload-time = "2025-02-19T14:19:59.919Z" }, +] + [[package]] name = "pygments" version = "2.19.2" From 9c510a43be70f0d5a4b1e218d071316c1b1f953e Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 23:01:22 -0700 Subject: [PATCH 09/10] refactor(vm): extract per-gateway rootfs instead of cloning from shared template Each gateway instance now gets its own rootfs extracted directly from the embedded tarball, eliminating the shared template rootfs that all instances previously cloned from. The default (unnamed) gateway is now treated as a named instance with name "default", removing all Option-based branching. - Replace ensure_rootfs_extracted() with extract_rootfs_to(dest) in embedded.rs - Rewrite ensure_named_rootfs() to extract instead of clone - Remove clone_rootfs() and DEFAULT_GATEWAY_NAME - Default --name CLI arg to "default", simplifying all call sites - Remove dead default_rootfs_dir() from paths.rs --- crates/openshell-bootstrap/src/paths.rs | 10 -- crates/openshell-vm/src/embedded.rs | 65 ++++++------ crates/openshell-vm/src/exec.rs | 3 +- crates/openshell-vm/src/lib.rs | 128 ++++++++---------------- crates/openshell-vm/src/main.rs | 39 ++++---- 5 files changed, 94 insertions(+), 151 deletions(-) diff --git a/crates/openshell-bootstrap/src/paths.rs b/crates/openshell-bootstrap/src/paths.rs index d11888e1f..1c514f370 100644 --- a/crates/openshell-bootstrap/src/paths.rs +++ b/crates/openshell-bootstrap/src/paths.rs @@ -26,16 +26,6 @@ pub fn last_sandbox_path(gateway: &str) -> Result { Ok(gateways_dir()?.join(gateway).join("last_sandbox")) } -/// Default rootfs directory for gateway microVMs. -/// -/// Location: `$XDG_DATA_HOME/openshell/openshell-vm/rootfs` -pub fn default_rootfs_dir() -> Result { - Ok(xdg_data_dir()? - .join("openshell") - .join("openshell-vm") - .join("rootfs")) -} - /// Base directory for openshell-vm data (without version). /// /// Location: `$XDG_DATA_HOME/openshell/openshell-vm/` diff --git a/crates/openshell-vm/src/embedded.rs b/crates/openshell-vm/src/embedded.rs index c301cfb24..aff39a4ee 100644 --- a/crates/openshell-vm/src/embedded.rs +++ b/crates/openshell-vm/src/embedded.rs @@ -157,54 +157,65 @@ pub fn runtime_cache_path() -> Result { runtime_cache_dir() } -/// Ensures the embedded rootfs is extracted to the cache directory. +/// Extract the embedded rootfs to the given destination directory. /// -/// Returns the path to the rootfs directory. +/// If the destination already exists, it is returned as-is (no re-extraction). +/// Otherwise the embedded `rootfs.tar.zst` is decompressed and unpacked into `dest`. /// -/// On first call, this extracts the compressed embedded rootfs tarball to the cache. -/// Subsequent calls return the cached path if valid. -pub fn ensure_rootfs_extracted() -> Result { - // Check if embedded rootfs is available (non-empty) +/// A `.version` marker is written after successful extraction so that +/// version-mismatched rootfs directories are detected and rebuilt. +pub fn extract_rootfs_to(dest: &Path) -> Result<(), VmError> { if resources::ROOTFS.is_empty() { return Err(VmError::HostSetup( "Rootfs not embedded. Build with: mise run vm:build:embedded".to_string(), )); } - let rootfs_dir = rootfs_cache_dir()?; - let version_marker = rootfs_dir.join(".version"); + let version_marker = dest.join(".version"); - // Check if already extracted with correct version + // Already extracted with the correct version — nothing to do. if version_marker.exists() { if let Ok(cached_version) = fs::read_to_string(&version_marker) { if cached_version.trim() == VERSION { tracing::debug!( - path = %rootfs_dir.display(), + path = %dest.display(), "Using cached rootfs" ); - return Ok(rootfs_dir); + return Ok(()); } } } - // Clean up old versions before extracting new one - cleanup_old_rootfs_versions(&rootfs_dir)?; - - // Remove existing if present (version mismatch) - if rootfs_dir.exists() { - eprintln!("Removing outdated rootfs..."); - fs::remove_dir_all(&rootfs_dir) + // Remove existing if present (version mismatch or incomplete extraction). + if dest.exists() { + eprintln!("Removing outdated rootfs at {}...", dest.display()); + fs::remove_dir_all(dest) .map_err(|e| VmError::HostSetup(format!("remove old rootfs: {e}")))?; } - // Extract with progress bar - extract_rootfs_with_progress(resources::ROOTFS, &rootfs_dir)?; + // Extract with progress bar. + extract_rootfs_with_progress(resources::ROOTFS, dest)?; - // Write version marker + // Write version marker. fs::write(&version_marker, VERSION) .map_err(|e| VmError::HostSetup(format!("write rootfs version marker: {e}")))?; - Ok(rootfs_dir) + Ok(()) +} + +/// Clean up rootfs directories from older versions. +/// +/// Call this periodically (e.g. at startup) to reclaim disk from previous +/// releases. Removes all version directories under the openshell-vm base +/// except the current version. +pub fn cleanup_old_rootfs() -> Result<(), VmError> { + let base = rootfs_cache_base()?; + if !base.exists() { + return Ok(()); + } + + let current_version_dir = base.join(VERSION); + cleanup_old_versions_in_base(&base, ¤t_version_dir) } /// Check if the rootfs is embedded (non-empty). @@ -226,12 +237,6 @@ fn runtime_cache_base() -> Result { Ok(base.join("openshell").join("vm-runtime")) } -fn rootfs_cache_dir() -> Result { - let base = openshell_core::paths::xdg_data_dir() - .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; - Ok(base.join("openshell").join("openshell-vm").join("rootfs")) -} - fn rootfs_cache_base() -> Result { let base = openshell_core::paths::xdg_data_dir() .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; @@ -242,10 +247,6 @@ fn cleanup_old_versions(current_dir: &Path) -> Result<(), VmError> { cleanup_old_versions_in_base(&runtime_cache_base()?, current_dir) } -fn cleanup_old_rootfs_versions(current_dir: &Path) -> Result<(), VmError> { - cleanup_old_versions_in_base(&rootfs_cache_base()?, current_dir) -} - fn cleanup_old_versions_in_base(base: &Path, current_dir: &Path) -> Result<(), VmError> { if !base.exists() { return Ok(()); diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index de6bf4a6a..88ac64e60 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -538,8 +538,7 @@ fn rootfs_key(rootfs: &Path) -> String { } fn default_rootfs() -> Result { - openshell_bootstrap::paths::default_rootfs_dir() - .map_err(|e| VmError::RuntimeState(format!("resolve default VM rootfs: {e}"))) + crate::named_rootfs_dir("default") } fn load_vm_runtime_state(rootfs: Option<&Path>) -> Result { diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index b2612897b..c264db4ec 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -203,54 +203,57 @@ impl VmConfig { binary: default_runtime_gvproxy_path(), }, reset: false, - gateway_name: DEFAULT_GATEWAY_NAME.to_string(), + gateway_name: format!("{GATEWAY_NAME_PREFIX}-default"), } } } -/// Default gateway metadata name used by the legacy single-instance layout. -pub const DEFAULT_GATEWAY_NAME: &str = "openshell-vm"; - -/// Resolve the gateway metadata name for an optional instance name. -pub fn gateway_name(instance_name: Option<&str>) -> Result { - match instance_name { - Some(name) => Ok(format!( - "{DEFAULT_GATEWAY_NAME}-{}", - sanitize_instance_name(name)? - )), - None => Ok(DEFAULT_GATEWAY_NAME.to_string()), - } +/// Base prefix for gateway metadata names. +const GATEWAY_NAME_PREFIX: &str = "openshell-vm"; + +/// Resolve the gateway metadata name for an instance name. +pub fn gateway_name(instance_name: &str) -> Result { + Ok(format!( + "{GATEWAY_NAME_PREFIX}-{}", + sanitize_instance_name(instance_name)? + )) } -/// Resolve the rootfs path for a named instance. +/// Resolve the rootfs path for a named instance (including the default gateway). +/// +/// Layout: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/instances/{name}/rootfs` pub fn named_rootfs_dir(instance_name: &str) -> Result { let name = sanitize_instance_name(instance_name)?; - let base = openshell_bootstrap::paths::default_rootfs_dir() - .map_err(|e| VmError::RuntimeState(format!("resolve default VM rootfs: {e}")))?; - let parent = base.parent().ok_or_else(|| { - VmError::RuntimeState(format!("default rootfs has no parent: {}", base.display())) - })?; - Ok(parent.join("instances").join(name).join("rootfs")) + let base = openshell_bootstrap::paths::openshell_vm_base_dir() + .map_err(|e| VmError::RuntimeState(format!("resolve openshell-vm base dir: {e}")))?; + Ok(base + .join(env!("CARGO_PKG_VERSION")) + .join("instances") + .join(name) + .join("rootfs")) } -/// Ensure a named instance rootfs exists, cloning from the default rootfs -/// on first use. +/// Ensure a named instance rootfs exists, extracting from the embedded +/// rootfs tarball on first use. +/// +/// The default (unnamed) gateway should be routed here as `"default"`. pub fn ensure_named_rootfs(instance_name: &str) -> Result { let instance_rootfs = named_rootfs_dir(instance_name)?; if instance_rootfs.is_dir() { return Ok(instance_rootfs); } - let default_rootfs = openshell_bootstrap::paths::default_rootfs_dir() - .map_err(|e| VmError::RuntimeState(format!("resolve default VM rootfs: {e}")))?; - if !default_rootfs.is_dir() { - return Err(VmError::RootfsNotFound { - path: default_rootfs.display().to_string(), - }); + if embedded::has_embedded_rootfs() { + // Clean up rootfs directories left by older binary versions. + embedded::cleanup_old_rootfs()?; + + embedded::extract_rootfs_to(&instance_rootfs)?; + return Ok(instance_rootfs); } - clone_rootfs(&default_rootfs, &instance_rootfs)?; - Ok(instance_rootfs) + Err(VmError::RootfsNotFound { + path: instance_rootfs.display().to_string(), + }) } fn sanitize_instance_name(name: &str) -> Result { @@ -275,54 +278,6 @@ fn sanitize_instance_name(name: &str) -> Result { Ok(out) } -fn clone_rootfs(source: &Path, dest: &Path) -> Result<(), VmError> { - let parent = dest.parent().ok_or_else(|| { - VmError::RuntimeState(format!("instance rootfs has no parent: {}", dest.display())) - })?; - std::fs::create_dir_all(parent).map_err(|e| { - VmError::RuntimeState(format!( - "create instance parent dir {}: {e}", - parent.display() - )) - })?; - - let status = if cfg!(target_os = "macos") { - let clone_status = std::process::Command::new("cp") - .args(["-c", "-R"]) - .arg(source) - .arg(dest) - .status() - .map_err(|e| VmError::RuntimeState(format!("clone rootfs with cp failed: {e}")))?; - if clone_status.success() { - clone_status - } else { - std::process::Command::new("cp") - .args(["-R"]) - .arg(source) - .arg(dest) - .status() - .map_err(|e| VmError::RuntimeState(format!("copy rootfs with cp failed: {e}")))? - } - } else { - std::process::Command::new("cp") - .args(["-a", "--reflink=auto"]) - .arg(source) - .arg(dest) - .status() - .map_err(|e| VmError::RuntimeState(format!("clone rootfs with cp failed: {e}")))? - }; - - if !status.success() { - return Err(VmError::RuntimeState(format!( - "failed to clone rootfs {} -> {}", - source.display(), - dest.display() - ))); - } - - Ok(()) -} - // ── Helpers ───────────────────────────────────────────────────────────── /// Build a null-terminated C string array from a slice of strings. @@ -430,10 +385,11 @@ pub fn default_runtime_gvproxy_path() -> PathBuf { .join("gvproxy") } -/// Check if the given path matches the expected default rootfs location. -fn is_default_rootfs_path(path: &Path) -> bool { - // Check if path matches the pattern: ~/.local/share/openshell/openshell-vm/.../rootfs - path.to_string_lossy().contains("openshell/openshell-vm") && path.ends_with("rootfs") +/// Check if the given path looks like an openshell-vm instance rootfs. +fn is_instance_rootfs_path(path: &Path) -> bool { + // Matches: .../openshell/openshell-vm/.../instances/.../rootfs + let s = path.to_string_lossy(); + s.contains("openshell/openshell-vm") && s.contains("instances") && path.ends_with("rootfs") } #[cfg(target_os = "macos")] @@ -900,11 +856,9 @@ fn path_to_cstring(path: &Path) -> Result { /// Returns the VM exit code (from `waitpid`). #[allow(clippy::similar_names)] pub fn launch(config: &VmConfig) -> Result { - // Auto-extract embedded rootfs if using default path and it doesn't exist - if !config.rootfs.is_dir() { - if is_default_rootfs_path(&config.rootfs) && embedded::has_embedded_rootfs() { - embedded::ensure_rootfs_extracted()?; - } + // Auto-extract embedded rootfs if using an instance path and it doesn't exist + if !config.rootfs.is_dir() && is_instance_rootfs_path(&config.rootfs) && embedded::has_embedded_rootfs() { + embedded::extract_rootfs_to(&config.rootfs)?; } // Validate rootfs diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs index e97648ad6..b0f7d95f4 100644 --- a/crates/openshell-vm/src/main.rs +++ b/crates/openshell-vm/src/main.rs @@ -4,8 +4,9 @@ //! Standalone openshell-vm binary. //! //! Boots a libkrun microVM running the OpenShell control plane (k3s + -//! openshell-server). By default it uses the pre-built rootfs at -//! `~/.local/share/openshell/openshell-vm/rootfs`. +//! openshell-server). Each named instance gets its own rootfs extracted from +//! the embedded tarball at +//! `~/.local/share/openshell/openshell-vm/{version}/instances//rootfs`. //! //! # Codesigning (macOS) //! @@ -32,18 +33,18 @@ struct Cli { command: Option, /// Path to the rootfs directory (aarch64 Linux). - /// Defaults to `~/.local/share/openshell/openshell-vm/rootfs`. + /// Overrides the default instance-based rootfs resolution. #[arg(long, value_hint = ValueHint::DirPath)] rootfs: Option, /// Named VM instance. /// - /// When set, the rootfs resolves to - /// `~/.local/share/openshell/openshell-vm/instances//rootfs`. - /// For launch mode, the instance rootfs is cloned from the default - /// rootfs on first use. - #[arg(long, conflicts_with = "rootfs")] - name: Option, + /// The rootfs resolves to + /// `~/.local/share/openshell/openshell-vm/{version}/instances//rootfs`. + /// Each instance gets its own rootfs extracted from the embedded tarball + /// on first use. + #[arg(long, default_value = "default", conflicts_with = "rootfs")] + name: String, /// Executable path inside the VM. When set, runs this instead of /// the default k3s server. @@ -142,11 +143,10 @@ fn run(cli: Cli) -> Result> { } return Ok(openshell_vm::exec_running_vm( openshell_vm::VmExecOptions { - rootfs: match (cli.rootfs, cli.name.as_deref()) { - (Some(path), _) => Some(path), - (None, Some(name)) => Some(openshell_vm::named_rootfs_dir(name)?), - (None, None) => None, - }, + rootfs: Some( + cli.rootfs + .unwrap_or(openshell_vm::named_rootfs_dir(&cli.name)?), + ), command, workdir, env, @@ -168,13 +168,12 @@ fn run(cli: Cli) -> Result> { } }; - let rootfs = match (cli.rootfs, cli.name.as_deref()) { - (Some(path), _) => path, - (None, Some(name)) => openshell_vm::ensure_named_rootfs(name)?, - (None, None) => openshell_bootstrap::paths::default_rootfs_dir()?, - }; + let rootfs = cli + .rootfs + .map(Ok) + .unwrap_or_else(|| openshell_vm::ensure_named_rootfs(&cli.name))?; - let gateway_name = openshell_vm::gateway_name(cli.name.as_deref())?; + let gateway_name = openshell_vm::gateway_name(&cli.name)?; let mut config = if let Some(exec_path) = cli.exec { openshell_vm::VmConfig { From c2c6e8ca2725f9b36d705918f6f4266c1b45f0e5 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 23:27:46 -0700 Subject: [PATCH 10/10] wip --- architecture/custom-vm-runtime.md | 13 +- crates/openshell-vm/README.md | 4 +- crates/openshell-vm/pins.env | 2 +- crates/openshell-vm/scripts/api-proxy.py | 132 -------- .../scripts/build-rootfs-minimal.sh | 293 ------------------ crates/openshell-vm/scripts/build-rootfs.sh | 95 ++++-- crates/openshell-vm/scripts/hello-server.py | 49 --- {tasks/scripts => e2e/rust}/e2e-vm.sh | 0 tasks/scripts/{ => vm}/build-libkrun-macos.sh | 42 +-- tasks/scripts/{ => vm}/build-libkrun.sh | 2 +- .../scripts/{ => vm}/build-rootfs-tarball.sh | 36 +-- .../scripts/{ => vm}/codesign-openshell-vm.sh | 2 +- tasks/scripts/{ => vm}/compress-vm-runtime.sh | 147 ++------- tasks/scripts/{ => vm}/ensure-vm-rootfs.sh | 0 tasks/scripts/{ => vm}/run-vm.sh | 2 +- tasks/scripts/{ => vm}/sync-vm-rootfs.sh | 4 +- tasks/test.toml | 4 +- tasks/vm.toml | 33 +- 18 files changed, 162 insertions(+), 698 deletions(-) delete mode 100644 crates/openshell-vm/scripts/api-proxy.py delete mode 100755 crates/openshell-vm/scripts/build-rootfs-minimal.sh delete mode 100644 crates/openshell-vm/scripts/hello-server.py rename {tasks/scripts => e2e/rust}/e2e-vm.sh (100%) rename tasks/scripts/{ => vm}/build-libkrun-macos.sh (89%) rename tasks/scripts/{ => vm}/build-libkrun.sh (99%) rename tasks/scripts/{ => vm}/build-rootfs-tarball.sh (73%) rename tasks/scripts/{ => vm}/codesign-openshell-vm.sh (85%) rename tasks/scripts/{ => vm}/compress-vm-runtime.sh (57%) rename tasks/scripts/{ => vm}/ensure-vm-rootfs.sh (100%) rename tasks/scripts/{ => vm}/run-vm.sh (88%) rename tasks/scripts/{ => vm}/sync-vm-rootfs.sh (97%) diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index 5b0b4e287..58cf48b71 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -158,8 +158,8 @@ commands work the same way they would inside the VM shell. ## Build Commands ```bash -# Build embedded binary with minimal rootfs (~120MB, recommended) -mise run vm:build:rootfs-tarball:minimal # Build minimal rootfs tarball +# Build embedded binary with base rootfs (~120MB, recommended) +mise run vm:build:rootfs-tarball:base # Build base rootfs tarball mise run vm:build:embedded # Build binary with embedded rootfs # Quick rebuild (uses cached artifacts, skips rootfs rebuild) @@ -197,9 +197,8 @@ mise run vm:build:embedded # Then build embedded binary | `crates/openshell-vm/scripts/openshell-vm-exec-agent.py` | Guest-side exec agent | | `crates/openshell-vm/scripts/check-vm-capabilities.sh` | Kernel capability checker | | `crates/openshell-vm/runtime/` | Build pipeline and kernel config | -| `tasks/scripts/compress-vm-runtime.sh` | Gather and compress runtime artifacts | -| `tasks/scripts/build-rootfs-tarball.sh` | Build and compress rootfs tarball | -| `tasks/scripts/build-libkrun.sh` | Build libkrun from source (Linux) | -| `crates/openshell-vm/scripts/build-rootfs.sh` | Build full rootfs with pre-loaded images | -| `crates/openshell-vm/scripts/build-rootfs-minimal.sh` | Build minimal rootfs without images | +| `tasks/scripts/vm/compress-vm-runtime.sh` | Gather and compress runtime artifacts | +| `tasks/scripts/vm/build-rootfs-tarball.sh` | Build and compress rootfs tarball | +| `tasks/scripts/vm/build-libkrun.sh` | Build libkrun from source (Linux) | +| `crates/openshell-vm/scripts/build-rootfs.sh` | Build rootfs (full by default, `--base` for lightweight) | | `tasks/vm.toml` | Mise task definitions | diff --git a/crates/openshell-vm/README.md b/crates/openshell-vm/README.md index 1861d7294..c908cf161 100644 --- a/crates/openshell-vm/README.md +++ b/crates/openshell-vm/README.md @@ -147,7 +147,7 @@ mise run vm:build:rootfs-tarball Just k3s + supervisor, cold starts in ~30-60s: ```bash -mise run vm:build:rootfs-tarball:minimal +mise run vm:build:rootfs-tarball:base ``` ## Custom Kernel (libkrunfw) @@ -199,7 +199,7 @@ Guest VM (aarch64 Linux) | `vm:build:embedded` | Build single binary with embedded runtime | | `vm:build:embedded:quick` | Build using cached rootfs tarball | | `vm:build:rootfs-tarball` | Build full rootfs tarball | -| `vm:build:rootfs-tarball:minimal` | Build minimal rootfs tarball | +| `vm:build:rootfs-tarball:base` | Build base rootfs tarball | | `vm:runtime:compress` | Compress runtime artifacts for embedding | | `vm:runtime:build-libkrunfw` | Build custom libkrunfw | | `vm:runtime:build-libkrun` | Build libkrun from source (Linux) | diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index 557bc2d3d..2855d962c 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -3,7 +3,7 @@ # Pinned dependency versions for openshell-vm builds. # -# This file is sourced by build-rootfs.sh, build-rootfs-minimal.sh, and +# This file is sourced by build-rootfs.sh and # build-custom-libkrunfw.sh. It centralises version pins and content-addressed # digests so that builds are reproducible and auditable. # diff --git a/crates/openshell-vm/scripts/api-proxy.py b/crates/openshell-vm/scripts/api-proxy.py deleted file mode 100644 index 6da224f13..000000000 --- a/crates/openshell-vm/scripts/api-proxy.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -""" -TCP proxy that waits for the k3s apiserver to be ready on 127.0.0.1:6444, -then accepts connections on 0.0.0.0:6443 and forwards them to the apiserver. - -This decouples the TSI-exposed port from k3s's internal dynamiclistener, -which has TLS handshake issues when accessed through TSI. -""" - -import os -import socket -import sys -import threading -import time - -LISTEN_HOST = "0.0.0.0" -LISTEN_PORT = int(os.environ.get("PROXY_LISTEN_PORT", "6443")) -UPSTREAM_HOST = "127.0.0.1" -UPSTREAM_PORT = int(os.environ.get("PROXY_UPSTREAM_PORT", "6444")) -BUFFER_SIZE = 65536 - - -def wait_for_upstream(): - """Block until the upstream apiserver completes a TLS handshake. - - A raw TCP connect succeeds as soon as the port is bound, but the TLS - server may not be ready yet. We do a full TLS handshake to confirm. - """ - import ssl - - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - attempt = 0 - while True: - attempt += 1 - try: - sock = socket.create_connection((UPSTREAM_HOST, UPSTREAM_PORT), timeout=5) - ssock = ctx.wrap_socket(sock, server_hostname="localhost") - ssock.close() - print(f"[proxy] upstream TLS ready after {attempt} attempts", flush=True) - return - except ( - ConnectionRefusedError, - ConnectionResetError, - OSError, - ssl.SSLError, - ) as e: - if attempt % 5 == 0: - print( - f"[proxy] waiting for upstream (attempt {attempt}): {e}", flush=True - ) - time.sleep(1) - - -def forward(src, dst, label): - """Forward data between two sockets until one closes.""" - try: - while True: - data = src.recv(BUFFER_SIZE) - if not data: - break - dst.sendall(data) - except (BrokenPipeError, ConnectionResetError, OSError): - pass - finally: - try: - dst.shutdown(socket.SHUT_WR) - except OSError: - pass - - -def handle_client(client_sock, client_addr): - """Connect to upstream and forward bidirectionally.""" - print(f"[proxy] accepted connection from {client_addr}", flush=True) - try: - upstream = socket.create_connection((UPSTREAM_HOST, UPSTREAM_PORT), timeout=5) - print(f"[proxy] connected to upstream for {client_addr}", flush=True) - except OSError as e: - print( - f"[proxy] failed to connect to upstream for {client_addr}: {e}", flush=True - ) - client_sock.close() - return - - # Forward in both directions - t1 = threading.Thread( - target=forward, args=(client_sock, upstream, "client->upstream"), daemon=True - ) - t2 = threading.Thread( - target=forward, args=(upstream, client_sock, "upstream->client"), daemon=True - ) - t1.start() - t2.start() - t1.join() - t2.join() - print(f"[proxy] connection closed for {client_addr}", flush=True) - client_sock.close() - upstream.close() - - -def main(): - # Wait for the real apiserver to be ready before accepting connections - print( - f"[proxy] waiting for upstream at {UPSTREAM_HOST}:{UPSTREAM_PORT}...", - flush=True, - ) - wait_for_upstream() - - # Start listening - server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind((LISTEN_HOST, LISTEN_PORT)) - server.listen(64) - print( - f"[proxy] listening on {LISTEN_HOST}:{LISTEN_PORT} -> {UPSTREAM_HOST}:{UPSTREAM_PORT}", - flush=True, - ) - - while True: - client_sock, client_addr = server.accept() - threading.Thread( - target=handle_client, args=(client_sock, client_addr), daemon=True - ).start() - - -if __name__ == "__main__": - main() diff --git a/crates/openshell-vm/scripts/build-rootfs-minimal.sh b/crates/openshell-vm/scripts/build-rootfs-minimal.sh deleted file mode 100755 index cf7bb592e..000000000 --- a/crates/openshell-vm/scripts/build-rootfs-minimal.sh +++ /dev/null @@ -1,293 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Build a minimal Ubuntu rootfs for embedding in openshell-vm. -# -# This produces a lightweight rootfs (~200-300MB) with: -# - Base Ubuntu with k3s binary -# - OpenShell supervisor binary -# - Helm charts and Kubernetes manifests -# - NO pre-loaded container images (pulled on demand) -# - NO pre-initialized k3s state (cold start on first boot) -# -# First boot will be slower (~30-60s) as k3s initializes and pulls images, -# but subsequent boots use cached state. -# -# Supports aarch64 and x86_64 guest architectures. The target architecture -# is auto-detected from the host but can be overridden with --arch. -# -# Usage: -# ./build-rootfs-minimal.sh [--arch aarch64|x86_64] [output_dir] -# -# Requires: Docker, curl, helm - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Source pinned dependency versions (digests, checksums, commit SHAs). -# Environment variables override pins — see pins.env for details. -PINS_FILE="${SCRIPT_DIR}/../pins.env" -if [ -f "$PINS_FILE" ]; then - # shellcheck source=../pins.env - source "$PINS_FILE" -fi - -# ── Architecture detection ───────────────────────────────────────────── -# Allow override via --arch flag; default to host architecture. -GUEST_ARCH="" -POSITIONAL_ARGS=() -while [[ $# -gt 0 ]]; do - case "$1" in - --arch) - GUEST_ARCH="$2"; shift 2 ;; - *) - POSITIONAL_ARGS+=("$1"); shift ;; - esac -done - -if [ -z "$GUEST_ARCH" ]; then - case "$(uname -m)" in - aarch64|arm64) GUEST_ARCH="aarch64" ;; - x86_64) GUEST_ARCH="x86_64" ;; - *) - echo "ERROR: Unsupported host architecture: $(uname -m)" >&2 - echo " Use --arch aarch64 or --arch x86_64 to override." >&2 - exit 1 - ;; - esac -fi - -case "$GUEST_ARCH" in - aarch64) - DOCKER_PLATFORM="linux/arm64" - K3S_BINARY_SUFFIX="-arm64" - K3S_CHECKSUM_VAR="K3S_ARM64_SHA256" - RUST_TARGET="aarch64-unknown-linux-gnu" - ;; - x86_64) - DOCKER_PLATFORM="linux/amd64" - K3S_BINARY_SUFFIX="" # x86_64 binary has no suffix - K3S_CHECKSUM_VAR="K3S_AMD64_SHA256" - RUST_TARGET="x86_64-unknown-linux-gnu" - ;; - *) - echo "ERROR: Unsupported guest architecture: ${GUEST_ARCH}" >&2 - echo " Supported: aarch64, x86_64" >&2 - exit 1 - ;; -esac - -DEFAULT_ROOTFS="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" -ROOTFS_DIR="${POSITIONAL_ARGS[0]:-${DEFAULT_ROOTFS}}" -CONTAINER_NAME="krun-rootfs-minimal-builder" -BASE_IMAGE_TAG="krun-rootfs:openshell-vm-minimal" -K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" -K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" - -# Project root (two levels up from crates/openshell-vm/scripts/) -PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" - -# Cross-platform checksum helper -verify_checksum() { - local expected="$1" file="$2" - if command -v sha256sum &>/dev/null; then - echo "${expected} ${file}" | sha256sum -c - - else - echo "${expected} ${file}" | shasum -a 256 -c - - fi -} - -echo "==> Building minimal openshell-vm rootfs" -echo " Guest arch: ${GUEST_ARCH}" -echo " k3s version: ${K3S_VERSION}" -echo " Output: ${ROOTFS_DIR}" -echo " Mode: minimal (no pre-loaded images, cold start)" -echo "" - -# ── Check for running VM ──────────────────────────────────────────────── -VM_LOCK_FILE="$(dirname "${ROOTFS_DIR}")/$(basename "${ROOTFS_DIR}")-vm.lock" -if [ -f "${VM_LOCK_FILE}" ]; then - if ! python3 -c " -import fcntl, os, sys -fd = os.open(sys.argv[1], os.O_RDONLY) -try: - fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) - fcntl.flock(fd, fcntl.LOCK_UN) -except BlockingIOError: - sys.exit(1) -finally: - os.close(fd) -" "${VM_LOCK_FILE}" 2>/dev/null; then - HOLDER_PID=$(cat "${VM_LOCK_FILE}" 2>/dev/null | tr -d '[:space:]') - echo "ERROR: An openshell-vm (pid ${HOLDER_PID:-unknown}) holds a lock on this rootfs." - echo " Stop the VM first, then re-run this script." - exit 1 - fi -fi - -VM_STATE_FILE="$(dirname "${ROOTFS_DIR}")/$(basename "${ROOTFS_DIR}")-vm-state.json" -if [ -f "${VM_STATE_FILE}" ]; then - VM_PID=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1]))['pid'])" "${VM_STATE_FILE}" 2>/dev/null || echo "") - if [ -n "${VM_PID}" ] && kill -0 "${VM_PID}" 2>/dev/null; then - echo "ERROR: An openshell-vm is running (pid ${VM_PID}) using this rootfs." - echo " Stop the VM first, then re-run this script." - exit 1 - else - rm -f "${VM_STATE_FILE}" - fi -fi - -# ── Download k3s binary ───────────────────────────────────────────────── -K3S_BIN="/tmp/k3s-${GUEST_ARCH}-${K3S_VERSION}" -if [ -f "${K3S_BIN}" ]; then - echo "==> Using cached k3s binary: ${K3S_BIN}" -else - echo "==> Downloading k3s ${K3S_VERSION} for ${GUEST_ARCH}..." - curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s${K3S_BINARY_SUFFIX}" \ - -o "${K3S_BIN}" - chmod +x "${K3S_BIN}" -fi - -# Verify k3s binary integrity. -K3S_CHECKSUM="${!K3S_CHECKSUM_VAR:-}" -if [ -n "${K3S_CHECKSUM}" ]; then - echo "==> Verifying k3s binary checksum..." - verify_checksum "${K3S_CHECKSUM}" "${K3S_BIN}" -else - echo "WARNING: ${K3S_CHECKSUM_VAR} not set, skipping checksum verification" -fi - -# ── Build base image ─────────────────────────────────────────────────── -docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true - -echo "==> Building base image..." -docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ - --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' -ARG BASE_IMAGE -FROM ${BASE_IMAGE} -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - ca-certificates \ - iptables \ - iproute2 \ - python3 \ - busybox-static \ - zstd \ - && rm -rf /var/lib/apt/lists/* -RUN mkdir -p /usr/share/udhcpc && \ - ln -sf /bin/busybox /sbin/udhcpc -RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s -DOCKERFILE - -# Create container and export filesystem -echo "==> Creating container..." -docker create --platform "${DOCKER_PLATFORM}" --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true - -echo "==> Exporting filesystem..." -if [ -d "${ROOTFS_DIR}" ]; then - chmod -R u+rwx "${ROOTFS_DIR}" 2>/dev/null || true - rm -rf "${ROOTFS_DIR}" -fi -mkdir -p "${ROOTFS_DIR}" -docker export "${CONTAINER_NAME}" | tar -C "${ROOTFS_DIR}" -xf - -docker rm "${CONTAINER_NAME}" - -# ── Inject k3s binary ────────────────────────────────────────────────── -echo "==> Injecting k3s binary..." -cp "${K3S_BIN}" "${ROOTFS_DIR}/usr/local/bin/k3s" -chmod +x "${ROOTFS_DIR}/usr/local/bin/k3s" -ln -sf /usr/local/bin/k3s "${ROOTFS_DIR}/usr/local/bin/kubectl" - -# ── Inject scripts ───────────────────────────────────────────────────── -echo "==> Injecting scripts..." -mkdir -p "${ROOTFS_DIR}/srv" -cp "${SCRIPT_DIR}/openshell-vm-init.sh" "${ROOTFS_DIR}/srv/openshell-vm-init.sh" -chmod +x "${ROOTFS_DIR}/srv/openshell-vm-init.sh" - -cp "${SCRIPT_DIR}/hello-server.py" "${ROOTFS_DIR}/srv/hello-server.py" -chmod +x "${ROOTFS_DIR}/srv/hello-server.py" - -cp "${SCRIPT_DIR}/check-vm-capabilities.sh" "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" -chmod +x "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" - -cp "${SCRIPT_DIR}/openshell-vm-exec-agent.py" "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" -chmod +x "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" - -# ── Build and inject supervisor binary ───────────────────────────────── -SUPERVISOR_TARGET="${RUST_TARGET}" -SUPERVISOR_BIN="${PROJECT_ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" - -echo "==> Building openshell-sandbox supervisor binary (${SUPERVISOR_TARGET})..." -if ! command -v cargo-zigbuild >/dev/null 2>&1; then - echo "ERROR: cargo-zigbuild is not installed." - echo " Install it with: cargo install cargo-zigbuild" - exit 1 -fi - -cargo zigbuild --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" \ - --manifest-path "${PROJECT_ROOT}/Cargo.toml" 2>&1 | tail -5 - -if [ ! -f "${SUPERVISOR_BIN}" ]; then - echo "ERROR: supervisor binary not found at ${SUPERVISOR_BIN}" - exit 1 -fi - -echo " Injecting supervisor binary into rootfs..." -mkdir -p "${ROOTFS_DIR}/opt/openshell/bin" -cp "${SUPERVISOR_BIN}" "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" -chmod +x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" - -# ── Package and inject helm chart ────────────────────────────────────── -HELM_CHART_DIR="${PROJECT_ROOT}/deploy/helm/openshell" -CHART_DEST="${ROOTFS_DIR}/var/lib/rancher/k3s/server/static/charts" - -if [ -d "${HELM_CHART_DIR}" ]; then - echo "==> Packaging helm chart..." - mkdir -p "${CHART_DEST}" - helm package "${HELM_CHART_DIR}" -d "${CHART_DEST}" - mkdir -p "${ROOTFS_DIR}/opt/openshell/charts" - cp "${CHART_DEST}"/*.tgz "${ROOTFS_DIR}/opt/openshell/charts/" -fi - -# ── Inject Kubernetes manifests ──────────────────────────────────────── -MANIFEST_SRC="${PROJECT_ROOT}/deploy/kube/manifests" -MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/manifests" - -echo "==> Injecting Kubernetes manifests..." -mkdir -p "${MANIFEST_DEST}" - -for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do - if [ -f "${MANIFEST_SRC}/${manifest}" ]; then - cp "${MANIFEST_SRC}/${manifest}" "${MANIFEST_DEST}/" - echo " ${manifest}" - fi -done - -# ── Create empty images directory ────────────────────────────────────── -# k3s expects this directory to exist for airgap image loading. -mkdir -p "${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images" - -# ── Mark as minimal (not pre-initialized) ────────────────────────────── -# The init script checks for this file to determine if cold start is expected. -echo "minimal" > "${ROOTFS_DIR}/opt/openshell/.rootfs-type" - -# ── Verify ───────────────────────────────────────────────────────────── -if [ ! -f "${ROOTFS_DIR}/usr/local/bin/k3s" ]; then - echo "ERROR: k3s binary not found in rootfs." - exit 1 -fi - -if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then - echo "ERROR: openshell-sandbox supervisor binary not found in rootfs." - exit 1 -fi - -echo "" -echo "==> Minimal rootfs ready at: ${ROOTFS_DIR}" -echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" -echo " Type: minimal (cold start, images pulled on demand)" -echo "" -echo "Note: First boot will take ~30-60s as k3s initializes." -echo " Container images will be pulled from registries on first use." diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index 8db62acc6..09cfd3fcc 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -4,20 +4,28 @@ # Build a Ubuntu rootfs for the openshell-vm microVM. # -# Produces a rootfs with k3s pre-installed, the OpenShell helm chart and -# manifests baked in, container images pre-loaded, AND a fully initialized -# k3s cluster state (database, TLS, images imported, all services deployed). +# By default, produces a fully pre-initialized rootfs with k3s pre-installed, +# the OpenShell helm chart and manifests baked in, container images pre-loaded, +# AND a fully initialized k3s cluster state (database, TLS, images imported, +# all services deployed). On first VM boot, k3s resumes from this pre-baked +# state instead of cold-starting, achieving ~3-5s startup times. # -# On first VM boot, k3s resumes from this pre-baked state instead of -# cold-starting, achieving ~3-5s startup times. +# With --base, produces a lightweight rootfs (~200-300MB) with: +# - Base Ubuntu with k3s binary +# - OpenShell supervisor binary +# - Helm charts and Kubernetes manifests +# - NO pre-loaded container images (pulled on demand) +# - NO pre-initialized k3s state (cold start on first boot) +# First boot will be slower (~30-60s) as k3s initializes and pulls images. # # Supports aarch64 and x86_64 guest architectures. The target architecture # is auto-detected from the host but can be overridden with --arch. # # Usage: -# ./crates/openshell-vm/scripts/build-rootfs.sh [--arch aarch64|x86_64] [output_dir] +# ./build-rootfs.sh [--base] [--arch aarch64|x86_64] [output_dir] # -# Requires: Docker (or compatible container runtime), curl, helm, zstd +# Requires: Docker (or compatible container runtime), curl, helm +# Full mode (default) also requires: zstd, sqlite3, a built openshell-vm binary set -euo pipefail @@ -31,12 +39,14 @@ if [ -f "$PINS_FILE" ]; then source "$PINS_FILE" fi -# ── Architecture detection ───────────────────────────────────────────── -# Allow override via --arch flag; default to host architecture. +# ── Argument parsing ─────────────────────────────────────────────────── +BASE_ONLY=false GUEST_ARCH="" POSITIONAL_ARGS=() while [[ $# -gt 0 ]]; do case "$1" in + --base) + BASE_ONLY=true; shift ;; --arch) GUEST_ARCH="$2"; shift 2 ;; *) @@ -44,6 +54,8 @@ while [[ $# -gt 0 ]]; do esac done +# ── Architecture detection ───────────────────────────────────────────── +# Allow override via --arch flag; default to host architecture. if [ -z "$GUEST_ARCH" ]; then case "$(uname -m)" in aarch64|arm64) GUEST_ARCH="aarch64" ;; @@ -79,7 +91,6 @@ esac DEFAULT_ROOTFS="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" ROOTFS_DIR="${POSITIONAL_ARGS[0]:-${DEFAULT_ROOTFS}}" CONTAINER_NAME="krun-rootfs-builder" -INIT_CONTAINER_NAME="krun-k3s-init" BASE_IMAGE_TAG="krun-rootfs:openshell-vm" # K3S_VERSION uses the semver "+" form for GitHub releases. # The mise env may provide the Docker-tag form with "-" instead of "+"; @@ -90,7 +101,7 @@ K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" # Project root (two levels up from crates/openshell-vm/scripts/) PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -# Container images to pre-load into k3s. +# Container images to pre-load into k3s (full mode only). # AGENT_SANDBOX_IMAGE and COMMUNITY_SANDBOX_IMAGE are digest-pinned in pins.env. # SERVER_IMAGE is intentionally unpinned (local dev artifact). IMAGE_REPO_BASE="${IMAGE_REPO_BASE:-openshell}" @@ -107,11 +118,21 @@ verify_checksum() { fi } -echo "==> Building openshell-vm rootfs" -echo " Guest arch: ${GUEST_ARCH}" -echo " k3s version: ${K3S_VERSION}" -echo " Images: ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}" -echo " Output: ${ROOTFS_DIR}" +if [ "$BASE_ONLY" = true ]; then + echo "==> Building base openshell-vm rootfs" + echo " Guest arch: ${GUEST_ARCH}" + echo " k3s version: ${K3S_VERSION}" + echo " Output: ${ROOTFS_DIR}" + echo " Mode: base (no pre-loaded images, cold start)" +else + echo "==> Building openshell-vm rootfs" + echo " Guest arch: ${GUEST_ARCH}" + echo " k3s version: ${K3S_VERSION}" + echo " Images: ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}" + echo " Output: ${ROOTFS_DIR}" + echo " Mode: full (pre-loaded images, pre-initialized)" +fi +echo "" # ── Check for running VM ──────────────────────────────────────────────── # If an openshell-vm is using this rootfs via virtio-fs, wiping the rootfs @@ -198,7 +219,6 @@ fi # Clean up any previous run docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true -docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true echo "==> Building base image..." docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ @@ -254,15 +274,11 @@ chmod +x "${ROOTFS_DIR}"/var/lib/rancher/k3s/data/*/bin/aux/* 2>/dev/null || tru # ── Inject scripts ──────────────────────────────────────────────────── -echo "==> Injecting openshell-vm-init.sh..." +echo "==> Injecting scripts..." mkdir -p "${ROOTFS_DIR}/srv" cp "${SCRIPT_DIR}/openshell-vm-init.sh" "${ROOTFS_DIR}/srv/openshell-vm-init.sh" chmod +x "${ROOTFS_DIR}/srv/openshell-vm-init.sh" -# Keep the hello server around for debugging -cp "${SCRIPT_DIR}/hello-server.py" "${ROOTFS_DIR}/srv/hello-server.py" -chmod +x "${ROOTFS_DIR}/srv/hello-server.py" - # Inject VM capability checker for runtime diagnostics. cp "${SCRIPT_DIR}/check-vm-capabilities.sh" "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" chmod +x "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" @@ -341,6 +357,41 @@ for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do fi done +# ── Base mode: mark rootfs type and skip pre-loading ─────────────────── + +if [ "$BASE_ONLY" = true ]; then + # k3s expects this directory to exist for airgap image loading. + mkdir -p "${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images" + + # Mark as base (not pre-initialized). The init script checks for + # this file to determine if cold start is expected. + echo "base" > "${ROOTFS_DIR}/opt/openshell/.rootfs-type" + + # ── Verify ───────────────────────────────────────────────────────── + if [ ! -f "${ROOTFS_DIR}/usr/local/bin/k3s" ]; then + echo "ERROR: k3s binary not found in rootfs." + exit 1 + fi + + if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then + echo "ERROR: openshell-sandbox supervisor binary not found in rootfs." + exit 1 + fi + + echo "" + echo "==> Base rootfs ready at: ${ROOTFS_DIR}" + echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" + echo " Type: base (cold start, images pulled on demand)" + echo "" + echo "Note: First boot will take ~30-60s as k3s initializes." + echo " Container images will be pulled from registries on first use." + exit 0 +fi + +# ══════════════════════════════════════════════════════════════════════════ +# Full mode: pre-load images and pre-initialize k3s cluster state +# ══════════════════════════════════════════════════════════════════════════ + # ── Pre-load container images ──────────────────────────────────────── # Pull images for the target architecture and save as tarballs in the # k3s airgap images directory. k3s auto-imports from diff --git a/crates/openshell-vm/scripts/hello-server.py b/crates/openshell-vm/scripts/hello-server.py deleted file mode 100644 index f02d7d72e..000000000 --- a/crates/openshell-vm/scripts/hello-server.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Minimal HTTP server that responds with 'Hello from libkrun VM!' on port 8080.""" - -import json -import os -import platform -from http.server import HTTPServer, BaseHTTPRequestHandler - - -class HelloHandler(BaseHTTPRequestHandler): - def do_GET(self): - body = json.dumps( - { - "message": "Hello from libkrun VM!", - "hostname": platform.node(), - "platform": platform.platform(), - "arch": platform.machine(), - "pid": os.getpid(), - "path": self.path, - }, - indent=2, - ) - self.send_response(200) - self.send_header("Content-Type", "application/json") - self.send_header("Content-Length", str(len(body))) - self.end_headers() - self.wfile.write(body.encode()) - - def log_message(self, format, *args): - print(f"[hello-server] {args[0]}") - - -def main(): - host = "0.0.0.0" - port = 8080 - server = HTTPServer((host, port), HelloHandler) - print(f"Hello server listening on {host}:{port}") - try: - server.serve_forever() - except KeyboardInterrupt: - print("\nShutting down.") - server.server_close() - - -if __name__ == "__main__": - main() diff --git a/tasks/scripts/e2e-vm.sh b/e2e/rust/e2e-vm.sh similarity index 100% rename from tasks/scripts/e2e-vm.sh rename to e2e/rust/e2e-vm.sh diff --git a/tasks/scripts/build-libkrun-macos.sh b/tasks/scripts/vm/build-libkrun-macos.sh similarity index 89% rename from tasks/scripts/build-libkrun-macos.sh rename to tasks/scripts/vm/build-libkrun-macos.sh index 19972cf86..125ad9a10 100755 --- a/tasks/scripts/build-libkrun-macos.sh +++ b/tasks/scripts/vm/build-libkrun-macos.sh @@ -22,9 +22,11 @@ set -euo pipefail -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" BUILD_DIR="${ROOT}/target/libkrun-build" OUTPUT_DIR="${BUILD_DIR}" +BREW_PREFIX="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" +CUSTOM_RUNTIME="${ROOT}/target/custom-runtime" if [ "$(uname -s)" != "Darwin" ]; then echo "Error: This script only runs on macOS" >&2 @@ -54,7 +56,7 @@ check_deps() { fi # Check for lld (LLVM linker) - if ! command -v ld.lld &>/dev/null && ! [ -x "$(brew --prefix llvm 2>/dev/null)/bin/ld.lld" ]; then + if ! command -v ld.lld &>/dev/null && ! [ -x "${BREW_PREFIX}/opt/llvm/bin/ld.lld" ]; then MISSING="$MISSING lld" fi @@ -64,8 +66,6 @@ check_deps() { fi # Check for libkrunfw - BREW_PREFIX="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" - CUSTOM_RUNTIME="${ROOT}/target/custom-runtime" if [ ! -f "${BREW_PREFIX}/lib/libkrunfw.dylib" ] && \ [ ! -f "${BREW_PREFIX}/lib/libkrunfw.5.dylib" ] && \ [ ! -f "${CUSTOM_RUNTIME}/libkrunfw.dylib" ]; then @@ -93,27 +93,20 @@ cd "$BUILD_DIR" LIBKRUN_REF="${LIBKRUN_REF:-v1.17.4}" -if [ -d libkrun ]; then - echo "==> Updating existing libkrun checkout..." - cd libkrun - git fetch origin --tags - git checkout "${LIBKRUN_REF}" 2>/dev/null || git checkout "tags/${LIBKRUN_REF}" 2>/dev/null || { - echo "Error: Could not checkout ${LIBKRUN_REF}" >&2 - exit 1 - } - cd .. -else - echo "==> Cloning libkrun (${LIBKRUN_REF})..." +if [ ! -d libkrun ]; then + echo "==> Cloning libkrun..." git clone https://github.com/containers/libkrun.git - cd libkrun - git fetch --tags - git checkout "${LIBKRUN_REF}" 2>/dev/null || git checkout "tags/${LIBKRUN_REF}" 2>/dev/null || { - echo "Error: Could not checkout ${LIBKRUN_REF}" >&2 - exit 1 - } - cd .. fi +echo "==> Checking out ${LIBKRUN_REF}..." +cd libkrun +git fetch origin --tags +git checkout "${LIBKRUN_REF}" 2>/dev/null || git checkout "tags/${LIBKRUN_REF}" 2>/dev/null || { + echo "Error: Could not checkout ${LIBKRUN_REF}" >&2 + exit 1 +} +cd .. + LIBKRUN_COMMIT=$(git -C libkrun rev-parse HEAD) echo " Commit: ${LIBKRUN_COMMIT}" @@ -125,9 +118,6 @@ echo "" echo "==> Building libkrun with NET=1 BLK=1 (no GPU)..." # Find libkrunfw - prefer custom build with bridge support -BREW_PREFIX="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" -CUSTOM_RUNTIME="${ROOT}/target/custom-runtime" - if [ -f "${CUSTOM_RUNTIME}/provenance.json" ] && [ -f "${CUSTOM_RUNTIME}/libkrunfw.dylib" ]; then LIBKRUNFW_DIR="${CUSTOM_RUNTIME}" echo " Using custom libkrunfw from ${LIBKRUNFW_DIR}" @@ -142,7 +132,7 @@ export DYLD_LIBRARY_PATH="${LIBKRUNFW_DIR}:${BREW_PREFIX}/lib:${DYLD_LIBRARY_PAT # Set up LLVM/clang for bindgen (required by krun_display/krun_input if they get compiled) # Note: DYLD_LIBRARY_PATH is needed at runtime for the build scripts that use libclang -LLVM_PREFIX="$(brew --prefix llvm 2>/dev/null || echo /opt/homebrew/opt/llvm)" +LLVM_PREFIX="${BREW_PREFIX}/opt/llvm" if [ -d "$LLVM_PREFIX" ]; then export LIBCLANG_PATH="${LLVM_PREFIX}/lib" export DYLD_LIBRARY_PATH="${LLVM_PREFIX}/lib:${DYLD_LIBRARY_PATH:-}" diff --git a/tasks/scripts/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh similarity index 99% rename from tasks/scripts/build-libkrun.sh rename to tasks/scripts/vm/build-libkrun.sh index 7a31430ae..10cadaf87 100755 --- a/tasks/scripts/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -20,7 +20,7 @@ set -euo pipefail -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" BUILD_DIR="${ROOT}/target/libkrun-build" OUTPUT_DIR="${BUILD_DIR}" KERNEL_CONFIG="${ROOT}/crates/openshell-vm/runtime/kernel/openshell.kconfig" diff --git a/tasks/scripts/build-rootfs-tarball.sh b/tasks/scripts/vm/build-rootfs-tarball.sh similarity index 73% rename from tasks/scripts/build-rootfs-tarball.sh rename to tasks/scripts/vm/build-rootfs-tarball.sh index 2e4b1f05e..02410624f 100755 --- a/tasks/scripts/build-rootfs-tarball.sh +++ b/tasks/scripts/vm/build-rootfs-tarball.sh @@ -5,14 +5,14 @@ # Build rootfs and compress to tarball for embedding in openshell-vm binary. # # This script: -# 1. Builds the rootfs using build-rootfs.sh or build-rootfs-minimal.sh +# 1. Builds the rootfs using build-rootfs.sh # 2. Compresses it to a zstd tarball for embedding # # Usage: -# ./build-rootfs-tarball.sh [--minimal] +# ./build-rootfs-tarball.sh [--base] # # Options: -# --minimal Build a minimal rootfs (~200-300MB) without pre-loaded images. +# --base Build a base rootfs (~200-300MB) without pre-loaded images. # First boot will be slower but binary size is much smaller. # Default: full rootfs with pre-loaded images (~2GB+). # @@ -21,24 +21,24 @@ set -euo pipefail -ROOT="$(git rev-parse --show-toplevel 2>/dev/null)" || ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" ROOTFS_BUILD_DIR="${ROOT}/target/rootfs-build" OUTPUT_DIR="${ROOT}/target/vm-runtime-compressed" OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst" # Parse arguments -MINIMAL=false +BASE_ONLY=false for arg in "$@"; do case "$arg" in - --minimal) - MINIMAL=true + --base) + BASE_ONLY=true ;; --help|-h) - echo "Usage: $0 [--minimal]" + echo "Usage: $0 [--base]" echo "" echo "Options:" - echo " --minimal Build minimal rootfs (~200-300MB) without pre-loaded images" - echo " First boot will be slower but binary size is much smaller" + echo " --base Build base rootfs (~200-300MB) without pre-loaded images" + echo " First boot will be slower but binary size is much smaller" exit 0 ;; *) @@ -63,16 +63,16 @@ if ! docker info &>/dev/null; then exit 1 fi -if [ "$MINIMAL" = true ]; then - echo "==> Building MINIMAL rootfs for embedding" +if [ "$BASE_ONLY" = true ]; then + echo "==> Building BASE rootfs for embedding" echo " Build dir: ${ROOTFS_BUILD_DIR}" echo " Output: ${OUTPUT}" - echo " Mode: minimal (no pre-loaded images, ~200-300MB)" + echo " Mode: base (no pre-loaded images, ~200-300MB)" echo "" - # Build minimal rootfs - echo "==> Step 1/2: Building minimal rootfs..." - "${ROOT}/crates/openshell-vm/scripts/build-rootfs-minimal.sh" "${ROOTFS_BUILD_DIR}" + # Build base rootfs + echo "==> Step 1/2: Building base rootfs..." + "${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" --base "${ROOTFS_BUILD_DIR}" else echo "==> Building FULL rootfs for embedding" echo " Build dir: ${ROOTFS_BUILD_DIR}" @@ -107,8 +107,8 @@ echo "" echo "==> Rootfs tarball created successfully!" echo " Output: ${OUTPUT}" echo " Compressed: $(du -sh "${OUTPUT}" | cut -f1)" -if [ "$MINIMAL" = true ]; then - echo " Type: minimal (first boot ~30-60s, images pulled on demand)" +if [ "$BASE_ONLY" = true ]; then + echo " Type: base (first boot ~30-60s, images pulled on demand)" else echo " Type: full (first boot ~3-5s, images pre-loaded)" fi diff --git a/tasks/scripts/codesign-openshell-vm.sh b/tasks/scripts/vm/codesign-openshell-vm.sh similarity index 85% rename from tasks/scripts/codesign-openshell-vm.sh rename to tasks/scripts/vm/codesign-openshell-vm.sh index 1217ad74f..0aeeca9b1 100755 --- a/tasks/scripts/codesign-openshell-vm.sh +++ b/tasks/scripts/vm/codesign-openshell-vm.sh @@ -8,5 +8,5 @@ if [ "$(uname -s)" != "Darwin" ]; then exit 0 fi -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" codesign --entitlements "${ROOT}/crates/openshell-vm/entitlements.plist" --force -s - "${ROOT}/target/debug/openshell-vm" diff --git a/tasks/scripts/compress-vm-runtime.sh b/tasks/scripts/vm/compress-vm-runtime.sh similarity index 57% rename from tasks/scripts/compress-vm-runtime.sh rename to tasks/scripts/vm/compress-vm-runtime.sh index 18b1589b1..4007ef13e 100755 --- a/tasks/scripts/compress-vm-runtime.sh +++ b/tasks/scripts/vm/compress-vm-runtime.sh @@ -18,8 +18,9 @@ set -euo pipefail -# Use git to find the project root reliably -ROOT="$(git rev-parse --show-toplevel 2>/dev/null)" || ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" + +GVPROXY_VERSION="${GVPROXY_VERSION:-v0.8.8}" # ── macOS dylib portability helpers ───────────────────────────────────── @@ -43,57 +44,6 @@ make_dylib_portable() { codesign -f -s - "$dylib" 2>/dev/null || true } -# Bundle GPU dependencies (libepoxy, virglrenderer, MoltenVK) for Homebrew libkrun -bundle_gpu_dependencies() { - local work_dir="$1" - local brew_prefix - brew_prefix="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" - - # Dependencies to bundle - local deps=( - "${brew_prefix}/opt/libepoxy/lib/libepoxy.0.dylib" - "${brew_prefix}/opt/virglrenderer/lib/libvirglrenderer.1.dylib" - "${brew_prefix}/opt/molten-vk/lib/libMoltenVK.dylib" - ) - - for dep in "${deps[@]}"; do - if [ -f "$dep" ]; then - local dep_name - dep_name="$(basename "$dep")" - cp "$dep" "${work_dir}/${dep_name}" - echo " Copied: ${dep_name}" - fi - done - - # Rewrite all paths to use @loader_path - for dylib in "${work_dir}"/*.dylib; do - [ -f "$dylib" ] || continue - - # Rewrite install name - local dylib_name - dylib_name="$(basename "$dylib")" - install_name_tool -id "@loader_path/${dylib_name}" "$dylib" 2>/dev/null || true - - # Rewrite all Homebrew references to @loader_path - for dep in "${deps[@]}"; do - local dep_name - dep_name="$(basename "$dep")" - install_name_tool -change "$dep" "@loader_path/${dep_name}" "$dylib" 2>/dev/null || true - done - - # Also rewrite libkrunfw - local krunfw_path - krunfw_path=$(otool -L "$dylib" 2>/dev/null | grep libkrunfw | awk '{print $1}' || true) - if [ -n "$krunfw_path" ] && [[ "$krunfw_path" != @* ]]; then - install_name_tool -change "$krunfw_path" "@loader_path/libkrunfw.dylib" "$dylib" - fi - - # Re-codesign - codesign -f -s - "$dylib" 2>/dev/null || true - done - - echo " All dependencies rewritten to use @loader_path" -} WORK_DIR="${ROOT}/target/vm-runtime" OUTPUT_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${ROOT}/target/vm-runtime-compressed}" @@ -110,7 +60,6 @@ case "$(uname -s)-$(uname -m)" in # Source priority for libkrun: # 1. Custom build from build-libkrun-macos.sh (portable, no GPU deps) # 2. Custom runtime with custom libkrunfw - # 3. Homebrew (has GPU deps, not portable) LIBKRUN_BUILD_DIR="${ROOT}/target/libkrun-build" CUSTOM_DIR="${ROOT}/target/custom-runtime" BREW_PREFIX="$(brew --prefix 2>/dev/null || echo /opt/homebrew)" @@ -140,32 +89,9 @@ case "$(uname -s)-$(uname -m)" in # libkrunfw from custom build cp "${CUSTOM_DIR}/libkrunfw.dylib" "$WORK_DIR/" else - echo " Using Homebrew runtime from ${BREW_PREFIX}/lib" - echo " Warning: Homebrew libkrun has GPU dependencies (libepoxy, virglrenderer)" - echo " For a portable build, run: mise run vm:runtime:build-libkrun-macos" - - cp "${BREW_PREFIX}/lib/libkrun.dylib" "$WORK_DIR/" - - # Copy libkrunfw - for krunfw in "${BREW_PREFIX}/lib"/libkrunfw*.dylib; do - [ -f "$krunfw" ] || continue - if [ -L "$krunfw" ]; then - target=$(readlink "$krunfw") - if [[ "$target" != /* ]]; then - target="${BREW_PREFIX}/lib/${target}" - fi - cp "$target" "$WORK_DIR/$(basename "$krunfw")" - else - cp "$krunfw" "$WORK_DIR/" - fi - done - - # If using Homebrew libkrun with GPU, we need to bundle the GPU dependencies - # for portability. Check if libkrun has GPU deps: - if otool -L "$WORK_DIR/libkrun.dylib" | grep -q "libepoxy\|virglrenderer"; then - echo " Bundling GPU dependencies for portability..." - bundle_gpu_dependencies "$WORK_DIR" - fi + echo "Error: No portable libkrun build found." >&2 + echo " Run: mise run vm:runtime:build-libkrun-macos" >&2 + exit 1 fi # Normalize libkrunfw naming - ensure we have libkrunfw.dylib @@ -177,8 +103,8 @@ case "$(uname -s)-$(uname -m)" in if [ -x /opt/podman/bin/gvproxy ]; then cp /opt/podman/bin/gvproxy "$WORK_DIR/" echo " Using gvproxy from Podman" - elif [ -x "$(brew --prefix 2>/dev/null)/bin/gvproxy" ]; then - cp "$(brew --prefix)/bin/gvproxy" "$WORK_DIR/" + elif [ -x "${BREW_PREFIX}/bin/gvproxy" ]; then + cp "${BREW_PREFIX}/bin/gvproxy" "$WORK_DIR/" echo " Using gvproxy from Homebrew" else echo "Error: gvproxy not found. Install Podman Desktop or run: brew install gvproxy" >&2 @@ -186,9 +112,18 @@ case "$(uname -s)-$(uname -m)" in fi ;; - Linux-aarch64) - PLATFORM="linux-aarch64" - echo " Platform: Linux ARM64" + Linux-*) + ARCH="$(uname -m)" + case "$ARCH" in + aarch64) GVPROXY_ARCH="arm64" ;; + x86_64) GVPROXY_ARCH="amd64" ;; + *) + echo "Error: Unsupported Linux architecture: ${ARCH}" >&2 + exit 1 + ;; + esac + PLATFORM="linux-${ARCH}" + echo " Platform: Linux ${ARCH}" BUILD_DIR="${ROOT}/target/libkrun-build" if [ ! -f "${BUILD_DIR}/libkrun.so" ]; then @@ -215,45 +150,9 @@ case "$(uname -s)-$(uname -m)" in # Download gvproxy if not present if [ ! -f "$WORK_DIR/gvproxy" ]; then - echo " Downloading gvproxy for linux-arm64..." - curl -fsSL -o "$WORK_DIR/gvproxy" \ - "https://github.com/containers/gvisor-tap-vsock/releases/download/v0.8.8/gvproxy-linux-arm64" - chmod +x "$WORK_DIR/gvproxy" - fi - ;; - - Linux-x86_64) - PLATFORM="linux-x86_64" - echo " Platform: Linux x86_64" - - BUILD_DIR="${ROOT}/target/libkrun-build" - if [ ! -f "${BUILD_DIR}/libkrun.so" ]; then - echo "Error: libkrun not found. Run: mise run vm:runtime:build-libkrun" >&2 - exit 1 - fi - - cp "${BUILD_DIR}/libkrun.so" "$WORK_DIR/" - - # Copy libkrunfw - for krunfw in "${BUILD_DIR}"/libkrunfw.so*; do - [ -f "$krunfw" ] || continue - cp "$krunfw" "$WORK_DIR/" - done - - # Ensure the soname symlink (libkrunfw.so.5) exists alongside the fully - # versioned file (libkrunfw.so.5.x.y). libloading loads by soname. - if [ ! -f "$WORK_DIR/libkrunfw.so.5" ]; then - versioned=$(ls "$WORK_DIR"/libkrunfw.so.5.* 2>/dev/null | head -n1) - if [ -n "$versioned" ]; then - cp "$versioned" "$WORK_DIR/libkrunfw.so.5" - fi - fi - - # Download gvproxy if not present - if [ ! -f "$WORK_DIR/gvproxy" ]; then - echo " Downloading gvproxy for linux-amd64..." + echo " Downloading gvproxy for linux-${GVPROXY_ARCH}..." curl -fsSL -o "$WORK_DIR/gvproxy" \ - "https://github.com/containers/gvisor-tap-vsock/releases/download/v0.8.8/gvproxy-linux-amd64" + "https://github.com/containers/gvisor-tap-vsock/releases/download/${GVPROXY_VERSION}/gvproxy-linux-${GVPROXY_ARCH}" chmod +x "$WORK_DIR/gvproxy" fi ;; @@ -276,7 +175,7 @@ for file in "$WORK_DIR"/*; do [ -f "$file" ] || continue name=$(basename "$file") original_size=$(du -h "$file" | cut -f1) - zstd -19 -f -q -o "${OUTPUT_DIR}/${name}.zst" "$file" + zstd -19 -f -q -T0 -o "${OUTPUT_DIR}/${name}.zst" "$file" # Ensure compressed file is readable/writable (source may be read-only) chmod 644 "${OUTPUT_DIR}/${name}.zst" compressed_size=$(du -h "${OUTPUT_DIR}/${name}.zst" | cut -f1) diff --git a/tasks/scripts/ensure-vm-rootfs.sh b/tasks/scripts/vm/ensure-vm-rootfs.sh similarity index 100% rename from tasks/scripts/ensure-vm-rootfs.sh rename to tasks/scripts/vm/ensure-vm-rootfs.sh diff --git a/tasks/scripts/run-vm.sh b/tasks/scripts/vm/run-vm.sh similarity index 88% rename from tasks/scripts/run-vm.sh rename to tasks/scripts/vm/run-vm.sh index 9b9506017..397abb4a6 100755 --- a/tasks/scripts/run-vm.sh +++ b/tasks/scripts/vm/run-vm.sh @@ -4,7 +4,7 @@ set -euo pipefail -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" RUNTIME_DIR="${ROOT}/target/debug/openshell-vm.runtime" GATEWAY_BIN="${ROOT}/target/debug/openshell-vm" diff --git a/tasks/scripts/sync-vm-rootfs.sh b/tasks/scripts/vm/sync-vm-rootfs.sh similarity index 97% rename from tasks/scripts/sync-vm-rootfs.sh rename to tasks/scripts/vm/sync-vm-rootfs.sh index edbc2f9b8..9543d38dc 100755 --- a/tasks/scripts/sync-vm-rootfs.sh +++ b/tasks/scripts/vm/sync-vm-rootfs.sh @@ -11,7 +11,7 @@ set -euo pipefail -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" ROOTFS_DIR="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/rootfs" SCRIPT_DIR="${ROOT}/crates/openshell-vm/scripts" @@ -24,7 +24,7 @@ fi echo "Syncing development artifacts into rootfs..." # ── Init scripts and utilities ───────────────────────────────────────── -for script in openshell-vm-init.sh openshell-vm-exec-agent.py check-vm-capabilities.sh hello-server.py; do +for script in openshell-vm-init.sh openshell-vm-exec-agent.py check-vm-capabilities.sh; do src="${SCRIPT_DIR}/${script}" dst="${ROOTFS_DIR}/srv/${script}" if [ -f "$src" ]; then diff --git a/tasks/test.toml b/tasks/test.toml index 7bd88708d..dc2338195 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -51,6 +51,6 @@ run = "uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2 description = "Boot openshell-vm and run smoke e2e (macOS ARM64; pass -- --vm-port=N [--vm-name=NAME] to reuse)" depends = ["vm:build:binary", "vm:codesign", "vm:bundle-runtime", "vm:rootfs"] run = [ - "tasks/scripts/sync-vm-rootfs.sh", - "tasks/scripts/e2e-vm.sh", + "tasks/scripts/vm/sync-vm-rootfs.sh", + "e2e/rust/e2e-vm.sh", ] diff --git a/tasks/vm.toml b/tasks/vm.toml index 5d61a60b5..756bbcbe6 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -12,9 +12,9 @@ description = "Build and run the standalone openshell-vm microVM" depends = ["build:docker:gateway"] run = [ "mise run vm:build:embedded", - "tasks/scripts/ensure-vm-rootfs.sh", - "tasks/scripts/sync-vm-rootfs.sh", - "tasks/scripts/run-vm.sh", + "tasks/scripts/vm/ensure-vm-rootfs.sh", + "tasks/scripts/vm/sync-vm-rootfs.sh", + "tasks/scripts/vm/run-vm.sh", ] hide = false @@ -23,8 +23,8 @@ description = "Force a fresh openshell-vm rebuild, including the rootfs" depends = ["build:docker:gateway"] run = [ "mise run vm:build:embedded", - "OPENSHELL_VM_FORCE_ROOTFS_REBUILD=1 tasks/scripts/ensure-vm-rootfs.sh", - "tasks/scripts/sync-vm-rootfs.sh", + "OPENSHELL_VM_FORCE_ROOTFS_REBUILD=1 tasks/scripts/vm/ensure-vm-rootfs.sh", + "tasks/scripts/vm/sync-vm-rootfs.sh", ] hide = false @@ -37,8 +37,7 @@ description = "Build openshell-vm with embedded runtime (single binary, no sidec run = [ "mise run vm:runtime:compress", "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm", - "tasks/scripts/codesign-openshell-vm.sh", - "tasks/scripts/bundle-vm-runtime.sh", + "tasks/scripts/vm/codesign-openshell-vm.sh", ] hide = false @@ -58,7 +57,7 @@ hide = true ["vm:runtime:compress"] description = "Gather and compress VM runtime artifacts for embedding" -run = "tasks/scripts/compress-vm-runtime.sh" +run = "tasks/scripts/vm/compress-vm-runtime.sh" hide = false ["vm:runtime:build-libkrunfw"] @@ -68,12 +67,12 @@ hide = false ["vm:runtime:build-libkrun"] description = "Build libkrun and libkrunfw from source (Linux only)" -run = "tasks/scripts/build-libkrun.sh" +run = "tasks/scripts/vm/build-libkrun.sh" hide = false ["vm:runtime:build-libkrun-macos"] description = "Build portable libkrun from source (macOS, no GPU deps)" -run = "tasks/scripts/build-libkrun-macos.sh" +run = "tasks/scripts/vm/build-libkrun-macos.sh" hide = false # ═══════════════════════════════════════════════════════════════════════════ @@ -82,17 +81,17 @@ hide = false ["vm:rootfs"] description = "Build the default openshell-vm rootfs if needed" -run = "tasks/scripts/ensure-vm-rootfs.sh" +run = "tasks/scripts/vm/ensure-vm-rootfs.sh" hide = true ["vm:build:rootfs-tarball"] description = "Build and compress FULL rootfs tarball for embedding (~2GB+)" -run = "tasks/scripts/build-rootfs-tarball.sh" +run = "tasks/scripts/vm/build-rootfs-tarball.sh" hide = false -["vm:build:rootfs-tarball:minimal"] -description = "Build and compress MINIMAL rootfs tarball for embedding (~200-300MB)" -run = "tasks/scripts/build-rootfs-tarball.sh --minimal" +["vm:build:rootfs-tarball:base"] +description = "Build and compress BASE rootfs tarball for embedding (~200-300MB)" +run = "tasks/scripts/vm/build-rootfs-tarball.sh --base" hide = false ["vm:build:embedded:quick"] @@ -100,14 +99,14 @@ description = "Build embedded binary using cached rootfs tarball (skips rootfs r run = [ "mise run vm:runtime:compress", "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm", - "tasks/scripts/codesign-openshell-vm.sh", + "tasks/scripts/vm/codesign-openshell-vm.sh", ] hide = false ["vm:codesign"] description = "Codesign the openshell-vm binary for Hypervisor.framework access on macOS" depends = ["vm:build:binary"] -run = "tasks/scripts/codesign-openshell-vm.sh" +run = "tasks/scripts/vm/codesign-openshell-vm.sh" hide = true ["vm:check-capabilities"]