From 2ee61ee7608b31ad10c37667b2d490570d290003 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 19 Mar 2026 14:59:07 -0700 Subject: [PATCH 01/10] feat(bootstrap): resume gateway from existing state and persist SSH handshake secret Add a resume code path to gateway start so existing Docker volume state (k3s, etcd, sandboxes, secrets) is reused instead of requiring a full destroy/recreate cycle. When the container is gone but the volume remains (e.g. Docker restart), the CLI automatically creates a new container with the existing volume and reconciles PKI and secrets. Move the SSH handshake HMAC secret from ephemeral generation in the cluster entrypoint (regenerated on every container start) to a Kubernetes Secret that persists in etcd on the Docker volume. This ensures sandbox SSH sessions survive container restarts. Key changes: - Add DeployOptions.resume flag with resume branch in deploy flow - Add cleanup_gateway_container for volume-preserving failure cleanup - Auto-resume in gateway_admin_deploy (stopped/volume-only states) - Auto-bootstrap tries resume first, falls back to recreate - Add unless-stopped Docker restart policy to gateway container - Reconcile SSH handshake secret as K8s Secret alongside TLS PKI - Update Helm chart to read secret via secretKeyRef - Add SSH handshake secret to cluster health check Closes #487 --- crates/openshell-bootstrap/src/constants.rs | 2 + crates/openshell-bootstrap/src/docker.rs | 51 +++++- crates/openshell-bootstrap/src/lib.rs | 146 ++++++++++++++++-- crates/openshell-cli/src/bootstrap.rs | 93 ++++++----- crates/openshell-cli/src/run.rs | 61 +++----- deploy/docker/cluster-entrypoint.sh | 11 +- deploy/docker/cluster-healthcheck.sh | 3 + .../helm/openshell/templates/statefulset.yaml | 5 +- deploy/helm/openshell/values.yaml | 8 +- .../kube/manifests/openshell-helmchart.yaml | 1 - tasks/scripts/cluster-deploy-fast.sh | 14 +- 11 files changed, 279 insertions(+), 116 deletions(-) diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs index ff283b3ea..74e381fd2 100644 --- a/crates/openshell-bootstrap/src/constants.rs +++ b/crates/openshell-bootstrap/src/constants.rs @@ -11,6 +11,8 @@ pub const SERVER_TLS_SECRET_NAME: &str = "openshell-server-tls"; pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca"; /// K8s secret holding the client TLS certificate, key, and CA cert (shared by CLI and sandboxes). pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls"; +/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods). +pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake"; pub fn container_name(name: &str) -> String { format!("openshell-cluster-{name}") diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index cc63aacce..157823bf3 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -9,7 +9,8 @@ use bollard::Docker; use bollard::errors::Error as BollardError; use bollard::models::{ ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum, - NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, VolumeCreateRequest, + NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, RestartPolicy, + RestartPolicyNameEnum, VolumeCreateRequest, }; use bollard::query_parameters::{ CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions, @@ -555,6 +556,12 @@ pub async fn ensure_container( port_bindings: Some(port_bindings), binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]), network_mode: Some(network_name(name)), + // Automatically restart the container when Docker restarts, unless the + // user explicitly stopped it with `gateway stop`. + restart_policy: Some(RestartPolicy { + name: Some(RestartPolicyNameEnum::UNLESS_STOPPED), + maximum_retry_count: None, + }), // Add host gateway aliases for DNS resolution. // This allows both the entrypoint script and the running gateway // process to reach services on the Docker host. @@ -956,6 +963,48 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<() Ok(()) } +/// Clean up the gateway container and network, preserving the persistent volume. +/// +/// Used when a resume attempt fails — we want to remove the container we may +/// have just created but keep the volume so the user can retry without losing +/// their k3s/etcd state and sandbox data. +pub async fn cleanup_gateway_container(docker: &Docker, name: &str) -> Result<()> { + let container_name = container_name(name); + let net_name = network_name(name); + + // Disconnect container from network + let _ = docker + .disconnect_network( + &net_name, + NetworkDisconnectRequest { + container: container_name.clone(), + force: Some(true), + }, + ) + .await; + + let _ = stop_container(docker, &container_name).await; + + let remove_container = docker + .remove_container( + &container_name, + Some(RemoveContainerOptions { + force: true, + ..Default::default() + }), + ) + .await; + if let Err(err) = remove_container + && !is_not_found(&err) + { + return Err(err).into_diagnostic(); + } + + force_remove_network(docker, &net_name).await?; + + Ok(()) +} + /// Forcefully remove a Docker network, disconnecting any remaining /// containers first. This ensures that stale Docker network endpoints /// cannot prevent port bindings from being released. diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 7dcabe052..7aba7c630 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -26,13 +26,13 @@ use miette::{IntoDiagnostic, Result}; use std::sync::{Arc, Mutex}; use crate::constants::{ - CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, network_name, - volume_name, + CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, + SSH_HANDSHAKE_SECRET_NAME, network_name, volume_name, }; use crate::docker::{ - check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container, - ensure_image, ensure_network, ensure_volume, resolve_gpu_device_ids, start_container, - stop_container, + check_existing_gateway, check_port_conflicts, cleanup_gateway_container, + destroy_gateway_resources, ensure_container, ensure_image, ensure_network, ensure_volume, + resolve_gpu_device_ids, start_container, stop_container, }; use crate::metadata::{ create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host, @@ -123,6 +123,11 @@ pub struct DeployOptions { /// When false, an existing gateway is left as-is and deployment is /// skipped (the caller is responsible for prompting the user first). pub recreate: bool, + /// When true, resume from existing state (volume, stopped container) + /// instead of erroring out. The deploy flow reuses the existing volume + /// and creates a new container if needed, preserving k3s/etcd state, + /// sandbox pods, and secrets. + pub resume: bool, } impl DeployOptions { @@ -139,6 +144,7 @@ impl DeployOptions { registry_token: None, gpu: vec![], recreate: false, + resume: false, } } @@ -208,6 +214,13 @@ impl DeployOptions { self.recreate = recreate; self } + + /// Set whether to resume from existing state (volume, stopped container). + #[must_use] + pub fn with_resume(mut self, resume: bool) -> Self { + self.resume = resume; + self + } } #[derive(Debug, Clone)] @@ -272,6 +285,7 @@ where let registry_token = options.registry_token; let gpu = options.gpu; let recreate = options.recreate; + let resume = options.resume; // Wrap on_log in Arc> so we can share it with pull_remote_image // which needs a 'static callback for the bollard streaming pull. @@ -308,12 +322,28 @@ where .and_then(|info| info.cdi_spec_dirs) .is_some_and(|dirs| !dirs.is_empty()); - // If an existing gateway is found, either tear it down (when recreate is - // requested) or bail out so the caller can prompt the user / reuse it. + // Guard: recreate takes precedence if both are set (shouldn't happen in practice). + if resume && recreate { + tracing::warn!("both resume and recreate set; recreate takes precedence"); + } + + // If an existing gateway is found, decide how to proceed: + // - recreate: destroy everything and start fresh + // - resume: keep existing state and create/start the container + // - neither: error out (caller should prompt the user) if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { if recreate { log("[status] Removing existing gateway".to_string()); destroy_gateway_resources(&target_docker, &name).await?; + } else if resume { + if existing.container_running { + log("[status] Gateway is already running".to_string()); + } else { + log("[status] Resuming gateway from existing state".to_string()); + } + // Fall through to ensure_* calls — they are idempotent and will + // reuse the existing volume, create a container if needed, and + // start it. } else { return Err(miette::miette!( "Gateway '{name}' already exists (container_running={}).\n\ @@ -476,6 +506,11 @@ where store_pki_bundle(&name, &pki_bundle)?; + // Reconcile SSH handshake secret: reuse existing K8s secret if present, + // generate and persist a new one otherwise. This secret is stored in etcd + // (on the persistent volume) so it survives container restarts. + reconcile_ssh_handshake_secret(&target_docker, &name, &log).await?; + // Push locally-built component images into the k3s containerd runtime. // This is the "push" path for local development — images are exported from // the local Docker daemon and streamed into the cluster's containerd so @@ -545,15 +580,30 @@ where docker: target_docker, }), Err(deploy_err) => { - // Automatically clean up Docker resources (volume, container, network, - // image) so the environment is left in a retryable state. - tracing::info!("deploy failed, cleaning up gateway resources for '{name}'"); - if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await { - tracing::warn!( - "automatic cleanup after failed deploy also failed: {cleanup_err}. \ - Manual cleanup may be required: \ - openshell gateway destroy --name {name}" + if resume { + // When resuming, preserve the volume so the user can retry. + // Only clean up the container and network that we may have created. + tracing::info!( + "resume failed, cleaning up container for '{name}' (preserving volume)" ); + if let Err(cleanup_err) = cleanup_gateway_container(&target_docker, &name).await { + tracing::warn!( + "automatic cleanup after failed resume also failed: {cleanup_err}. \ + Manual cleanup may be required: \ + openshell gateway destroy --name {name}" + ); + } + } else { + // Automatically clean up Docker resources (volume, container, network, + // image) so the environment is left in a retryable state. + tracing::info!("deploy failed, cleaning up gateway resources for '{name}'"); + if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await { + tracing::warn!( + "automatic cleanup after failed deploy also failed: {cleanup_err}. \ + Manual cleanup may be required: \ + openshell gateway destroy --name {name}" + ); + } } Err(deploy_err) } @@ -858,6 +908,72 @@ where Ok((bundle, true)) } +/// Reconcile the SSH handshake HMAC secret as a Kubernetes Secret. +/// +/// If the secret already exists in the cluster, this is a no-op. Otherwise a +/// fresh 32-byte hex secret is generated and applied. Because the secret lives +/// in etcd (backed by the persistent Docker volume), it survives container +/// restarts without regeneration — existing sandbox SSH sessions remain valid. +async fn reconcile_ssh_handshake_secret(docker: &Docker, name: &str, log: &F) -> Result<()> +where + F: Fn(String) + Sync, +{ + use miette::WrapErr; + + let cname = container_name(name); + let kubeconfig = constants::KUBECONFIG_PATH; + + // Check if the secret already exists. + let (output, exit_code) = exec_capture_with_exit( + docker, + &cname, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "KUBECONFIG={kubeconfig} kubectl -n openshell get secret {SSH_HANDSHAKE_SECRET_NAME} -o jsonpath='{{.data.secret}}' 2>/dev/null" + ), + ], + ) + .await?; + + if exit_code == 0 && !output.trim().is_empty() { + tracing::debug!( + "existing SSH handshake secret found ({} bytes encoded)", + output.trim().len() + ); + log("[progress] Reusing existing SSH handshake secret".to_string()); + return Ok(()); + } + + // Generate a new 32-byte hex secret and create the K8s secret. + log("[progress] Generating SSH handshake secret".to_string()); + let (output, exit_code) = exec_capture_with_exit( + docker, + &cname, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \\n') && \ + KUBECONFIG={kubeconfig} kubectl -n openshell create secret generic {SSH_HANDSHAKE_SECRET_NAME} \ + --from-literal=secret=$SECRET --dry-run=client -o yaml | \ + KUBECONFIG={kubeconfig} kubectl apply -f -" + ), + ], + ) + .await?; + + if exit_code != 0 { + return Err(miette::miette!( + "failed to create SSH handshake secret (exit {exit_code}): {output}" + )) + .wrap_err("failed to apply SSH handshake secret"); + } + + Ok(()) +} + /// Load existing TLS secrets from the cluster and reconstruct a [`PkiBundle`]. /// /// Returns an error string describing why secrets couldn't be loaded (for logging). diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index ea6410b91..9cbe8332a 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -144,47 +144,62 @@ pub async fn run_bootstrap( ); eprintln!(); - // Auto-bootstrap always recreates if stale Docker resources are found - // (e.g. metadata was deleted but container/volume still exist). - let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true); - if let Some(dest) = remote { - let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); - if let Some(key) = ssh_key { - remote_opts = remote_opts.with_ssh_key(key); + // Build base deploy options. The auto-bootstrap path tries to resume from + // existing state first (preserving sandboxes and secrets), falling back to + // a full recreate if the resume fails. + let build_options = |resume: bool, recreate: bool| { + let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name) + .with_resume(resume) + .with_recreate(recreate) + .with_gpu(gpu); + if let Some(dest) = remote { + let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); + if let Some(key) = ssh_key { + remote_opts = remote_opts.with_ssh_key(key); + } + opts = opts.with_remote(remote_opts); } - options = options.with_remote(remote_opts); - } - // Read registry credentials from environment for the auto-bootstrap path. - // The explicit `--registry-username` / `--registry-token` flags are only - // on `gateway start`; when bootstrapping via `sandbox create`, the env - // vars are the mechanism. - if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME") - && !username.trim().is_empty() - { - options = options.with_registry_username(username); - } - if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN") - && !token.trim().is_empty() - { - options = options.with_registry_token(token); - } - // Read gateway host override from environment. Needed whenever the - // client cannot reach the Docker host at 127.0.0.1 — CI containers, - // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag - // is only on `gateway start`; this env var covers the auto-bootstrap - // path triggered by `sandbox create`. - if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") - && !host.trim().is_empty() + // Read registry credentials from environment for the auto-bootstrap path. + // The explicit `--registry-username` / `--registry-token` flags are only + // on `gateway start`; when bootstrapping via `sandbox create`, the env + // vars are the mechanism. + if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME") + && !username.trim().is_empty() + { + opts = opts.with_registry_username(username); + } + if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN") + && !token.trim().is_empty() + { + opts = opts.with_registry_token(token); + } + // Read gateway host override from environment. Needed whenever the + // client cannot reach the Docker host at 127.0.0.1 — CI containers, + // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag + // is only on `gateway start`; this env var covers the auto-bootstrap + // path triggered by `sandbox create`. + if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") + && !host.trim().is_empty() + { + opts = opts.with_gateway_host(host); + } + opts + }; + + // Try resume first to preserve existing sandboxes and secrets. + let handle = match deploy_gateway_with_panel( + build_options(true, false), + &gateway_name, + location, + ) + .await { - options = options.with_gateway_host(host); - } - options = options.with_gpu(if gpu { - vec!["auto".to_string()] - } else { - vec![] - }); - - let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?; + Ok(handle) => handle, + Err(resume_err) => { + tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}"); + deploy_gateway_with_panel(build_options(false, true), &gateway_name, location).await? + } + }; let server = handle.gateway_endpoint().to_string(); print_deploy_summary(&gateway_name, &handle); diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index c4f2833d2..4bdb94c56 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1369,49 +1369,27 @@ pub async fn gateway_admin_deploy( opts }); - // Check whether a gateway already exists. If so, prompt the user (unless - // --recreate was passed or we're in non-interactive mode). - let mut should_recreate = recreate; + // Check whether a gateway already exists and decide how to proceed: + // - --recreate: always destroy and start fresh + // - existing state (stopped container / volume only): auto-resume + // - already running: return immediately (nothing to do) + let should_recreate = recreate; + let mut should_resume = false; if let Some(existing) = openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? { - if !should_recreate { - let interactive = std::io::stdin().is_terminal() && std::io::stderr().is_terminal(); - if interactive { - let status = if existing.container_running { - "running" - } else if existing.container_exists { - "stopped" - } else { - "volume only" - }; - eprintln!(); - eprintln!( - "{} Gateway '{name}' already exists ({status}).", - "!".yellow().bold() - ); - if let Some(image) = &existing.container_image { - eprintln!(" {} {}", "Image:".dimmed(), image); - } - eprintln!(); - eprint!("Destroy and recreate? [y/N] "); - std::io::stderr().flush().ok(); - let mut input = String::new(); - std::io::stdin() - .read_line(&mut input) - .into_diagnostic() - .wrap_err("failed to read user input")?; - let choice = input.trim().to_lowercase(); - should_recreate = choice == "y" || choice == "yes"; - if !should_recreate { - eprintln!("Keeping existing gateway."); - return Ok(()); - } - } else { - // Non-interactive mode: reuse existing gateway silently. - eprintln!("Gateway '{name}' already exists, reusing."); - return Ok(()); - } + if should_recreate { + // --recreate flag: fall through to destroy and redeploy. + } else if existing.container_running { + // Already running — nothing to do. + eprintln!( + "{} Gateway '{name}' is already running.", + "✓".green().bold() + ); + return Ok(()); + } else { + // Stopped container or volume-only: auto-resume from existing state. + should_resume = true; } } @@ -1420,7 +1398,8 @@ pub async fn gateway_admin_deploy( .with_disable_tls(disable_tls) .with_disable_gateway_auth(disable_gateway_auth) .with_gpu(gpu) - .with_recreate(should_recreate); + .with_recreate(should_recreate) + .with_resume(should_resume); if let Some(opts) = remote_opts { options = options.with_remote(opts); } diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index d4717d88e..367665db3 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -457,10 +457,10 @@ if [ -n "${IMAGE_PULL_POLICY:-}" ] && [ -f "$HELMCHART" ]; then sed -i "s|pullPolicy: Always|pullPolicy: ${IMAGE_PULL_POLICY}|" "$HELMCHART" fi -# Generate a random SSH handshake secret for the NSSH1 HMAC handshake between -# the gateway and sandbox SSH servers. This is required — the server will refuse -# to start without it. -SSH_HANDSHAKE_SECRET="${SSH_HANDSHAKE_SECRET:-$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n')}" +# SSH handshake secret: previously generated here and injected via sed into the +# HelmChart CR. Now persisted as a Kubernetes Secret (openshell-ssh-handshake) +# created by the bootstrap process after k3s starts. This ensures the secret +# survives container restarts without regeneration. # Inject SSH gateway host/port into the HelmChart manifest so the openshell # server returns the correct address to CLI clients for SSH proxy CONNECT. @@ -479,9 +479,6 @@ if [ -f "$HELMCHART" ]; then # Clear the placeholder so the default (8080) is used sed -i "s|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g" "$HELMCHART" fi - echo "Setting SSH handshake secret" - sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART" - # Disable gateway auth: when set, the server accepts connections without # client certificates (for reverse-proxy / Cloudflare Tunnel deployments). if [ "${DISABLE_GATEWAY_AUTH:-}" = "true" ]; then diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh index 68210b456..1bf76f71f 100644 --- a/deploy/docker/cluster-healthcheck.sh +++ b/deploy/docker/cluster-healthcheck.sh @@ -68,3 +68,6 @@ if [ "${DISABLE_TLS:-}" != "true" ]; then kubectl -n openshell get secret openshell-server-tls >/dev/null 2>&1 || exit 1 kubectl -n openshell get secret openshell-client-tls >/dev/null 2>&1 || exit 1 fi + +# Verify SSH handshake secret exists (created by openshell-bootstrap alongside TLS secrets) +kubectl -n openshell get secret openshell-ssh-handshake >/dev/null 2>&1 || exit 1 diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 1be8f14ab..ed503a78e 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -77,7 +77,10 @@ spec: value: {{ .Values.server.hostGatewayIP | quote }} {{- end }} - name: OPENSHELL_SSH_HANDSHAKE_SECRET - value: {{ required "server.sshHandshakeSecret is required" .Values.server.sshHandshakeSecret | quote }} + valueFrom: + secretKeyRef: + name: {{ .Values.server.sshHandshakeSecretName | quote }} + key: secret {{- if .Values.server.disableTls }} - name: OPENSHELL_DISABLE_TLS value: "true" diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index ccc8d1ffa..d698e8120 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -85,10 +85,10 @@ server: sshGatewayPort: 0 # TLS configuration for the server. The server always terminates mTLS # directly and requires client certificates. - # HMAC secret used for the NSSH1 handshake between gateway and sandbox SSH. - # Required — the server will refuse to start if empty. For cluster deployments - # this is auto-generated by the entrypoint script. - sshHandshakeSecret: "" + # Name of the Kubernetes Secret holding the NSSH1 HMAC handshake key. + # The secret must contain a `secret` key with the hex-encoded HMAC key. + # For cluster deployments this is auto-created by the bootstrap process. + sshHandshakeSecretName: "openshell-ssh-handshake" # Host gateway IP for sandbox pod hostAliases. When set, sandbox pods get # hostAliases entries mapping host.docker.internal and host.openshell.internal # to this IP, allowing them to reach services running on the Docker host. diff --git a/deploy/kube/manifests/openshell-helmchart.yaml b/deploy/kube/manifests/openshell-helmchart.yaml index 2245c72ed..ae22ddc6a 100644 --- a/deploy/kube/manifests/openshell-helmchart.yaml +++ b/deploy/kube/manifests/openshell-helmchart.yaml @@ -32,7 +32,6 @@ spec: sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest sshGatewayHost: __SSH_GATEWAY_HOST__ sshGatewayPort: __SSH_GATEWAY_PORT__ - sshHandshakeSecret: __SSH_HANDSHAKE_SECRET__ grpcEndpoint: "https://openshell.openshell.svc.cluster.local:8080" hostGatewayIP: __HOST_GATEWAY_IP__ disableGatewayAuth: __DISABLE_GATEWAY_AUTH__ diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index 600bdd6c7..307e76233 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -408,12 +408,13 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then # terminates mTLS (there is no server.tls.enabled toggle). Without this, # a prior Helm override or chart default change could silently regress # sandbox callbacks to plaintext. - # Retrieve the existing handshake secret from the running release, or generate - # a new one if this is the first deploy with the mandatory secret. - EXISTING_SECRET=$(cluster_exec "helm get values openshell -n openshell -o json 2>/dev/null \ - | grep -o '\"sshHandshakeSecret\":\"[^\"]*\"' \ - | cut -d'\"' -f4") || true - SSH_HANDSHAKE_SECRET="${EXISTING_SECRET:-$(openssl rand -hex 32)}" + # Ensure the SSH handshake K8s secret exists. The bootstrap process normally + # creates it, but fast-deploy may run before bootstrap on a fresh cluster. + EXISTING_SECRET=$(cluster_exec "kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null | base64 -d" 2>/dev/null) || true + if [ -z "${EXISTING_SECRET}" ]; then + SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)" + cluster_exec "kubectl -n openshell create secret generic openshell-ssh-handshake --from-literal=secret='${SSH_HANDSHAKE_SECRET}' --dry-run=client -o yaml | kubectl apply -f -" + fi # Retrieve the host gateway IP from the entrypoint-rendered HelmChart CR so # that hostAliases for host.openshell.internal are preserved across fast deploys. @@ -433,7 +434,6 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then --set server.tls.certSecretName=openshell-server-tls \ --set server.tls.clientCaSecretName=openshell-server-client-ca \ --set server.tls.clientTlsSecretName=openshell-client-tls \ - --set server.sshHandshakeSecret=${SSH_HANDSHAKE_SECRET} \ ${HOST_GATEWAY_ARGS} \ ${helm_wait_args}" helm_end=$(date +%s) From 55f30bcb25b3eda63f8859666f227855f6154133 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 19 Mar 2026 21:43:22 -0700 Subject: [PATCH 02/10] add e2e tests --- e2e/rust/tests/gateway_resume.rs | 324 +++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 e2e/rust/tests/gateway_resume.rs diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs new file mode 100644 index 000000000..b9e9b9710 --- /dev/null +++ b/e2e/rust/tests/gateway_resume.rs @@ -0,0 +1,324 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! E2E tests for gateway resume from existing state. +//! +//! These tests verify that `openshell gateway start` resumes from existing +//! Docker volume state (after stop or container removal) and that the SSH +//! handshake secret persists across container restarts. +//! +//! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one. + +use std::process::{Command, Stdio}; +use std::time::Duration; + +use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::output::strip_ansi; +use tokio::time::sleep; + +/// Default gateway name used by the e2e cluster. +const GATEWAY_NAME: &str = "openshell"; + +/// Docker container name for the default gateway. +fn container_name() -> String { + format!("openshell-cluster-{GATEWAY_NAME}") +} + +/// Run `openshell ` and return (combined output, exit code). +async fn run_cli(args: &[&str]) -> (String, i32) { + let mut cmd = openshell_cmd(); + cmd.args(args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let output = cmd.output().await.expect("spawn openshell"); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let combined = format!("{stdout}{stderr}"); + let code = output.status.code().unwrap_or(-1); + (combined, code) +} + +/// Run `docker ` synchronously and return (stdout, exit code). +fn docker_cmd(args: &[&str]) -> (String, i32) { + let output = Command::new("docker") + .args(args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .expect("spawn docker"); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let code = output.status.code().unwrap_or(-1); + (stdout, code) +} + +/// Wait for the gateway to become healthy by polling `openshell status`. +async fn wait_for_healthy(timeout: Duration) { + let start = std::time::Instant::now(); + loop { + let (output, code) = run_cli(&["status"]).await; + let clean = strip_ansi(&output).to_lowercase(); + if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("✓")) { + return; + } + if start.elapsed() > timeout { + panic!( + "gateway did not become healthy within {}s. Last output:\n{}", + timeout.as_secs(), + strip_ansi(&output) + ); + } + sleep(Duration::from_secs(3)).await; + } +} + +/// Read the SSH handshake secret from the K8s secret inside the cluster. +fn read_ssh_handshake_secret() -> Option { + let cname = container_name(); + let (output, code) = docker_cmd(&[ + "exec", + &cname, + "sh", + "-c", + "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null", + ]); + if code == 0 && !output.trim().is_empty() { + Some(output.trim().to_string()) + } else { + None + } +} + +// ------------------------------------------------------------------- +// Test: `gateway start` on an already-running gateway succeeds +// ------------------------------------------------------------------- + +/// When the gateway is already running, `openshell gateway start` should +/// return immediately with exit code 0 and indicate it's already running. +#[tokio::test] +async fn gateway_start_on_running_gateway_succeeds() { + // Precondition: gateway is running (e2e cluster is up). + wait_for_healthy(Duration::from_secs(30)).await; + + let (output, code) = run_cli(&["gateway", "start"]).await; + let clean = strip_ansi(&output); + + assert_eq!( + code, 0, + "gateway start on running gateway should exit 0:\n{clean}" + ); + assert!( + clean.to_lowercase().contains("already running"), + "output should indicate gateway is already running:\n{clean}" + ); +} + +// ------------------------------------------------------------------- +// Test: gateway stop → start resumes, sandbox survives +// ------------------------------------------------------------------- + +/// After `gateway stop` then `gateway start`, the gateway should resume +/// from existing state. A sandbox created before the stop should still +/// appear in the sandbox list after restart. +#[tokio::test] +async fn gateway_stop_start_resumes_with_sandbox() { + // Precondition: gateway is healthy. + wait_for_healthy(Duration::from_secs(30)).await; + + // Create a sandbox that we'll check for after restart. + let (create_output, create_code) = + run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await; + let clean_create = strip_ansi(&create_output); + assert_eq!( + create_code, 0, + "sandbox create should succeed:\n{clean_create}" + ); + + // Extract sandbox name from output. + let sandbox_name = clean_create + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from create output"); + + // Stop the gateway. + let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await; + assert_eq!( + stop_code, 0, + "gateway stop should succeed:\n{}", + strip_ansi(&stop_output) + ); + + // Wait a moment for the container to fully stop. + sleep(Duration::from_secs(3)).await; + + // Verify container is stopped. + let (inspect_out, _) = docker_cmd(&[ + "inspect", + "-f", + "{{.State.Running}}", + &container_name(), + ]); + assert_eq!( + inspect_out.trim(), + "false", + "container should be stopped after gateway stop" + ); + + // Start the gateway again — should resume from existing state. + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + let clean_start = strip_ansi(&start_output); + assert_eq!( + start_code, 0, + "gateway start after stop should succeed:\n{clean_start}" + ); + + // Wait for the gateway to become healthy again. + wait_for_healthy(Duration::from_secs(180)).await; + + // Verify the sandbox still exists. + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean_list = strip_ansi(&list_output); + assert_eq!( + list_code, 0, + "sandbox list should succeed after resume:\n{clean_list}" + ); + assert!( + clean_list.contains(&sandbox_name), + "sandbox '{sandbox_name}' should survive gateway stop/start.\nList output:\n{clean_list}" + ); + + // Cleanup: delete the test sandbox. + let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; +} + +// ------------------------------------------------------------------- +// Test: container removed → gateway start resumes +// ------------------------------------------------------------------- + +/// After the Docker container is force-removed (simulating Docker restart), +/// `openshell gateway start` should resume from the existing volume. +#[tokio::test] +async fn gateway_start_resumes_after_container_removal() { + // Precondition: gateway is healthy. + wait_for_healthy(Duration::from_secs(30)).await; + + // Create a sandbox to verify state persistence. + let (create_output, create_code) = + run_cli(&["sandbox", "create", "--", "echo", "container-rm-test"]).await; + let clean_create = strip_ansi(&create_output); + assert_eq!( + create_code, 0, + "sandbox create should succeed:\n{clean_create}" + ); + + let sandbox_name = clean_create + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from create output"); + + // Force-remove the container (simulates Docker restart / OOM kill). + let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]); + assert_eq!(rm_code, 0, "docker rm -f should succeed"); + + // Verify the volume still exists. + let (vol_out, vol_code) = docker_cmd(&[ + "volume", + "inspect", + &format!("openshell-cluster-{GATEWAY_NAME}"), + ]); + assert_eq!( + vol_code, 0, + "volume should still exist after container removal:\n{vol_out}" + ); + + // Start the gateway — should resume from the volume. + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + let clean_start = strip_ansi(&start_output); + assert_eq!( + start_code, 0, + "gateway start after container removal should succeed:\n{clean_start}" + ); + + // Wait for healthy. + wait_for_healthy(Duration::from_secs(180)).await; + + // Verify sandbox survived. + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean_list = strip_ansi(&list_output); + assert_eq!( + list_code, 0, + "sandbox list should succeed after resume:\n{clean_list}" + ); + assert!( + clean_list.contains(&sandbox_name), + "sandbox '{sandbox_name}' should survive container removal + resume.\nList output:\n{clean_list}" + ); + + // Cleanup. + let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; +} + +// ------------------------------------------------------------------- +// Test: SSH handshake secret persists across container restart +// ------------------------------------------------------------------- + +/// The SSH handshake K8s secret should persist across gateway stop/start +/// cycles — the same base64-encoded value should be returned before and +/// after the restart. +#[tokio::test] +async fn ssh_handshake_secret_persists_across_restart() { + // Precondition: gateway is healthy. + wait_for_healthy(Duration::from_secs(30)).await; + + // Read the SSH handshake secret before restart. + let secret_before = read_ssh_handshake_secret() + .expect("SSH handshake secret should exist before restart"); + assert!( + !secret_before.is_empty(), + "SSH handshake secret should not be empty" + ); + + // Stop the gateway. + let (_, stop_code) = run_cli(&["gateway", "stop"]).await; + assert_eq!(stop_code, 0, "gateway stop should succeed"); + + sleep(Duration::from_secs(3)).await; + + // Start the gateway. + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + assert_eq!( + start_code, 0, + "gateway start should succeed:\n{}", + strip_ansi(&start_output) + ); + + // Wait for healthy. + wait_for_healthy(Duration::from_secs(180)).await; + + // Read the secret after restart. + let secret_after = read_ssh_handshake_secret() + .expect("SSH handshake secret should exist after restart"); + + assert_eq!( + secret_before, secret_after, + "SSH handshake secret should be identical before and after restart" + ); +} From a2c278f5cd1e6344ee98a5c5b1a6f352db7b26b0 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 19 Mar 2026 22:24:38 -0700 Subject: [PATCH 03/10] fix(bootstrap): handle stale network and PKI on gateway resume On resume after container kill, ensure_network destroys and recreates the Docker network with a new ID. The stopped container still referenced the old network ID, causing 'network not found' on start. Fix by reconciling the container's network attachment in ensure_container. Also, reconcile_pki was attempting to load K8s secrets before k3s had booted, failing transiently, and regenerating PKI unnecessarily. This triggered a server rollout restart causing TLS errors. Fix by waiting for the openshell namespace before attempting to read existing secrets. Add gRPC readiness check to gateway_admin_deploy so the CLI waits for the server to accept connections before declaring the gateway ready. Add e2e test covering container kill, stale network, sandbox persistence, and sandbox create after resume. --- crates/openshell-bootstrap/src/docker.rs | 82 ++++++++++++++++- crates/openshell-bootstrap/src/lib.rs | 12 ++- crates/openshell-cli/src/bootstrap.rs | 2 +- crates/openshell-cli/src/run.rs | 9 ++ e2e/rust/tests/gateway_resume.rs | 110 +++++++++++++++++++++++ 5 files changed, 207 insertions(+), 8 deletions(-) diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index 157823bf3..bca30e073 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -8,9 +8,9 @@ use bollard::API_DEFAULT_VERSION; use bollard::Docker; use bollard::errors::Error as BollardError; use bollard::models::{ - ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum, - NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, RestartPolicy, - RestartPolicyNameEnum, VolumeCreateRequest, + ContainerCreateBody, DeviceRequest, EndpointSettings, HostConfig, HostConfigCgroupnsModeEnum, + NetworkConnectRequest, NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, + RestartPolicy, RestartPolicyNameEnum, VolumeCreateRequest, }; use bollard::query_parameters::{ CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions, @@ -506,6 +506,17 @@ pub async fn ensure_container( }; if image_matches { + // The container exists with the correct image, but its network + // attachment may be stale. When the gateway is resumed after a + // container kill, `ensure_network` destroys and recreates the + // Docker network (giving it a new ID). The stopped container + // still references the old network ID, so `docker start` would + // fail with "network not found". + // + // Fix: disconnect from any existing networks and reconnect to + // the current (just-created) network before returning. + let expected_net = network_name(name); + reconcile_container_network(docker, &container_name, &expected_net).await?; return Ok(()); } @@ -1042,6 +1053,71 @@ async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> { } } +/// Ensure a stopped container is connected to the expected Docker network. +/// +/// When a gateway is resumed after the container was killed (but not removed), +/// `ensure_network` destroys and recreates the network with a new ID. The +/// stopped container still holds a reference to the old network ID in its +/// config, so `docker start` would fail with a 404 "network not found" error. +/// +/// This function disconnects the container from any networks that no longer +/// match the expected network name and connects it to the correct one. +async fn reconcile_container_network( + docker: &Docker, + container_name: &str, + expected_network: &str, +) -> Result<()> { + let info = docker + .inspect_container(container_name, None::) + .await + .into_diagnostic() + .wrap_err("failed to inspect container for network reconciliation")?; + + // Check the container's current network attachments via NetworkSettings. + let attached_networks: Vec = info + .network_settings + .as_ref() + .and_then(|ns| ns.networks.as_ref()) + .map(|nets| nets.keys().cloned().collect()) + .unwrap_or_default(); + + // If the container is already attached to the expected network (by name), + // Docker will resolve the name to the current network ID on start. + // However, when the network was destroyed and recreated, the container's + // stored endpoint references the old ID. Disconnect and reconnect to + // pick up the new network ID. + for net_name in &attached_networks { + let _ = docker + .disconnect_network( + net_name, + NetworkDisconnectRequest { + container: container_name.to_string(), + force: Some(true), + }, + ) + .await; + } + + // Connect to the (freshly created) expected network. + docker + .connect_network( + expected_network, + NetworkConnectRequest { + container: container_name.to_string(), + endpoint_config: Some(EndpointSettings::default()), + }, + ) + .await + .into_diagnostic() + .wrap_err("failed to connect container to gateway network")?; + + tracing::debug!( + "Reconciled network for container {container_name}: disconnected from {attached_networks:?}, connected to {expected_network}" + ); + + Ok(()) +} + fn is_not_found(err: &BollardError) -> bool { matches!( err, diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 7aba7c630..aabc11f9f 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -880,6 +880,14 @@ where let cname = container_name(name); let kubeconfig = constants::KUBECONFIG_PATH; + // Wait for the k3s API server and openshell namespace before attempting + // to read secrets. Without this, kubectl fails transiently on resume + // (k3s hasn't booted yet), the code assumes secrets are gone, and + // regenerates PKI unnecessarily — triggering a server rollout restart + // and TLS errors for in-flight connections. + log("[progress] Waiting for openshell namespace".to_string()); + wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?; + // Try to load existing secrets. match load_existing_pki_bundle(docker, &cname, kubeconfig).await { Ok(bundle) => { @@ -894,10 +902,6 @@ where } // Generate fresh PKI and apply to cluster. - // Namespace may still be creating on first bootstrap, so wait here only - // when rotation is actually needed. - log("[progress] Waiting for openshell namespace".to_string()); - wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?; log("[progress] Generating TLS certificates".to_string()); let bundle = generate_pki(extra_sans)?; log("[progress] Applying TLS secrets to gateway".to_string()); diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index 9cbe8332a..0928991ec 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -226,7 +226,7 @@ pub async fn run_bootstrap( /// Retry connecting to the gateway gRPC endpoint until it succeeds or a /// timeout is reached. Uses exponential backoff starting at 500 ms, doubling /// up to 4 s, with a total deadline of 30 s. -async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { +pub(crate) async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { const MAX_WAIT: Duration = Duration::from_secs(30); const INITIAL_BACKOFF: Duration = Duration::from_millis(500); diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 4bdb94c56..964988d4d 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1415,6 +1415,15 @@ pub async fn gateway_admin_deploy( let handle = deploy_gateway_with_panel(options, name, location).await?; + // Wait for the gRPC endpoint to actually accept connections before + // declaring the gateway ready. The Docker health check may pass before + // the gRPC listener inside the pod is fully bound. + let server = handle.gateway_endpoint().to_string(); + let tls = TlsOptions::default() + .with_gateway_name(name) + .with_default_paths(&server); + crate::bootstrap::wait_for_grpc_ready(&server, &tls).await?; + print_deploy_summary(name, &handle); // Auto-activate: set this gateway as the active gateway. diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs index b9e9b9710..dbe091ccb 100644 --- a/e2e/rust/tests/gateway_resume.rs +++ b/e2e/rust/tests/gateway_resume.rs @@ -276,6 +276,116 @@ async fn gateway_start_resumes_after_container_removal() { let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; } +// ------------------------------------------------------------------- +// Test: container killed → gateway start resumes, sandboxes survive, +// new sandbox create works +// ------------------------------------------------------------------- + +/// When a container is killed (stopped but NOT removed), `gateway start` +/// should resume from existing state. This validates three things: +/// +/// 1. The stale Docker network reference is reconciled (ensure_network +/// destroys and recreates the network with a new ID). +/// 2. Existing sandboxes created before the kill survive the restart. +/// 3. New `sandbox create` works after resume — the TLS certificates +/// are reused (not needlessly regenerated), so the CLI's mTLS certs +/// still match the server. +#[tokio::test] +async fn gateway_start_resumes_after_container_kill() { + // Precondition: gateway is healthy. + wait_for_healthy(Duration::from_secs(30)).await; + + let cname = container_name(); + let net_name = format!("openshell-cluster-{GATEWAY_NAME}"); + + // Create a sandbox before the kill to verify state persistence. + let (create_output, create_code) = + run_cli(&["sandbox", "create", "--", "echo", "kill-resume-test"]).await; + let clean_create = strip_ansi(&create_output); + assert_eq!( + create_code, 0, + "sandbox create should succeed:\n{clean_create}" + ); + + let sandbox_before = clean_create + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from create output"); + + // Kill the container (it remains as a stopped container, unlike `docker rm`). + let (_, kill_code) = docker_cmd(&["kill", &cname]); + assert_eq!(kill_code, 0, "docker kill should succeed"); + + sleep(Duration::from_secs(3)).await; + + // Remove the Docker network to simulate a stale network reference. + // The bootstrap `ensure_network` always destroys and recreates, so + // after this the container's stored network ID will be invalid. + let _ = docker_cmd(&["network", "disconnect", "-f", &net_name, &cname]); + let (_, net_rm_code) = docker_cmd(&["network", "rm", &net_name]); + assert_eq!( + net_rm_code, 0, + "docker network rm should succeed (or network already gone)" + ); + + // Start the gateway — must handle stale network + reuse existing PKI. + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + let clean_start = strip_ansi(&start_output); + assert_eq!( + start_code, 0, + "gateway start after kill should succeed:\n{clean_start}" + ); + + // Wait for the gateway to become healthy again. + wait_for_healthy(Duration::from_secs(180)).await; + + // Verify the pre-existing sandbox survived. + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean_list = strip_ansi(&list_output); + assert_eq!( + list_code, 0, + "sandbox list should succeed after resume:\n{clean_list}" + ); + assert!( + clean_list.contains(&sandbox_before), + "sandbox '{sandbox_before}' should survive container kill + resume.\nList output:\n{clean_list}" + ); + + // Create a new sandbox to verify TLS is working end-to-end. + let (new_create_output, new_create_code) = + run_cli(&["sandbox", "create", "--", "echo", "post-resume-test"]).await; + let clean_new = strip_ansi(&new_create_output); + assert_eq!( + new_create_code, 0, + "sandbox create after resume should succeed (TLS must work):\n{clean_new}" + ); + + let sandbox_after = clean_new + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from post-resume create output"); + + // Cleanup. + let _ = run_cli(&["sandbox", "delete", &sandbox_before]).await; + let _ = run_cli(&["sandbox", "delete", &sandbox_after]).await; +} + // ------------------------------------------------------------------- // Test: SSH handshake secret persists across container restart // ------------------------------------------------------------------- From 3018f5df315c60366f0cabbf6d85ce42db0530f6 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 19 Mar 2026 23:05:31 -0700 Subject: [PATCH 04/10] fix(e2e): match 'connected' status in gateway resume health check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The wait_for_healthy helper checked for 'healthy', 'running', or '✓' but openshell status outputs 'Connected'. All five gateway_resume tests were failing because the health check never matched. --- e2e/rust/tests/gateway_resume.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs index dbe091ccb..67d28d197 100644 --- a/e2e/rust/tests/gateway_resume.rs +++ b/e2e/rust/tests/gateway_resume.rs @@ -60,7 +60,7 @@ async fn wait_for_healthy(timeout: Duration) { loop { let (output, code) = run_cli(&["status"]).await; let clean = strip_ansi(&output).to_lowercase(); - if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("✓")) { + if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("connected") || clean.contains("✓")) { return; } if start.elapsed() > timeout { From 8494755514b411fad6156f4b46ced106af06658d Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 19 Mar 2026 23:13:24 -0700 Subject: [PATCH 05/10] refactor(bootstrap): remove resume from DeployOptions, auto-detect internally The deploy flow now auto-detects whether to resume by checking for existing gateway state inside deploy_gateway_with_logs. Callers no longer need to compute and pass a resume flag. The explicit gateway start path still short-circuits for already-running gateways to avoid redundant work. --- crates/openshell-bootstrap/src/lib.rs | 45 ++++++--------------------- crates/openshell-cli/src/bootstrap.rs | 22 ++++++------- crates/openshell-cli/src/run.rs | 36 ++++++++------------- 3 files changed, 31 insertions(+), 72 deletions(-) diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index aabc11f9f..61f95fa31 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -123,11 +123,6 @@ pub struct DeployOptions { /// When false, an existing gateway is left as-is and deployment is /// skipped (the caller is responsible for prompting the user first). pub recreate: bool, - /// When true, resume from existing state (volume, stopped container) - /// instead of erroring out. The deploy flow reuses the existing volume - /// and creates a new container if needed, preserving k3s/etcd state, - /// sandbox pods, and secrets. - pub resume: bool, } impl DeployOptions { @@ -144,7 +139,6 @@ impl DeployOptions { registry_token: None, gpu: vec![], recreate: false, - resume: false, } } @@ -214,13 +208,6 @@ impl DeployOptions { self.recreate = recreate; self } - - /// Set whether to resume from existing state (volume, stopped container). - #[must_use] - pub fn with_resume(mut self, resume: bool) -> Self { - self.resume = resume; - self - } } #[derive(Debug, Clone)] @@ -285,7 +272,6 @@ where let registry_token = options.registry_token; let gpu = options.gpu; let recreate = options.recreate; - let resume = options.resume; // Wrap on_log in Arc> so we can share it with pull_remote_image // which needs a 'static callback for the bollard streaming pull. @@ -322,35 +308,22 @@ where .and_then(|info| info.cdi_spec_dirs) .is_some_and(|dirs| !dirs.is_empty()); - // Guard: recreate takes precedence if both are set (shouldn't happen in practice). - if resume && recreate { - tracing::warn!("both resume and recreate set; recreate takes precedence"); - } - // If an existing gateway is found, decide how to proceed: // - recreate: destroy everything and start fresh - // - resume: keep existing state and create/start the container - // - neither: error out (caller should prompt the user) + // - otherwise: auto-resume from existing state (the ensure_* calls are + // idempotent and will reuse the volume, create a container if needed, + // and start it) + let mut resume = false; if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { if recreate { log("[status] Removing existing gateway".to_string()); destroy_gateway_resources(&target_docker, &name).await?; - } else if resume { - if existing.container_running { - log("[status] Gateway is already running".to_string()); - } else { - log("[status] Resuming gateway from existing state".to_string()); - } - // Fall through to ensure_* calls — they are idempotent and will - // reuse the existing volume, create a container if needed, and - // start it. + } else if existing.container_running { + log("[status] Gateway is already running".to_string()); + resume = true; } else { - return Err(miette::miette!( - "Gateway '{name}' already exists (container_running={}).\n\ - Use --recreate to destroy and redeploy, or destroy it first with:\n\n \ - openshell gateway destroy --name {name}", - existing.container_running, - )); + log("[status] Resuming gateway from existing state".to_string()); + resume = true; } } diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index 0928991ec..6a6e4d25e 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -144,12 +144,11 @@ pub async fn run_bootstrap( ); eprintln!(); - // Build base deploy options. The auto-bootstrap path tries to resume from - // existing state first (preserving sandboxes and secrets), falling back to - // a full recreate if the resume fails. - let build_options = |resume: bool, recreate: bool| { + // Build deploy options. The deploy flow auto-resumes from existing state + // (preserving sandboxes and secrets) when it finds an existing gateway. + // If the initial attempt fails, fall back to a full recreate. + let build_options = |recreate: bool| { let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name) - .with_resume(resume) .with_recreate(recreate) .with_gpu(gpu); if let Some(dest) = remote { @@ -186,18 +185,15 @@ pub async fn run_bootstrap( opts }; - // Try resume first to preserve existing sandboxes and secrets. - let handle = match deploy_gateway_with_panel( - build_options(true, false), - &gateway_name, - location, - ) - .await + // Deploy the gateway. The deploy flow auto-resumes from existing state + // when it finds one. If that fails, fall back to a full recreate. + let handle = match deploy_gateway_with_panel(build_options(false), &gateway_name, location) + .await { Ok(handle) => handle, Err(resume_err) => { tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}"); - deploy_gateway_with_panel(build_options(false, true), &gateway_name, location).await? + deploy_gateway_with_panel(build_options(true), &gateway_name, location).await? } }; let server = handle.gateway_endpoint().to_string(); diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 964988d4d..874c91a81 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1369,27 +1369,18 @@ pub async fn gateway_admin_deploy( opts }); - // Check whether a gateway already exists and decide how to proceed: - // - --recreate: always destroy and start fresh - // - existing state (stopped container / volume only): auto-resume - // - already running: return immediately (nothing to do) - let should_recreate = recreate; - let mut should_resume = false; - if let Some(existing) = - openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? - { - if should_recreate { - // --recreate flag: fall through to destroy and redeploy. - } else if existing.container_running { - // Already running — nothing to do. - eprintln!( - "{} Gateway '{name}' is already running.", - "✓".green().bold() - ); - return Ok(()); - } else { - // Stopped container or volume-only: auto-resume from existing state. - should_resume = true; + // If the gateway is already running and we're not recreating, short-circuit. + if !recreate { + if let Some(existing) = + openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? + { + if existing.container_running { + eprintln!( + "{} Gateway '{name}' is already running.", + "✓".green().bold() + ); + return Ok(()); + } } } @@ -1398,8 +1389,7 @@ pub async fn gateway_admin_deploy( .with_disable_tls(disable_tls) .with_disable_gateway_auth(disable_gateway_auth) .with_gpu(gpu) - .with_recreate(should_recreate) - .with_resume(should_resume); + .with_recreate(recreate); if let Some(opts) = remote_opts { options = options.with_remote(opts); } From 94be3e10ccae8df2c68f6e12aa6b6433eb517212 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Fri, 20 Mar 2026 07:01:39 -0700 Subject: [PATCH 06/10] fix(ssh): retry CONNECT on 412 when sandbox not yet ready The gateway returns HTTP 412 (Precondition Failed) when the sandbox pod exists but hasn't reached Ready phase yet. This is a transient state after allocation. Instead of failing immediately, retry with exponential backoff (1s to 8s) for up to 60 seconds. --- crates/openshell-cli/src/ssh.rs | 57 +++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs index 4b284bff1..369f03acb 100644 --- a/crates/openshell-cli/src/ssh.rs +++ b/crates/openshell-cli/src/ssh.rs @@ -693,27 +693,50 @@ pub async fn sandbox_ssh_proxy( .ok_or_else(|| miette::miette!("gateway URL missing port"))?; let connect_path = url.path(); - let mut stream: Box = - connect_gateway(scheme, gateway_host, gateway_port, tls).await?; - let request = format!( "CONNECT {connect_path} HTTP/1.1\r\nHost: {gateway_host}\r\nX-Sandbox-Id: {sandbox_id}\r\nX-Sandbox-Token: {token}\r\n\r\n" ); - stream - .write_all(request.as_bytes()) - .await - .into_diagnostic()?; - // Wrap in a BufReader **before** reading the HTTP response. The gateway - // may send the 200 OK response and the first SSH protocol bytes in the - // same TCP segment / WebSocket frame. A plain `read()` would consume - // those SSH bytes into our buffer and discard them, causing SSH to see a - // truncated protocol banner and exit with code 255. BufReader ensures - // any bytes read past the `\r\n\r\n` header boundary stay buffered and - // are returned by subsequent reads during the bidirectional copy phase. - let mut buf_stream = BufReader::new(stream); - let status = read_connect_status(&mut buf_stream).await?; - if status != 200 { + // The gateway returns 412 (Precondition Failed) when the sandbox pod + // exists but hasn't reached Ready phase yet. This is a transient state + // after sandbox allocation — retry with backoff instead of failing + // immediately. + const MAX_CONNECT_WAIT: std::time::Duration = std::time::Duration::from_secs(60); + const INITIAL_BACKOFF: std::time::Duration = std::time::Duration::from_secs(1); + + let start = std::time::Instant::now(); + let mut backoff = INITIAL_BACKOFF; + let mut buf_stream; + + loop { + let mut stream: Box = + connect_gateway(scheme, gateway_host, gateway_port, tls).await?; + stream + .write_all(request.as_bytes()) + .await + .into_diagnostic()?; + + // Wrap in a BufReader **before** reading the HTTP response. The gateway + // may send the 200 OK response and the first SSH protocol bytes in the + // same TCP segment / WebSocket frame. A plain `read()` would consume + // those SSH bytes into our buffer and discard them, causing SSH to see a + // truncated protocol banner and exit with code 255. BufReader ensures + // any bytes read past the `\r\n\r\n` header boundary stay buffered and + // are returned by subsequent reads during the bidirectional copy phase. + buf_stream = BufReader::new(stream); + let status = read_connect_status(&mut buf_stream).await?; + if status == 200 { + break; + } + if status == 412 && start.elapsed() < MAX_CONNECT_WAIT { + tracing::debug!( + elapsed = ?start.elapsed(), + "sandbox not yet ready (HTTP 412), retrying in {backoff:?}" + ); + tokio::time::sleep(backoff).await; + backoff = (backoff * 2).min(std::time::Duration::from_secs(8)); + continue; + } return Err(miette::miette!( "gateway CONNECT failed with status {status}" )); From e1bea6d4563f425877d7a589e4a4ed37528e83f4 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Fri, 20 Mar 2026 07:11:00 -0700 Subject: [PATCH 07/10] fix: resolve compile warnings across workspace - Remove duplicate Duration import and use unqualified Duration in ssh.rs - Prefix unused default_image parameter with underscore in sandbox/mod.rs - Make SecretResolver pub to match its use in pub function signature --- crates/openshell-cli/src/ssh.rs | 6 +++--- crates/openshell-sandbox/src/secrets.rs | 2 +- crates/openshell-server/src/sandbox/mod.rs | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs index 369f03acb..44b48da7a 100644 --- a/crates/openshell-cli/src/ssh.rs +++ b/crates/openshell-cli/src/ssh.rs @@ -701,8 +701,8 @@ pub async fn sandbox_ssh_proxy( // exists but hasn't reached Ready phase yet. This is a transient state // after sandbox allocation — retry with backoff instead of failing // immediately. - const MAX_CONNECT_WAIT: std::time::Duration = std::time::Duration::from_secs(60); - const INITIAL_BACKOFF: std::time::Duration = std::time::Duration::from_secs(1); + const MAX_CONNECT_WAIT: Duration = Duration::from_secs(60); + const INITIAL_BACKOFF: Duration = Duration::from_secs(1); let start = std::time::Instant::now(); let mut backoff = INITIAL_BACKOFF; @@ -734,7 +734,7 @@ pub async fn sandbox_ssh_proxy( "sandbox not yet ready (HTTP 412), retrying in {backoff:?}" ); tokio::time::sleep(backoff).await; - backoff = (backoff * 2).min(std::time::Duration::from_secs(8)); + backoff = (backoff * 2).min(Duration::from_secs(8)); continue; } return Err(miette::miette!( diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-sandbox/src/secrets.rs index 88b84831e..a27537c91 100644 --- a/crates/openshell-sandbox/src/secrets.rs +++ b/crates/openshell-sandbox/src/secrets.rs @@ -62,7 +62,7 @@ pub(crate) struct RewriteTargetResult { // --------------------------------------------------------------------------- #[derive(Debug, Clone, Default)] -pub(crate) struct SecretResolver { +pub struct SecretResolver { by_placeholder: HashMap, } diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs index 3dca66493..9df8cc37f 100644 --- a/crates/openshell-server/src/sandbox/mod.rs +++ b/crates/openshell-server/src/sandbox/mod.rs @@ -962,6 +962,7 @@ fn sandbox_template_to_k8s( result } + fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option { let mut resources = struct_to_json(&template.resources).unwrap_or_else(|| serde_json::json!({})); From 8760a0f6d2bcd717c08e2b22bf981fdcb80fad6d Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 1 Apr 2026 13:04:14 -0700 Subject: [PATCH 08/10] fix: resolve rebase artifacts (type mismatch and formatting) --- crates/openshell-cli/src/bootstrap.rs | 6 +++++- crates/openshell-server/src/sandbox/mod.rs | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index 6a6e4d25e..41ddc7081 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -150,7 +150,11 @@ pub async fn run_bootstrap( let build_options = |recreate: bool| { let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name) .with_recreate(recreate) - .with_gpu(gpu); + .with_gpu(if gpu { + vec!["auto".to_string()] + } else { + vec![] + }); if let Some(dest) = remote { let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); if let Some(key) = ssh_key { diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs index 9df8cc37f..3dca66493 100644 --- a/crates/openshell-server/src/sandbox/mod.rs +++ b/crates/openshell-server/src/sandbox/mod.rs @@ -962,7 +962,6 @@ fn sandbox_template_to_k8s( result } - fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option { let mut resources = struct_to_json(&template.resources).unwrap_or_else(|| serde_json::json!({})); From a6e1d22c9d497110be279bb6c8d2172ad92cd9a1 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 1 Apr 2026 13:36:46 -0700 Subject: [PATCH 09/10] fix(bootstrap): preserve container hostname across image-change recreation When a gateway is stopped and restarted with a different container image, ensure_container() removes the old container and creates a new one. The new container gets a different hostname (Docker default: container ID prefix), which k3s registers as a new node. Pods on the old node remain stuck in Terminating until the eviction timeout expires, causing the 30s health check to fail with 'connection reset by peer'. Preserve the old container's hostname before removal and set it on the replacement container so k3s sees the same node identity. For fresh containers, set the hostname to the container name for a stable default that survives future recreations. --- crates/openshell-bootstrap/src/docker.rs | 26 ++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index bca30e073..63059bf19 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -482,6 +482,12 @@ pub async fn ensure_container( ) -> Result<()> { let container_name = container_name(name); + // When an existing container is recreated due to an image change, we + // preserve its hostname so the new container registers with the same k3s + // node identity. Without this, k3s sees a brand-new node while pods on + // the old (now-dead) node remain stuck in Terminating. + let mut preserved_hostname: Option = None; + // Check if the container already exists match docker .inspect_container(&container_name, None::) @@ -520,12 +526,21 @@ pub async fn ensure_container( return Ok(()); } - // Image changed — remove the stale container so we can recreate it + // Image changed — remove the stale container so we can recreate it. + // Capture the hostname before removal so the replacement container + // keeps the same k3s node identity. + preserved_hostname = info + .config + .as_ref() + .and_then(|c| c.hostname.clone()) + .filter(|h| !h.is_empty()); + tracing::info!( - "Container {} exists but uses a different image (container={}, desired={}), recreating", + "Container {} exists but uses a different image (container={}, desired={}), recreating (preserving hostname {:?})", container_name, container_image_id.as_deref().map_or("unknown", truncate_id), desired_id.as_deref().map_or("unknown", truncate_id), + preserved_hostname, ); let _ = docker.stop_container(&container_name, None).await; @@ -732,7 +747,14 @@ pub async fn ensure_container( let env = Some(env_vars); + // Use the preserved hostname from a previous container (image-change + // recreation) so k3s keeps the same node identity. For fresh containers + // fall back to the Docker container name, giving a stable hostname that + // survives future image-change recreations. + let hostname = preserved_hostname.unwrap_or_else(|| container_name.clone()); + let config = ContainerCreateBody { + hostname: Some(hostname), image: Some(image_ref.to_string()), cmd: Some(cmd), env, From 4ba1b61921e1ef21af5e2cebfd00f29b61e718f2 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 1 Apr 2026 15:01:51 -0700 Subject: [PATCH 10/10] fix(bootstrap): robust stale node cleanup with retries and pod force-deletion Reverts the hostname preservation approach which caused k3s node password validation failures. Instead, makes clean_stale_nodes() reliable by: 1. Retrying with 3s backoff (up to ~45s) until kubectl becomes available after a container restart, instead of firing once and silently giving up. 2. Force-deleting pods stuck in Terminating on removed stale nodes so StatefulSets can immediately reschedule replacements. This fixes gateway resume failures after stop/start when the container image has changed (common in development), where the new container gets a different k3s node identity and pods on the old node never reschedule. --- .github/workflows/e2e-test.yml | 42 ++- crates/openshell-bootstrap/src/docker.rs | 49 ++- crates/openshell-bootstrap/src/lib.rs | 17 +- crates/openshell-bootstrap/src/runtime.rs | 140 +++++-- crates/openshell-cli/src/bootstrap.rs | 10 +- crates/openshell-cli/src/run.rs | 15 +- deploy/docker/cluster-healthcheck.sh | 9 + e2e/rust/tests/gateway_resume.rs | 435 +++++++++------------- rust-toolchain.toml | 5 + tasks/test.toml | 3 +- 10 files changed, 388 insertions(+), 337 deletions(-) create mode 100644 rust-toolchain.toml diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index f14ccb880..a89f4508f 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -19,9 +19,25 @@ permissions: jobs: e2e: - name: E2E + name: "E2E (${{ matrix.suite }})" runs-on: ${{ inputs.runner }} timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + include: + - suite: python + cluster: e2e-python + port: "8080" + cmd: "mise run --no-prepare --skip-deps e2e:python" + - suite: rust + cluster: e2e-rust + port: "8081" + cmd: "mise run --no-prepare --skip-deps e2e:rust" + - suite: gateway-resume + cluster: e2e-resume + port: "8082" + cmd: "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test gateway_resume" container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -38,6 +54,7 @@ jobs: OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_GATEWAY: ${{ matrix.cluster }} steps: - uses: actions/checkout@v4 @@ -48,21 +65,26 @@ jobs: run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} - name: Install Python dependencies and generate protobuf stubs + if: matrix.suite == 'python' run: uv sync --frozen && mise run --no-prepare python:proto - - name: Bootstrap and deploy cluster + - name: Build Rust CLI + if: matrix.suite != 'python' + run: cargo build -p openshell-cli --features openshell-core/dev-settings + + - name: Install SSH client + if: matrix.suite != 'python' + run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/* + + - name: Bootstrap cluster env: GATEWAY_HOST: host.docker.internal - GATEWAY_PORT: "8080" + GATEWAY_PORT: ${{ matrix.port }} + CLUSTER_NAME: ${{ matrix.cluster }} SKIP_IMAGE_PUSH: "1" SKIP_CLUSTER_IMAGE_BUILD: "1" OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} run: mise run --no-prepare --skip-deps cluster - - name: Install SSH client for Rust CLI e2e tests - run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/* - - - name: Run E2E tests - run: | - mise run --no-prepare --skip-deps e2e:python - mise run --no-prepare --skip-deps e2e:rust + - name: Run tests + run: ${{ matrix.cmd }} diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index 63059bf19..d9aaed7f4 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -467,6 +467,9 @@ pub async fn ensure_image( Ok(()) } +/// Returns the actual host port the container is using. When an existing +/// container is reused (same image), this may differ from `gateway_port` +/// because the container was originally created with a different port. pub async fn ensure_container( docker: &Docker, name: &str, @@ -479,15 +482,9 @@ pub async fn ensure_container( registry_username: Option<&str>, registry_token: Option<&str>, device_ids: &[String], -) -> Result<()> { +) -> Result { let container_name = container_name(name); - // When an existing container is recreated due to an image change, we - // preserve its hostname so the new container registers with the same k3s - // node identity. Without this, k3s sees a brand-new node while pods on - // the old (now-dead) node remain stuck in Terminating. - let mut preserved_hostname: Option = None; - // Check if the container already exists match docker .inspect_container(&container_name, None::) @@ -523,24 +520,31 @@ pub async fn ensure_container( // the current (just-created) network before returning. let expected_net = network_name(name); reconcile_container_network(docker, &container_name, &expected_net).await?; - return Ok(()); + + // Read the actual host port from the container's port bindings + // as a cross-check. The caller should already pass the correct + // port (from stored metadata), but this catches mismatches if + // the container was recreated with a different port externally. + let actual_port = info + .host_config + .as_ref() + .and_then(|hc| hc.port_bindings.as_ref()) + .and_then(|pb| pb.get("30051/tcp")) + .and_then(|bindings| bindings.as_ref()) + .and_then(|bindings| bindings.first()) + .and_then(|b| b.host_port.as_ref()) + .and_then(|p| p.parse::().ok()) + .unwrap_or(gateway_port); + + return Ok(actual_port); } // Image changed — remove the stale container so we can recreate it. - // Capture the hostname before removal so the replacement container - // keeps the same k3s node identity. - preserved_hostname = info - .config - .as_ref() - .and_then(|c| c.hostname.clone()) - .filter(|h| !h.is_empty()); - tracing::info!( - "Container {} exists but uses a different image (container={}, desired={}), recreating (preserving hostname {:?})", + "Container {} exists but uses a different image (container={}, desired={}), recreating", container_name, container_image_id.as_deref().map_or("unknown", truncate_id), desired_id.as_deref().map_or("unknown", truncate_id), - preserved_hostname, ); let _ = docker.stop_container(&container_name, None).await; @@ -747,14 +751,7 @@ pub async fn ensure_container( let env = Some(env_vars); - // Use the preserved hostname from a previous container (image-change - // recreation) so k3s keeps the same node identity. For fresh containers - // fall back to the Docker container name, giving a stable hostname that - // survives future image-change recreations. - let hostname = preserved_hostname.unwrap_or_else(|| container_name.clone()); - let config = ContainerCreateBody { - hostname: Some(hostname), image: Some(image_ref.to_string()), cmd: Some(cmd), env, @@ -774,7 +771,7 @@ pub async fn ensure_container( .await .into_diagnostic() .wrap_err("failed to create gateway container")?; - Ok(()) + Ok(gateway_port) } /// Information about a container that is holding a port we need. diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 61f95fa31..b569cabe7 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -429,7 +429,10 @@ where // See: https://github.com/NVIDIA/OpenShell/issues/463 let deploy_result: Result = async { let device_ids = resolve_gpu_device_ids(&gpu, cdi_supported); - ensure_container( + // ensure_container returns the actual host port — which may differ from + // the requested `port` when reusing an existing container that was + // originally created with a different port. + let actual_port = ensure_container( &target_docker, &name, &image_ref, @@ -443,16 +446,22 @@ where &device_ids, ) .await?; + let port = actual_port; start_container(&target_docker, &name).await?; // Clean up stale k3s nodes left over from previous container instances that - // used the same persistent volume. Without this, pods remain scheduled on + // used the same persistent volume. Without this, pods remain scheduled on // NotReady ghost nodes and the health check will time out. + // + // The function retries internally until kubectl becomes available (k3s may + // still be initialising after the container start). It also force-deletes + // pods stuck in Terminating on the removed nodes so that StatefulSets can + // reschedule replacements immediately. match clean_stale_nodes(&target_docker, &name).await { Ok(0) => {} - Ok(n) => tracing::debug!("removed {n} stale node(s)"), + Ok(n) => tracing::info!("removed {n} stale node(s) and their orphaned pods"), Err(err) => { - tracing::debug!("stale node cleanup failed (non-fatal): {err}"); + tracing::warn!("stale node cleanup failed (non-fatal): {err}"); } } diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs index 271fde8d4..2a10b2651 100644 --- a/crates/openshell-bootstrap/src/runtime.rs +++ b/crates/openshell-bootstrap/src/runtime.rs @@ -362,57 +362,135 @@ pub async fn fetch_recent_logs(docker: &Docker, container_name: &str, n: usize) rendered } -/// Remove stale k3s nodes from a cluster with a reused persistent volume. +/// Remove stale k3s nodes and their orphaned pods from a resumed cluster. /// /// When a cluster container is recreated but the volume is reused, k3s registers /// a new node (using the container ID as the hostname) while old node entries /// persist in etcd. Pods scheduled on those stale `NotReady` nodes will never run, /// causing health checks to fail. /// -/// This function identifies all `NotReady` nodes and deletes them so k3s can -/// reschedule workloads onto the current (Ready) node. +/// This function retries with backoff until `kubectl` becomes available (k3s may +/// still be initialising), then: +/// 1. Deletes all `NotReady` nodes so k3s stops tracking them. +/// 2. Force-deletes any pods stuck in `Terminating` so `StatefulSets` and +/// Deployments can reschedule replacements on the current (Ready) node. /// /// Returns the number of stale nodes removed. pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { + // Retry until kubectl is responsive. k3s can take 10-20 s to start the + // API server after a container restart, so we allow up to ~45 s. + const MAX_ATTEMPTS: u32 = 15; + const RETRY_DELAY: Duration = Duration::from_secs(3); + let container_name = container_name(name); + let mut stale_nodes: Vec = Vec::new(); + + for attempt in 1..=MAX_ATTEMPTS { + // List ALL node names and the container's own hostname. Any node that + // is not the current container is stale — we cannot rely on the Ready + // condition because k3s may not have marked the old node NotReady yet + // when this runs shortly after container start. + let (output, exit_code) = exec_capture_with_exit( + docker, + &container_name, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \ + --no-headers -o custom-columns=NAME:.metadata.name \ + 2>/dev/null" + ), + ], + ) + .await?; + + if exit_code == 0 { + // Determine the current node name (container hostname). + let (hostname_out, _) = + exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()]) + .await?; + let current_hostname = hostname_out.trim().to_string(); + + stale_nodes = output + .lines() + .map(str::trim) + .filter(|l| !l.is_empty() && *l != current_hostname) + .map(ToString::to_string) + .collect(); + break; + } + + if attempt < MAX_ATTEMPTS { + tracing::debug!( + "kubectl not ready yet (attempt {attempt}/{MAX_ATTEMPTS}), retrying in {}s", + RETRY_DELAY.as_secs() + ); + tokio::time::sleep(RETRY_DELAY).await; + } + } + + if stale_nodes.is_empty() { + return Ok(0); + } + + let node_list = stale_nodes.join(" "); + let count = stale_nodes.len(); + tracing::info!("removing {} stale node(s): {}", count, node_list); - // Get the list of NotReady nodes. - // The last condition on a node is always type=Ready; we need to check its - // **status** (True/False/Unknown), not its type. Nodes where the Ready - // condition status is not "True" are stale and should be removed. - let (output, exit_code) = exec_capture_with_exit( + // Step 1: delete the stale node objects. + let (_output, exit_code) = exec_capture_with_exit( docker, &container_name, vec![ "sh".to_string(), "-c".to_string(), format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \ - --no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \ - 2>/dev/null | grep -v '\\bTrue$' | awk '{{print $1}}'" + "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found" ), ], ) .await?; if exit_code != 0 { - // kubectl not ready yet or no nodes — nothing to clean - return Ok(0); + tracing::warn!("failed to delete stale nodes (exit code {exit_code})"); } - let stale_nodes: Vec<&str> = output - .lines() - .map(str::trim) - .filter(|l| !l.is_empty()) - .collect(); - if stale_nodes.is_empty() { - return Ok(0); - } + // Step 2: force-delete pods stuck in Terminating. After the stale node is + // removed, pods that were scheduled on it transition to Terminating but + // will never complete graceful shutdown (the node is gone). StatefulSets + // will not create a replacement until the old pod is fully deleted. + let (_output, exit_code) = exec_capture_with_exit( + docker, + &container_name, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \ + --field-selector=status.phase=Running -o name 2>/dev/null; \ + for pod_line in $(KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \ + --no-headers 2>/dev/null | awk '$4 == \"Terminating\" {{print $1\"/\"$2}}'); do \ + ns=${{pod_line%%/*}}; pod=${{pod_line#*/}}; \ + KUBECONFIG={KUBECONFIG_PATH} kubectl delete pod \"$pod\" -n \"$ns\" \ + --force --grace-period=0 --ignore-not-found 2>/dev/null; \ + done" + ), + ], + ) + .await?; - let node_list = stale_nodes.join(" "); - let count = stale_nodes.len(); - tracing::info!("removing {} stale node(s): {}", count, node_list); + if exit_code != 0 { + tracing::debug!( + "force-delete of terminating pods returned exit code {exit_code} (non-fatal)" + ); + } + // Step 3: delete PersistentVolumeClaims in the openshell namespace whose + // backing PV has node affinity for a stale node. local-path-provisioner + // creates PVs tied to the original node; when the node changes, the PV is + // unschedulable and the `StatefulSet` pod stays Pending. Deleting the PVC + // (and its PV) lets the provisioner create a fresh one on the current node. let (_output, exit_code) = exec_capture_with_exit( docker, &container_name, @@ -420,14 +498,24 @@ pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { "sh".to_string(), "-c".to_string(), format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found" + r#"KUBECONFIG={KUBECONFIG_PATH}; export KUBECONFIG; \ + CURRENT_NODE=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | head -1); \ + [ -z "$CURRENT_NODE" ] && exit 0; \ + for pv in $(kubectl get pv -o jsonpath='{{.items[*].metadata.name}}' 2>/dev/null); do \ + NODE=$(kubectl get pv "$pv" -o jsonpath='{{.spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]}}' 2>/dev/null); \ + [ "$NODE" = "$CURRENT_NODE" ] && continue; \ + NS=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.namespace}}' 2>/dev/null); \ + PVC=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.name}}' 2>/dev/null); \ + [ -n "$PVC" ] && kubectl delete pvc "$PVC" -n "$NS" --ignore-not-found 2>/dev/null; \ + kubectl delete pv "$pv" --ignore-not-found 2>/dev/null; \ + done"# ), ], ) .await?; if exit_code != 0 { - tracing::warn!("failed to delete stale nodes (exit code {exit_code})"); + tracing::debug!("PV/PVC cleanup returned exit code {exit_code} (non-fatal)"); } Ok(count) diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index 41ddc7081..ee9a481aa 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -225,9 +225,13 @@ pub async fn run_bootstrap( /// Retry connecting to the gateway gRPC endpoint until it succeeds or a /// timeout is reached. Uses exponential backoff starting at 500 ms, doubling -/// up to 4 s, with a total deadline of 30 s. +/// up to 4 s, with a total deadline of 90 s. +/// +/// The generous timeout accounts for gateway resume scenarios where stale k3s +/// nodes must be cleaned up and workload pods rescheduled before the gRPC +/// endpoint becomes available. pub(crate) async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { - const MAX_WAIT: Duration = Duration::from_secs(30); + const MAX_WAIT: Duration = Duration::from_secs(90); const INITIAL_BACKOFF: Duration = Duration::from_millis(500); let start = std::time::Instant::now(); @@ -251,7 +255,7 @@ pub(crate) async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Resul Err(last_err .unwrap_or_else(|| miette::miette!("timed out waiting for gateway")) - .wrap_err("gateway deployed but not accepting connections after 30 s")) + .wrap_err("gateway deployed but not accepting connections after 90 s")) } #[cfg(test)] diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 874c91a81..81af9eac4 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1384,8 +1384,21 @@ pub async fn gateway_admin_deploy( } } + // When resuming an existing gateway (not recreating), prefer the port + // from stored metadata over the CLI default. The user may have originally + // bootstrapped on a non-default port (e.g. `--port 8082`) and a bare + // `gateway start` without `--port` should honour that. + let effective_port = if !recreate { + openshell_bootstrap::load_gateway_metadata(name) + .ok() + .filter(|m| m.gateway_port > 0) + .map_or(port, |m| m.gateway_port) + } else { + port + }; + let mut options = DeployOptions::new(name) - .with_port(port) + .with_port(effective_port) .with_disable_tls(disable_tls) .with_disable_gateway_auth(disable_gateway_auth) .with_gpu(gpu) diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh index 1bf76f71f..e2828c6e5 100644 --- a/deploy/docker/cluster-healthcheck.sh +++ b/deploy/docker/cluster-healthcheck.sh @@ -71,3 +71,12 @@ fi # Verify SSH handshake secret exists (created by openshell-bootstrap alongside TLS secrets) kubectl -n openshell get secret openshell-ssh-handshake >/dev/null 2>&1 || exit 1 + +# --------------------------------------------------------------------------- +# Verify the gateway NodePort (30051) is actually accepting TCP connections. +# After a container restart, kube-proxy may need extra time to re-program +# iptables rules for NodePort routing. Without this check the health check +# can pass before the port is routable, causing "Connection refused" on the +# host-mapped port. +# --------------------------------------------------------------------------- +timeout 2 bash -c 'echo >/dev/tcp/127.0.0.1/30051' 2>/dev/null || exit 1 diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs index 67d28d197..01ad4941e 100644 --- a/e2e/rust/tests/gateway_resume.rs +++ b/e2e/rust/tests/gateway_resume.rs @@ -5,9 +5,10 @@ //! E2E tests for gateway resume from existing state. //! -//! These tests verify that `openshell gateway start` resumes from existing -//! Docker volume state (after stop or container removal) and that the SSH -//! handshake secret persists across container restarts. +//! All scenarios run inside a **single** `#[tokio::test]` so they execute +//! in a deterministic order and share a known-good gateway state. Each +//! scenario restores the gateway to a healthy state before the next one +//! begins, preventing cascading failures. //! //! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one. @@ -18,12 +19,19 @@ use openshell_e2e::harness::binary::openshell_cmd; use openshell_e2e::harness::output::strip_ansi; use tokio::time::sleep; -/// Default gateway name used by the e2e cluster. -const GATEWAY_NAME: &str = "openshell"; +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- -/// Docker container name for the default gateway. +/// Resolve the gateway name from the `OPENSHELL_GATEWAY` env var (the same +/// variable the CLI reads), falling back to `"openshell"` which matches CI. +fn gateway_name() -> String { + std::env::var("OPENSHELL_GATEWAY").unwrap_or_else(|_| "openshell".to_string()) +} + +/// Docker container name for the e2e gateway. fn container_name() -> String { - format!("openshell-cluster-{GATEWAY_NAME}") + format!("openshell-cluster-{}", gateway_name()) } /// Run `openshell ` and return (combined output, exit code). @@ -60,7 +68,12 @@ async fn wait_for_healthy(timeout: Duration) { loop { let (output, code) = run_cli(&["status"]).await; let clean = strip_ansi(&output).to_lowercase(); - if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("connected") || clean.contains("✓")) { + if code == 0 + && (clean.contains("healthy") + || clean.contains("running") + || clean.contains("connected") + || clean.contains("✓")) + { return; } if start.elapsed() > timeout { @@ -91,17 +104,81 @@ fn read_ssh_handshake_secret() -> Option { } } -// ------------------------------------------------------------------- -// Test: `gateway start` on an already-running gateway succeeds -// ------------------------------------------------------------------- +/// Extract the sandbox name from `openshell sandbox create` output. +fn extract_sandbox_name(output: &str) -> String { + strip_ansi(output) + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from create output") +} + +/// Run `gateway start` and log the output if it fails (non-fatal — the +/// test relies on [`wait_for_healthy`] for the real assertion). +async fn start_gateway() { + let (output, code) = run_cli(&["gateway", "start"]).await; + if code != 0 { + eprintln!( + "gateway start exited {code} (may still recover):\n{}", + strip_ansi(&output) + ); + } +} + +// --------------------------------------------------------------------------- +// Orchestrated test suite +// --------------------------------------------------------------------------- -/// When the gateway is already running, `openshell gateway start` should -/// return immediately with exit code 0 and indicate it's already running. +/// Single entry-point that runs every resume scenario in a fixed order. +/// +/// Running as one `#[tokio::test]` gives us: +/// - **Deterministic ordering** — no async-mutex races. +/// - **Cascade prevention** — each scenario starts only after the previous +/// one left the gateway healthy. +/// - **No task-runner hacks** — no `--test-threads`, `--skip`, or split +/// cargo invocations. #[tokio::test] -async fn gateway_start_on_running_gateway_succeeds() { - // Precondition: gateway is running (e2e cluster is up). +async fn gateway_resume_scenarios() { + // The gateway must already be running (bootstrapped by the `cluster` task). wait_for_healthy(Duration::from_secs(30)).await; + // Warm the sandbox base image by creating (and deleting) a throwaway + // sandbox. On a fresh cluster the ~1 GB image pull can take minutes; + // doing it once up-front keeps the actual scenarios snappy. + eprintln!("--- warmup: pulling sandbox base image ---"); + let (output, code) = + run_cli(&["sandbox", "create", "--", "echo", "warmup"]).await; + if code == 0 { + let name = extract_sandbox_name(&output); + let _ = run_cli(&["sandbox", "delete", &name]).await; + } else { + eprintln!( + "warmup sandbox create failed (non-fatal, image may already be cached):\n{}", + strip_ansi(&output) + ); + } + + scenario_start_on_running_gateway().await; + scenario_ssh_secret_persists_across_restart().await; + scenario_stop_start_resumes_with_sandbox().await; + scenario_container_kill_resumes().await; + scenario_container_removal_resumes().await; +} + +// --------------------------------------------------------------------------- +// Scenario: `gateway start` on an already-running gateway +// --------------------------------------------------------------------------- + +async fn scenario_start_on_running_gateway() { + eprintln!("--- scenario: start on running gateway ---"); + let (output, code) = run_cli(&["gateway", "start"]).await; let clean = strip_ansi(&output); @@ -115,50 +192,60 @@ async fn gateway_start_on_running_gateway_succeeds() { ); } -// ------------------------------------------------------------------- -// Test: gateway stop → start resumes, sandbox survives -// ------------------------------------------------------------------- +// --------------------------------------------------------------------------- +// Scenario: SSH handshake secret persists across restart +// --------------------------------------------------------------------------- -/// After `gateway stop` then `gateway start`, the gateway should resume -/// from existing state. A sandbox created before the stop should still -/// appear in the sandbox list after restart. -#[tokio::test] -async fn gateway_stop_start_resumes_with_sandbox() { - // Precondition: gateway is healthy. - wait_for_healthy(Duration::from_secs(30)).await; +async fn scenario_ssh_secret_persists_across_restart() { + eprintln!("--- scenario: SSH secret persists across restart ---"); - // Create a sandbox that we'll check for after restart. - let (create_output, create_code) = - run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await; - let clean_create = strip_ansi(&create_output); + let secret_before = + read_ssh_handshake_secret().expect("SSH handshake secret should exist before restart"); + assert!( + !secret_before.is_empty(), + "SSH handshake secret should not be empty" + ); + + // Stop → start. + let (_, stop_code) = run_cli(&["gateway", "stop"]).await; + assert_eq!(stop_code, 0, "gateway stop should succeed"); + sleep(Duration::from_secs(3)).await; + + start_gateway().await; + wait_for_healthy(Duration::from_secs(300)).await; + + let secret_after = + read_ssh_handshake_secret().expect("SSH handshake secret should exist after restart"); assert_eq!( - create_code, 0, - "sandbox create should succeed:\n{clean_create}" + secret_before, secret_after, + "SSH handshake secret should be identical before and after restart" ); +} - // Extract sandbox name from output. - let sandbox_name = clean_create - .lines() - .find_map(|line| { - if let Some((_, rest)) = line.split_once("Created sandbox:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else if let Some((_, rest)) = line.split_once("Name:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else { - None - } - }) - .expect("should extract sandbox name from create output"); +// --------------------------------------------------------------------------- +// Scenario: stop → start resumes, sandbox survives +// --------------------------------------------------------------------------- + +async fn scenario_stop_start_resumes_with_sandbox() { + eprintln!("--- scenario: stop/start resumes with sandbox ---"); + + // Create a sandbox. + let (output, code) = + run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await; + assert_eq!( + code, 0, + "sandbox create should succeed:\n{}", + strip_ansi(&output) + ); + let sandbox_name = extract_sandbox_name(&output); - // Stop the gateway. + // Stop → start. let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await; assert_eq!( stop_code, 0, "gateway stop should succeed:\n{}", strip_ansi(&stop_output) ); - - // Wait a moment for the container to fully stop. sleep(Duration::from_secs(3)).await; // Verify container is stopped. @@ -174,261 +261,77 @@ async fn gateway_stop_start_resumes_with_sandbox() { "container should be stopped after gateway stop" ); - // Start the gateway again — should resume from existing state. - let (start_output, start_code) = run_cli(&["gateway", "start"]).await; - let clean_start = strip_ansi(&start_output); - assert_eq!( - start_code, 0, - "gateway start after stop should succeed:\n{clean_start}" - ); - - // Wait for the gateway to become healthy again. - wait_for_healthy(Duration::from_secs(180)).await; - - // Verify the sandbox still exists. - let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; - let clean_list = strip_ansi(&list_output); - assert_eq!( - list_code, 0, - "sandbox list should succeed after resume:\n{clean_list}" - ); - assert!( - clean_list.contains(&sandbox_name), - "sandbox '{sandbox_name}' should survive gateway stop/start.\nList output:\n{clean_list}" - ); - - // Cleanup: delete the test sandbox. - let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; -} - -// ------------------------------------------------------------------- -// Test: container removed → gateway start resumes -// ------------------------------------------------------------------- - -/// After the Docker container is force-removed (simulating Docker restart), -/// `openshell gateway start` should resume from the existing volume. -#[tokio::test] -async fn gateway_start_resumes_after_container_removal() { - // Precondition: gateway is healthy. - wait_for_healthy(Duration::from_secs(30)).await; - - // Create a sandbox to verify state persistence. - let (create_output, create_code) = - run_cli(&["sandbox", "create", "--", "echo", "container-rm-test"]).await; - let clean_create = strip_ansi(&create_output); - assert_eq!( - create_code, 0, - "sandbox create should succeed:\n{clean_create}" - ); - - let sandbox_name = clean_create - .lines() - .find_map(|line| { - if let Some((_, rest)) = line.split_once("Created sandbox:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else if let Some((_, rest)) = line.split_once("Name:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else { - None - } - }) - .expect("should extract sandbox name from create output"); - - // Force-remove the container (simulates Docker restart / OOM kill). - let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]); - assert_eq!(rm_code, 0, "docker rm -f should succeed"); - - // Verify the volume still exists. - let (vol_out, vol_code) = docker_cmd(&[ - "volume", - "inspect", - &format!("openshell-cluster-{GATEWAY_NAME}"), - ]); - assert_eq!( - vol_code, 0, - "volume should still exist after container removal:\n{vol_out}" - ); - - // Start the gateway — should resume from the volume. - let (start_output, start_code) = run_cli(&["gateway", "start"]).await; - let clean_start = strip_ansi(&start_output); - assert_eq!( - start_code, 0, - "gateway start after container removal should succeed:\n{clean_start}" - ); - - // Wait for healthy. - wait_for_healthy(Duration::from_secs(180)).await; + start_gateway().await; + wait_for_healthy(Duration::from_secs(300)).await; // Verify sandbox survived. let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; let clean_list = strip_ansi(&list_output); assert_eq!( list_code, 0, - "sandbox list should succeed after resume:\n{clean_list}" + "sandbox list should succeed:\n{clean_list}" ); assert!( clean_list.contains(&sandbox_name), - "sandbox '{sandbox_name}' should survive container removal + resume.\nList output:\n{clean_list}" + "sandbox '{sandbox_name}' should survive stop/start.\nList:\n{clean_list}" ); - // Cleanup. let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; } -// ------------------------------------------------------------------- -// Test: container killed → gateway start resumes, sandboxes survive, -// new sandbox create works -// ------------------------------------------------------------------- +// --------------------------------------------------------------------------- +// Scenario: container killed → resume with stale network +// --------------------------------------------------------------------------- -/// When a container is killed (stopped but NOT removed), `gateway start` -/// should resume from existing state. This validates three things: -/// -/// 1. The stale Docker network reference is reconciled (ensure_network -/// destroys and recreates the network with a new ID). -/// 2. Existing sandboxes created before the kill survive the restart. -/// 3. New `sandbox create` works after resume — the TLS certificates -/// are reused (not needlessly regenerated), so the CLI's mTLS certs -/// still match the server. -#[tokio::test] -async fn gateway_start_resumes_after_container_kill() { - // Precondition: gateway is healthy. - wait_for_healthy(Duration::from_secs(30)).await; +async fn scenario_container_kill_resumes() { + eprintln!("--- scenario: container kill resumes ---"); let cname = container_name(); - let net_name = format!("openshell-cluster-{GATEWAY_NAME}"); + let net_name = format!("openshell-cluster-{}", gateway_name()); - // Create a sandbox before the kill to verify state persistence. - let (create_output, create_code) = - run_cli(&["sandbox", "create", "--", "echo", "kill-resume-test"]).await; - let clean_create = strip_ansi(&create_output); - assert_eq!( - create_code, 0, - "sandbox create should succeed:\n{clean_create}" - ); - - let sandbox_before = clean_create - .lines() - .find_map(|line| { - if let Some((_, rest)) = line.split_once("Created sandbox:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else if let Some((_, rest)) = line.split_once("Name:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else { - None - } - }) - .expect("should extract sandbox name from create output"); - - // Kill the container (it remains as a stopped container, unlike `docker rm`). + // Kill the container. let (_, kill_code) = docker_cmd(&["kill", &cname]); assert_eq!(kill_code, 0, "docker kill should succeed"); - sleep(Duration::from_secs(3)).await; - // Remove the Docker network to simulate a stale network reference. + // Remove the network to simulate a stale network reference. // The bootstrap `ensure_network` always destroys and recreates, so // after this the container's stored network ID will be invalid. let _ = docker_cmd(&["network", "disconnect", "-f", &net_name, &cname]); let (_, net_rm_code) = docker_cmd(&["network", "rm", &net_name]); assert_eq!( net_rm_code, 0, - "docker network rm should succeed (or network already gone)" - ); - - // Start the gateway — must handle stale network + reuse existing PKI. - let (start_output, start_code) = run_cli(&["gateway", "start"]).await; - let clean_start = strip_ansi(&start_output); - assert_eq!( - start_code, 0, - "gateway start after kill should succeed:\n{clean_start}" - ); - - // Wait for the gateway to become healthy again. - wait_for_healthy(Duration::from_secs(180)).await; - - // Verify the pre-existing sandbox survived. - let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; - let clean_list = strip_ansi(&list_output); - assert_eq!( - list_code, 0, - "sandbox list should succeed after resume:\n{clean_list}" + "docker network rm should succeed" ); - assert!( - clean_list.contains(&sandbox_before), - "sandbox '{sandbox_before}' should survive container kill + resume.\nList output:\n{clean_list}" - ); - - // Create a new sandbox to verify TLS is working end-to-end. - let (new_create_output, new_create_code) = - run_cli(&["sandbox", "create", "--", "echo", "post-resume-test"]).await; - let clean_new = strip_ansi(&new_create_output); - assert_eq!( - new_create_code, 0, - "sandbox create after resume should succeed (TLS must work):\n{clean_new}" - ); - - let sandbox_after = clean_new - .lines() - .find_map(|line| { - if let Some((_, rest)) = line.split_once("Created sandbox:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else if let Some((_, rest)) = line.split_once("Name:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else { - None - } - }) - .expect("should extract sandbox name from post-resume create output"); - // Cleanup. - let _ = run_cli(&["sandbox", "delete", &sandbox_before]).await; - let _ = run_cli(&["sandbox", "delete", &sandbox_after]).await; + // Resume — must handle stale network + reuse existing PKI. + start_gateway().await; + wait_for_healthy(Duration::from_secs(300)).await; } -// ------------------------------------------------------------------- -// Test: SSH handshake secret persists across container restart -// ------------------------------------------------------------------- - -/// The SSH handshake K8s secret should persist across gateway stop/start -/// cycles — the same base64-encoded value should be returned before and -/// after the restart. -#[tokio::test] -async fn ssh_handshake_secret_persists_across_restart() { - // Precondition: gateway is healthy. - wait_for_healthy(Duration::from_secs(30)).await; - - // Read the SSH handshake secret before restart. - let secret_before = read_ssh_handshake_secret() - .expect("SSH handshake secret should exist before restart"); - assert!( - !secret_before.is_empty(), - "SSH handshake secret should not be empty" - ); +// --------------------------------------------------------------------------- +// Scenario: container removed → resume from volume +// --------------------------------------------------------------------------- - // Stop the gateway. - let (_, stop_code) = run_cli(&["gateway", "stop"]).await; - assert_eq!(stop_code, 0, "gateway stop should succeed"); +async fn scenario_container_removal_resumes() { + eprintln!("--- scenario: container removal resumes ---"); - sleep(Duration::from_secs(3)).await; + // Force-remove the container. + let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]); + assert_eq!(rm_code, 0, "docker rm -f should succeed"); - // Start the gateway. - let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + // Volume should survive. + let (vol_out, vol_code) = docker_cmd(&[ + "volume", + "inspect", + &format!("openshell-cluster-{}", gateway_name()), + ]); assert_eq!( - start_code, 0, - "gateway start should succeed:\n{}", - strip_ansi(&start_output) + vol_code, 0, + "volume should still exist after container removal:\n{vol_out}" ); - // Wait for healthy. - wait_for_healthy(Duration::from_secs(180)).await; - - // Read the secret after restart. - let secret_after = read_ssh_handshake_secret() - .expect("SSH handshake secret should exist after restart"); - - assert_eq!( - secret_before, secret_after, - "SSH handshake secret should be identical before and after restart" - ); + // Resume from volume. + start_gateway().await; + wait_for_healthy(Duration::from_secs(300)).await; } diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 000000000..25f96ab68 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[toolchain] +channel = "stable" diff --git a/tasks/test.toml b/tasks/test.toml index 6231c21e7..c383eafb5 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -32,7 +32,8 @@ description = "Run Rust CLI e2e tests (requires a running cluster)" depends = ["cluster"] run = [ "cargo build -p openshell-cli --features openshell-core/dev-settings", - "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e", + # gateway_resume tests run in a dedicated CI job with their own cluster. + "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- --skip gateway_resume_scenarios", ] ["e2e:python"]