diff --git a/aiopslab/config.yml.example b/aiopslab/config.yml.example index 3da81468..2ef447db 100644 --- a/aiopslab/config.yml.example +++ b/aiopslab/config.yml.example @@ -13,3 +13,8 @@ qualitative_eval: false # Flag to enable/disable printing the session print_session: false + +# Offline mode configuration (for environments with restricted network access) +# When enabled, images are loaded from local tar files instead of pulling from registries +offline_mode: false +images_dir: ./images # Directory containing pre-downloaded image tar files diff --git a/aiopslab/orchestrator/orchestrator.py b/aiopslab/orchestrator/orchestrator.py index 055ff8ec..6d2b86db 100644 --- a/aiopslab/orchestrator/orchestrator.py +++ b/aiopslab/orchestrator/orchestrator.py @@ -11,6 +11,7 @@ from aiopslab.utils.status import * from aiopslab.utils.critical_section import CriticalSection from aiopslab.service.telemetry.prometheus import Prometheus +from aiopslab.paths import BASE_DIR import time import inspect import asyncio @@ -30,6 +31,25 @@ def __init__(self, results_dir=None): self.kubectl = KubeCtl() self.use_wandb = os.getenv("USE_WANDB", "false").lower() == "true" self.results_dir = results_dir + + # Configure offline mode if enabled in config + self._configure_offline_mode() + + def _configure_offline_mode(self): + """Configure offline image loading if enabled in config.""" + try: + from aiopslab.config import Config + config_path = BASE_DIR / "config.yml" + if config_path.exists(): + config = Config(config_path) + offline_mode = config.get("offline_mode", False) + images_dir = config.get("images_dir", "./images") + + if offline_mode: + Helm.configure_offline_mode(enabled=True, images_dir=images_dir) + except Exception as e: + # Config file might not exist or other error, that's OK + pass def init_problem(self, problem_id: str): """Initialize a problem instance for the agent to solve. diff --git a/aiopslab/plugins/__init__.py b/aiopslab/plugins/__init__.py new file mode 100644 index 00000000..5e3ba659 --- /dev/null +++ b/aiopslab/plugins/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""AIOpsLab Plugins Package""" diff --git a/aiopslab/plugins/offline_images/README.md b/aiopslab/plugins/offline_images/README.md new file mode 100644 index 00000000..dbf5e10f --- /dev/null +++ b/aiopslab/plugins/offline_images/README.md @@ -0,0 +1,102 @@ +# Offline Images Plugin + +This plugin enables AIOpsLab to work in environments with restricted or no internet access by loading pre-downloaded Docker images from local tar files into Kind clusters. + +## Why Use This? + +- **Network Restrictions**: Some environments don't have access to Docker Hub, GHCR, or other registries +- **Slow Networks**: Pulling large images repeatedly can be time-consuming +- **Reproducibility**: Pre-downloaded images ensure consistent versions across deployments + +## Quick Start + +### 1. Download Images (with internet access) + +```bash +# Run this on a machine with internet access +./scripts/download_images.sh ./images +``` + +This will download all required images and save them as tar files. + +### 2. Transfer Images (if needed) + +Copy the `./images` directory to your target machine. + +### 3. Configure AIOpsLab + +Edit `aiopslab/config.yml`: + +```yaml +# Enable offline mode +offline_mode: true +images_dir: ./images +``` + +### 4. Run as Usual + +```bash +python cli.py +# or +python service.py +``` + +Images will be automatically loaded from local tars before deploying applications. + +## Manual Usage + +You can also use the ImageLoader programmatically: + +```python +from aiopslab.plugins.offline_images import ImageLoader + +# Initialize loader +loader = ImageLoader(images_dir="./images", cluster_name="kind") + +# Load all images +count = loader.load_all_from_directory() +print(f"Loaded {count} images") + +# Or load a specific image +loader.load_image_from_tar(Path("./images/nginx_latest.tar")) +``` + +## Tar File Naming Convention + +Image tar files should be named in the format: +``` +{registry}_{image_path}_{tag}.tar +``` + +Examples: +- `ghcr.io_open-telemetry_demo_1.11.1.tar` +- `docker.io_library_nginx_latest.tar` +- `quay.io_prometheus_prometheus_v2.47.2.tar` + +The `download_images.sh` script handles this naming automatically. + +## Supported Registries + +- `ghcr.io` (GitHub Container Registry) +- `docker.io` (Docker Hub) +- `quay.io` +- `registry.k8s.io` +- `gcr.io` (Google Container Registry) + +## Troubleshooting + +### Images not loading + +1. Check that the tar files exist in the images directory +2. Ensure Docker daemon is running +3. Verify Kind cluster is running: `kind get clusters` + +### Image name mismatch + +If an image fails to load, check that the tar filename follows the naming convention. +You can manually check the image name in a tar: + +```bash +docker load -i image.tar +# Output: Loaded image: registry/image:tag +``` diff --git a/aiopslab/plugins/offline_images/__init__.py b/aiopslab/plugins/offline_images/__init__.py new file mode 100644 index 00000000..ecb63ad4 --- /dev/null +++ b/aiopslab/plugins/offline_images/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Offline Images Plugin for AIOpsLab + +This plugin enables offline/local image loading for Kind clusters, +which is useful in environments with restricted network access. + +Usage: + 1. Download images: ./scripts/download_images.sh ./images + 2. Enable in config.yml: + offline_mode: true + images_dir: ./images + 3. Run AIOpsLab as usual - images will be loaded from local tars +""" + +from .image_loader import ImageLoader, ensure_images_loaded + +__all__ = ["ImageLoader", "ensure_images_loaded"] diff --git a/aiopslab/plugins/offline_images/image_loader.py b/aiopslab/plugins/offline_images/image_loader.py new file mode 100644 index 00000000..03d50f74 --- /dev/null +++ b/aiopslab/plugins/offline_images/image_loader.py @@ -0,0 +1,283 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Image Loader for Offline Deployment + +Loads pre-downloaded Docker images from local tar files into Kubernetes clusters. +Supports all three AIOpsLab deployment modes: + - Kind (k8s_host: kind) — loads via `kind load docker-image` + - Localhost (k8s_host: localhost) — loads via `docker load` (or ctr/crictl for containerd) + - Remote (k8s_host: ) — loads via SSH + `docker load` + +This enables AIOpsLab to work in environments without internet access. +""" + +import subprocess +import os +from pathlib import Path +from typing import Optional, Set +import logging + +logger = logging.getLogger(__name__) + + +class ImageLoader: + """ + Loads Docker images from local tar files into Kubernetes clusters. + + Supports all three AIOpsLab deployment modes based on k8s_host config: + - "kind": uses `kind load docker-image` to load images into Kind nodes + - "localhost": uses `docker load` locally (cluster runs on the same machine) + - "": uses SSH + `docker load` to load images on remote nodes + + Tar files should be named in the format: registry_image_tag.tar + Examples: + - ghcr.io_open-telemetry_demo_1.0.0.tar + - docker.io_library_nginx_latest.tar + """ + + def __init__(self, images_dir: str, k8s_host: str = "kind", + cluster_name: str = "kind", k8s_user: str = None, + ssh_key_path: str = None): + """ + Initialize ImageLoader. + + Args: + images_dir: Directory containing pre-downloaded image tar files + k8s_host: Cluster host type from config.yml — "kind", "localhost", or a remote hostname + cluster_name: Name of the Kind cluster (only used when k8s_host is "kind") + k8s_user: SSH username for remote clusters (only used when k8s_host is a hostname) + ssh_key_path: SSH key path for remote clusters (only used when k8s_host is a hostname) + """ + self.images_dir = Path(images_dir) + self.k8s_host = k8s_host + self.cluster_name = cluster_name + self.k8s_user = k8s_user + self.ssh_key_path = ssh_key_path or "~/.ssh/id_rsa" + self.loaded_images: Set[str] = set() + + def _tar_name_to_image(self, tar_path: Path) -> str: + """ + Convert tar filename back to Docker image name. + + Args: + tar_path: Path to the tar file + + Returns: + Docker image name (e.g., "ghcr.io/open-telemetry/demo:1.0.0") + """ + name = tar_path.stem # Remove .tar extension + + # Known registry prefixes + registry_prefixes = [ + 'ghcr.io_', + 'quay.io_', + 'registry.k8s.io_', + 'docker.io_', + 'gcr.io_', + ] + + registry = '' + for prefix in registry_prefixes: + if name.startswith(prefix): + # Convert prefix back to registry URL + registry = prefix.replace('_', '/', 1).rstrip('_') + '/' + name = name[len(prefix):] + break + + # Find the last underscore as tag separator + parts = name.rsplit('_', 1) + if len(parts) == 2: + image_path = parts[0].replace('_', '/') + tag = parts[1] + return f"{registry}{image_path}:{tag}" + else: + return name.replace('_', '/') + + def _load_to_kind(self, image_name: str) -> bool: + """Load image into Kind cluster via `kind load docker-image`.""" + cmd = ["kind", "load", "docker-image", image_name, "--name", self.cluster_name] + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + if "already present" not in result.stderr.lower(): + logger.warning(f"Failed to load {image_name} to Kind: {result.stderr[:200]}") + return False + return True + + def _load_to_localhost(self, tar_path: Path) -> bool: + """Load image on localhost via `docker load` (cluster runs locally).""" + # docker load is already done in load_image_from_tar(), nothing extra needed + return True + + def _load_to_remote(self, tar_path: Path, image_name: str) -> bool: + """Load image on remote cluster node via SSH + `docker load`.""" + ssh_key = os.path.expanduser(self.ssh_key_path) + + # Use SSH to pipe the tar file to docker load on the remote host + cmd = ( + f"ssh -i {ssh_key} -o StrictHostKeyChecking=no " + f"{self.k8s_user}@{self.k8s_host} 'docker load'" + ) + + try: + with open(tar_path, 'rb') as f: + result = subprocess.run( + cmd, shell=True, stdin=f, + capture_output=True, text=True, timeout=300 + ) + + if result.returncode != 0: + logger.warning(f"Failed to load {image_name} to remote: {result.stderr[:200]}") + return False + return True + except subprocess.TimeoutExpired: + logger.warning(f"Timeout loading {image_name} to remote host") + return False + except Exception as e: + logger.warning(f"Error loading {image_name} to remote: {e}") + return False + + def load_image_from_tar(self, tar_path: Path) -> bool: + """ + Load a single image from tar file into the cluster. + + The loading method depends on k8s_host: + - "kind": docker load + kind load docker-image + - "localhost": docker load only + - "": SSH + docker load on remote node + + Args: + tar_path: Path to the tar file + + Returns: + True if successful, False otherwise + """ + if not tar_path.exists(): + logger.warning(f"Tar file not found: {tar_path}") + return False + + # Step 1: Load into local Docker (needed for Kind and Localhost modes) + image_name = None + + if self.k8s_host in ("kind", "localhost"): + load_cmd = ["docker", "load", "-i", str(tar_path)] + result = subprocess.run(load_cmd, capture_output=True, text=True) + + if result.returncode != 0: + logger.warning(f"Failed to docker load {tar_path.name}: {result.stderr[:200]}") + return False + + # Extract image name from output + for line in result.stdout.split('\n'): + if 'Loaded image:' in line: + image_name = line.split('Loaded image:')[-1].strip() + break + + if not image_name: + image_name = self._tar_name_to_image(tar_path) + + # Step 2: Load into cluster based on k8s_host mode + if self.k8s_host == "kind": + if not self._load_to_kind(image_name): + return False + elif self.k8s_host == "localhost": + if not self._load_to_localhost(tar_path): + return False + else: + # Remote host + if not self._load_to_remote(tar_path, image_name): + return False + + self.loaded_images.add(image_name) + logger.info(f"Loaded image: {image_name} (mode: {self.k8s_host})") + return True + + def load_all_from_directory(self) -> int: + """ + Load all tar files from the images directory into the cluster. + + Returns: + Number of successfully loaded images + """ + if not self.images_dir.exists(): + logger.warning(f"Images directory not found: {self.images_dir}") + return 0 + + tar_files = list(self.images_dir.glob("*.tar")) + if not tar_files: + logger.warning(f"No tar files found in {self.images_dir}") + return 0 + + mode_desc = { + "kind": "Kind cluster (docker load + kind load)", + "localhost": "Localhost (docker load)", + }.get(self.k8s_host, f"Remote host '{self.k8s_host}' (SSH + docker load)") + + logger.info(f"Loading {len(tar_files)} images from {self.images_dir}") + logger.info(f" Mode: {mode_desc}") + + success_count = 0 + for i, tar_file in enumerate(tar_files, 1): + logger.info(f"[{i}/{len(tar_files)}] Loading {tar_file.name}...") + if self.load_image_from_tar(tar_file): + success_count += 1 + + logger.info(f"Loaded {success_count}/{len(tar_files)} images successfully") + return success_count + + def is_image_loaded(self, image_name: str) -> bool: + """Check if an image has been loaded.""" + return image_name in self.loaded_images + + +# Global instance (lazily initialized) +_loader: Optional[ImageLoader] = None + + +def get_loader() -> Optional[ImageLoader]: + """Get the global ImageLoader instance.""" + return _loader + + +def init_loader(images_dir: str, **kwargs) -> ImageLoader: + """ + Initialize the global ImageLoader. + + Args: + images_dir: Directory containing image tar files + **kwargs: Additional arguments passed to ImageLoader + + Returns: + The initialized ImageLoader instance + """ + global _loader + _loader = ImageLoader(images_dir, **kwargs) + return _loader + + +def ensure_images_loaded(images_dir: Optional[str] = None, **kwargs) -> bool: + """ + Ensure all images from the directory are loaded into the cluster. + + This is the main entry point for the offline images plugin. + Call this before deploying applications. + + Args: + images_dir: Directory containing image tar files (optional if already initialized) + **kwargs: Additional arguments (k8s_host, cluster_name, k8s_user, ssh_key_path) + + Returns: + True if images were loaded successfully, False otherwise + """ + global _loader + + if _loader is None: + if images_dir is None: + logger.warning("ImageLoader not initialized and no images_dir provided") + return False + _loader = ImageLoader(images_dir, **kwargs) + + count = _loader.load_all_from_directory() + return count > 0 diff --git a/aiopslab/service/helm.py b/aiopslab/service/helm.py index c436f890..13cad729 100644 --- a/aiopslab/service/helm.py +++ b/aiopslab/service/helm.py @@ -4,11 +4,52 @@ """Interface for helm operations""" import subprocess +import os from aiopslab.service.kubectl import KubeCtl class Helm: + # Offline mode flag (can be set via config) + _offline_mode = False + _images_dir = None + _images_loaded = False + + @classmethod + def configure_offline_mode(cls, enabled: bool, images_dir: str = None): + """Configure offline mode for Helm deployments. + + Args: + enabled: Whether to enable offline mode + images_dir: Directory containing pre-downloaded image tar files + """ + cls._offline_mode = enabled + cls._images_dir = images_dir + cls._images_loaded = False + if enabled: + print(f"== Offline Mode Enabled ==") + print(f" Images directory: {images_dir}") + + @classmethod + def _ensure_images_loaded(cls): + """Load images from local tar files if offline mode is enabled.""" + if not cls._offline_mode or cls._images_loaded: + return + + if not cls._images_dir or not os.path.exists(cls._images_dir): + print(f"Warning: Offline mode enabled but images_dir not found: {cls._images_dir}") + return + + try: + from aiopslab.plugins.offline_images import ensure_images_loaded + print("== Loading Offline Images ==") + ensure_images_loaded(cls._images_dir) + cls._images_loaded = True + except ImportError: + print("Warning: offline_images plugin not found, skipping image loading") + except Exception as e: + print(f"Warning: Failed to load offline images: {e}") + @staticmethod def install(**args): """Install a helm chart @@ -21,6 +62,9 @@ def install(**args): extra_args (List[str)]: Extra arguments for the helm install command remote_chart (bool): Whether the chart is remote (from a Helm repo) """ + # Load offline images if configured + Helm._ensure_images_loaded() + print("== Helm Install ==") release_name = args.get("release_name") chart_path = args.get("chart_path") diff --git a/scripts/download_images.sh b/scripts/download_images.sh new file mode 100755 index 00000000..a76c526b --- /dev/null +++ b/scripts/download_images.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# +# Download all required images for AIOpsLab offline deployment +# +# Usage: ./download_images.sh [output_dir] +# +# This script downloads all Docker images required by AIOpsLab applications +# and saves them as tar files for offline deployment. +# + +set -e + +OUTPUT_DIR="${1:-./images}" +mkdir -p "$OUTPUT_DIR" + +echo "==============================================" +echo "AIOpsLab Image Downloader" +echo "==============================================" +echo "Output directory: $OUTPUT_DIR" +echo "" + +# List of required images for AIOpsLab applications +# Format: registry/image:tag +IMAGES=( + # OpenTelemetry Astronomy Shop + "ghcr.io/open-telemetry/demo:1.11.1-accountingservice" + "ghcr.io/open-telemetry/demo:1.11.1-adservice" + "ghcr.io/open-telemetry/demo:1.11.1-cartservice" + "ghcr.io/open-telemetry/demo:1.11.1-checkoutservice" + "ghcr.io/open-telemetry/demo:1.11.1-currencyservice" + "ghcr.io/open-telemetry/demo:1.11.1-emailservice" + "ghcr.io/open-telemetry/demo:1.11.1-flagd" + "ghcr.io/open-telemetry/demo:1.11.1-frauddetectionservice" + "ghcr.io/open-telemetry/demo:1.11.1-frontend" + "ghcr.io/open-telemetry/demo:1.11.1-frontendproxy" + "ghcr.io/open-telemetry/demo:1.11.1-imageprovider" + "ghcr.io/open-telemetry/demo:1.11.1-kafka" + "ghcr.io/open-telemetry/demo:1.11.1-loadgenerator" + "ghcr.io/open-telemetry/demo:1.11.1-paymentservice" + "ghcr.io/open-telemetry/demo:1.11.1-productcatalogservice" + "ghcr.io/open-telemetry/demo:1.11.1-quoteservice" + "ghcr.io/open-telemetry/demo:1.11.1-recommendationservice" + "ghcr.io/open-telemetry/demo:1.11.1-shippingservice" + "ghcr.io/open-telemetry/demo:1.11.1-valkey" + + # Observability stack + "quay.io/prometheus/prometheus:v2.47.2" + "grafana/grafana:10.2.0" + "jaegertracing/all-in-one:1.50" + "otel/opentelemetry-collector-contrib:0.88.0" + + # Infrastructure + "docker.io/library/redis:7.2-alpine" + "docker.io/library/postgres:16" + "bitnami/kafka:3.6" + + # OpenEBS for local storage + "openebs/provisioner-localpv:3.4.0" + "openebs/linux-utils:3.4.0" + "openebs/node-disk-manager:2.1.0" + "openebs/node-disk-operator:2.1.0" +) + +# Function to convert image name to tar filename +image_to_filename() { + local image="$1" + # Replace / with _ and : with _ + echo "${image//\//_}" | sed 's/:/_/g' +} + +# Download and save images +total=${#IMAGES[@]} +count=0 +success=0 + +for image in "${IMAGES[@]}"; do + ((count++)) + filename=$(image_to_filename "$image") + tar_path="$OUTPUT_DIR/${filename}.tar" + + echo "" + echo "[$count/$total] $image" + + # Skip if already downloaded + if [ -f "$tar_path" ]; then + echo " ✓ Already exists: $tar_path" + ((success++)) + continue + fi + + # Pull image + echo " Pulling..." + if docker pull "$image" > /dev/null 2>&1; then + # Save to tar + echo " Saving to $tar_path..." + if docker save -o "$tar_path" "$image"; then + echo " ✓ Success" + ((success++)) + else + echo " ✗ Failed to save" + fi + else + echo " ✗ Failed to pull" + fi +done + +echo "" +echo "==============================================" +echo "Download complete: $success/$total images" +echo "Output directory: $OUTPUT_DIR" +echo "==============================================" + +# Print disk usage +du -sh "$OUTPUT_DIR"