From 491074618910aaad887ad5c1fa021ba9d0c2129b Mon Sep 17 00:00:00 2001 From: fok666 Date: Sun, 1 Mar 2026 14:17:31 +0100 Subject: [PATCH] feat: Implement Lambda functions for building Python layers - Added `submit_build` Lambda function to handle build requests, validate input, and store build information in DynamoDB. - Created `process_build` Lambda function to process SQS messages and launch EC2 instances for building layers. - Introduced `check_status` Lambda function to retrieve build status and generate presigned URLs for artifacts. chore: Set up Terraform infrastructure for the build system - Configured API Gateway with routes for submitting builds and checking build status. - Created DynamoDB table for tracking build states with TTL for automatic cleanup. - Established SQS queue for managing build requests and a dead letter queue for failed messages. - Defined EC2 launch template for build workers using Amazon Linux 2023. - Set up IAM roles and policies for Lambda functions and EC2 instances. - Configured S3 bucket for storing build artifacts with lifecycle policies for expiration. - Added necessary Terraform variables and outputs for deployment configuration. - Included .gitignore for Terraform files and build artifacts. --- docs/index.html | 897 ++++++++++++++++++ infrastructure/README.md | 205 ++++ infrastructure/lambdas/check_status/index.py | 147 +++ infrastructure/lambdas/process_build/index.py | 357 +++++++ infrastructure/lambdas/submit_build/index.py | 122 +++ infrastructure/terraform/.gitignore | 18 + infrastructure/terraform/api_gateway.tf | 101 ++ infrastructure/terraform/dynamodb.tf | 26 + infrastructure/terraform/ec2.tf | 83 ++ infrastructure/terraform/iam.tf | 246 +++++ infrastructure/terraform/lambda.tf | 138 +++ infrastructure/terraform/main.tf | 55 ++ infrastructure/terraform/outputs.tf | 48 + infrastructure/terraform/s3.tf | 74 ++ infrastructure/terraform/sqs.tf | 31 + .../terraform/terraform.tfvars.example | 43 + infrastructure/terraform/variables.tf | 192 ++++ infrastructure/terraform/vpc.tf | 93 ++ 18 files changed, 2876 insertions(+) create mode 100644 docs/index.html create mode 100644 infrastructure/README.md create mode 100644 infrastructure/lambdas/check_status/index.py create mode 100644 infrastructure/lambdas/process_build/index.py create mode 100644 infrastructure/lambdas/submit_build/index.py create mode 100644 infrastructure/terraform/.gitignore create mode 100644 infrastructure/terraform/api_gateway.tf create mode 100644 infrastructure/terraform/dynamodb.tf create mode 100644 infrastructure/terraform/ec2.tf create mode 100644 infrastructure/terraform/iam.tf create mode 100644 infrastructure/terraform/lambda.tf create mode 100644 infrastructure/terraform/main.tf create mode 100644 infrastructure/terraform/outputs.tf create mode 100644 infrastructure/terraform/s3.tf create mode 100644 infrastructure/terraform/sqs.tf create mode 100644 infrastructure/terraform/terraform.tfvars.example create mode 100644 infrastructure/terraform/variables.tf create mode 100644 infrastructure/terraform/vpc.tf diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..aef4051 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,897 @@ + + + + + + Lambda Python Layer Builder + + + +
+
+

πŸ“¦ Lambda Python Layer Builder

+

Build AWS Lambda Python layers for any architecture. Powered by Docker on EC2 Spot instances.

+
+ + +
+ +
+ +
+ + +
From Terraform output: api_url. Saved in browser storage.
+
+ + +
+

πŸ”§ New Build

+ +
+ + +
+ +
+
+ + +
+ +
+ +
+ + +
+
+ +
+ +
+ + Combined archive +
+
+
+ + +
+ + +
+

πŸ“‹ Builds

+
+
+
πŸ—οΈ
+

No builds yet. Submit a build to get started.

+
+
+
+ + +
+ +
+ + + + diff --git a/infrastructure/README.md b/infrastructure/README.md new file mode 100644 index 0000000..82d2fb1 --- /dev/null +++ b/infrastructure/README.md @@ -0,0 +1,205 @@ +# Lambda Python Layer Builder β€” Infrastructure + +Serverless infrastructure that builds AWS Lambda Python layers on-demand using EC2 Spot instances and Docker, with a GitHub Pages frontend. + +## Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ GitHub Pages (docs/index.html) β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ requirements.txt β”‚ Python version β”‚ Architecture β”‚ Submit β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ POST /builds + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ API Gateway (HTTP API) β”‚ +β”‚ POST /builds β†’ submit_build Lambda β”‚ +β”‚ GET /builds/{id} β†’ check_status Lambda β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β–Ό β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ submit_build Ξ» β”‚ β”‚ check_status Ξ» β”‚ +β”‚ β€’ Validates inputβ”‚ β”‚ β€’ Reads DynamoDB β”‚ +β”‚ β€’ Creates record β”‚ β”‚ β€’ Generates presignedβ”‚ +β”‚ β€’ Sends to SQS β”‚ β”‚ S3 download URLs β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β–Ό β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ SQS Build Queue β”‚ β”‚ DynamoDB β”‚ +β”‚ (with DLQ) β”‚ β”‚ buildId | status β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ s3_keys | TTL β”‚ + β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β–Ό β–² +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ process_build Ξ» β”‚ β”‚ +β”‚ β€’ Launches EC2 β”‚ β”‚ +β”‚ Spot instance β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β”‚ β”‚ + β–Ό β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ EC2 Spot Instance β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ 1. Install Docker β”‚ β”‚ β”‚ +β”‚ β”‚ 2. Pull/build Docker image β”‚ β”‚ β”‚ +β”‚ β”‚ 3. Run container to build β”‚ β”‚ β”‚ +β”‚ β”‚ Lambda layer zip files β”‚ β”‚ β”‚ +β”‚ β”‚ 4. Upload zips to S3 ─────────┼──┐ β”‚ β”‚ +β”‚ β”‚ 5. Update DynamoDB status β”€β”€β”€β”€β”€β”Όβ”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ 6. Self-terminate β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ S3 Artifacts β”‚ + β”‚ builds/{id}/*.zip β”‚ + β”‚ Lifecycle: 24h β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Flow + +1. **User** opens GitHub Pages, enters `requirements.txt`, selects Python version & architecture +2. **API Gateway** routes `POST /builds` to `submit_build` Lambda +3. **submit_build** validates input, creates DynamoDB record (QUEUED), sends SQS message +4. **SQS** triggers `process_build` Lambda +5. **process_build** launches an EC2 Spot instance with a user-data script +6. **EC2 instance** installs Docker, pulls pre-built images from GHCR (or builds from Dockerfile), runs the build, uploads zips to S3, updates DynamoDB (COMPLETED), self-terminates +7. **User** frontend polls `GET /builds/{id}` which returns status + presigned S3 download URLs +8. **Artifacts** auto-expire from S3 after configurable TTL (default 24h) + +## Cost Estimate + +| Component | Cost | Notes | +|-----------|------|-------| +| EC2 Spot (c5.xlarge) | ~$0.04/hr | ~$0.01 per build (15 min avg) | +| S3 | ~$0.023/GB/month | Artifacts auto-expire | +| Lambda | ~$0.20/1M requests | Minimal usage | +| API Gateway | $1.00/1M requests | HTTP API pricing | +| DynamoDB | Pay-per-request | ~$0.00 for low volume | +| SQS | $0.40/1M messages | Negligible | +| **Total (idle)** | **~$0/month** | No running infrastructure | +| **Per build** | **~$0.01-0.03** | Spot instance + S3 | + +## Prerequisites + +- AWS account with permissions to create VPC, EC2, Lambda, S3, SQS, DynamoDB, API Gateway, IAM +- [Terraform](https://www.terraform.io/downloads) >= 1.5.0 +- AWS CLI configured (`aws configure`) + +## Deployment + +```bash +cd infrastructure/terraform + +# Copy and customize configuration +cp terraform.tfvars.example terraform.tfvars +# Edit terraform.tfvars with your preferences + +# Initialize and deploy +terraform init +terraform plan +terraform apply +``` + +After deployment, note the `api_url` output: + +``` +Outputs: + api_url = "https://xxxxxxxxxx.execute-api.eu-central-1.amazonaws.com" +``` + +### Configure GitHub Pages + +1. In your GitHub repository: **Settings β†’ Pages β†’ Source: Deploy from a branch** +2. Select **Branch: main**, **Folder: /docs** +3. Open your GitHub Pages URL +4. Click **βš™ API Settings** and paste the `api_url` from Terraform output +5. Start building layers! + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `aws_region` | `eu-central-1` | AWS region | +| `environment` | `prod` | Environment name | +| `artifact_ttl_hours` | `24` | Hours to keep artifacts in S3 | +| `ec2_instance_type` | `c5.xlarge` | Spot instance type | +| `ec2_volume_size` | `50` | EBS volume size (GB) | +| `ec2_max_build_time_minutes` | `30` | Safety timeout per build | +| `allowed_origins` | `["*"]` | CORS origins | +| `docker_image_prefix` | `ghcr.io/fok666/lambda-python-layer` | Pre-built image registry | + +## API Reference + +### POST /builds + +Submit a new build request. + +```json +{ + "requirements": "numpy==1.26.4\nrequests==2.32.4", + "python_version": "3.13", + "architectures": ["x86_64", "arm64"], + "single_file": true +} +``` + +**Response:** +```json +{ + "build_id": "a1b2c3d4-...", + "status": "QUEUED", + "expires_at": 1709398800 +} +``` + +### GET /builds/{buildId} + +Check build status. Returns presigned download URLs when completed. + +**Response (completed):** +```json +{ + "build_id": "a1b2c3d4-...", + "status": "COMPLETED", + "python_version": "3.13", + "architectures": ["x86_64", "arm64"], + "files": [ + { + "filename": "combined-python3.13-x86_64.zip", + "download_url": "https://s3.amazonaws.com/...", + "architecture": "x86_64" + }, + { + "filename": "combined-python3.13-aarch64.zip", + "download_url": "https://s3.amazonaws.com/...", + "architecture": "arm64" + } + ] +} +``` + +## Security + +- **S3 bucket**: Private, no public access. Downloads via presigned URLs only +- **EC2 instances**: No SSH, no inbound ports. Egress-only security group +- **IMDSv2**: Enforced on all EC2 instances +- **EBS encryption**: Enabled by default +- **IAM**: Least-privilege policies per component +- **DynamoDB TTL**: Automatic cleanup of old records +- **S3 lifecycle**: Automatic deletion of old artifacts + +## Teardown + +```bash +cd infrastructure/terraform +terraform destroy +``` + +> **Note:** S3 bucket must be empty before destruction. Terraform will fail if artifacts exist. Wait for lifecycle expiration or manually empty the bucket. diff --git a/infrastructure/lambdas/check_status/index.py b/infrastructure/lambdas/check_status/index.py new file mode 100644 index 0000000..342a072 --- /dev/null +++ b/infrastructure/lambdas/check_status/index.py @@ -0,0 +1,147 @@ +""" +Check Status Lambda +Returns the build status and generates presigned download URLs +for completed builds. + +API: GET /builds/{buildId} +Response: { + "build_id": "uuid", + "status": "COMPLETED", + "python_version": "3.13", + "architectures": ["x86_64", "arm64"], + "created_at": 1709312400, + "expires_at": 1709398800, + "files": [ + { + "filename": "combined-python3.13-x86_64.zip", + "download_url": "https://...", + "architecture": "x86_64" + } + ] +} +""" + +import json +import os +import re +import boto3 +from botocore.exceptions import ClientError + +dynamodb = boto3.resource("dynamodb") +s3_client = boto3.client("s3") + +TABLE_NAME = os.environ["DYNAMODB_TABLE"] +S3_BUCKET = os.environ["S3_BUCKET"] +ARTIFACT_TTL_HOURS = int(os.environ.get("ARTIFACT_TTL_HOURS", "24")) + +# Presigned URL expiry matches artifact TTL (capped at 7 days for S3 limit) +PRESIGN_EXPIRY = min(ARTIFACT_TTL_HOURS * 3600, 604800) + + +def handler(event, context): + """Handle GET /builds/{buildId} requests.""" + # Extract buildId from path parameters + build_id = (event.get("pathParameters") or {}).get("buildId") + + if not build_id: + return _response(400, {"error": "buildId is required"}) + + # Validate UUID format + uuid_pattern = re.compile( + r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I + ) + if not uuid_pattern.match(build_id): + return _response(400, {"error": "Invalid buildId format"}) + + # Fetch build record + table = dynamodb.Table(TABLE_NAME) + try: + result = table.get_item(Key={"buildId": build_id}) + except ClientError as e: + print(f"DynamoDB error: {e}") + return _response(500, {"error": "Failed to retrieve build status"}) + + item = result.get("Item") + if not item: + return _response(404, {"error": "Build not found"}) + + # Build base response + response_body = { + "build_id": item["buildId"], + "status": item["status"], + "python_version": item.get("python_version", "unknown"), + "architectures": item.get("architectures", []), + "single_file": item.get("single_file", True), + "created_at": int(item.get("created_at", 0)), + "expires_at": int(item.get("expires_at", 0)), + } + + # Add error message if failed + if item.get("error_message"): + response_body["error_message"] = item["error_message"] + + # Add completed timestamp + if item.get("completed_at"): + response_body["completed_at"] = int(item["completed_at"]) + + # Generate presigned download URLs for completed builds + if item["status"] == "COMPLETED" and item.get("s3_keys"): + s3_keys = item["s3_keys"].split(",") + files = [] + + for s3_key in s3_keys: + s3_key = s3_key.strip() + if not s3_key: + continue + + filename = s3_key.split("/")[-1] + architecture = _detect_architecture(filename) + + try: + download_url = s3_client.generate_presigned_url( + "get_object", + Params={"Bucket": S3_BUCKET, "Key": s3_key}, + ExpiresIn=PRESIGN_EXPIRY, + ) + files.append({ + "filename": filename, + "download_url": download_url, + "architecture": architecture, + "s3_key": s3_key, + }) + except ClientError as e: + print(f"Failed to generate presigned URL for {s3_key}: {e}") + files.append({ + "filename": filename, + "architecture": architecture, + "error": "Failed to generate download URL", + }) + + response_body["files"] = files + response_body["file_count"] = len(files) + + return _response(200, response_body) + + +def _detect_architecture(filename): + """Detect architecture from filename.""" + filename_lower = filename.lower() + if "x86_64" in filename_lower or "amd64" in filename_lower: + return "x86_64" + elif "aarch64" in filename_lower or "arm64" in filename_lower: + return "arm64" + return "unknown" + + +def _response(status_code, body): + """Create API Gateway response with CORS headers.""" + return { + "statusCode": status_code, + "headers": { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Headers": "Content-Type", + "Access-Control-Allow-Methods": "POST,GET,OPTIONS", + }, + "body": json.dumps(body), + } diff --git a/infrastructure/lambdas/process_build/index.py b/infrastructure/lambdas/process_build/index.py new file mode 100644 index 0000000..749225e --- /dev/null +++ b/infrastructure/lambdas/process_build/index.py @@ -0,0 +1,357 @@ +""" +Process Build Lambda +Triggered by SQS. Launches an EC2 Spot instance with a user-data script +that builds the Lambda layer packages using Docker containers. + +The EC2 instance: +1. Installs Docker +2. Pulls pre-built images from GHCR (or builds locally as fallback) +3. Runs the Docker container to build the Lambda layer zips +4. Uploads artifacts to S3 +5. Updates DynamoDB with status and S3 keys +6. Self-terminates +""" + +import json +import os +import base64 +import random +import boto3 + +ec2 = boto3.client("ec2") +dynamodb = boto3.resource("dynamodb") + +TABLE_NAME = os.environ["DYNAMODB_TABLE"] +S3_BUCKET = os.environ["S3_BUCKET"] +SUBNET_IDS = os.environ["SUBNET_IDS"].split(",") +SECURITY_GROUP_ID = os.environ["SECURITY_GROUP_ID"] +LAUNCH_TEMPLATE_ID = os.environ["LAUNCH_TEMPLATE_ID"] +INSTANCE_PROFILE_ARN = os.environ["INSTANCE_PROFILE_ARN"] +DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "ghcr.io/fok666/lambda-python-layer") +GITHUB_REPO_URL = os.environ.get("GITHUB_REPO_URL", "https://github.com/fok666/lambda-python-layer.git") +EC2_INSTANCE_TYPE = os.environ.get("EC2_INSTANCE_TYPE", "c5.xlarge") +MAX_BUILD_MINUTES = int(os.environ.get("MAX_BUILD_MINUTES", "30")) +PROJECT_NAME = os.environ.get("PROJECT_NAME", "lambda-layer-builder") + + +def handler(event, context): + """Process SQS messages containing build requests.""" + for record in event["Records"]: + message = json.loads(record["body"]) + try: + _process_build(message) + except Exception as e: + print(f"ERROR processing build {message.get('build_id', 'unknown')}: {e}") + _update_status(message.get("build_id"), "FAILED", error=str(e)) + raise + + +def _process_build(message): + """Launch EC2 Spot instance to perform the build.""" + build_id = message["build_id"] + python_version = message["python_version"] + architectures = message["architectures"] + requirements = message["requirements"] + single_file = message.get("single_file", True) + + print(f"Processing build {build_id}: Python {python_version}, " + f"arch={architectures}, single_file={single_file}") + + # Update status to PROCESSING + _update_status(build_id, "PROCESSING") + + # Generate user-data script + user_data = _generate_user_data( + build_id=build_id, + python_version=python_version, + architectures=architectures, + requirements=requirements, + single_file=single_file, + ) + + # Pick a random subnet for AZ diversity + subnet_id = random.choice(SUBNET_IDS) + + # Launch Spot instance + try: + response = ec2.run_instances( + LaunchTemplate={"LaunchTemplateId": LAUNCH_TEMPLATE_ID}, + InstanceType=EC2_INSTANCE_TYPE, + MinCount=1, + MaxCount=1, + SubnetId=subnet_id, + InstanceMarketOptions={ + "MarketType": "spot", + "SpotOptions": { + "SpotInstanceType": "one-time", + "InstanceInterruptionBehavior": "terminate", + }, + }, + UserData=base64.b64encode(user_data.encode()).decode(), + TagSpecifications=[ + { + "ResourceType": "instance", + "Tags": [ + {"Key": "Name", "Value": f"builder-{build_id[:8]}"}, + {"Key": "BuildId", "Value": build_id}, + {"Key": "Project", "Value": PROJECT_NAME}, + {"Key": "AutoTerminate", "Value": "true"}, + ], + } + ], + ) + + instance_id = response["Instances"][0]["InstanceId"] + print(f"Launched Spot instance {instance_id} for build {build_id}") + + # Store instance ID in DynamoDB + table = dynamodb.Table(TABLE_NAME) + table.update_item( + Key={"buildId": build_id}, + UpdateExpression="SET instance_id = :i", + ExpressionAttributeValues={":i": instance_id}, + ) + + except Exception as e: + error_msg = str(e) + print(f"Failed to launch instance for build {build_id}: {error_msg}") + + # If spot capacity unavailable, mark as failed with helpful message + if "InsufficientInstanceCapacity" in error_msg or "SpotMaxPriceTooLow" in error_msg: + _update_status(build_id, "FAILED", + error="Spot instance capacity unavailable. Please retry.") + else: + _update_status(build_id, "FAILED", error=error_msg) + raise + + +def _update_status(build_id, status, error=None): + """Update build status in DynamoDB.""" + if not build_id: + return + table = dynamodb.Table(TABLE_NAME) + update_expr = "SET #s = :s" + attr_names = {"#s": "status"} + attr_values = {":s": status} + + if error: + update_expr += ", error_message = :e" + attr_values[":e"] = error + + try: + table.update_item( + Key={"buildId": build_id}, + UpdateExpression=update_expr, + ExpressionAttributeNames=attr_names, + ExpressionAttributeValues=attr_values, + ) + except Exception as e: + print(f"Failed to update status for {build_id}: {e}") + + +def _generate_user_data(build_id, python_version, architectures, requirements, single_file): + """Generate the EC2 user-data bash script for the build.""" + req_escaped = requirements.replace("\\", "\\\\").replace("'", "'\\''") + arches_str = " ".join(architectures) + single_file_str = "true" if single_file else "false" + + return f"""#!/bin/bash +set -euo pipefail +exec > >(tee /var/log/build.log) 2>&1 + +echo "$(date): === Lambda Layer Builder ===" +echo "Build ID: {build_id}" +echo "Python: {python_version}" +echo "Architectures: {arches_str}" +echo "Single file: {single_file_str}" + +# --- Instance metadata (IMDSv2) --- +TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 300") +REGION=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/placement/region) +INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/instance-id) + +export AWS_DEFAULT_REGION="$REGION" + +# --- Safety: auto-terminate after {MAX_BUILD_MINUTES} minutes --- +(sleep {MAX_BUILD_MINUTES * 60} && \ + echo "$(date): TIMEOUT - self-terminating" && \ + aws dynamodb update-item \ + --table-name "{TABLE_NAME}" \ + --key '{{"buildId": {{"S": "{build_id}"}}}}' \ + --update-expression "SET #s = :s, error_message = :e" \ + --expression-attribute-names '{{"#s": "status"}}' \ + --expression-attribute-values '{{":s": {{"S": "FAILED"}}, ":e": {{"S": "Build timed out after {MAX_BUILD_MINUTES} minutes"}}}}' && \ + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID") & +WATCHDOG_PID=$! + +# --- Helper functions --- +update_status() {{ + local status=$1 + local extra="${{2:-}}" + aws dynamodb update-item \ + --table-name "{TABLE_NAME}" \ + --key '{{"buildId": {{"S": "{build_id}"}}}}' \ + --update-expression "SET #s = :s${{extra}}" \ + --expression-attribute-names '{{"#s": "status"}}' \ + --expression-attribute-values "$(echo '{{":s": {{"S": "'"$status"'"}}}}' )" \ + 2>/dev/null || true +}} + +cleanup() {{ + echo "$(date): Cleanup initiated" + kill $WATCHDOG_PID 2>/dev/null || true + echo "$(date): Self-terminating instance $INSTANCE_ID" + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" 2>/dev/null || true +}} +trap cleanup EXIT + +# --- Install Docker --- +echo "$(date): Installing Docker..." +dnf install -y docker git aws-cli 2>/dev/null || yum install -y docker git aws-cli +systemctl start docker +systemctl enable docker + +# Enable QEMU for cross-architecture builds +docker run --rm --privileged multiarch/qemu-user-static --reset -p yes 2>/dev/null || true + +# --- Create requirements file --- +mkdir -p /build/input /build/output +cat > /build/input/requirements.txt << 'REQUIREMENTS_EOF' +{requirements} +REQUIREMENTS_EOF + +echo "$(date): Requirements:" +cat /build/input/requirements.txt + +# --- Configuration --- +DOCKER_IMAGE_PREFIX="{DOCKER_IMAGE_PREFIX}" +S3_BUCKET="{S3_BUCKET}" +BUILD_ID="{build_id}" +PYTHON_VERSION="{python_version}" +SINGLE_FILE="{single_file_str}" + +# --- Build function --- +build_arch() {{ + local arch=$1 + local platform="" + local arch_label="" + + if [ "$arch" = "x86_64" ]; then + platform="linux/amd64" + arch_label="amd64" + else + platform="linux/arm64" + arch_label="arm64" + fi + + echo "" + echo "$(date): =========================================" + echo "$(date): Building for $arch ($platform)" + echo "$(date): =========================================" + + local image_tag="${{DOCKER_IMAGE_PREFIX}}:python${{PYTHON_VERSION}}-${{arch_label}}-latest" + + # Try pre-built image first, fall back to local build + if docker pull --platform "$platform" "$image_tag" 2>/dev/null; then + echo "$(date): Using pre-built image: $image_tag" + else + echo "$(date): Pre-built image unavailable, building locally..." + + if [ ! -d /build/repo ]; then + git clone {GITHUB_REPO_URL} /build/repo + fi + + # Select correct Dockerfile based on Python version + local dockerfile="/build/repo/Dockerfile.al2023" + if [[ "$PYTHON_VERSION" == "3.10" || "$PYTHON_VERSION" == "3.11" ]]; then + dockerfile="/build/repo/Dockerfile.al2" + fi + + docker buildx create --use --name builder 2>/dev/null || true + docker buildx build \ + --platform "$platform" \ + --build-arg PYTHON_VERSION=$PYTHON_VERSION \ + -t "$image_tag" \ + --load \ + -f "$dockerfile" \ + /build/repo/ + fi + + # Run the build container + if [ "$SINGLE_FILE" = "true" ]; then + docker run --rm \ + --platform "$platform" \ + -e SINGLE_FILE=true \ + -v /build/input/requirements.txt:/input/requirements.txt \ + -v /build/output:/package \ + "$image_tag" + else + docker run --rm \ + --platform "$platform" \ + -v /build/input/requirements.txt:/input/requirements.txt \ + -v /build/output:/package \ + "$image_tag" + fi + + echo "$(date): Build complete for $arch" +}} + +# --- Execute builds --- +for arch in {arches_str}; do + build_arch "$arch" +done + +# --- Upload artifacts to S3 --- +echo "" +echo "$(date): Uploading artifacts to S3..." +S3_KEYS="" +FILE_COUNT=0 + +for zip_file in /build/output/*.zip; do + if [ -f "$zip_file" ]; then + filename=$(basename "$zip_file") + s3_key="builds/$BUILD_ID/$filename" + aws s3 cp "$zip_file" "s3://$S3_BUCKET/$s3_key" + echo "$(date): Uploaded: s3://$S3_BUCKET/$s3_key ($(du -h "$zip_file" | cut -f1))" + + if [ -n "$S3_KEYS" ]; then + S3_KEYS="$S3_KEYS,$s3_key" + else + S3_KEYS="$s3_key" + fi + FILE_COUNT=$((FILE_COUNT + 1)) + fi +done + +if [ "$FILE_COUNT" -eq 0 ]; then + echo "$(date): ERROR - No zip files produced!" + aws dynamodb update-item \ + --table-name "{TABLE_NAME}" \ + --key '{{"buildId": {{"S": "{build_id}"}}}}' \ + --update-expression "SET #s = :s, error_message = :e" \ + --expression-attribute-names '{{"#s": "status"}}' \ + --expression-attribute-values '{{":s": {{"S": "FAILED"}}, ":e": {{"S": "Build produced no output files"}}}}' + exit 1 +fi + +# --- Update DynamoDB with completion --- +COMPLETED_AT=$(date +%s) +aws dynamodb update-item \ + --table-name "{TABLE_NAME}" \ + --key '{{"buildId": {{"S": "{build_id}"}}}}' \ + --update-expression "SET #s = :s, s3_keys = :k, completed_at = :t, file_count = :fc" \ + --expression-attribute-names '{{"#s": "status"}}' \ + --expression-attribute-values '{{":s": {{"S": "COMPLETED"}}, ":k": {{"S": "'"$S3_KEYS"'"}}, ":t": {{"N": "'"$COMPLETED_AT"'"}}, ":fc": {{"N": "'"$FILE_COUNT"'"}}}}' + +echo "" +echo "$(date): =========================================" +echo "$(date): Build completed successfully!" +echo "$(date): Files: $FILE_COUNT" +echo "$(date): S3 Keys: $S3_KEYS" +echo "$(date): =========================================" + +# Instance will self-terminate via the EXIT trap +""" diff --git a/infrastructure/lambdas/submit_build/index.py b/infrastructure/lambdas/submit_build/index.py new file mode 100644 index 0000000..38a5d28 --- /dev/null +++ b/infrastructure/lambdas/submit_build/index.py @@ -0,0 +1,122 @@ +""" +Submit Build Lambda +Validates the build request, creates a DynamoDB record, and sends +the message to SQS for processing. + +API: POST /builds +Body: { + "requirements": "numpy==1.26.4\nrequests==2.32.4", + "python_version": "3.13", + "architectures": ["x86_64", "arm64"], + "single_file": true +} +""" + +import json +import uuid +import time +import os +import boto3 + +dynamodb = boto3.resource("dynamodb") +sqs = boto3.client("sqs") + +TABLE_NAME = os.environ["DYNAMODB_TABLE"] +QUEUE_URL = os.environ["SQS_QUEUE_URL"] +ARTIFACT_TTL_HOURS = int(os.environ.get("ARTIFACT_TTL_HOURS", "24")) + +VALID_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.14"] +VALID_ARCHITECTURES = ["x86_64", "arm64"] +MAX_REQUIREMENTS_LENGTH = 10000 # 10KB max + + +def handler(event, context): + """Handle POST /builds requests.""" + try: + body = json.loads(event.get("body") or "{}") + except json.JSONDecodeError: + return _response(400, {"error": "Invalid JSON body"}) + + # --- Validate input --- + requirements = body.get("requirements", "").strip() + if not requirements: + return _response(400, {"error": "requirements is required"}) + + if len(requirements) > MAX_REQUIREMENTS_LENGTH: + return _response(400, { + "error": f"requirements too large (max {MAX_REQUIREMENTS_LENGTH} chars)" + }) + + python_version = body.get("python_version", "3.13") + if python_version not in VALID_PYTHON_VERSIONS: + return _response(400, { + "error": f"Invalid python_version. Must be one of: {VALID_PYTHON_VERSIONS}" + }) + + architectures = body.get("architectures", ["x86_64"]) + if not isinstance(architectures, list) or len(architectures) == 0: + return _response(400, {"error": "architectures must be a non-empty list"}) + + for arch in architectures: + if arch not in VALID_ARCHITECTURES: + return _response(400, { + "error": f"Invalid architecture: {arch}. Must be one of: {VALID_ARCHITECTURES}" + }) + + single_file = body.get("single_file", True) + if not isinstance(single_file, bool): + return _response(400, {"error": "single_file must be a boolean"}) + + # --- Create build record --- + build_id = str(uuid.uuid4()) + now = int(time.time()) + expires_at = now + (ARTIFACT_TTL_HOURS * 3600) + + table = dynamodb.Table(TABLE_NAME) + table.put_item(Item={ + "buildId": build_id, + "status": "QUEUED", + "python_version": python_version, + "architectures": architectures, + "requirements": requirements, + "single_file": single_file, + "created_at": now, + "expires_at": expires_at, + "ttl": expires_at + 86400, # DynamoDB TTL: 1 day after artifact expiry + }) + + # --- Queue for processing --- + sqs.send_message( + QueueUrl=QUEUE_URL, + MessageBody=json.dumps({ + "build_id": build_id, + "python_version": python_version, + "architectures": architectures, + "requirements": requirements, + "single_file": single_file, + }), + MessageGroupId=build_id[:8] if QUEUE_URL.endswith(".fifo") else None, + ) + + return _response(200, { + "build_id": build_id, + "status": "QUEUED", + "python_version": python_version, + "architectures": architectures, + "single_file": single_file, + "expires_at": expires_at, + }) + + +def _response(status_code, body): + """Create API Gateway response with CORS headers.""" + return { + "statusCode": status_code, + "headers": { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Headers": "Content-Type", + "Access-Control-Allow-Methods": "POST,GET,OPTIONS", + }, + "body": json.dumps(body), + } diff --git a/infrastructure/terraform/.gitignore b/infrastructure/terraform/.gitignore new file mode 100644 index 0000000..c75b474 --- /dev/null +++ b/infrastructure/terraform/.gitignore @@ -0,0 +1,18 @@ +# Terraform state files +*.tfstate +*.tfstate.* +.terraform/ +.terraform.lock.hcl + +# Build artifacts +.build/ + +# Variables file (may contain secrets) +terraform.tfvars + +# Crash logs +crash.log +crash.*.log + +# Plan files +*.tfplan diff --git a/infrastructure/terraform/api_gateway.tf b/infrastructure/terraform/api_gateway.tf new file mode 100644 index 0000000..2509b00 --- /dev/null +++ b/infrastructure/terraform/api_gateway.tf @@ -0,0 +1,101 @@ +# ============================================================================= +# API Gateway (HTTP API v2) +# ============================================================================= +# HTTP API with CORS support. Routes: +# POST /builds β†’ submit_build Lambda +# GET /builds/{buildId} β†’ check_status Lambda +# ============================================================================= + +resource "aws_apigatewayv2_api" "api" { + name = "${local.name_prefix}-api" + protocol_type = "HTTP" + description = "Lambda Python Layer Builder API" + + cors_configuration { + allow_origins = var.allowed_origins + allow_methods = ["GET", "POST", "OPTIONS"] + allow_headers = ["Content-Type", "Authorization"] + max_age = 86400 + } +} + +resource "aws_apigatewayv2_stage" "default" { + api_id = aws_apigatewayv2_api.api.id + name = "$default" + auto_deploy = true + + default_route_settings { + throttling_rate_limit = var.api_throttle_rate + throttling_burst_limit = var.api_throttle_burst + } + + access_log_settings { + destination_arn = aws_cloudwatch_log_group.api_gateway.arn + format = jsonencode({ + requestId = "$context.requestId" + ip = "$context.identity.sourceIp" + requestTime = "$context.requestTime" + httpMethod = "$context.httpMethod" + routeKey = "$context.routeKey" + status = "$context.status" + protocol = "$context.protocol" + responseLength = "$context.responseLength" + errorMessage = "$context.error.message" + }) + } +} + +resource "aws_cloudwatch_log_group" "api_gateway" { + name = "/aws/apigateway/${local.name_prefix}-api" + retention_in_days = 14 +} + +# ----------------------------------------------------------------------------- +# POST /builds - Submit Build +# ----------------------------------------------------------------------------- + +resource "aws_apigatewayv2_integration" "submit_build" { + api_id = aws_apigatewayv2_api.api.id + integration_type = "AWS_PROXY" + integration_uri = aws_lambda_function.submit_build.invoke_arn + payload_format_version = "2.0" +} + +resource "aws_apigatewayv2_route" "submit_build" { + api_id = aws_apigatewayv2_api.api.id + route_key = "POST /builds" + target = "integrations/${aws_apigatewayv2_integration.submit_build.id}" +} + +resource "aws_lambda_permission" "submit_build_apigw" { + statement_id = "AllowAPIGatewayInvoke" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.submit_build.function_name + principal = "apigateway.amazonaws.com" + source_arn = "${aws_apigatewayv2_api.api.execution_arn}/*/*" +} + +# ----------------------------------------------------------------------------- +# GET /builds/{buildId} - Check Status +# ----------------------------------------------------------------------------- + +resource "aws_apigatewayv2_integration" "check_status" { + api_id = aws_apigatewayv2_api.api.id + integration_type = "AWS_PROXY" + integration_uri = aws_lambda_function.check_status.invoke_arn + payload_format_version = "2.0" +} + +resource "aws_apigatewayv2_route" "check_status" { + api_id = aws_apigatewayv2_api.api.id + route_key = "GET /builds/{buildId}" + target = "integrations/${aws_apigatewayv2_integration.check_status.id}" +} + +resource "aws_lambda_permission" "check_status_apigw" { + statement_id = "AllowAPIGatewayInvoke" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.check_status.function_name + principal = "apigateway.amazonaws.com" + source_arn = "${aws_apigatewayv2_api.api.execution_arn}/*/*" +} diff --git a/infrastructure/terraform/dynamodb.tf b/infrastructure/terraform/dynamodb.tf new file mode 100644 index 0000000..7c41249 --- /dev/null +++ b/infrastructure/terraform/dynamodb.tf @@ -0,0 +1,26 @@ +# ============================================================================= +# DynamoDB Table for Build Tracking +# ============================================================================= +# Tracks build state: QUEUED β†’ PROCESSING β†’ COMPLETED | FAILED +# TTL automatically cleans up old records. +# ============================================================================= + +resource "aws_dynamodb_table" "builds" { + name = "${local.name_prefix}-builds" + billing_mode = "PAY_PER_REQUEST" + hash_key = "buildId" + + attribute { + name = "buildId" + type = "S" + } + + ttl { + attribute_name = "ttl" + enabled = true + } + + tags = { + Name = "${local.name_prefix}-builds" + } +} diff --git a/infrastructure/terraform/ec2.tf b/infrastructure/terraform/ec2.tf new file mode 100644 index 0000000..072a791 --- /dev/null +++ b/infrastructure/terraform/ec2.tf @@ -0,0 +1,83 @@ +# ============================================================================= +# EC2 Launch Template for Build Workers +# ============================================================================= +# Uses Amazon Linux 2023 with Docker pre-configured. +# Instances are launched as Spot by the process_build Lambda. +# Each instance self-terminates after build completion or timeout. +# ============================================================================= + +# Latest Amazon Linux 2023 AMI +data "aws_ami" "al2023" { + most_recent = true + owners = ["amazon"] + + filter { + name = "name" + values = ["al2023-ami-*-x86_64"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "architecture" + values = ["x86_64"] + } +} + +resource "aws_launch_template" "builder" { + name_prefix = "${local.name_prefix}-builder-" + image_id = data.aws_ami.al2023.id + instance_type = var.ec2_instance_type + + iam_instance_profile { + arn = aws_iam_instance_profile.ec2_builder.arn + } + + vpc_security_group_ids = [aws_security_group.builder.id] + + block_device_mappings { + device_name = "/dev/xvda" + + ebs { + volume_size = var.ec2_volume_size + volume_type = "gp3" + encrypted = true + delete_on_termination = true + } + } + + metadata_options { + http_endpoint = "enabled" + http_tokens = "required" # IMDSv2 only + http_put_response_hop_limit = 2 + } + + tag_specifications { + resource_type = "instance" + + tags = { + Name = "${local.name_prefix}-builder" + Project = var.project_name + AutoTerminate = "true" + } + } + + tag_specifications { + resource_type = "volume" + + tags = { + Name = "${local.name_prefix}-builder-vol" + Project = var.project_name + } + } + + # User data is provided dynamically by the process_build Lambda + # This template serves as a base configuration + + lifecycle { + create_before_destroy = true + } +} diff --git a/infrastructure/terraform/iam.tf b/infrastructure/terraform/iam.tf new file mode 100644 index 0000000..c0aa50c --- /dev/null +++ b/infrastructure/terraform/iam.tf @@ -0,0 +1,246 @@ +# ============================================================================= +# IAM Roles & Policies +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Lambda Execution Role - Submit Build +# ----------------------------------------------------------------------------- + +resource "aws_iam_role" "lambda_submit" { + name = "${local.name_prefix}-lambda-submit" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + }] + }) +} + +resource "aws_iam_role_policy" "lambda_submit" { + name = "${local.name_prefix}-lambda-submit-policy" + role = aws_iam_role.lambda_submit.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "dynamodb:PutItem", + ] + Resource = aws_dynamodb_table.builds.arn + }, + { + Effect = "Allow" + Action = [ + "sqs:SendMessage", + ] + Resource = aws_sqs_queue.build_queue.arn + }, + { + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + ] + Resource = "arn:aws:logs:*:*:*" + } + ] + }) +} + +# ----------------------------------------------------------------------------- +# Lambda Execution Role - Process Build (SQS consumer, launches EC2) +# ----------------------------------------------------------------------------- + +resource "aws_iam_role" "lambda_process" { + name = "${local.name_prefix}-lambda-process" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + }] + }) +} + +resource "aws_iam_role_policy" "lambda_process" { + name = "${local.name_prefix}-lambda-process-policy" + role = aws_iam_role.lambda_process.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "dynamodb:UpdateItem", + ] + Resource = aws_dynamodb_table.builds.arn + }, + { + Effect = "Allow" + Action = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes", + ] + Resource = aws_sqs_queue.build_queue.arn + }, + { + Effect = "Allow" + Action = [ + "ec2:RunInstances", + "ec2:CreateTags", + ] + Resource = "*" + }, + { + Effect = "Allow" + Action = "iam:PassRole" + Resource = aws_iam_role.ec2_builder.arn + }, + { + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + ] + Resource = "arn:aws:logs:*:*:*" + } + ] + }) +} + +# ----------------------------------------------------------------------------- +# Lambda Execution Role - Check Status +# ----------------------------------------------------------------------------- + +resource "aws_iam_role" "lambda_status" { + name = "${local.name_prefix}-lambda-status" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + }] + }) +} + +resource "aws_iam_role_policy" "lambda_status" { + name = "${local.name_prefix}-lambda-status-policy" + role = aws_iam_role.lambda_status.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "dynamodb:GetItem", + ] + Resource = aws_dynamodb_table.builds.arn + }, + { + Effect = "Allow" + Action = [ + "s3:GetObject", + ] + Resource = "${aws_s3_bucket.artifacts.arn}/builds/*" + }, + { + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + ] + Resource = "arn:aws:logs:*:*:*" + } + ] + }) +} + +# ----------------------------------------------------------------------------- +# EC2 Builder Instance Role +# ----------------------------------------------------------------------------- + +resource "aws_iam_role" "ec2_builder" { + name = "${local.name_prefix}-ec2-builder" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ec2.amazonaws.com" + } + }] + }) +} + +resource "aws_iam_role_policy" "ec2_builder" { + name = "${local.name_prefix}-ec2-builder-policy" + role = aws_iam_role.ec2_builder.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:PutObject", + "s3:PutObjectAcl", + ] + Resource = "${aws_s3_bucket.artifacts.arn}/builds/*" + }, + { + Effect = "Allow" + Action = [ + "dynamodb:UpdateItem", + ] + Resource = aws_dynamodb_table.builds.arn + }, + { + Effect = "Allow" + Action = [ + "ec2:TerminateInstances", + ] + Resource = "*" + Condition = { + StringEquals = { + "ec2:ResourceTag/Project" = var.project_name + } + } + }, + { + Effect = "Allow" + Action = [ + "ec2:DescribeInstances", + ] + Resource = "*" + } + ] + }) +} + +resource "aws_iam_instance_profile" "ec2_builder" { + name = "${local.name_prefix}-ec2-builder" + role = aws_iam_role.ec2_builder.name +} diff --git a/infrastructure/terraform/lambda.tf b/infrastructure/terraform/lambda.tf new file mode 100644 index 0000000..948c214 --- /dev/null +++ b/infrastructure/terraform/lambda.tf @@ -0,0 +1,138 @@ +# ============================================================================= +# Lambda Functions +# ============================================================================= + +# Package Lambda source code into zip archives +data "archive_file" "submit_build" { + type = "zip" + source_file = "${path.module}/../lambdas/submit_build/index.py" + output_path = "${path.module}/.build/submit_build.zip" +} + +data "archive_file" "process_build" { + type = "zip" + source_file = "${path.module}/../lambdas/process_build/index.py" + output_path = "${path.module}/.build/process_build.zip" +} + +data "archive_file" "check_status" { + type = "zip" + source_file = "${path.module}/../lambdas/check_status/index.py" + output_path = "${path.module}/.build/check_status.zip" +} + +# ----------------------------------------------------------------------------- +# Submit Build Lambda +# Validates request, creates DynamoDB record, sends message to SQS +# ----------------------------------------------------------------------------- + +resource "aws_lambda_function" "submit_build" { + function_name = "${local.name_prefix}-submit-build" + filename = data.archive_file.submit_build.output_path + source_code_hash = data.archive_file.submit_build.output_base64sha256 + handler = "index.handler" + runtime = "python3.13" + timeout = 30 + memory_size = 128 + role = aws_iam_role.lambda_submit.arn + + environment { + variables = { + DYNAMODB_TABLE = aws_dynamodb_table.builds.name + SQS_QUEUE_URL = aws_sqs_queue.build_queue.url + ARTIFACT_TTL_HOURS = tostring(var.artifact_ttl_hours) + } + } + + tags = { + Name = "${local.name_prefix}-submit-build" + } +} + +resource "aws_cloudwatch_log_group" "submit_build" { + name = "/aws/lambda/${aws_lambda_function.submit_build.function_name}" + retention_in_days = 14 +} + +# ----------------------------------------------------------------------------- +# Process Build Lambda +# Triggered by SQS, launches EC2 Spot instance with build user-data +# ----------------------------------------------------------------------------- + +resource "aws_lambda_function" "process_build" { + function_name = "${local.name_prefix}-process-build" + filename = data.archive_file.process_build.output_path + source_code_hash = data.archive_file.process_build.output_base64sha256 + handler = "index.handler" + runtime = "python3.13" + timeout = 60 + memory_size = 256 + role = aws_iam_role.lambda_process.arn + + environment { + variables = { + DYNAMODB_TABLE = aws_dynamodb_table.builds.name + S3_BUCKET = aws_s3_bucket.artifacts.id + SUBNET_IDS = join(",", aws_subnet.public[*].id) + SECURITY_GROUP_ID = aws_security_group.builder.id + LAUNCH_TEMPLATE_ID = aws_launch_template.builder.id + INSTANCE_PROFILE_ARN = aws_iam_instance_profile.ec2_builder.arn + DOCKER_IMAGE_PREFIX = var.docker_image_prefix + GITHUB_REPO_URL = var.github_repo_url + EC2_INSTANCE_TYPE = var.ec2_instance_type + MAX_BUILD_MINUTES = tostring(var.ec2_max_build_time_minutes) + PROJECT_NAME = var.project_name + } + } + + tags = { + Name = "${local.name_prefix}-process-build" + } +} + +resource "aws_cloudwatch_log_group" "process_build" { + name = "/aws/lambda/${aws_lambda_function.process_build.function_name}" + retention_in_days = 14 +} + +# SQS trigger for process_build Lambda +resource "aws_lambda_event_source_mapping" "sqs_trigger" { + event_source_arn = aws_sqs_queue.build_queue.arn + function_name = aws_lambda_function.process_build.arn + batch_size = 1 + maximum_batching_window_in_seconds = 0 + enabled = true +} + +# ----------------------------------------------------------------------------- +# Check Status Lambda +# Returns build status and generates presigned download URLs +# ----------------------------------------------------------------------------- + +resource "aws_lambda_function" "check_status" { + function_name = "${local.name_prefix}-check-status" + filename = data.archive_file.check_status.output_path + source_code_hash = data.archive_file.check_status.output_base64sha256 + handler = "index.handler" + runtime = "python3.13" + timeout = 15 + memory_size = 128 + role = aws_iam_role.lambda_status.arn + + environment { + variables = { + DYNAMODB_TABLE = aws_dynamodb_table.builds.name + S3_BUCKET = aws_s3_bucket.artifacts.id + ARTIFACT_TTL_HOURS = tostring(var.artifact_ttl_hours) + } + } + + tags = { + Name = "${local.name_prefix}-check-status" + } +} + +resource "aws_cloudwatch_log_group" "check_status" { + name = "/aws/lambda/${aws_lambda_function.check_status.function_name}" + retention_in_days = 14 +} diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf new file mode 100644 index 0000000..b23c22e --- /dev/null +++ b/infrastructure/terraform/main.tf @@ -0,0 +1,55 @@ +# ============================================================================= +# Lambda Python Layer Builder - Infrastructure +# ============================================================================= +# Serverless build system that spins up EC2 Spot instances to build +# AWS Lambda Python layers using Docker, with GitHub Pages as the frontend. +# ============================================================================= + +terraform { + required_version = ">= 1.5.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + archive = { + source = "hashicorp/archive" + version = "~> 2.0" + } + random = { + source = "hashicorp/random" + version = "~> 3.0" + } + } +} + +provider "aws" { + region = var.aws_region + + # Credentials: export from the fok666 profile before running Terraform: + # eval "$(aws configure export-credentials --profile fok666 --format env)" + # terraform plan -var-file=prod.tfvars + + default_tags { + tags = { + Project = var.project_name + ManagedBy = "Terraform" + Environment = var.environment + } + } +} + +# Unique suffix for globally unique resource names +resource "random_id" "suffix" { + byte_length = 4 +} + +locals { + name_prefix = "${var.project_name}-${var.environment}" + suffix = random_id.suffix.hex +} + +# Current AWS account and region info +data "aws_caller_identity" "current" {} +data "aws_region" "current" {} diff --git a/infrastructure/terraform/outputs.tf b/infrastructure/terraform/outputs.tf new file mode 100644 index 0000000..68d3022 --- /dev/null +++ b/infrastructure/terraform/outputs.tf @@ -0,0 +1,48 @@ +# ============================================================================= +# Outputs +# ============================================================================= + +output "api_url" { + description = "API Gateway URL. Configure this in the GitHub Pages frontend." + value = aws_apigatewayv2_api.api.api_endpoint +} + +output "s3_bucket_name" { + description = "S3 bucket name for build artifacts." + value = aws_s3_bucket.artifacts.id +} + +output "dynamodb_table_name" { + description = "DynamoDB table name for build tracking." + value = aws_dynamodb_table.builds.name +} + +output "sqs_queue_url" { + description = "SQS queue URL for build requests." + value = aws_sqs_queue.build_queue.url +} + +output "vpc_id" { + description = "VPC ID." + value = aws_vpc.main.id +} + +output "submit_build_lambda" { + description = "Submit build Lambda function name." + value = aws_lambda_function.submit_build.function_name +} + +output "process_build_lambda" { + description = "Process build Lambda function name." + value = aws_lambda_function.process_build.function_name +} + +output "check_status_lambda" { + description = "Check status Lambda function name." + value = aws_lambda_function.check_status.function_name +} + +output "github_pages_config" { + description = "Paste this API URL into the GitHub Pages settings panel." + value = "API URL: ${aws_apigatewayv2_api.api.api_endpoint}" +} diff --git a/infrastructure/terraform/s3.tf b/infrastructure/terraform/s3.tf new file mode 100644 index 0000000..d57d3be --- /dev/null +++ b/infrastructure/terraform/s3.tf @@ -0,0 +1,74 @@ +# ============================================================================= +# S3 Bucket for Build Artifacts +# ============================================================================= +# Stores built Lambda layer zip files. Objects expire automatically +# after the configured TTL. Access is via presigned URLs only. +# ============================================================================= + +resource "aws_s3_bucket" "artifacts" { + bucket = "${local.name_prefix}-artifacts-${local.suffix}" + + tags = { + Name = "${local.name_prefix}-artifacts" + } +} + +resource "aws_s3_bucket_versioning" "artifacts" { + bucket = aws_s3_bucket.artifacts.id + + versioning_configuration { + status = "Disabled" + } +} + +resource "aws_s3_bucket_lifecycle_configuration" "artifacts" { + bucket = aws_s3_bucket.artifacts.id + + rule { + id = "expire-build-artifacts" + status = "Enabled" + + filter { + prefix = "builds/" + } + + expiration { + days = ceil(var.artifact_ttl_hours / 24) + } + + abort_incomplete_multipart_upload { + days_after_initiation = 1 + } + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "artifacts" { + bucket = aws_s3_bucket.artifacts.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +resource "aws_s3_bucket_public_access_block" "artifacts" { + bucket = aws_s3_bucket.artifacts.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_cors_configuration" "artifacts" { + bucket = aws_s3_bucket.artifacts.id + + cors_rule { + allowed_headers = ["*"] + allowed_methods = ["GET", "HEAD"] + allowed_origins = var.allowed_origins + expose_headers = ["Content-Length", "Content-Type"] + max_age_seconds = 3600 + } +} diff --git a/infrastructure/terraform/sqs.tf b/infrastructure/terraform/sqs.tf new file mode 100644 index 0000000..2adb614 --- /dev/null +++ b/infrastructure/terraform/sqs.tf @@ -0,0 +1,31 @@ +# ============================================================================= +# SQS Queue for Build Requests +# ============================================================================= +# Build requests are queued for decoupled, reliable processing. +# Failed messages go to a dead letter queue for investigation. +# ============================================================================= + +resource "aws_sqs_queue" "build_queue" { + name = "${local.name_prefix}-build-queue" + visibility_timeout_seconds = var.sqs_visibility_timeout + message_retention_seconds = 86400 # 24 hours + receive_wait_time_seconds = 10 # Long polling + + redrive_policy = jsonencode({ + deadLetterTargetArn = aws_sqs_queue.build_dlq.arn + maxReceiveCount = var.sqs_max_receive_count + }) + + tags = { + Name = "${local.name_prefix}-build-queue" + } +} + +resource "aws_sqs_queue" "build_dlq" { + name = "${local.name_prefix}-build-dlq" + message_retention_seconds = 604800 # 7 days + + tags = { + Name = "${local.name_prefix}-build-dlq" + } +} diff --git a/infrastructure/terraform/terraform.tfvars.example b/infrastructure/terraform/terraform.tfvars.example new file mode 100644 index 0000000..80b6531 --- /dev/null +++ b/infrastructure/terraform/terraform.tfvars.example @@ -0,0 +1,43 @@ +# ============================================================================= +# Lambda Python Layer Builder - Terraform Configuration +# ============================================================================= +# Copy this file to terraform.tfvars and customize for your environment. +# ============================================================================= + +# AWS region for all resources +aws_region = "eu-central-1" + +# Environment name +environment = "prod" + +# Project name (used as prefix for all resources) +project_name = "lambda-layer-builder" + +# Hours to keep build artifacts in S3 (1-168) +artifact_ttl_hours = 24 + +# Docker image prefix for pre-built images +# docker_image_prefix = "ghcr.io/fok666/lambda-python-layer" + +# GitHub repo URL (fallback for local Docker builds) +# github_repo_url = "https://github.com/fok666/lambda-python-layer.git" + +# EC2 Spot instance type for builds +# c5.xlarge = 4 vCPU, 8GB (~$0.04/hr spot) - Recommended +# c5.2xlarge = 8 vCPU, 16GB (~$0.08/hr spot) - Heavy builds +# m5.large = 2 vCPU, 8GB (~$0.02/hr spot) - Light builds +ec2_instance_type = "c5.xlarge" + +# EBS volume size in GB (20-200) +ec2_volume_size = 20 + +# Max build time before instance self-terminates (safety net) +ec2_max_build_time_minutes = 30 + +# CORS origins - restrict to your GitHub Pages URL in production +# Example: ["https://yourusername.github.io"] +allowed_origins = ["*"] + +# API request limits +# api_throttle_rate = 10 +# api_throttle_burst = 20 diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf new file mode 100644 index 0000000..3d20f5a --- /dev/null +++ b/infrastructure/terraform/variables.tf @@ -0,0 +1,192 @@ +# ============================================================================= +# General Configuration +# ============================================================================= + +variable "aws_region" { + description = "AWS region for all resources." + type = string + default = "eu-central-1" +} + +variable "environment" { + description = "Environment name (e.g., dev, staging, prod)." + type = string + default = "prod" +} + +variable "project_name" { + description = "Project name used as prefix for all resource names." + type = string + default = "lambda-layer-builder" +} + +# ============================================================================= +# Build Configuration +# ============================================================================= + +variable "artifact_ttl_hours" { + description = <<-EOT + Number of hours to retain build artifacts in S3 before automatic deletion. + Also controls the DynamoDB TTL for build records (artifacts TTL + 24h). + + **Cost Impact:** + - Longer TTL = more S3 storage costs + - Shorter TTL = users must download promptly + + Default: 24 hours + EOT + type = number + default = 24 + + validation { + condition = var.artifact_ttl_hours >= 1 && var.artifact_ttl_hours <= 168 + error_message = "artifact_ttl_hours must be between 1 and 168 (1 week)." + } +} + +variable "docker_image_prefix" { + description = <<-EOT + Docker image prefix for pre-built Lambda layer builder images. + The system will try to pull pre-built images before falling back to local builds. + + Format: registry/repository (without tag) + Tags are auto-generated: python{version}-{arch}-latest + EOT + type = string + default = "ghcr.io/fok666/lambda-python-layer" +} + +variable "github_repo_url" { + description = "GitHub repository URL for cloning Dockerfiles (fallback when pre-built images unavailable)." + type = string + default = "https://github.com/fok666/lambda-python-layer.git" +} + +# ============================================================================= +# EC2 Spot Instance Configuration +# ============================================================================= + +variable "ec2_instance_type" { + description = <<-EOT + EC2 instance type for build workers. Needs sufficient CPU and memory + for Docker builds and Python package compilation. + + **Recommended types:** + - c5.xlarge: 4 vCPU, 8 GB RAM (~$0.04/hr spot) - Good default + - c5.2xlarge: 8 vCPU, 16 GB RAM (~$0.08/hr spot) - Heavy builds + - m5.large: 2 vCPU, 8 GB RAM (~$0.02/hr spot) - Light builds + + **Cost Impact:** Spot instances are 60-90% cheaper than on-demand. + Typical build takes 5-15 minutes, costing $0.003-$0.02. + + Default: c5.xlarge (best balance of cost and build speed) + EOT + type = string + default = "c5.xlarge" +} + +variable "ec2_volume_size" { + description = <<-EOT + Root EBS volume size in GB for build instances. + Needs space for Docker images, build artifacts, and OS. + + **Sizing guide:** + - 30 GB: Minimal (1 Python version, small packages) + - 50 GB: Recommended (multiple versions, large packages) + - 100 GB: Heavy builds (many large packages like PyTorch) + + Default: 50 GB + EOT + type = number + default = 50 + + validation { + condition = var.ec2_volume_size >= 20 && var.ec2_volume_size <= 200 + error_message = "ec2_volume_size must be between 20 and 200 GB." + } +} + +variable "ec2_max_build_time_minutes" { + description = <<-EOT + Maximum time in minutes before a build instance self-terminates. + Safety net to prevent runaway costs from stuck instances. + + Default: 30 minutes (most builds complete in 5-15 minutes) + EOT + type = number + default = 30 + + validation { + condition = var.ec2_max_build_time_minutes >= 10 && var.ec2_max_build_time_minutes <= 120 + error_message = "ec2_max_build_time_minutes must be between 10 and 120." + } +} + +# ============================================================================= +# Networking +# ============================================================================= + +variable "vpc_cidr" { + description = "CIDR block for the VPC. Default: 10.0.0.0/16" + type = string + default = "10.0.0.0/16" +} + +variable "availability_zones" { + description = <<-EOT + List of availability zones for subnet placement. + At least 1 required. Using 2 improves spot instance availability. + + Leave empty to auto-select the first 2 AZs in the region. + EOT + type = list(string) + default = [] +} + +# ============================================================================= +# API & CORS +# ============================================================================= + +variable "allowed_origins" { + description = <<-EOT + List of allowed CORS origins for the API. + Use ["*"] during development, restrict to your GitHub Pages URL in production. + + Example: ["https://yourusername.github.io"] + EOT + type = list(string) + default = ["*"] +} + +variable "api_throttle_rate" { + description = "API Gateway throttle rate (requests per second)." + type = number + default = 10 +} + +variable "api_throttle_burst" { + description = "API Gateway throttle burst limit." + type = number + default = 20 +} + +# ============================================================================= +# SQS Configuration +# ============================================================================= + +variable "sqs_visibility_timeout" { + description = <<-EOT + SQS message visibility timeout in seconds. + Should be longer than the time it takes to launch an EC2 instance. + + Default: 300 seconds (5 minutes) + EOT + type = number + default = 300 +} + +variable "sqs_max_receive_count" { + description = "Number of times a message can be received before going to DLQ." + type = number + default = 3 +} diff --git a/infrastructure/terraform/vpc.tf b/infrastructure/terraform/vpc.tf new file mode 100644 index 0000000..ae28cf1 --- /dev/null +++ b/infrastructure/terraform/vpc.tf @@ -0,0 +1,93 @@ +# ============================================================================= +# VPC & Networking +# ============================================================================= +# Minimal VPC with public subnets for EC2 Spot instances. +# Instances need internet access to pull Docker images and upload to S3. +# No NAT Gateway needed - instances use public IPs (cost optimization). +# ============================================================================= + +data "aws_availability_zones" "available" { + state = "available" +} + +locals { + azs = length(var.availability_zones) > 0 ? var.availability_zones : slice( + data.aws_availability_zones.available.names, 0, + min(2, length(data.aws_availability_zones.available.names)) + ) +} + +resource "aws_vpc" "main" { + cidr_block = var.vpc_cidr + enable_dns_hostnames = true + enable_dns_support = true + + tags = { + Name = "${local.name_prefix}-vpc" + } +} + +resource "aws_internet_gateway" "main" { + vpc_id = aws_vpc.main.id + + tags = { + Name = "${local.name_prefix}-igw" + } +} + +resource "aws_subnet" "public" { + count = length(local.azs) + + vpc_id = aws_vpc.main.id + cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index) + availability_zone = local.azs[count.index] + map_public_ip_on_launch = true + + tags = { + Name = "${local.name_prefix}-public-${local.azs[count.index]}" + } +} + +resource "aws_route_table" "public" { + vpc_id = aws_vpc.main.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.main.id + } + + tags = { + Name = "${local.name_prefix}-public-rt" + } +} + +resource "aws_route_table_association" "public" { + count = length(aws_subnet.public) + + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +# Security group for build instances - egress only, no inbound access +resource "aws_security_group" "builder" { + name_prefix = "${local.name_prefix}-builder-" + description = "Security group for Lambda layer builder EC2 instances" + vpc_id = aws_vpc.main.id + + # Allow all outbound traffic (Docker pulls, S3 uploads, API calls) + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + description = "Allow all outbound traffic" + } + + tags = { + Name = "${local.name_prefix}-builder-sg" + } + + lifecycle { + create_before_destroy = true + } +}