diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 9b9a231..664e37b 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -1,9 +1,24 @@ name: Integration Tests on: - pull_request: - types: [labeled] workflow_dispatch: + inputs: + nat_ami_id: + description: Explicit NAT AMI ID to use for the integration fixture + required: false + type: string + updated_nat_ami_id: + description: Optional replacement NAT AMI ID to exercise the AMI upgrade path + required: false + type: string + workflow_call: + inputs: + nat_ami_id: + required: false + type: string + updated_nat_ami_id: + required: false + type: string concurrency: group: nat-zero-integration @@ -13,11 +28,11 @@ permissions: id-token: write contents: read +env: + TEST_NAT_AMI_ID: ${{ vars.NAT_ZERO_TEST_AMI_ID }} + jobs: integration-test: - if: >- - github.event_name == 'workflow_dispatch' || - github.event.label.name == 'integration-test' runs-on: ubuntu-latest timeout-minutes: 15 environment: integration @@ -37,6 +52,24 @@ jobs: role-to-assume: ${{ secrets.INTEGRATION_ROLE_ARN }} aws-region: us-east-1 + - name: Resolve NAT AMI inputs + env: + INPUT_NAT_AMI_ID: ${{ inputs.nat_ami_id }} + INPUT_UPDATED_NAT_AMI_ID: ${{ inputs.updated_nat_ami_id }} + run: | + nat_ami_id="${INPUT_NAT_AMI_ID:-$TEST_NAT_AMI_ID}" + + if [ -z "$nat_ami_id" ]; then + echo "default integration NAT AMI is not configured" >&2 + exit 1 + fi + + echo "NAT_ZERO_TEST_NAT_AMI_ID=$nat_ami_id" >> "$GITHUB_ENV" + + if [ -n "$INPUT_UPDATED_NAT_AMI_ID" ]; then + echo "NAT_ZERO_TEST_UPDATED_NAT_AMI_ID=$INPUT_UPDATED_NAT_AMI_ID" >> "$GITHUB_ENV" + fi + - name: Build Lambda binary working-directory: cmd/lambda run: | diff --git a/.github/workflows/manual-pr-checks.yml b/.github/workflows/manual-pr-checks.yml new file mode 100644 index 0000000..16e5c5b --- /dev/null +++ b/.github/workflows/manual-pr-checks.yml @@ -0,0 +1,43 @@ +name: Manual PR Checks + +on: + pull_request: + types: [labeled] + +permissions: + contents: write + id-token: write + issues: write + pull-requests: write + +jobs: + integration: + if: ${{ github.event.label.name == 'integration-test' }} + uses: ./.github/workflows/integration-tests.yml + secrets: inherit + + nat-images: + if: ${{ github.event.label.name == 'nat-images' }} + uses: ./.github/workflows/nat-images.yml + secrets: inherit + + clear-trigger-label: + if: >- + always() && + (github.event.label.name == 'integration-test' || github.event.label.name == 'nat-images') + needs: + - integration + - nat-images + runs-on: ubuntu-latest + steps: + - name: Remove trigger label + continue-on-error: true + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + LABEL_NAME: ${{ github.event.label.name }} + REPOSITORY: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + run: | + gh api \ + --method DELETE \ + "repos/$REPOSITORY/issues/$PR_NUMBER/labels/$LABEL_NAME" diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml new file mode 100644 index 0000000..868c426 --- /dev/null +++ b/.github/workflows/nat-images.yml @@ -0,0 +1,294 @@ +name: NAT Images + +on: + workflow_dispatch: + inputs: + build_subnet_id: + description: Public subnet ID to use for the Packer builder instance + required: false + type: string + source_region: + description: Region where the AMI is built before being copied globally + required: false + default: us-east-1 + type: string + run_integration_gate: + description: Run the us-east-1 integration gates before publishing and promoting the AMI + required: false + default: true + type: boolean + workflow_call: + inputs: + build_subnet_id: + required: false + type: string + source_region: + required: false + type: string + run_integration_gate: + required: false + default: true + type: boolean + +concurrency: + group: nat-zero-ami + cancel-in-progress: false + +permissions: + contents: read + id-token: write + +env: + PACKER_REGIONS_FILE: ami/nat-zero-private-all-regions.pkrvars.hcl + +jobs: + resolve-inputs: + runs-on: ubuntu-latest + permissions: + contents: read + outputs: + build_subnet_id: ${{ steps.resolve.outputs.build_subnet_id }} + source_region: ${{ steps.resolve.outputs.source_region }} + run_integration_gate: ${{ steps.resolve.outputs.run_integration_gate }} + should_publish: ${{ steps.resolve.outputs.should_publish }} + steps: + - name: Resolve workflow inputs + id: resolve + env: + EVENT_NAME: ${{ github.event_name }} + INPUT_BUILD_SUBNET_ID: ${{ inputs.build_subnet_id }} + INPUT_SOURCE_REGION: ${{ inputs.source_region }} + INPUT_RUN_INTEGRATION_GATE: ${{ inputs.run_integration_gate }} + DEFAULT_BUILD_SUBNET_ID: ${{ vars.NAT_ZERO_AMI_BUILD_SUBNET_ID }} + run: | + should_publish=true + + if [ "$EVENT_NAME" = "workflow_call" ]; then + should_publish=false + fi + + build_subnet_id="${INPUT_BUILD_SUBNET_ID:-$DEFAULT_BUILD_SUBNET_ID}" + source_region="${INPUT_SOURCE_REGION:-us-east-1}" + run_integration_gate="${INPUT_RUN_INTEGRATION_GATE:-true}" + + if [ -z "$build_subnet_id" ]; then + echo "build_subnet_id input is required unless vars.NAT_ZERO_AMI_BUILD_SUBNET_ID is set" >&2 + exit 1 + fi + + { + echo "build_subnet_id=$build_subnet_id" + echo "source_region=$source_region" + echo "run_integration_gate=$run_integration_gate" + echo "should_publish=$should_publish" + } >> "$GITHUB_OUTPUT" + + build-and-copy: + needs: resolve-inputs + runs-on: ubuntu-latest + environment: ami-build + permissions: + id-token: write + contents: read + outputs: + ami_name: ${{ steps.metadata.outputs.ami_name }} + owner_account_id: ${{ steps.metadata.outputs.owner_account_id }} + source_ami_id: ${{ steps.build.outputs.source_ami_id }} + test_ami_id: ${{ steps.test-ami.outputs.test_ami_id }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232 # v3 + + - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 + with: + role-to-assume: ${{ secrets.AMI_BUILD_ROLE_ARN }} + aws-region: ${{ needs.resolve-inputs.outputs.source_region }} + + - name: Prepare Packer copy regions + id: regions + run: | + bash scripts/render_packer_ami_regions.sh \ + "$PACKER_REGIONS_FILE" \ + "${{ needs.resolve-inputs.outputs.source_region }}" \ + "$RUNNER_TEMP/nat-zero-copy-regions.pkrvars.hcl" + + echo "var_file=$RUNNER_TEMP/nat-zero-copy-regions.pkrvars.hcl" >> "$GITHUB_OUTPUT" + + - name: Build AMI + id: build + working-directory: ami + env: + SOURCE_REGION: ${{ needs.resolve-inputs.outputs.source_region }} + run: | + rm -f manifest.json + packer init nat-zero.pkr.hcl + packer build \ + -color=false \ + -var-file "${{ steps.regions.outputs.var_file }}" \ + -var "region=${{ needs.resolve-inputs.outputs.source_region }}" \ + -var "subnet_id=${{ needs.resolve-inputs.outputs.build_subnet_id }}" \ + nat-zero.pkr.hcl + + source_ami_id="$( + jq -er '.builds[-1].artifact_id' manifest.json | + tr ',' '\n' | + awk -F: -v source_region="$SOURCE_REGION" '$1 == source_region && $2 != "" { print $2; exit }' + )" + if [ -z "$source_ami_id" ] || [ "$source_ami_id" = "null" ]; then + echo "failed to determine source AMI ID after packer build" >&2 + exit 1 + fi + + echo "source_ami_id=$source_ami_id" >> "$GITHUB_OUTPUT" + + - name: Resolve AMI metadata + id: metadata + env: + SOURCE_AMI_ID: ${{ steps.build.outputs.source_ami_id }} + run: | + owner_account_id="$(aws sts get-caller-identity --query 'Account' --output text)" + ami_name="$(aws ec2 describe-images --region "${{ needs.resolve-inputs.outputs.source_region }}" --image-ids "$SOURCE_AMI_ID" --query 'Images[0].Name' --output text)" + + echo "owner_account_id=$owner_account_id" >> "$GITHUB_OUTPUT" + echo "ami_name=$ami_name" >> "$GITHUB_OUTPUT" + + - name: Resolve us-east-1 test AMI + id: test-ami + env: + AMI_NAME: ${{ steps.metadata.outputs.ami_name }} + OWNER_ACCOUNT_ID: ${{ steps.metadata.outputs.owner_account_id }} + SOURCE_AMI_ID: ${{ steps.build.outputs.source_ami_id }} + run: | + if [ "${{ needs.resolve-inputs.outputs.source_region }}" = "us-east-1" ]; then + test_ami_id="$SOURCE_AMI_ID" + else + test_ami_id="$(aws ec2 describe-images \ + --region us-east-1 \ + --owners "$OWNER_ACCOUNT_ID" \ + --filters "Name=name,Values=$AMI_NAME" "Name=state,Values=available" \ + --query 'Images[0].ImageId' \ + --output text)" + fi + + if [ -z "$test_ami_id" ] || [ "$test_ami_id" = "None" ]; then + echo "failed to resolve the us-east-1 AMI copy for $AMI_NAME" >&2 + exit 1 + fi + + echo "test_ami_id=$test_ami_id" >> "$GITHUB_OUTPUT" + + integration: + if: ${{ needs.resolve-inputs.outputs.run_integration_gate == 'true' }} + needs: + - resolve-inputs + - build-and-copy + uses: ./.github/workflows/integration-tests.yml + secrets: inherit + with: + nat_ami_id: ${{ vars.NAT_ZERO_TEST_AMI_ID }} + updated_nat_ami_id: ${{ needs.build-and-copy.outputs.test_ami_id }} + + publish-public: + needs: + - resolve-inputs + - build-and-copy + - integration + if: >- + always() && + needs.resolve-inputs.outputs.should_publish == 'true' && + needs.build-and-copy.result == 'success' && + ( + needs.resolve-inputs.outputs.run_integration_gate != 'true' || + needs.integration.result == 'success' + ) + runs-on: ubuntu-latest + environment: ami-build + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 + with: + role-to-assume: ${{ secrets.AMI_BUILD_ROLE_ARN }} + aws-region: ${{ needs.resolve-inputs.outputs.source_region }} + + - name: Make copied AMIs public + env: + AMI_NAME: ${{ needs.build-and-copy.outputs.ami_name }} + OWNER_ACCOUNT_ID: ${{ needs.build-and-copy.outputs.owner_account_id }} + run: | + bash scripts/publish_ami_public.sh \ + "$OWNER_ACCOUNT_ID" \ + "$AMI_NAME" \ + "${{ needs.resolve-inputs.outputs.source_region }}" \ + "$PACKER_REGIONS_FILE" + + open-promotion-pr: + needs: + - resolve-inputs + - build-and-copy + - publish-public + if: ${{ needs.resolve-inputs.outputs.should_publish == 'true' && needs.publish-public.result == 'success' }} + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 + with: + terraform_wrapper: false + + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.12" + + - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 + with: + go-version-file: cmd/lambda/go.mod + + - name: Install pre-commit + run: python -m pip install --upgrade pre-commit + + - name: Update promoted AMI defaults + env: + AMI_NAME: ${{ needs.build-and-copy.outputs.ami_name }} + OWNER_ACCOUNT_ID: ${{ needs.build-and-copy.outputs.owner_account_id }} + run: | + bash scripts/update_ami_defaults.sh "$OWNER_ACCOUNT_ID" "$AMI_NAME" + terraform fmt -recursive + pre-commit run terraform-docs-go --all-files + + - name: Create or update promotion PR + env: + AMI_NAME: ${{ needs.build-and-copy.outputs.ami_name }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + branch_name="automation/promote-nat-zero-ami-$(tr -cs '[:alnum:]' '-' <<<"$AMI_NAME" | sed 's/^-//; s/-$//' | tr '[:upper:]' '[:lower:]')" + commit_title="feat: promote nat-zero AMI ${AMI_NAME}" + pr_title="$commit_title" + pr_body=$'Promotes the default nat-zero AMI after the automated Packer build, global copy, and us-east-1 integration gates passed.\n\nSquash-merge this PR to preserve the `feat:` title so release-please cuts the next module release PR.' + + if git diff --quiet; then + echo "no default changes detected; nothing to promote" + exit 0 + fi + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + git checkout -B "$branch_name" + git add variables.tf README.md docs/reference.md + git commit -m "$commit_title" + git push --force --set-upstream origin "$branch_name" + + pr_number="$(gh pr list --head "$branch_name" --json number --jq '.[0].number')" + if [ -n "$pr_number" ]; then + gh pr edit "$pr_number" --title "$pr_title" --body "$pr_body" + else + gh pr create --base main --head "$branch_name" --title "$pr_title" --body "$pr_body" + fi diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index c6c4296..3a7df22 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -16,11 +16,16 @@ jobs: with: go-version-file: cmd/lambda/go.mod + - uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232 # v3 + - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 - name: Install tools run: | + sudo apt-get update + sudo apt-get install -y shellcheck go install honnef.co/go/tools/cmd/staticcheck@latest + go install github.com/rhysd/actionlint/cmd/actionlint@latest curl -s https://raw.githubusercontent.com/terraform-linters/tflint/master/install_linux.sh | bash - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 9c35b37..38b3876 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -37,22 +37,15 @@ jobs: go-version-file: cmd/lambda/go.mod - name: Build - run: GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap + run: GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -trimpath -buildvcs=false -tags lambda.norpc -ldflags='-s -w -buildid=' -o bootstrap - name: Package - run: zip lambda.zip bootstrap + run: | + TZ=UTC touch -t 198001010000 bootstrap + zip -q -X lambda.zip bootstrap + openssl dgst -sha256 -binary lambda.zip | openssl base64 -A > lambda.zip.base64sha256 - name: Upload to versioned release env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: gh release upload "${{ needs.release-please.outputs.tag_name }}" lambda.zip --clobber - - - name: Update rolling latest release - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - gh release create nat-zero-lambda-latest \ - --title "nat-zero Lambda (latest)" \ - --notes "Auto-built Go Lambda binary from ${{ needs.release-please.outputs.tag_name }}" \ - --latest=false 2>/dev/null || true - gh release upload nat-zero-lambda-latest lambda.zip --clobber + run: gh release upload "${{ needs.release-please.outputs.tag_name }}" lambda.zip lambda.zip.base64sha256 --clobber diff --git a/.gitignore b/.gitignore index 9476cc0..d70df16 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ cmd/lambda/lambda cmd/lambda/bootstrap *.zip +ami/manifest.json # Go vendor/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5ebf7f0..42acec9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,10 @@ repos: - id: detect-private-key - id: detect-aws-credentials args: ["--allow-missing-credentials"] + - repo: https://github.com/rhysd/actionlint + rev: v1.7.8 + hooks: + - id: actionlint - repo: https://github.com/TekWizely/pre-commit-golang rev: v1.0.0-rc.1 hooks: @@ -30,6 +34,23 @@ repos: files: '\.go$' exclude: "tests/integration/" pass_filenames: false + - id: shellcheck + name: shellcheck + language: system + entry: shellcheck + files: '(^scripts/.*\.sh$|^ami/.*\.sh$)' + - id: packer-fmt + name: packer fmt + language: system + entry: bash -c 'cd ami && packer fmt -check -diff .' + files: '^ami/.*\.(pkr\.hcl|hcl)$' + pass_filenames: false + - id: packer-validate + name: packer validate + language: system + entry: bash -c 'tmp="$(mktemp).pkrvars.hcl" && trap '\''rm -f "$tmp"'\'' EXIT && bash scripts/render_packer_ami_regions.sh ami/nat-zero-private-all-regions.pkrvars.hcl us-east-1 "$tmp" && cd ami && packer init nat-zero.pkr.hcl >/dev/null && packer validate -var-file "$tmp" -var "subnet_id=subnet-00000000000000000" nat-zero.pkr.hcl' + files: '^ami/.*$' + pass_filenames: false - repo: https://github.com/zricethezav/gitleaks rev: v8.16.4 hooks: @@ -38,6 +59,7 @@ repos: rev: v1.77.0 hooks: - id: terraform_fmt + - id: terraform_validate - id: terraform_tflint - repo: https://github.com/terraform-docs/terraform-docs rev: "v0.16.0" diff --git a/.terraform-docs-reference.yml b/.terraform-docs-reference.yml index f1ba32b..00fe526 100644 --- a/.terraform-docs-reference.yml +++ b/.terraform-docs-reference.yml @@ -1,5 +1,9 @@ formatter: "markdown table" +sections: + hide: + - providers + output: template: | {{ .Content }} diff --git a/.terraform-docs.yml b/.terraform-docs.yml index 8e30c37..78d3caf 100644 --- a/.terraform-docs.yml +++ b/.terraform-docs.yml @@ -1 +1,5 @@ formatter: "markdown table" + +sections: + hide: + - providers diff --git a/README.md b/README.md index 997dd83..3add691 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ nat-zero is a Terraform module that replaces always-on NAT with on-demand NAT instances. When a workload launches in a private subnet, a NAT instance starts automatically. When the last workload stops, the NAT shuts down and its Elastic IP is released. Idle cost: ~$0.80/month per AZ. -Built on [fck-nat](https://fck-nat.dev/) AMIs. Orchestrated by a single Go Lambda (~55 ms cold start, 29 MB memory). Integration-tested against real AWS infrastructure on every PR. +Built around a NAT Zero AMI baked in-repo and promoted through a dedicated workflow. Orchestrated by a single Go Lambda (~55 ms cold start, 29 MB memory). Integration-tested against real AWS infrastructure on every PR. ``` AZ-A (active) AZ-B (idle) @@ -33,9 +33,9 @@ Built on [fck-nat](https://fck-nat.dev/) AMIs. Orchestrated by a single Go Lambd | **Idle** (no workloads) | **~$0.80/mo** | ~$7-8 | ~$36+ | | **Active** (workloads running) | ~$7-8 | ~$7-8 | ~$36+ | -AWS NAT Gateway costs ~$36/month per AZ even when idle. fck-nat brings that to ~$7-8/month, but the instance and EIP run 24/7. nat-zero releases the Elastic IP when idle, avoiding the [$3.60/month public IPv4 charge](https://aws.amazon.com/blogs/aws/new-aws-public-ipv4-address-charge-public-ip-insights/). +AWS NAT Gateway costs ~$36/month per AZ even when idle. `fck-nat` brings that to roughly ~$7-8/month, but the instance and EIP stay allocated 24/7. nat-zero releases the Elastic IP when idle, avoiding the [$3.60/month public IPv4 charge](https://aws.amazon.com/blogs/aws/new-aws-public-ipv4-address-charge-public-ip-insights/). -Best for dev/staging environments, CI/CD runners, batch jobs, and side projects where workloads run intermittently. +Best for dev/staging environments, CI/CD runners, batch jobs, and side projects where workloads run intermittently. If you need a simpler always-on NAT instance, `fck-nat` is still a sensible option. ## How it works @@ -82,6 +82,33 @@ module "nat_zero" { See [Examples](docs/examples.md) for spot instances, custom AMIs, and building from source. +## Lambda Code Paths + +The module intentionally supports exactly three ways to supply the Lambda binary: + +1. Default release artifact + - Normal path for end users + - The module downloads the versioned `lambda.zip` and reads the matching `lambda.zip.base64sha256` from the tagged GitHub release + - The checksum file exists so Terraform can know the Lambda code hash during `plan`, before it downloads the zip during `apply` + - When a new release publishes a different checksum, Terraform sees the `source_code_hash` change during `plan` and knows the Lambda must be updated +2. Pre-built local zip via `lambda_binary_path` + - Best for CI, unreleased branch testing, or custom binaries + - Terraform hashes the local file during plan +3. Apply-time build via `build_lambda_locally = true` + - Local development only + - Requires Go and `zip` + - May require a second apply after Lambda code changes + +## Recommended Usage + +| Audience | Recommended module ref | Recommended Lambda path | Why | +|----------|------------------------|-------------------------|-----| +| Normal end users | Release tag such as `?ref=v0.4.0` | Default release artifact | Stable module code, stable versioned Lambda artifact, and clean plan/apply behavior | +| CI, branch testing, unreleased validation | Branch or commit ref | `lambda_binary_path` | Lets Terraform see Lambda code changes during plan before the branch has been released | +| Local module development | Working tree | `build_lambda_locally = true` | Fastest iteration loop while changing Go code inside this repo | + +`ref=main` is suitable for development, but it is not the stable consumption path for end users. If `main` has unreleased Go changes, the default Lambda artifact still comes from the latest tagged release until a new release is cut. + ## Performance | Scenario | Time to connectivity | @@ -99,6 +126,7 @@ See [Performance](docs/performance.md) for detailed timings and cost breakdowns. - **EventBridge scope**: Captures all EC2 state changes in the account; Lambda filters by VPC ID. - **Startup delay**: First workload in an idle AZ waits ~10 seconds for internet. Design scripts to retry outbound connections. - **Dual ENI**: Persistent public + private ENIs survive stop/start cycles. +- **AMI compatibility**: The module defaults to the NAT Zero AMI track. Custom AMIs are supported only if they follow the same deterministic dual-ENI model. `fck-nat` AMIs are intentionally unsupported because their bootstrap interrogates IMDS/AWS to discover attached ENIs before nat-zero's EIP lifecycle has completed. - **Retries**: Failed Lambda invocations are retried up to 2 times by EventBridge. - **Clean destroy**: A cleanup action terminates NAT instances before `terraform destroy` removes ENIs. - **Config versioning**: Changing AMI or instance type auto-replaces NAT instances on next workload event. @@ -109,19 +137,12 @@ See [Performance](docs/performance.md) for detailed timings and cost breakdowns. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.3 | +| [terraform](#requirement\_terraform) | >= 1.4 | | [aws](#requirement\_aws) | >= 5.0 | +| [http](#requirement\_http) | >= 3.0 | | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | -## Providers - -| Name | Version | -|------|---------| -| [aws](#provider\_aws) | >= 5.0 | -| [null](#provider\_null) | >= 3.0 | -| [time](#provider\_time) | >= 0.9 | - ## Modules No modules. @@ -148,26 +169,28 @@ No modules. | [aws_route.nat_route](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/route) | resource | | [aws_security_group.nat_security_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | | [null_resource.build_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [null_resource.download_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [terraform_data.download_lambda](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | +| [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | +| [http_http.lambda_binary_hash](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [ami\_id](#input\_ami\_id) | Explicit AMI ID to use (overrides AMI lookup entirely) | `string` | `null` | no | +| [ami\_name\_pattern](#input\_ami\_name\_pattern) | AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI. | `string` | `"nat-zero-al2023-minimal-arm64-20260306-064438"` | no | +| [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `"590144423513"` | no | | [availability\_zones](#input\_availability\_zones) | List of availability zones to deploy NAT instances in | `list(string)` | n/a | yes | | [block\_device\_size](#input\_block\_device\_size) | Size in GB of the root EBS volume | `number` | `10` | no | -| [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally. | `bool` | `false` | no | -| [custom\_ami\_name\_pattern](#input\_custom\_ami\_name\_pattern) | AMI name pattern when use\_fck\_nat\_ami is false | `string` | `null` | no | -| [custom\_ami\_owner](#input\_custom\_ami\_owner) | AMI owner account ID when use\_fck\_nat\_ami is false | `string` | `null` | no | +| [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source during apply instead of downloading a pre-compiled release. This is primarily for local development and may require a second apply after code changes. | `bool` | `false` | no | | [enable\_logging](#input\_enable\_logging) | Create a CloudWatch log group for the Lambda function | `bool` | `true` | no | | [encrypt\_root\_volume](#input\_encrypt\_root\_volume) | Encrypt the root EBS volume. | `bool` | `true` | no | | [ignore\_tag\_key](#input\_ignore\_tag\_key) | Tag key used to mark instances the Lambda should ignore | `string` | `"nat-zero:ignore"` | no | | [ignore\_tag\_value](#input\_ignore\_tag\_value) | Tag value used to mark instances the Lambda should ignore | `string` | `"true"` | no | | [instance\_type](#input\_instance\_type) | Instance type for the NAT instance | `string` | `"t4g.nano"` | no | -| [lambda\_binary\_url](#input\_lambda\_binary\_url) | URL to the pre-compiled Go Lambda zip. Updated automatically by CI. | `string` | `"https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip"` | no | +| [lambda\_binary\_path](#input\_lambda\_binary\_path) | Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation. | `string` | `null` | no | | [lambda\_memory\_size](#input\_lambda\_memory\_size) | Memory allocated to the Lambda function in MB (also scales CPU proportionally) | `number` | `128` | no | | [log\_retention\_days](#input\_log\_retention\_days) | CloudWatch log retention in days (only used when enable\_logging is true) | `number` | `14` | no | | [market\_type](#input\_market\_type) | Whether to use spot or on-demand instances | `string` | `"on-demand"` | no | @@ -179,7 +202,6 @@ No modules. | [private\_subnets\_cidr\_blocks](#input\_private\_subnets\_cidr\_blocks) | CIDR blocks for the private subnets (one per AZ, used in security group rules) | `list(string)` | n/a | yes | | [public\_subnets](#input\_public\_subnets) | Public subnet IDs (one per AZ) for NAT instance public ENIs | `list(string)` | n/a | yes | | [tags](#input\_tags) | Additional tags to apply to all resources | `map(string)` | `{}` | no | -| [use\_fck\_nat\_ami](#input\_use\_fck\_nat\_ami) | Use the public fck-nat AMI. Set to false to use a custom AMI. | `bool` | `true` | no | | [vpc\_id](#input\_vpc\_id) | The VPC ID where NAT instances will be deployed | `string` | n/a | yes | ## Outputs diff --git a/ami.tf b/ami.tf new file mode 100644 index 0000000..a999be2 --- /dev/null +++ b/ami.tf @@ -0,0 +1,23 @@ +locals { + ami_lookup_enabled = var.ami_id == null && var.ami_owner_account != null && var.ami_name_pattern != null +} + +data "aws_ami" "nat" { + count = local.ami_lookup_enabled ? 1 : 0 + most_recent = true + owners = [local.ami_lookup_enabled ? var.ami_owner_account : "000000000000"] + + filter { + name = "name" + values = [local.ami_lookup_enabled ? var.ami_name_pattern : "missing"] + } + + filter { + name = "state" + values = ["available"] + } +} + +locals { + effective_ami_id = var.ami_id != null ? var.ami_id : try(data.aws_ami.nat[0].id, null) +} diff --git a/ami/README.md b/ami/README.md new file mode 100644 index 0000000..d79bb21 --- /dev/null +++ b/ami/README.md @@ -0,0 +1,65 @@ +# NAT Zero AMI (arm64, AL2023 minimal) + +This directory contains the Packer build for the nat-zero AMI. + +## Supported Flavor + +- Architecture: `arm64` +- Base image: Amazon Linux 2023 minimal +- Runtime model: deterministic dual ENI (`ens5` public, `ens6` private) + +## Runtime Design Constraints + +- No IMDS calls in bootstrap/runtime NAT scripts +- No `aws` CLI calls in bootstrap/runtime NAT scripts +- No runtime ENI attach/detach or EIP association logic +- Small, readable bootstrap and NAT config scripts +- `fck-nat`-style bootstrap discovery is intentionally avoided because nat-zero relies on launch-template-owned ENIs and attaches the EIP later in the reconciliation loop +- Unencrypted AMI backing snapshot so the image can be made public; the module can still encrypt runtime NAT instance volumes +- Build-time OS patching via `dnf upgrade --refresh` before the AMI is created + +## Build + +1. Choose a public subnet ID in the target region. +2. Build with Packer: + +```bash +cd ami +packer init nat-zero.pkr.hcl +packer build \ + -var-file "nat-zero-private-all-regions.pkrvars.hcl" \ + -var "region=us-east-1" \ + -var "subnet_id=subnet-0123456789abcdef0" \ + nat-zero.pkr.hcl +``` + +The AMI name format is: + +- `nat-zero-al2023-minimal-arm64-` + +This full AMI name is used as the module default target for deterministic rollout. + +## GitHub Workflow + +Workflow: `.github/workflows/nat-images.yml` + +- Requires GitHub environment secret `AMI_BUILD_ROLE_ARN` +- Requires GitHub environment secret `INTEGRATION_ROLE_ARN` when `run_integration_gate=true` +- Requires GitHub Actions variable `NAT_ZERO_AMI_BUILD_SUBNET_ID` for label-triggered PR validation runs +- Uses OIDC via `aws-actions/configure-aws-credentials` +- Triggers: + - `workflow_dispatch` + - PR label `nat-images` for pre-merge validation on the branch under review +- Inputs for `workflow_dispatch`: + - `build_subnet_id` + - `source_region` (default `us-east-1`) + - `run_integration_gate` (default `true`) +- Behavior: + - builds a new nat-zero AMI with Packer + - uses `nat-zero-private-all-regions.pkrvars.hcl` as the checked-in list of private regional copies + - runs integration tests against the new us-east-1 AMI before any public sharing + - publishes the copied AMIs only after the integration gates pass +- PR label runs stop after build + integration so they can validate the branch safely without publishing or opening a promotion PR +- updates `ami_owner_account`, `ami_name_pattern` (and generated docs) and opens a PR + +Merge the promotion PR to `main` to let release-please publish a new module release that points to the promoted AMI name. diff --git a/ami/files/snat.service b/ami/files/snat.service new file mode 100644 index 0000000..3754e39 --- /dev/null +++ b/ami/files/snat.service @@ -0,0 +1,12 @@ +[Unit] +Description=Configure deterministic IPv4 SNAT for NAT instance +Wants=network-online.target +After=network-online.target + +[Service] +ExecStart=/opt/nat/snat.sh +Type=oneshot +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/ami/files/snat.sh b/ami/files/snat.sh new file mode 100755 index 0000000..9d1b71a --- /dev/null +++ b/ami/files/snat.sh @@ -0,0 +1,43 @@ +#!/bin/sh +set -eu + +NAT_PUBLIC_IFACE="${NAT_PUBLIC_IFACE:-ens5}" +NAT_PRIVATE_IFACE="${NAT_PRIVATE_IFACE:-ens6}" + +if ! ip link show "$NAT_PUBLIC_IFACE" > /dev/null 2>&1; then + echo "Missing expected public interface: $NAT_PUBLIC_IFACE" >&2 + exit 1 +fi + +if ! ip link show "$NAT_PRIVATE_IFACE" > /dev/null 2>&1; then + echo "Missing expected private interface: $NAT_PRIVATE_IFACE" >&2 + exit 1 +fi + +cat > /etc/sysctl.d/99-nat.conf << 'EOF_SYSCTL' +net.ipv4.ip_forward = 1 +EOF_SYSCTL +sysctl --system > /dev/null + +cat > /etc/sysconfig/iptables << EOF_IPTABLES +*filter +:INPUT DROP [0:0] +:FORWARD DROP [0:0] +:OUTPUT ACCEPT [0:0] +-A INPUT -i lo -j ACCEPT +-A INPUT -i $NAT_PRIVATE_IFACE -j ACCEPT +-A INPUT -i $NAT_PUBLIC_IFACE -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT +-A FORWARD -i $NAT_PRIVATE_IFACE -o $NAT_PUBLIC_IFACE -j ACCEPT +-A FORWARD -i $NAT_PUBLIC_IFACE -o $NAT_PRIVATE_IFACE -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT +COMMIT + +*nat +:PREROUTING ACCEPT [0:0] +:INPUT ACCEPT [0:0] +:OUTPUT ACCEPT [0:0] +:POSTROUTING ACCEPT [0:0] +-A POSTROUTING -o $NAT_PUBLIC_IFACE -j MASQUERADE +COMMIT +EOF_IPTABLES + +iptables-restore < /etc/sysconfig/iptables diff --git a/ami/nat-zero-private-all-regions.pkrvars.hcl b/ami/nat-zero-private-all-regions.pkrvars.hcl new file mode 100644 index 0000000..502f312 --- /dev/null +++ b/ami/nat-zero-private-all-regions.pkrvars.hcl @@ -0,0 +1,38 @@ +# Keep this list aligned with the regions we want nat-zero published into. +# me-central-1 is intentionally excluded for now because EC2 CopyImage is +# currently throttling there for this account. +ami_regions = [ + "af-south-1", + "ap-east-1", + "ap-east-2", + "ap-northeast-1", + "ap-northeast-2", + "ap-northeast-3", + "ap-south-1", + "ap-south-2", + "ap-southeast-1", + "ap-southeast-2", + "ap-southeast-3", + "ap-southeast-4", + "ap-southeast-5", + "ap-southeast-6", + "ap-southeast-7", + "ca-central-1", + "ca-west-1", + "eu-central-1", + "eu-central-2", + "eu-north-1", + "eu-south-1", + "eu-south-2", + "eu-west-1", + "eu-west-2", + "eu-west-3", + "il-central-1", + "me-south-1", + "mx-central-1", + "sa-east-1", + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", +] diff --git a/ami/nat-zero.pkr.hcl b/ami/nat-zero.pkr.hcl new file mode 100644 index 0000000..579335f --- /dev/null +++ b/ami/nat-zero.pkr.hcl @@ -0,0 +1,99 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = ">= 1.2.0" + } + } +} + +variable "region" { + type = string + default = "us-east-1" +} + +variable "subnet_id" { + type = string +} + +variable "ami_name_prefix" { + type = string + default = "nat-zero-al2023-minimal-arm64" +} + +variable "root_volume_size" { + type = number + default = 4 +} + +variable "ami_regions" { + type = list(string) + default = [] +} + +source "amazon-ebs" "nat_zero" { + ami_name = "${var.ami_name_prefix}-${formatdate("YYYYMMDD-hhmmss", timestamp())}" + ami_regions = var.ami_regions + instance_type = "t4g.nano" + region = var.region + max_retries = 50 + subnet_id = var.subnet_id + ssh_username = "ec2-user" + + launch_block_device_mappings { + device_name = "/dev/xvda" + volume_size = var.root_volume_size + volume_type = "gp3" + delete_on_termination = true + # Public AMIs cannot reference encrypted backing snapshots. + encrypted = false + } + + source_ami_filter { + filters = { + name = "al2023-ami-minimal-*-kernel-*-arm64" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] + } + + tags = { + Name = "nat-zero-ami-build" + Project = "nat-zero" + Role = "nat" + ManagedBy = "packer" + OS = "al2023-minimal" + Architecture = "arm64" + } +} + +build { + name = "nat-zero-ami" + sources = ["source.amazon-ebs.nat_zero"] + + provisioner "file" { + source = "files/snat.sh" + destination = "/tmp/snat.sh" + } + + provisioner "file" { + source = "files/snat.service" + destination = "/tmp/snat.service" + } + + provisioner "shell" { + execute_command = "sudo -E sh -eux '{{ .Path }}'" + script = "scripts/install-deps.sh" + } + + provisioner "shell" { + execute_command = "sudo -E sh -eux '{{ .Path }}'" + script = "scripts/configure.sh" + } + + post-processor "manifest" { + output = "manifest.json" + } +} diff --git a/ami/scripts/configure.sh b/ami/scripts/configure.sh new file mode 100755 index 0000000..990e96c --- /dev/null +++ b/ami/scripts/configure.sh @@ -0,0 +1,14 @@ +#!/bin/sh +set -eu + +systemctl stop sshd +systemctl disable sshd +systemctl mask sshd +dnf remove -y openssh-server + +mkdir -p /opt/nat +install /tmp/snat.sh /opt/nat/snat.sh -m u+rx +cp /tmp/snat.service /etc/systemd/system/snat.service + +systemctl daemon-reload +systemctl enable snat diff --git a/ami/scripts/install-deps.sh b/ami/scripts/install-deps.sh new file mode 100755 index 0000000..a722e80 --- /dev/null +++ b/ami/scripts/install-deps.sh @@ -0,0 +1,9 @@ +#!/bin/sh +set -eu + +# Always bake from a fully patched AL2023 base so each AMI includes the +# latest published OS-level fixes available at build time. +dnf -y upgrade --refresh +dnf -y install iptables +dnf clean all +rm -rf /var/cache/dnf diff --git a/cmd/lambda/ec2iface.go b/cmd/lambda/ec2iface.go index 0db74c5..43b0ca6 100644 --- a/cmd/lambda/ec2iface.go +++ b/cmd/lambda/ec2iface.go @@ -18,8 +18,5 @@ type EC2API interface { DisassociateAddress(ctx context.Context, params *ec2.DisassociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateAddressOutput, error) ReleaseAddress(ctx context.Context, params *ec2.ReleaseAddressInput, optFns ...func(*ec2.Options)) (*ec2.ReleaseAddressOutput, error) DescribeAddresses(ctx context.Context, params *ec2.DescribeAddressesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeAddressesOutput, error) - DescribeNetworkInterfaces(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) - DescribeImages(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) DescribeLaunchTemplates(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) - DescribeLaunchTemplateVersions(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) } diff --git a/cmd/lambda/ec2ops.go b/cmd/lambda/ec2ops.go index bc88a46..fd16644 100644 --- a/cmd/lambda/ec2ops.go +++ b/cmd/lambda/ec2ops.go @@ -5,7 +5,6 @@ import ( "errors" "fmt" "log" - "sort" "strings" "time" @@ -321,33 +320,6 @@ func (h *Handler) isCurrentConfig(inst *Instance) bool { // --- NAT lifecycle helpers --- -func (h *Handler) resolveAMI(ctx context.Context) string { - defer timed("resolve_ami")() - resp, err := h.EC2.DescribeImages(ctx, &ec2.DescribeImagesInput{ - Owners: []string{h.AMIOwner}, - Filters: []ec2types.Filter{ - {Name: aws.String("name"), Values: []string{h.AMIPattern}}, - {Name: aws.String("state"), Values: []string{"available"}}, - }, - }) - if err != nil { - log.Printf("AMI lookup failed, using launch template default: %v", err) - return "" - } - if len(resp.Images) == 0 { - return "" - } - - images := resp.Images - sort.Slice(images, func(i, j int) bool { - return aws.ToString(images[i].CreationDate) > aws.ToString(images[j].CreationDate) - }) - ami := images[0] - amiID := aws.ToString(ami.ImageId) - log.Printf("Using AMI %s (%s)", amiID, aws.ToString(ami.Name)) - return amiID -} - func (h *Handler) resolveLT(ctx context.Context, az, vpc string) (string, int64) { defer timed("resolve_lt")() resp, err := h.EC2.DescribeLaunchTemplates(ctx, &ec2.DescribeLaunchTemplatesInput{ @@ -361,16 +333,10 @@ func (h *Handler) resolveLT(ctx context.Context, az, vpc string) (string, int64) } ltID := aws.ToString(resp.LaunchTemplates[0].LaunchTemplateId) - - verResp, err := h.EC2.DescribeLaunchTemplateVersions(ctx, &ec2.DescribeLaunchTemplateVersionsInput{ - LaunchTemplateId: aws.String(ltID), - Versions: []string{"$Latest"}, - }) - if err != nil || len(verResp.LaunchTemplateVersions) == 0 { - return "", 0 + version := aws.ToInt64(resp.LaunchTemplates[0].LatestVersionNumber) + if version == 0 { + version = aws.ToInt64(resp.LaunchTemplates[0].DefaultVersionNumber) } - - version := aws.ToInt64(verResp.LaunchTemplateVersions[0].VersionNumber) return ltID, version } @@ -383,16 +349,18 @@ func (h *Handler) createNAT(ctx context.Context, az, vpc string) string { return "" } - amiID := h.resolveAMI(ctx) - input := &ec2.RunInstancesInput{ LaunchTemplate: &ec2types.LaunchTemplateSpecification{ LaunchTemplateId: aws.String(ltID), - Version: aws.String(fmt.Sprintf("%d", version)), }, MinCount: aws.Int32(1), MaxCount: aws.Int32(1), } + if version > 0 { + input.LaunchTemplate.Version = aws.String(fmt.Sprintf("%d", version)) + } else { + log.Printf("Launch template %s has no version metadata, using EC2 default version", ltID) + } if h.ConfigVersion != "" { input.TagSpecifications = []ec2types.TagSpecification{{ @@ -403,10 +371,6 @@ func (h *Handler) createNAT(ctx context.Context, az, vpc string) string { }} } - if amiID != "" { - input.ImageId = aws.String(amiID) - } - resp, err := h.EC2.RunInstances(ctx, input) if err != nil { log.Printf("Failed to create NAT instance: %v", err) @@ -485,8 +449,21 @@ func (h *Handler) cleanupAll(ctx context.Context) { // before termination completes, Terraform may try to delete still-attached ENIs. func (h *Handler) waitForTermination(ctx context.Context, instanceIDs []string) { defer timed("wait_for_termination")() - for attempt := 0; attempt < 60; attempt++ { - time.Sleep(2 * time.Second) + const ( + pollInterval = 2 * time.Second + maxAttempts = 90 + deadlineBuffer = 5 * time.Second + ) + + deadline, hasDeadline := ctx.Deadline() + for attempt := 0; attempt < maxAttempts; attempt++ { + if hasDeadline && time.Until(deadline) <= deadlineBuffer { + log.Printf("Stopping termination wait with Lambda deadline %s away", time.Until(deadline).Round(time.Second)) + return + } + if attempt > 0 { + time.Sleep(pollInterval) + } resp, err := h.EC2.DescribeInstances(ctx, &ec2.DescribeInstancesInput{ InstanceIds: instanceIDs, Filters: []ec2types.Filter{ diff --git a/cmd/lambda/ec2ops_test.go b/cmd/lambda/ec2ops_test.go index c20da4b..8bd3fb8 100644 --- a/cmd/lambda/ec2ops_test.go +++ b/cmd/lambda/ec2ops_test.go @@ -369,16 +369,12 @@ func TestReleaseEIPs(t *testing.T) { // --- createNAT() --- func TestCreateNAT(t *testing.T) { - setupLTAndAMI := func(mock *mockEC2) { + setupLT := func(mock *mockEC2) { mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { return &ec2.DescribeLaunchTemplatesOutput{ - LaunchTemplates: []ec2types.LaunchTemplate{{LaunchTemplateId: aws.String("lt-123")}}, - }, nil - } - mock.DescribeLaunchTemplateVersionsFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) { - return &ec2.DescribeLaunchTemplateVersionsOutput{ - LaunchTemplateVersions: []ec2types.LaunchTemplateVersion{{ - LaunchTemplateId: aws.String("lt-123"), VersionNumber: aws.Int64(1), + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + LatestVersionNumber: aws.Int64(1), }}, }, nil } @@ -386,16 +382,7 @@ func TestCreateNAT(t *testing.T) { t.Run("happy path", func(t *testing.T) { mock := &mockEC2{} - setupLTAndAMI(mock) - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{ - Images: []ec2types.Image{{ - ImageId: aws.String("ami-fcknat"), - Name: aws.String("fck-nat-al2023-1.0-arm64-20240101"), - CreationDate: aws.String("2024-01-01T00:00:00.000Z"), - }}, - }, nil - } + setupLT(mock) mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { return &ec2.RunInstancesOutput{ Instances: []ec2types.Instance{{InstanceId: aws.String("i-new1")}}, @@ -408,6 +395,58 @@ func TestCreateNAT(t *testing.T) { } }) + t.Run("falls back to default version when latest version missing", func(t *testing.T) { + mock := &mockEC2{} + mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { + return &ec2.DescribeLaunchTemplatesOutput{ + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + DefaultVersionNumber: aws.Int64(2), + }}, + }, nil + } + mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { + if params.LaunchTemplate == nil || aws.ToString(params.LaunchTemplate.Version) != "2" { + t.Fatalf("expected launch template version 2, got %#v", params.LaunchTemplate) + } + return &ec2.RunInstancesOutput{ + Instances: []ec2types.Instance{{InstanceId: aws.String("i-new2")}}, + }, nil + } + h := newTestHandler(mock) + result := h.createNAT(context.Background(), testAZ, testVPC) + if result != "i-new2" { + t.Errorf("expected i-new2, got %s", result) + } + }) + + t.Run("uses template without explicit version when metadata missing", func(t *testing.T) { + mock := &mockEC2{} + mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { + return &ec2.DescribeLaunchTemplatesOutput{ + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + }}, + }, nil + } + mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { + if params.LaunchTemplate == nil || aws.ToString(params.LaunchTemplate.LaunchTemplateId) != "lt-123" { + t.Fatalf("expected launch template id lt-123, got %#v", params.LaunchTemplate) + } + if params.LaunchTemplate.Version != nil { + t.Fatalf("expected launch template version to be omitted, got %q", aws.ToString(params.LaunchTemplate.Version)) + } + return &ec2.RunInstancesOutput{ + Instances: []ec2types.Instance{{InstanceId: aws.String("i-new3")}}, + }, nil + } + h := newTestHandler(mock) + result := h.createNAT(context.Background(), testAZ, testVPC) + if result != "i-new3" { + t.Errorf("expected i-new3, got %s", result) + } + }) + t.Run("no launch template", func(t *testing.T) { mock := &mockEC2{} mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { @@ -422,10 +461,7 @@ func TestCreateNAT(t *testing.T) { t.Run("run instances fails", func(t *testing.T) { mock := &mockEC2{} - setupLTAndAMI(mock) - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{Images: []ec2types.Image{}}, nil - } + setupLT(mock) mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { return nil, fmt.Errorf("InsufficientInstanceCapacity: No capacity") } @@ -438,10 +474,7 @@ func TestCreateNAT(t *testing.T) { t.Run("config version tag included", func(t *testing.T) { mock := &mockEC2{} - setupLTAndAMI(mock) - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{Images: []ec2types.Image{}}, nil - } + setupLT(mock) mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { if len(params.TagSpecifications) == 0 { t.Error("expected TagSpecifications") diff --git a/cmd/lambda/handler.go b/cmd/lambda/handler.go index 6866e1f..4edb04f 100644 --- a/cmd/lambda/handler.go +++ b/cmd/lambda/handler.go @@ -20,8 +20,6 @@ type Handler struct { IgnoreTagKey string IgnoreTagValue string TargetVPC string - AMIOwner string - AMIPattern string ConfigVersion string } diff --git a/cmd/lambda/handler_test.go b/cmd/lambda/handler_test.go index e86e613..8deb4d8 100644 --- a/cmd/lambda/handler_test.go +++ b/cmd/lambda/handler_test.go @@ -119,19 +119,12 @@ func TestReconcileScaleUp(t *testing.T) { } mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { return &ec2.DescribeLaunchTemplatesOutput{ - LaunchTemplates: []ec2types.LaunchTemplate{{LaunchTemplateId: aws.String("lt-123")}}, - }, nil - } - mock.DescribeLaunchTemplateVersionsFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) { - return &ec2.DescribeLaunchTemplateVersionsOutput{ - LaunchTemplateVersions: []ec2types.LaunchTemplateVersion{{ - LaunchTemplateId: aws.String("lt-123"), VersionNumber: aws.Int64(1), + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + LatestVersionNumber: aws.Int64(1), }}, }, nil } - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{Images: []ec2types.Image{}}, nil - } mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { return &ec2.RunInstancesOutput{ Instances: []ec2types.Instance{{InstanceId: aws.String("i-new1")}}, @@ -751,19 +744,12 @@ func TestReconcileNATEvent(t *testing.T) { } mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { return &ec2.DescribeLaunchTemplatesOutput{ - LaunchTemplates: []ec2types.LaunchTemplate{{LaunchTemplateId: aws.String("lt-123")}}, - }, nil - } - mock.DescribeLaunchTemplateVersionsFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) { - return &ec2.DescribeLaunchTemplateVersionsOutput{ - LaunchTemplateVersions: []ec2types.LaunchTemplateVersion{{ - LaunchTemplateId: aws.String("lt-123"), VersionNumber: aws.Int64(1), + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + LatestVersionNumber: aws.Int64(1), }}, }, nil } - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{Images: []ec2types.Image{}}, nil - } mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { return &ec2.RunInstancesOutput{ Instances: []ec2types.Instance{{InstanceId: aws.String("i-new")}}, diff --git a/cmd/lambda/main.go b/cmd/lambda/main.go index bfe950c..643c12e 100644 --- a/cmd/lambda/main.go +++ b/cmd/lambda/main.go @@ -29,8 +29,6 @@ func main() { IgnoreTagKey: envOr("IGNORE_TAG_KEY", "nat-zero:ignore"), IgnoreTagValue: envOr("IGNORE_TAG_VALUE", "true"), TargetVPC: os.Getenv("TARGET_VPC_ID"), - AMIOwner: envOr("AMI_OWNER_ACCOUNT", "568608671756"), - AMIPattern: envOr("AMI_NAME_PATTERN", "fck-nat-al2023-*-arm64-*"), ConfigVersion: os.Getenv("CONFIG_VERSION"), } diff --git a/cmd/lambda/mock_test.go b/cmd/lambda/mock_test.go index 8fadd16..b96d762 100644 --- a/cmd/lambda/mock_test.go +++ b/cmd/lambda/mock_test.go @@ -11,20 +11,17 @@ import ( // mockEC2 implements EC2API with per-method function fields for test control. type mockEC2 struct { - DescribeInstancesFn func(ctx context.Context, params *ec2.DescribeInstancesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) - RunInstancesFn func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) - StartInstancesFn func(ctx context.Context, params *ec2.StartInstancesInput, optFns ...func(*ec2.Options)) (*ec2.StartInstancesOutput, error) - StopInstancesFn func(ctx context.Context, params *ec2.StopInstancesInput, optFns ...func(*ec2.Options)) (*ec2.StopInstancesOutput, error) - TerminateInstancesFn func(ctx context.Context, params *ec2.TerminateInstancesInput, optFns ...func(*ec2.Options)) (*ec2.TerminateInstancesOutput, error) - AllocateAddressFn func(ctx context.Context, params *ec2.AllocateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AllocateAddressOutput, error) - AssociateAddressFn func(ctx context.Context, params *ec2.AssociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AssociateAddressOutput, error) - DisassociateAddressFn func(ctx context.Context, params *ec2.DisassociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateAddressOutput, error) - ReleaseAddressFn func(ctx context.Context, params *ec2.ReleaseAddressInput, optFns ...func(*ec2.Options)) (*ec2.ReleaseAddressOutput, error) - DescribeAddressesFn func(ctx context.Context, params *ec2.DescribeAddressesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeAddressesOutput, error) - DescribeNetworkInterfacesFn func(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) - DescribeImagesFn func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) - DescribeLaunchTemplatesFn func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) - DescribeLaunchTemplateVersionsFn func(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) + DescribeInstancesFn func(ctx context.Context, params *ec2.DescribeInstancesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) + RunInstancesFn func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) + StartInstancesFn func(ctx context.Context, params *ec2.StartInstancesInput, optFns ...func(*ec2.Options)) (*ec2.StartInstancesOutput, error) + StopInstancesFn func(ctx context.Context, params *ec2.StopInstancesInput, optFns ...func(*ec2.Options)) (*ec2.StopInstancesOutput, error) + TerminateInstancesFn func(ctx context.Context, params *ec2.TerminateInstancesInput, optFns ...func(*ec2.Options)) (*ec2.TerminateInstancesOutput, error) + AllocateAddressFn func(ctx context.Context, params *ec2.AllocateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AllocateAddressOutput, error) + AssociateAddressFn func(ctx context.Context, params *ec2.AssociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AssociateAddressOutput, error) + DisassociateAddressFn func(ctx context.Context, params *ec2.DisassociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateAddressOutput, error) + ReleaseAddressFn func(ctx context.Context, params *ec2.ReleaseAddressInput, optFns ...func(*ec2.Options)) (*ec2.ReleaseAddressOutput, error) + DescribeAddressesFn func(ctx context.Context, params *ec2.DescribeAddressesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeAddressesOutput, error) + DescribeLaunchTemplatesFn func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) // Call tracking (mutex-protected for concurrent access) mu sync.Mutex @@ -134,22 +131,6 @@ func (m *mockEC2) DescribeAddresses(ctx context.Context, params *ec2.DescribeAdd return &ec2.DescribeAddressesOutput{}, nil } -func (m *mockEC2) DescribeNetworkInterfaces(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) { - m.track("DescribeNetworkInterfaces", params) - if m.DescribeNetworkInterfacesFn != nil { - return m.DescribeNetworkInterfacesFn(ctx, params, optFns...) - } - return &ec2.DescribeNetworkInterfacesOutput{}, nil -} - -func (m *mockEC2) DescribeImages(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - m.track("DescribeImages", params) - if m.DescribeImagesFn != nil { - return m.DescribeImagesFn(ctx, params, optFns...) - } - return &ec2.DescribeImagesOutput{}, nil -} - func (m *mockEC2) DescribeLaunchTemplates(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { m.track("DescribeLaunchTemplates", params) if m.DescribeLaunchTemplatesFn != nil { @@ -158,14 +139,6 @@ func (m *mockEC2) DescribeLaunchTemplates(ctx context.Context, params *ec2.Descr return &ec2.DescribeLaunchTemplatesOutput{}, nil } -func (m *mockEC2) DescribeLaunchTemplateVersions(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) { - m.track("DescribeLaunchTemplateVersions", params) - if m.DescribeLaunchTemplateVersionsFn != nil { - return m.DescribeLaunchTemplateVersionsFn(ctx, params, optFns...) - } - return &ec2.DescribeLaunchTemplateVersionsOutput{}, nil -} - // --- Test helper builders --- const ( @@ -220,8 +193,6 @@ func newTestHandler(mock *mockEC2) *Handler { IgnoreTagKey: "nat-zero:ignore", IgnoreTagValue: "true", TargetVPC: testVPC, - AMIOwner: "568608671756", - AMIPattern: "fck-nat-al2023-*-arm64-*", ConfigVersion: "", } } diff --git a/docs/examples.md b/docs/examples.md index 6ac5448..b0814bd 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -55,7 +55,7 @@ module "nat_zero" { private_route_table_ids = module.vpc.private_route_table_ids private_subnets_cidr_blocks = module.vpc.private_subnets_cidr_blocks - # Defaults: t4g.nano, fck-nat AMI, on-demand + # Defaults: t4g.nano, promoted public nat-zero AMI track, on-demand # Uncomment for spot instances: # market_type = "spot" @@ -92,7 +92,7 @@ module "nat_zero" { ## Custom AMI -To use a custom AMI instead of the default fck-nat AMI: +To use your own NAT Zero-compatible AMI instead of the default public nat-zero AMI: ```hcl module "nat_zero" { @@ -100,9 +100,8 @@ module "nat_zero" { # ... required variables ... - use_fck_nat_ami = false - custom_ami_owner = "123456789012" - custom_ami_name_pattern = "my-nat-ami-*" + ami_owner_account = "123456789012" + ami_name_pattern = "my-nat-ami-*" } ``` @@ -118,6 +117,8 @@ module "nat_zero" { } ``` +Custom AMIs must preserve nat-zero's deterministic dual-ENI boot model. `fck-nat` AMIs are not compatible because they query IMDS/AWS during bootstrap to infer ENI attachment, while nat-zero relies on the launch template ENIs being known up front and the EIP being attached later by the reconciler. + ## Disable Root Volume Encryption The root EBS volume is encrypted by default. To disable encryption (e.g., for environments without compliance requirements): @@ -132,9 +133,28 @@ module "nat_zero" { } ``` +## Lambda Code Paths + +This repo intentionally supports exactly three Lambda code paths: + +1. Default release path: do nothing extra. The module downloads the versioned `lambda.zip` and `lambda.zip.base64sha256` that match the tagged module release. + The checksum file exists so Terraform can know `source_code_hash` during `plan`, before it downloads the zip during `apply`. When the published checksum changes, Terraform can see the upstream Lambda code change in the plan. +2. Pre-built local zip: pass `lambda_binary_path` to test an unreleased branch or supply your own artifact. +3. Build during apply: set `build_lambda_locally = true` for local development only. + +## Recommended Usage By Audience + +| Audience | Recommended module ref | Recommended Lambda path | Why | +|----------|------------------------|-------------------------|-----| +| Normal end users | Release tag such as `?ref=v0.4.0` | Default release artifact | Stable module code, stable versioned Lambda artifact, and clean plan/apply behavior | +| CI, branch testing, unreleased validation | Branch or commit ref | `lambda_binary_path` | Lets Terraform see Lambda code changes during plan before the branch has been released | +| Local module development | Working tree | `build_lambda_locally = true` | Fastest iteration loop while changing Go code inside this repo | + +`ref=main` is fine for development, but it is not the stable consumer path. If `main` has unreleased Go changes, the default Lambda artifact still comes from the latest tagged release until the next release is published. + ## Building Lambda Locally -For development or if you want to build from source: +For development only, or if you explicitly want Terraform to build from source during `terraform apply`: ```hcl module "nat_zero" { @@ -146,4 +166,20 @@ module "nat_zero" { } ``` -Requires Go and `zip` installed locally. +Requires Go and `zip` installed locally. This is a non-standard path and may require a second apply after code changes. + +## Using a Pre-built Local Lambda Zip + +For CI, branch testing, or if you want plan-time Lambda diffs without waiting for a release, build the zip outside Terraform and pass it in directly: + +```hcl +module "nat_zero" { + source = "github.com/MachineDotDev/nat-zero" + + # ... required variables ... + + lambda_binary_path = "${path.module}/.build/lambda.zip" +} +``` + +This is the right way to test an unreleased branch when the branch includes Lambda code changes. The default downloaded Lambda zip is pinned to the latest tagged module release. diff --git a/docs/index.md b/docs/index.md index fdd5f02..6b2ad71 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,7 +4,7 @@ nat-zero is a Terraform module that replaces always-on NAT with on-demand NAT instances. When a workload launches in a private subnet, a NAT instance starts automatically. When the last workload stops, the NAT shuts down and its Elastic IP is released. Idle cost: ~$0.80/month per AZ. -Built on [fck-nat](https://fck-nat.dev/) AMIs. Orchestrated by a single Go Lambda (~55 ms cold start, 29 MB memory). Integration-tested against real AWS infrastructure on every PR. +Built around a NAT Zero AMI baked in-repo and promoted through a dedicated workflow. Orchestrated by a single Go Lambda (~55 ms cold start, 29 MB memory). Integration-tested against real AWS infrastructure on every PR. ## Quick start @@ -34,6 +34,10 @@ module "nat_zero" { - [Architecture](architecture.md) — reconciliation model, decision matrix, event flows - [Performance](performance.md) — startup latency, Lambda execution times, cost breakdowns -- [Examples](examples.md) — spot instances, custom AMIs, building from source +- [Examples](examples.md) — spot instances, custom AMIs, Lambda code paths, recommended usage by audience - [Terraform Reference](reference.md) — inputs, outputs, resources - [Testing](testing.md) — integration test lifecycle and CI + +`fck-nat` AMIs are intentionally unsupported here. They discover ENIs via IMDS/AWS calls during bootstrap, which does not match nat-zero's launch-template-owned ENIs and delayed EIP attachment model. + +`fck-nat` itself is still a good fit when you want a conventional always-on NAT instance and do not need nat-zero's scale-to-zero lifecycle. diff --git a/docs/performance.md b/docs/performance.md index f7a5816..e85c1f0 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -20,7 +20,7 @@ All measurements from real integration tests in us-east-1 with `t4g.nano` instan 2.3 s RunInstances returns — NAT is "pending" Lambda returns. -~8.0 s NAT reaches "running" (EC2 boot + fck-nat config) +~8.0 s NAT reaches "running" (EC2 boot + NAT config) ~8.3 s EventBridge delivers NAT "running" event ~8.9 s Lambda: allocate EIP + associate (~3 s) diff --git a/docs/reference.md b/docs/reference.md index d286474..7777c48 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -2,19 +2,12 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.3 | +| [terraform](#requirement\_terraform) | >= 1.4 | | [aws](#requirement\_aws) | >= 5.0 | +| [http](#requirement\_http) | >= 3.0 | | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | -## Providers - -| Name | Version | -|------|---------| -| [aws](#provider\_aws) | >= 5.0 | -| [null](#provider\_null) | >= 3.0 | -| [time](#provider\_time) | >= 0.9 | - ## Modules No modules. @@ -41,26 +34,28 @@ No modules. | [aws_route.nat_route](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/route) | resource | | [aws_security_group.nat_security_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | | [null_resource.build_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [null_resource.download_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [terraform_data.download_lambda](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | +| [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | +| [http_http.lambda_binary_hash](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [ami\_id](#input\_ami\_id) | Explicit AMI ID to use (overrides AMI lookup entirely) | `string` | `null` | no | +| [ami\_name\_pattern](#input\_ami\_name\_pattern) | AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI. | `string` | `"nat-zero-al2023-minimal-arm64-20260306-064438"` | no | +| [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `"590144423513"` | no | | [availability\_zones](#input\_availability\_zones) | List of availability zones to deploy NAT instances in | `list(string)` | n/a | yes | | [block\_device\_size](#input\_block\_device\_size) | Size in GB of the root EBS volume | `number` | `10` | no | -| [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally. | `bool` | `false` | no | -| [custom\_ami\_name\_pattern](#input\_custom\_ami\_name\_pattern) | AMI name pattern when use\_fck\_nat\_ami is false | `string` | `null` | no | -| [custom\_ami\_owner](#input\_custom\_ami\_owner) | AMI owner account ID when use\_fck\_nat\_ami is false | `string` | `null` | no | +| [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source during apply instead of downloading a pre-compiled release. This is primarily for local development and may require a second apply after code changes. | `bool` | `false` | no | | [enable\_logging](#input\_enable\_logging) | Create a CloudWatch log group for the Lambda function | `bool` | `true` | no | | [encrypt\_root\_volume](#input\_encrypt\_root\_volume) | Encrypt the root EBS volume. | `bool` | `true` | no | | [ignore\_tag\_key](#input\_ignore\_tag\_key) | Tag key used to mark instances the Lambda should ignore | `string` | `"nat-zero:ignore"` | no | | [ignore\_tag\_value](#input\_ignore\_tag\_value) | Tag value used to mark instances the Lambda should ignore | `string` | `"true"` | no | | [instance\_type](#input\_instance\_type) | Instance type for the NAT instance | `string` | `"t4g.nano"` | no | -| [lambda\_binary\_url](#input\_lambda\_binary\_url) | URL to the pre-compiled Go Lambda zip. Updated automatically by CI. | `string` | `"https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip"` | no | +| [lambda\_binary\_path](#input\_lambda\_binary\_path) | Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation. | `string` | `null` | no | | [lambda\_memory\_size](#input\_lambda\_memory\_size) | Memory allocated to the Lambda function in MB (also scales CPU proportionally) | `number` | `128` | no | | [log\_retention\_days](#input\_log\_retention\_days) | CloudWatch log retention in days (only used when enable\_logging is true) | `number` | `14` | no | | [market\_type](#input\_market\_type) | Whether to use spot or on-demand instances | `string` | `"on-demand"` | no | @@ -72,7 +67,6 @@ No modules. | [private\_subnets\_cidr\_blocks](#input\_private\_subnets\_cidr\_blocks) | CIDR blocks for the private subnets (one per AZ, used in security group rules) | `list(string)` | n/a | yes | | [public\_subnets](#input\_public\_subnets) | Public subnet IDs (one per AZ) for NAT instance public ENIs | `list(string)` | n/a | yes | | [tags](#input\_tags) | Additional tags to apply to all resources | `map(string)` | `{}` | no | -| [use\_fck\_nat\_ami](#input\_use\_fck\_nat\_ami) | Use the public fck-nat AMI. Set to false to use a custom AMI. | `bool` | `true` | no | | [vpc\_id](#input\_vpc\_id) | The VPC ID where NAT instances will be deployed | `string` | n/a | yes | ## Outputs diff --git a/docs/testing.md b/docs/testing.md index 0dce6a3..68fb08a 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -16,7 +16,7 @@ Integration tests require AWS credentials with permissions to manage EC2, IAM, L ## Integration Test Lifecycle -The test uses [Terratest](https://terratest.gruntwork.io/) with a single `terraform apply` / `destroy` cycle and four phases: +The test uses [Terratest](https://terratest.gruntwork.io/) with a single `terraform apply` / `destroy` cycle and five phases. Each run uses a unique `nat-test-*` module name so EventBridge, Lambda, and IAM resources do not collide across reruns. ### Phase 1: NAT Creation and Connectivity @@ -41,22 +41,31 @@ The test uses [Terratest](https://terratest.gruntwork.io/) with a single `terraf 3. Wait for NAT running with new EIP 4. Verify connectivity -### Phase 4: Cleanup Action +### Phase 4: AMI Replacement + +1. Reapply the fixture with `NAT_ZERO_TEST_UPDATED_NAT_AMI_ID` +2. Trigger reconciliation while a workload is active +3. Verify the old NAT instance is terminated +4. Verify the replacement NAT comes up on the new AMI and handles egress correctly + +### Phase 5: Cleanup Action 1. Invoke Lambda with `{action: "cleanup"}` 2. Verify all NAT instances terminated and EIPs released ### Teardown -`terraform destroy` removes all Terraform-managed resources. The cleanup action (Phase 4) ensures Lambda-created NAT instances are terminated first, so ENI deletion succeeds. +`terraform destroy` removes all Terraform-managed resources. The cleanup action (Phase 5) ensures Lambda-created NAT instances are terminated first, so ENI deletion succeeds. ## CI -Integration tests run in GitHub Actions when the `integration-test` label is added to a PR. They use OIDC to assume an AWS role in a dedicated test account. +Integration tests run in GitHub Actions when the `integration-test` label is added to a PR. A small router workflow handles the label event and then calls the reusable integration workflow. The tests use OIDC to assume an AWS role in a dedicated test account. - Concurrency: one test at a time (`cancel-in-progress: false`) - Timeout: 15 minutes - Region: us-east-1 +- Default NAT AMI: shared private test nat-zero AMI supplied via the GitHub Actions variable `NAT_ZERO_TEST_AMI_ID` unless `nat_ami_id` is supplied explicitly +- These are integration-fixture overrides only. Normal module consumers should not set `nat_ami_id`; the module defaults to the published nat-zero AMI track. ## Orphan Detection @@ -64,4 +73,4 @@ Integration tests run in GitHub Actions when the `integration-test` label is add ## Config Version Replacement -The Lambda tags NAT instances with a `ConfigVersion` hash (AMI + instance type + market type + volume size + encryption). When the config changes and a workload triggers reconciliation, the Lambda terminates the outdated NAT and creates a replacement. The integration test doesn't exercise this path directly, but it's covered by unit tests. +The Lambda tags NAT instances with a `ConfigVersion` hash (resolved AMI ID + instance type + market type + volume size + encryption). When the config changes and a workload triggers reconciliation, the Lambda terminates the outdated NAT and creates a replacement. The integration suite can now exercise this path by setting `NAT_ZERO_TEST_UPDATED_NAT_AMI_ID` before running `go test`. diff --git a/docs/workflows.md b/docs/workflows.md index 333c289..6378c7c 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -6,18 +6,19 @@ Internal reference for GitHub Actions workflows, repo rulesets, and the release | Workflow | File | Triggers | Required Check | |----------|------|----------|----------------| -| Pre-commit | `precommit.yml` | All PRs; push to `main` (filtered paths) | `precommit` | +| Manual PR Checks | `manual-pr-checks.yml` | PR labeled `integration-test` or `nat-images` | No (router workflow) | +| Pre-commit | `precommit.yml` | All PRs | `precommit` | | Go Tests | `go-tests.yml` | PRs touching `cmd/lambda/**`; push to `main` | `go-test` | -| Integration Tests | `integration-tests.yml` | PR labeled `integration-test`; manual dispatch | `integration-test` | +| Integration Tests | `integration-tests.yml` | Manual dispatch; reusable workflow | `integration-test` | +| NAT Images | `nat-images.yml` | Manual dispatch; reusable workflow | No (promotion workflow) | | Docs | `docs.yml` | Push to `main` (filtered paths) | No (post-merge deploy) | | Release | `release-please.yml` | Push to `main`; manual dispatch | No (post-merge) | ## Pre-commit (`precommit.yml`) -Runs the repo's `.pre-commit-config.yaml` hooks: terraform fmt, tflint, terraform-docs, Go staticcheck, etc. +Runs the repo's `.pre-commit-config.yaml` hooks: terraform fmt/validate, tflint, terraform-docs, Go staticcheck, actionlint, shellcheck, and Packer fmt/validate. - **PR trigger**: All pull requests, all paths (no path filter). -- **Push trigger**: Only on `main`, only when `*.tf`, `cmd/lambda/**`, `.pre-commit-config.yaml`, or `.terraform-docs.yml` change. - **Job name**: `precommit` (required status check for merge). ## Go Tests (`go-tests.yml`) @@ -29,17 +30,33 @@ Runs `go test -v -race ./...` in `cmd/lambda/` (Lambda unit tests). - **Job name**: `go-test` (required status check for merge). - **Note**: Path-filtered. If a PR doesn't touch Go code, this check won't run and won't block merge (see ruleset notes below). +## Manual PR Checks (`manual-pr-checks.yml`) + +Single entry point for expensive, manually requested PR checks. + +- **PR trigger**: `labeled` type only. +- **Labels**: + - `integration-test` -> calls the reusable integration workflow + - `nat-images` -> calls the reusable NAT image workflow +- **Why this exists**: GitHub cannot filter `pull_request:labeled` by label name up front. A single router workflow keeps that complexity in one place and prevents both heavyweight workflows from waking up on every label event. +- **How it appears on the PR**: the called reusable jobs show up as normal PR checks under the router workflow run. +- **One-shot labels**: the router removes the trigger label after the run is queued, so adding the same label later will trigger a fresh run again. + ## Integration Tests (`integration-tests.yml`) Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises the Lambda lifecycle (create NAT, scale-down, restart, cleanup), then destroys everything. -- **PR trigger**: `labeled` type only. Runs when the `integration-test` label is added. - **Manual trigger**: `workflow_dispatch`. -- **Condition**: `github.event.label.name == 'integration-test'` (or manual dispatch). +- **Reusable trigger**: `workflow_call`. - **Concurrency**: Group `nat-zero-integration`, `cancel-in-progress: false`. Only one integration test runs at a time; new ones queue. - **Environment**: `integration` (holds the `INTEGRATION_ROLE_ARN` secret for OIDC). - **Timeout**: 15 minutes. - **Job name**: `integration-test` (required status check for merge). +- **Optional inputs**: + - `nat_ami_id` to force the integration fixture onto a specific NAT AMI. If omitted, the workflow uses the shared private test AMI from the GitHub Actions variable `NAT_ZERO_TEST_AMI_ID`. + - `updated_nat_ami_id` to exercise the AMI replacement path after a second `terraform apply`. + +These inputs are test-only fixture controls. Normal module consumers should omit them and use the published nat-zero AMI defaults. ### Steps @@ -48,6 +65,22 @@ Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises t 3. Build the Lambda binary from source (`cmd/lambda/` -> `.build/lambda.zip`). 4. Run `go test -v -timeout 10m -count=1` in `tests/integration/`. +## NAT Images (`nat-images.yml`) + +Manual promotion workflow for the default public nat-zero AMI. + +1. Build the AMI with Packer in the chosen source region. +2. Let Packer privately copy it to the regions listed in `ami/nat-zero-private-all-regions.pkrvars.hcl`. +3. Run one us-east-1 integration gate on a single stack: + - deploy from the shared private test NAT AMI in `NAT_ZERO_TEST_AMI_ID` + - exercise the normal NAT lifecycle + - reapply the module with the new AMI + - verify the old NAT is replaced and the new NAT works +4. After the integration gates pass, run a small publish script that opens launch permissions for the copied AMIs. +5. Open a PR that updates the Terraform defaults (`ami_owner_account`, `ami_name_pattern`) so merge + release-please can publish the new module version. + +For pre-merge validation on a branch, add the `nat-images` label to the PR. The router workflow calls `nat-images.yml` as a reusable workflow, which uses the GitHub Actions variable `NAT_ZERO_AMI_BUILD_SUBNET_ID`, runs the build and integration gates on the PR branch, and intentionally skips the public-sharing and promotion-PR jobs. + ## Docs (`docs.yml`) Deploys MkDocs Material to GitHub Pages. @@ -85,9 +118,11 @@ Runs `googleapis/release-please-action@v4` with: Only runs when `release_created == 'true'` (i.e., the push that merges a release PR). 1. Cross-compiles the Go Lambda for `linux/arm64`. -2. Zips as `lambda.zip`. -3. **Uploads to the versioned release** (e.g., `v0.1.0`). -4. **Creates/updates a rolling `nat-zero-lambda-latest` release** with the same zip. This provides a stable URL for the module's default `lambda_binary_url`. +2. Creates a deterministic `lambda.zip`. +3. Writes `lambda.zip.base64sha256`, containing the base64-encoded SHA256 for the zip. +4. **Uploads the zip and checksum to the versioned release** (e.g., `v0.1.0`). + +That is the full release artifact flow. There is no second workflow that edits the release PR, and there is no rolling "latest" Lambda artifact to keep in sync. ### Changelog sections @@ -104,16 +139,16 @@ Only runs when `release_created == 'true'` (i.e., the push that merges a release ### `main` branch ruleset -- **No direct push**: creation, update, deletion, and non-fast-forward all blocked. - **PRs required** with: - - 1 approving review + - 0 required approvals - Stale reviews dismissed on push - - Last push approval required (reviewer cannot be the person who pushed the last commit) - All review threads must be resolved - - **Squash merge only** -- **Required status checks**: `precommit`, `go-test`, `integration-test` - - `strict_required_status_checks_policy: false` -- checks that don't run (path filtering / label gating) won't block merge. -- **Bypass**: Admin role can bypass always. +- **Required status checks**: `precommit`, `go-test` + - strict mode enabled, so required checks must be up to date with `main` +- **Linear history required** +- **No force push** +- **No branch deletion** +- **Bypass**: Admin role can bypass because `enforce_admins` is disabled. ### `tags` ruleset @@ -127,8 +162,9 @@ Only runs when `release_created == 'true'` (i.e., the push that merges a release Open PR -> precommit runs (always) -> go-test runs (if cmd/lambda/** changed) - -> Add "integration-test" label -> integration tests run against real AWS - -> 1 approval + threads resolved + -> Add "integration-test" label -> router calls integration tests + -> Add "nat-images" label -> router calls the NAT image build/integration gate + -> threads resolved -> Squash merge to main Post-merge to main: @@ -137,5 +173,22 @@ Post-merge to main: Merge release PR: -> release-please creates GitHub Release + tag - -> build-lambda uploads lambda.zip to release + rolling latest + -> build-lambda uploads lambda.zip + lambda.zip.base64sha256 to that versioned release ``` + +## Lambda Code Paths + +The module intentionally supports exactly three ways to supply Lambda code: + +1. Default release artifact + - Best for normal users + - Terraform downloads the versioned `lambda.zip` and reads the matching `lambda.zip.base64sha256` + - The checksum file lets Terraform know `source_code_hash` during `plan`, before the zip is downloaded during `apply` + - A changed published checksum shows up as a Lambda code change in `terraform plan` +2. Pre-built local zip via `lambda_binary_path` + - Best for CI, branch testing, or custom unreleased binaries + - Terraform hashes the local file during plan +3. Apply-time build via `build_lambda_locally = true` + - Best for local development only + - Requires Go and `zip` + - May require a second apply after Lambda code changes diff --git a/examples/basic/main.tf b/examples/basic/main.tf index 7910af8..1567949 100644 --- a/examples/basic/main.tf +++ b/examples/basic/main.tf @@ -48,7 +48,7 @@ module "nat_zero" { private_route_table_ids = module.vpc.private_route_table_ids private_subnets_cidr_blocks = module.vpc.private_subnets_cidr_blocks - # Defaults: t4g.nano, fck-nat AMI, on-demand + # Defaults: t4g.nano, nat-zero AMI, on-demand # Uncomment for spot instances: # market_type = "spot" diff --git a/iam.tf b/iam.tf index 0365f10..f9274a2 100644 --- a/iam.tf +++ b/iam.tf @@ -47,10 +47,9 @@ resource "aws_iam_role_policy" "lambda_iam_policy" { Effect = "Allow" Action = [ "ec2:DescribeInstances", - "ec2:DescribeImages", "ec2:DescribeLaunchTemplates", + # EC2 resolves launch template versions during RunInstances authorization. "ec2:DescribeLaunchTemplateVersions", - "ec2:DescribeNetworkInterfaces", "ec2:DescribeAddresses", ] Resource = "*" diff --git a/lambda.tf b/lambda.tf index 049ec2e..e03a6bd 100644 --- a/lambda.tf +++ b/lambda.tf @@ -17,15 +17,40 @@ resource "time_sleep" "lambda_ready" { destroy_duration = var.enable_logging ? "10s" : "0s" } -resource "null_resource" "download_lambda" { - count = var.build_lambda_locally ? 0 : 1 +locals { + module_release_version = jsondecode(file("${path.module}/.release-please-manifest.json"))["."] + default_lambda_binary_url = "https://github.com/MachineDotDev/nat-zero/releases/download/v${local.module_release_version}/lambda.zip" + lambda_binary_hash_url = "${local.default_lambda_binary_url}.base64sha256" + downloaded_lambda_zip_path = "${path.module}/.build/lambda.zip" + local_lambda_zip_path = coalesce(var.lambda_binary_path, local.downloaded_lambda_zip_path) + local_lambda_source_hash = var.lambda_binary_path != null ? ( + filebase64sha256(var.lambda_binary_path) + ) : ( + fileexists(local.downloaded_lambda_zip_path) ? filebase64sha256(local.downloaded_lambda_zip_path) : null + ) + downloaded_lambda_source_hash = one(data.http.lambda_binary_hash[*].response_body) + lambda_source_hash = var.build_lambda_locally || var.lambda_binary_path != null ? local.local_lambda_source_hash : trimspace(local.downloaded_lambda_source_hash) +} - triggers = { - url = var.lambda_binary_url - } +data "http" "lambda_binary_hash" { + count = var.build_lambda_locally || var.lambda_binary_path != null ? 0 : 1 + url = local.lambda_binary_hash_url +} + +resource "terraform_data" "download_lambda" { + count = var.build_lambda_locally || var.lambda_binary_path != null ? 0 : 1 + + triggers_replace = [ + local.default_lambda_binary_url, + local.lambda_binary_hash_url, + trimspace(local.downloaded_lambda_source_hash), + ] provisioner "local-exec" { - command = "test -f ${path.module}/.build/lambda.zip || (mkdir -p ${path.module}/.build && curl -sfL -o ${path.module}/.build/lambda.zip ${var.lambda_binary_url})" + command = <<-EOT + mkdir -p "${path.module}/.build" && \ + curl -sfL -o "${local.downloaded_lambda_zip_path}" "${local.default_lambda_binary_url}" + EOT } } @@ -34,7 +59,10 @@ resource "null_resource" "build_lambda" { triggers = { source_hash = sha256(join("", [ - for f in sort(fileset("${path.module}/cmd/lambda", "*.go")) : + for f in sort(concat( + tolist(fileset("${path.module}/cmd/lambda", "*.go")), + ["go.mod", "go.sum"], + )) : filesha256("${path.module}/cmd/lambda/${f}") ])) } @@ -42,8 +70,9 @@ resource "null_resource" "build_lambda" { provisioner "local-exec" { command = <<-EOT cd ${path.module}/cmd/lambda && \ - GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap && \ - zip lambda.zip bootstrap && \ + GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -trimpath -buildvcs=false -tags lambda.norpc -ldflags='-s -w -buildid=' -o bootstrap && \ + TZ=UTC touch -t 198001010000 bootstrap && \ + zip -q -X lambda.zip bootstrap && \ mkdir -p ../../.build && \ cp lambda.zip ../../.build/lambda.zip && \ rm bootstrap lambda.zip @@ -52,12 +81,12 @@ resource "null_resource" "build_lambda" { } resource "aws_lambda_function" "nat_zero" { - filename = "${path.module}/.build/lambda.zip" + filename = local.local_lambda_zip_path function_name = "${var.name}-nat-zero" handler = "bootstrap" role = aws_iam_role.lambda_iam_role.arn runtime = "provided.al2023" - source_code_hash = fileexists("${path.module}/.build/lambda.zip") ? filebase64sha256("${path.module}/.build/lambda.zip") : null + source_code_hash = local.lambda_source_hash architectures = ["arm64"] timeout = 90 reserved_concurrent_executions = 1 @@ -66,17 +95,13 @@ resource "aws_lambda_function" "nat_zero" { environment { variables = { - NAT_TAG_KEY = var.nat_tag_key - NAT_TAG_VALUE = var.nat_tag_value - IGNORE_TAG_KEY = var.ignore_tag_key - IGNORE_TAG_VALUE = var.ignore_tag_value - TARGET_VPC_ID = var.vpc_id - AMI_OWNER_ACCOUNT = var.use_fck_nat_ami ? "568608671756" : var.custom_ami_owner - AMI_NAME_PATTERN = var.use_fck_nat_ami ? "fck-nat-al2023-*-arm64-*" : var.custom_ami_name_pattern + NAT_TAG_KEY = var.nat_tag_key + NAT_TAG_VALUE = var.nat_tag_value + IGNORE_TAG_KEY = var.ignore_tag_key + IGNORE_TAG_VALUE = var.ignore_tag_value + TARGET_VPC_ID = var.vpc_id CONFIG_VERSION = sha256(join(",", [ - var.use_fck_nat_ami ? "568608671756" : var.custom_ami_owner, - var.use_fck_nat_ami ? "fck-nat-al2023-*-arm64-*" : var.custom_ami_name_pattern, - coalesce(var.ami_id, "none"), + coalesce(local.effective_ami_id, "missing"), var.instance_type, var.market_type, tostring(var.block_device_size), @@ -85,7 +110,14 @@ resource "aws_lambda_function" "nat_zero" { } } - depends_on = [time_sleep.lambda_ready, null_resource.download_lambda, null_resource.build_lambda] + lifecycle { + precondition { + condition = !(var.build_lambda_locally && var.lambda_binary_path != null) + error_message = "build_lambda_locally and lambda_binary_path cannot be used together." + } + } + + depends_on = [time_sleep.lambda_ready, terraform_data.download_lambda, null_resource.build_lambda] } resource "aws_lambda_function_event_invoke_config" "nat_zero_invoke_config" { diff --git a/launch_template.tf b/launch_template.tf index 2388791..05ff688 100644 --- a/launch_template.tf +++ b/launch_template.tf @@ -11,7 +11,7 @@ resource "aws_launch_template" "nat_launch_template" { count = length(var.availability_zones) name = "${var.name}-${var.availability_zones[count.index]}-launch-template" instance_type = var.instance_type - image_id = var.ami_id + image_id = local.effective_ami_id iam_instance_profile { arn = aws_iam_instance_profile.nat_instance_profile.arn @@ -76,4 +76,11 @@ resource "aws_launch_template" "nat_launch_template" { }, local.common_tags, ) + + lifecycle { + precondition { + condition = local.effective_ami_id != null + error_message = "Set ami_id or configure a resolvable AMI source with ami_owner_account and ami_name_pattern." + } + } } diff --git a/scripts/publish_ami_public.sh b/scripts/publish_ami_public.sh new file mode 100755 index 0000000..f18d838 --- /dev/null +++ b/scripts/publish_ami_public.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 4 ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +owner_account_id="$1" +ami_name="$2" +source_region="$3" +regions_file="$4" + +mapfile -t publish_regions < <(awk -F'"' '/"/ {print $2}' "$regions_file") + +if [ "${#publish_regions[@]}" -eq 0 ]; then + echo "no regions found in $regions_file" >&2 + exit 1 +fi + +source_present=0 +for region in "${publish_regions[@]}"; do + if [ "$region" = "$source_region" ]; then + source_present=1 + break + fi +done +if [ "$source_present" -eq 0 ]; then + publish_regions+=("$source_region") +fi + +cleanup() { + local region + + for region in "${publish_regions[@]}"; do + aws ec2 enable-image-block-public-access \ + --region "$region" \ + --image-block-public-access-state block-new-sharing >/dev/null + done + + for region in "${publish_regions[@]}"; do + for _ in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + state="$( + aws ec2 get-image-block-public-access-state \ + --region "$region" \ + --query 'ImageBlockPublicAccessState' \ + --output text + )" + if [ "$state" = "block-new-sharing" ]; then + break + fi + sleep 20 + done + done +} + +trap cleanup EXIT + +for region in "${publish_regions[@]}"; do + aws ec2 disable-image-block-public-access --region "$region" >/dev/null +done + +for region in "${publish_regions[@]}"; do + for _ in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + state="$( + aws ec2 get-image-block-public-access-state \ + --region "$region" \ + --query 'ImageBlockPublicAccessState' \ + --output text + )" + if [ "$state" = "unblocked" ]; then + break + fi + sleep 20 + done +done + +for region in "${publish_regions[@]}"; do + image_id="$( + aws ec2 describe-images \ + --region "$region" \ + --owners "$owner_account_id" \ + --filters "Name=name,Values=$ami_name" "Name=state,Values=available" \ + --query 'Images[0].ImageId' \ + --output text + )" + + if [ -z "$image_id" ] || [ "$image_id" = "None" ]; then + echo "failed to resolve image for $region" >&2 + exit 1 + fi + + aws ec2 modify-image-attribute \ + --region "$region" \ + --image-id "$image_id" \ + --launch-permission 'Add=[{Group=all}]' +done diff --git a/scripts/render_packer_ami_regions.sh b/scripts/render_packer_ami_regions.sh new file mode 100755 index 0000000..3b02ecf --- /dev/null +++ b/scripts/render_packer_ami_regions.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 3 ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +regions_file="$1" +source_region="$2" +output_file="$3" + +mapfile -t configured_regions < <(awk -F'"' '/"/ {print $2}' "$regions_file") + +if [ "${#configured_regions[@]}" -eq 0 ]; then + echo "no regions found in $regions_file" >&2 + exit 1 +fi + +{ + echo "ami_regions = [" + for region in "${configured_regions[@]}"; do + if [ "$region" = "$source_region" ]; then + continue + fi + printf ' "%s",\n' "$region" + done + echo "]" +} >"$output_file" diff --git a/scripts/update_ami_defaults.sh b/scripts/update_ami_defaults.sh new file mode 100644 index 0000000..92f20a5 --- /dev/null +++ b/scripts/update_ami_defaults.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 2 ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +owner_account_id="$1" +ami_name_pattern="$2" + +update_variable_default() { + local file="$1" + local variable_name="$2" + local replacement="$3" + local tmp_file + + tmp_file="$(mktemp)" + if ! awk -v variable_name="$variable_name" -v replacement="$replacement" ' + BEGIN { + in_variable = 0 + updated = 0 + } + $0 ~ "^variable \"" variable_name "\" \\{" { + in_variable = 1 + } + in_variable && $1 == "default" { + sub(/=.*/, "= " replacement) + in_variable = 0 + updated = 1 + } + { + print + } + END { + if (updated == 0) { + exit 1 + } + } + ' "$file" > "$tmp_file"; then + rm -f "$tmp_file" + echo "failed to update default for ${variable_name}" >&2 + exit 1 + fi + + mv "$tmp_file" "$file" +} + +update_variable_default "variables.tf" "ami_owner_account" "\"${owner_account_id}\"" +update_variable_default "variables.tf" "ami_name_pattern" "\"${ami_name_pattern}\"" diff --git a/tests/integration/fixture/main.tf b/tests/integration/fixture/main.tf index 0a608b4..473c9c7 100644 --- a/tests/integration/fixture/main.tf +++ b/tests/integration/fixture/main.tf @@ -71,10 +71,20 @@ variable "encrypt_root_volume" { default = true } +variable "nat_ami_id" { + type = string + default = null +} + +variable "name" { + type = string + default = "nat-test" +} + module "nat_zero" { source = "../../../" - name = "nat-test" + name = var.name vpc_id = data.aws_vpc.default.id availability_zones = [data.aws_subnet.public.availability_zone] public_subnets = [data.aws_subnet.public.id] @@ -86,6 +96,13 @@ module "nat_zero" { instance_type = var.nat_instance_type market_type = "on-demand" encrypt_root_volume = var.encrypt_root_volume + + # Test-only overrides: + # - ami_id lets the integration suite force a specific baseline or upgraded NAT AMI. + # - lambda_binary_path lets branch tests exercise unreleased Lambda code. + # Normal module consumers should omit both and use the published defaults. + ami_id = var.nat_ami_id + lambda_binary_path = fileexists("${path.module}/../../../.build/lambda.zip") ? abspath("${path.module}/../../../.build/lambda.zip") : null } output "vpc_id" { diff --git a/tests/integration/nat_zero_test.go b/tests/integration/nat_zero_test.go index dc4af3a..abdb182 100644 --- a/tests/integration/nat_zero_test.go +++ b/tests/integration/nat_zero_test.go @@ -4,11 +4,13 @@ import ( "encoding/base64" "encoding/json" "fmt" + "os" "strings" "testing" "time" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/cloudwatchevents" "github.com/aws/aws-sdk-go/service/cloudwatchlogs" @@ -59,6 +61,7 @@ type phase struct { // connectivity, scale-down, restart, cleanup action, and terraform destroy. func TestNatZero(t *testing.T) { runID := fmt.Sprintf("tt-%d", time.Now().Unix()) + moduleName := fmt.Sprintf("nat-test-%s", runID) sess := session.Must(session.NewSession(&aws.Config{Region: aws.String(awsRegion)})) ec2Client := ec2.New(sess) iamClient := iam.New(sess) @@ -112,9 +115,24 @@ func TestNatZero(t *testing.T) { t.Logf("Deleted SQS queue %s", queueName) }() + initialNatAMI := strings.TrimSpace(os.Getenv("NAT_ZERO_TEST_NAT_AMI_ID")) + updatedNatAMI := strings.TrimSpace(os.Getenv("NAT_ZERO_TEST_UPDATED_NAT_AMI_ID")) + tfVars := map[string]interface{}{ + "name": moduleName, + } + t.Logf("Integration module name: %s", moduleName) + if initialNatAMI != "" { + tfVars["nat_ami_id"] = initialNatAMI + t.Logf("Initial NAT AMI override: %s", initialNatAMI) + } + if updatedNatAMI != "" { + t.Logf("Updated NAT AMI target: %s", updatedNatAMI) + } + opts := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ TerraformDir: "./fixture", NoColor: true, + Vars: tfVars, }) defer func() { destroyStart := time.Now() @@ -170,50 +188,34 @@ func TestNatZero(t *testing.T) { amiID := getLatestAL2023AMI(t, ec2Client) // Shared across phases — set by Phase 1, used by Phase 2. - var workloadID string + var activeWorkloadID string + runPhase := func(name string, fn func(t *testing.T)) bool { + if t.Run(name, fn) { + return true + } + t.Logf("Phase %s failed, aborting remaining phases so deferred cleanup can run", name) + return false + } // ── Phase 1: NAT creation and connectivity ────────────────────────── // Launch a workload and let EventBridge trigger the Lambda automatically. - t.Run("NATCreationAndConnectivity", func(t *testing.T) { + if !runPhase("NATCreationAndConnectivity", func(t *testing.T) { wlStart := time.Now() - workloadID = launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) + activeWorkloadID = launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) record("Launch workload instance", time.Since(wlStart)) - t.Logf("Launched workload %s in VPC %s", workloadID, vpcID) + t.Logf("Launched workload %s in VPC %s", activeWorkloadID, vpcID) // EventBridge fires when the workload goes pending/running, // triggering the Lambda to create a NAT and attach an EIP. t.Log("Waiting for NAT to be running with EIP (via EventBridge)...") start := time.Now() - var natInstance *ec2.Instance - retry.DoWithRetry(t, "NAT running with EIP", 100, 2*time.Second, func() (string, error) { - nats := findNATInstances(t, ec2Client, vpcID) - for _, n := range nats { - if aws.StringValue(n.State.Name) == "running" { - for _, eni := range n.NetworkInterfaces { - if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && - eni.Association != nil && eni.Association.PublicIp != nil { - natInstance = n - return "OK", nil - } - } - return "", fmt.Errorf("NAT running but no EIP yet") - } - } - return "", fmt.Errorf("no running NAT (%d found)", len(nats)) - }) + natInstance := waitForRunningNATWithEIP(t, ec2Client, vpcID, "NAT running with EIP") natUpTime := time.Since(start) record("Wait for NAT running with EIP", natUpTime) t.Logf("NAT up with EIP in %s", natUpTime.Round(time.Millisecond)) - // Get NAT public IP from primary ENI. - var natEIP string - for _, eni := range natInstance.NetworkInterfaces { - if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && eni.Association != nil { - natEIP = aws.StringValue(eni.Association.PublicIp) - break - } - } + natEIP := natPublicIP(natInstance) require.NotEmpty(t, natEIP, "NAT should have a public IP") // Validate NAT tags. @@ -246,24 +248,27 @@ func TestNatZero(t *testing.T) { assert.Equal(t, natEIP, msg.EgressIP, "workload egress IP should match NAT EIP") t.Logf("Confirmed: workload egresses via NAT EIP %s", natEIP) - }) + }) { + return + } // ── Phase 2: NAT scale-down ───────────────────────────────────────── // Terminate the workload and let EventBridge drive the full // scale-down flow: stop NAT, then detach/release EIP. - t.Run("NATScaleDown", func(t *testing.T) { - require.NotEmpty(t, workloadID, "Phase 1 must set workloadID") + if !runPhase("NATScaleDown", func(t *testing.T) { + require.NotEmpty(t, activeWorkloadID, "Phase 1 must set activeWorkloadID") // Terminate the workload instance. EventBridge fires shutting-down // and terminated events which trigger the Lambda to stop the NAT. t.Log("Terminating workload to trigger NAT scale-down...") termStart := time.Now() _, err := ec2Client.TerminateInstances(&ec2.TerminateInstancesInput{ - InstanceIds: []*string{aws.String(workloadID)}, + InstanceIds: []*string{aws.String(activeWorkloadID)}, }) require.NoError(t, err) record("Terminate workload instance", time.Since(termStart)) + activeWorkloadID = "" // Wait for NAT to reach stopped state. t.Log("Waiting for NAT to stop (via EventBridge)...") @@ -308,39 +313,26 @@ func TestNatZero(t *testing.T) { }) record("Wait for EIP released", time.Since(eipStart)) t.Log("NAT stopped and EIP released") - }) + }) { + return + } // ── Phase 3: NAT restart from stopped state ───────────────────────── // Launch a new workload and let EventBridge trigger the restart. - t.Run("NATRestart", func(t *testing.T) { + if !runPhase("NATRestart", func(t *testing.T) { t.Log("Launching new workload to trigger NAT restart...") wlStart := time.Now() newWorkloadID := launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) record("Launch workload instance (restart)", time.Since(wlStart)) t.Logf("Launched workload %s", newWorkloadID) + activeWorkloadID = newWorkloadID // EventBridge fires when the new workload goes pending/running, // triggering the Lambda to start the stopped NAT. t.Log("Waiting for restarted NAT to be running with EIP (via EventBridge)...") start := time.Now() - var natInstance *ec2.Instance - retry.DoWithRetry(t, "NAT restarted with EIP", 100, 2*time.Second, func() (string, error) { - nats := findNATInstances(t, ec2Client, vpcID) - for _, n := range nats { - if aws.StringValue(n.State.Name) == "running" { - for _, eni := range n.NetworkInterfaces { - if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && - eni.Association != nil && eni.Association.PublicIp != nil { - natInstance = n - return "OK", nil - } - } - return "", fmt.Errorf("NAT running but no EIP yet") - } - } - return "", fmt.Errorf("no running NAT (%d found)", len(nats)) - }) + natInstance := waitForRunningNATWithEIP(t, ec2Client, vpcID, "NAT restarted with EIP") natRestartTime := time.Since(start) record("Wait for NAT restarted with EIP", natRestartTime) t.Logf("NAT restarted with EIP in %s", natRestartTime.Round(time.Millisecond)) @@ -348,13 +340,7 @@ func TestNatZero(t *testing.T) { require.NotNil(t, natInstance, "NAT should be running") // Verify the restarted NAT has an EIP. - var natEIP string - for _, eni := range natInstance.NetworkInterfaces { - if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && eni.Association != nil { - natEIP = aws.StringValue(eni.Association.PublicIp) - break - } - } + natEIP := natPublicIP(natInstance) require.NotEmpty(t, natEIP, "Restarted NAT should have a public IP") t.Logf("Restarted NAT has EIP %s", natEIP) @@ -372,11 +358,74 @@ func TestNatZero(t *testing.T) { } else { t.Logf("Workload egressed via NAT auto-assigned IP %s (EIP %s attached after; expected during restart)", msg.EgressIP, natEIP) } - }) + }) { + return + } - // ── Phase 4: Cleanup action ───────────────────────────────────────── + // ── Phase 4: NAT replacement on AMI update ───────────────────────── - t.Run("CleanupAction", func(t *testing.T) { + if !runPhase("NATAMIUpgrade", func(t *testing.T) { + if updatedNatAMI == "" { + t.Skip("NAT_ZERO_TEST_UPDATED_NAT_AMI_ID not set") + } + require.NotEmpty(t, activeWorkloadID, "AMI update phase requires an active workload") + + currentNat := waitForRunningNATWithEIP(t, ec2Client, vpcID, "current NAT running with EIP") + oldNatID := aws.StringValue(currentNat.InstanceId) + oldNatAMI := aws.StringValue(currentNat.ImageId) + require.NotEmpty(t, oldNatID, "current NAT should have an instance id") + require.NotEmpty(t, oldNatAMI, "current NAT should have an AMI id") + if oldNatAMI == updatedNatAMI { + t.Skipf("current NAT already uses target AMI %s", updatedNatAMI) + } + t.Logf("Updating NAT AMI from %s to %s", oldNatAMI, updatedNatAMI) + + applyStart := time.Now() + opts.Vars["nat_ami_id"] = updatedNatAMI + terraform.Apply(t, opts) + record("Terraform apply (AMI update)", time.Since(applyStart)) + + invokeTerminateStart := time.Now() + invokeLambda(t, lambdaClient, lambdaName, map[string]string{ + "instance_id": activeWorkloadID, + "state": "running", + }) + record("Lambda invoke (AMI update terminate)", time.Since(invokeTerminateStart)) + + waitTermStart := time.Now() + waitForInstanceTerminated(t, ec2Client, oldNatID) + record("Wait for outdated NAT terminated", time.Since(waitTermStart)) + + // The old NAT termination emits the next EventBridge signal. That + // should drive creation of the replacement NAT without another manual + // invoke, which would race the single-concurrency reconciler. + replacementStart := time.Now() + replacementNat := waitForRunningNATWithEIP(t, ec2Client, vpcID, "replacement NAT running with EIP") + record("Wait for replacement NAT running with EIP", time.Since(replacementStart)) + + require.NotEqual(t, oldNatID, aws.StringValue(replacementNat.InstanceId), "replacement NAT should be a new instance") + require.Equal(t, updatedNatAMI, aws.StringValue(replacementNat.ImageId), "replacement NAT should use updated AMI") + + replacementEIP := natPublicIP(replacementNat) + require.NotEmpty(t, replacementEIP, "replacement NAT should have a public IP") + + upgradeWorkloadStart := time.Now() + upgradeWorkloadID := launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) + record("Launch workload instance (AMI update)", time.Since(upgradeWorkloadStart)) + activeWorkloadID = upgradeWorkloadID + + t.Log("Waiting for workload connectivity via replacement NAT (SQS)...") + egressStart := time.Now() + msg := waitForEgress(t, sqsClient, queueURL, 4*time.Minute) + record("Wait for workload egress IP (AMI update)", time.Since(egressStart)) + require.Equal(t, replacementEIP, msg.EgressIP, "workload egress IP should match replacement NAT EIP") + }) { + return + } + + // ── Phase 5: Cleanup action ───────────────────────────────────────── + + runPhase("CleanupAction", func(t *testing.T) { // Terminate all test workloads before cleanup to match production // destroy ordering where Terraform deletes the EventBridge target // (stopping new events) before invoking the cleanup Lambda. @@ -470,10 +519,21 @@ func TestNatZero(t *testing.T) { func invokeLambda(t *testing.T, client *lambda.Lambda, funcName string, payload map[string]string) { t.Helper() body, _ := json.Marshal(payload) - out, err := client.Invoke(&lambda.InvokeInput{ - FunctionName: aws.String(funcName), - Payload: body, - LogType: aws.String("Tail"), + var out *lambda.InvokeOutput + _, err := retry.DoWithRetryE(t, "lambda invoke", 20, 3*time.Second, func() (string, error) { + var invokeErr error + out, invokeErr = client.Invoke(&lambda.InvokeInput{ + FunctionName: aws.String(funcName), + Payload: body, + LogType: aws.String("Tail"), + }) + if invokeErr == nil { + return "OK", nil + } + if isLambdaConcurrencyThrottle(invokeErr) { + return "", invokeErr + } + return "", retry.FatalError{Underlying: invokeErr} }) require.NoError(t, err, "Lambda invocation failed") if out.FunctionError != nil { @@ -494,6 +554,17 @@ func invokeLambda(t *testing.T, client *lambda.Lambda, funcName string, payload t.Logf("Lambda invoked: %v", payload) } +func isLambdaConcurrencyThrottle(err error) bool { + awsErr, ok := err.(awserr.Error) + if !ok { + return false + } + if awsErr.Code() != "TooManyRequestsException" { + return false + } + return strings.Contains(awsErr.Message(), "ReservedFunctionConcurrentInvocationLimitExceeded") +} + // dumpLambdaLogs prints recent Lambda CloudWatch log events for post-mortem debugging. func dumpLambdaLogs(t *testing.T, client *cloudwatchlogs.CloudWatchLogs, logGroup string) { t.Helper() @@ -658,6 +729,56 @@ func findWorkloadsInState(t *testing.T, c *ec2.EC2, vpcID, runID string, states return res } +func waitForRunningNATWithEIP(t *testing.T, c *ec2.EC2, vpcID, description string) *ec2.Instance { + t.Helper() + + var natInstance *ec2.Instance + retry.DoWithRetry(t, description, 100, 2*time.Second, func() (string, error) { + nats := findNATInstances(t, c, vpcID) + for _, n := range nats { + if aws.StringValue(n.State.Name) == "running" && natPublicIP(n) != "" { + natInstance = n + return "OK", nil + } + if aws.StringValue(n.State.Name) == "running" { + return "", fmt.Errorf("NAT running but no EIP yet") + } + } + return "", fmt.Errorf("no running NAT (%d found)", len(nats)) + }) + return natInstance +} + +func waitForInstanceTerminated(t *testing.T, c *ec2.EC2, instanceID string) { + t.Helper() + + retry.DoWithRetry(t, "instance terminated", 60, 2*time.Second, func() (string, error) { + out, err := c.DescribeInstances(&ec2.DescribeInstancesInput{ + InstanceIds: []*string{aws.String(instanceID)}, + }) + if err != nil { + return "", err + } + if len(out.Reservations) == 0 || len(out.Reservations[0].Instances) == 0 { + return "OK", nil + } + state := aws.StringValue(out.Reservations[0].Instances[0].State.Name) + if state == ec2.InstanceStateNameTerminated { + return "OK", nil + } + return "", fmt.Errorf("instance %s still %s", instanceID, state) + }) +} + +func natPublicIP(nat *ec2.Instance) string { + for _, eni := range nat.NetworkInterfaces { + if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && eni.Association != nil { + return aws.StringValue(eni.Association.PublicIp) + } + } + return "" +} + func launchWorkload(t *testing.T, c *ec2.EC2, subnet, ami, runID, profile, queueURL string) string { t.Helper() out, err := c.RunInstances(&ec2.RunInstancesInput{ @@ -849,13 +970,25 @@ func TestNoOrphanedResources(t *testing.T) { return found }}, {"Lambda", func() []string { - _, err := lambdaClient.GetFunction(&lambda.GetFunctionInput{ - FunctionName: aws.String(testPrefix + "-nat-zero"), - }) - if err == nil { - return []string{"Lambda nat-test-nat-zero"} + var found []string + var marker *string + for { + out, err := lambdaClient.ListFunctions(&lambda.ListFunctionsInput{Marker: marker}) + if err != nil { + return nil + } + for _, fn := range out.Functions { + name := aws.StringValue(fn.FunctionName) + if strings.HasPrefix(name, testPrefix) { + found = append(found, fmt.Sprintf("Lambda %s", name)) + } + } + if out.NextMarker == nil || aws.StringValue(out.NextMarker) == "" { + break + } + marker = out.NextMarker } - return nil + return found }}, {"LogGroups", func() []string { out, err := cwClient.DescribeLogGroups(&cloudwatchlogs.DescribeLogGroupsInput{ diff --git a/variables.tf b/variables.tf index 47c046a..941c9de 100644 --- a/variables.tf +++ b/variables.tf @@ -68,29 +68,22 @@ variable "encrypt_root_volume" { description = "Encrypt the root EBS volume." } -# AMI configuration -variable "use_fck_nat_ami" { - type = bool - default = true - description = "Use the public fck-nat AMI. Set to false to use a custom AMI." -} - variable "ami_id" { type = string default = null description = "Explicit AMI ID to use (overrides AMI lookup entirely)" } -variable "custom_ami_owner" { +variable "ami_owner_account" { type = string - default = null - description = "AMI owner account ID when use_fck_nat_ami is false" + default = "590144423513" + description = "Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI." } -variable "custom_ami_name_pattern" { +variable "ami_name_pattern" { type = string - default = null - description = "AMI name pattern when use_fck_nat_ami is false" + default = "nat-zero-al2023-minimal-arm64-20260306-064438" + description = "AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI." } variable "nat_tag_key" { @@ -143,11 +136,11 @@ variable "log_retention_days" { variable "build_lambda_locally" { type = bool default = false - description = "Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally." + description = "Build the Lambda binary from Go source during apply instead of downloading a pre-compiled release. This is primarily for local development and may require a second apply after code changes." } -variable "lambda_binary_url" { +variable "lambda_binary_path" { type = string - default = "https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip" - description = "URL to the pre-compiled Go Lambda zip. Updated automatically by CI." + default = null + description = "Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation." } diff --git a/versions.tf b/versions.tf index dd2367d..ff35e50 100644 --- a/versions.tf +++ b/versions.tf @@ -1,11 +1,15 @@ terraform { - required_version = ">= 1.3" + required_version = ">= 1.4" required_providers { aws = { source = "hashicorp/aws" version = ">= 5.0" } + http = { + source = "hashicorp/http" + version = ">= 3.0" + } null = { source = "hashicorp/null" version = ">= 3.0"