From 44ced933c4ac37833e937769f9e997063b875c1b Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Wed, 4 Mar 2026 13:59:39 +1000 Subject: [PATCH 1/2] fix: add boot-time fck-nat SNAT guard via launch template user_data --- files/nat_bootstrap_guard_user_data.sh | 139 +++++++++++++++++++++++++ lambda.tf | 1 + launch_template.tf | 6 ++ tests/integration/nat_zero_test.go | 35 +++++++ 4 files changed, 181 insertions(+) create mode 100644 files/nat_bootstrap_guard_user_data.sh diff --git a/files/nat_bootstrap_guard_user_data.sh b/files/nat_bootstrap_guard_user_data.sh new file mode 100644 index 0000000..7500126 --- /dev/null +++ b/files/nat_bootstrap_guard_user_data.sh @@ -0,0 +1,139 @@ +#!/bin/bash +set -euo pipefail + +cat >/usr/local/sbin/nat-zero-fck-nat-guard.sh <<'GUARD_SCRIPT' +#!/bin/bash +set -uo pipefail + +LOG_TAG="nat-zero-fck-nat-guard" +IMDS_BASE_URL="http://169.254.169.254/latest" +MAX_ATTEMPTS=15 +SLEEP_SECONDS=4 + +log() { + local msg="$1" + logger -t "$LOG_TAG" "$msg" + echo "$LOG_TAG: $msg" +} + +get_imds_token() { + curl --silent --show-error --fail --connect-timeout 1 --max-time 2 \ + -X PUT "$IMDS_BASE_URL/api/token" \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 60" +} + +resolve_public_interface() { + local token macs_raw mac mac_no_slash device_number iface iface_mac + + token="$(get_imds_token)" || return 1 + + macs_raw="$(curl --silent --show-error --fail --connect-timeout 1 --max-time 2 \ + -H "X-aws-ec2-metadata-token: $token" \ + "$IMDS_BASE_URL/meta-data/network/interfaces/macs/")" || return 1 + + for mac in $macs_raw; do + mac_no_slash="${mac%/}" + + device_number="$(curl --silent --show-error --fail --connect-timeout 1 --max-time 2 \ + -H "X-aws-ec2-metadata-token: $token" \ + "$IMDS_BASE_URL/meta-data/network/interfaces/macs/$mac_no_slash/device-number")" || return 1 + + if [ "$device_number" != "0" ]; then + continue + fi + + for iface in /sys/class/net/*; do + iface="${iface##*/}" + if [ "$iface" = "lo" ] || [ ! -f "/sys/class/net/$iface/address" ]; then + continue + fi + + iface_mac="$(cat "/sys/class/net/$iface/address" 2>/dev/null || true)" + if [ "$iface_mac" = "$mac_no_slash" ]; then + echo "$iface" + return 0 + fi + done + done + + return 1 +} + +has_snat_rule_for_interface() { + local iface="$1" + + iptables -t nat -S POSTROUTING 2>/dev/null | awk -v ifn="$iface" ' + $1 == "-A" && $2 == "POSTROUTING" { + has_if = 0 + has_masq = 0 + for (i = 3; i <= NF; i++) { + if ($i == "-o" && (i + 1) <= NF && $(i + 1) == ifn) { + has_if = 1 + } + if ($i == "-j" && (i + 1) <= NF && $(i + 1) == "MASQUERADE") { + has_masq = 1 + } + } + if (has_if && has_masq) { + found = 1 + } + } + END { exit(found ? 0 : 1) } + ' +} + +main() { + local attempt public_interface + + for attempt in $(seq 1 "$MAX_ATTEMPTS"); do + public_interface="$(resolve_public_interface 2>/dev/null || true)" + if [ -z "$public_interface" ]; then + log "attempt $attempt/$MAX_ATTEMPTS: IMDS or interface lookup not ready" + sleep "$SLEEP_SECONDS" + continue + fi + + log "attempt $attempt/$MAX_ATTEMPTS: resolved public interface $public_interface" + + if ! systemctl restart fck-nat.service; then + log "attempt $attempt/$MAX_ATTEMPTS: failed restarting fck-nat.service" + sleep "$SLEEP_SECONDS" + continue + fi + + if has_snat_rule_for_interface "$public_interface"; then + log "SNAT MASQUERADE rule is installed on $public_interface" + return 0 + fi + + log "attempt $attempt/$MAX_ATTEMPTS: SNAT MASQUERADE rule not present on $public_interface" + sleep "$SLEEP_SECONDS" + done + + log "exhausted retries without a valid SNAT rule" + return 1 +} + +main "$@" +GUARD_SCRIPT + +chmod 0755 /usr/local/sbin/nat-zero-fck-nat-guard.sh + +cat >/etc/systemd/system/nat-zero-fck-nat-guard.service <<'GUARD_UNIT' +[Unit] +Description=nat-zero guard for fck-nat IMDS/interface race +Wants=network-online.target fck-nat.service +After=network-online.target fck-nat.service + +[Service] +Type=oneshot +ExecStart=/usr/local/sbin/nat-zero-fck-nat-guard.sh + +[Install] +WantedBy=multi-user.target +GUARD_UNIT + +systemctl daemon-reload +systemctl enable nat-zero-fck-nat-guard.service +# Run once now, but do not fail cloud-init if retries are exhausted. +systemctl start nat-zero-fck-nat-guard.service || true diff --git a/lambda.tf b/lambda.tf index 049ec2e..6018d67 100644 --- a/lambda.tf +++ b/lambda.tf @@ -81,6 +81,7 @@ resource "aws_lambda_function" "nat_zero" { var.market_type, tostring(var.block_device_size), tostring(var.encrypt_root_volume), + local.fck_nat_bootstrap_guard_version, ])) } } diff --git a/launch_template.tf b/launch_template.tf index 2388791..74da1d7 100644 --- a/launch_template.tf +++ b/launch_template.tf @@ -5,6 +5,10 @@ locals { }, var.tags, ) + + # Bump when bootstrap guard behavior changes so existing NAT instances + # are replaced via the existing ConfigVersion reconciliation. + fck_nat_bootstrap_guard_version = "fck-nat-bootstrap-guard-v1" } resource "aws_launch_template" "nat_launch_template" { @@ -45,6 +49,8 @@ resource "aws_launch_template" "nat_launch_template" { http_tokens = "required" } + user_data = base64encode(file("${path.module}/files/nat_bootstrap_guard_user_data.sh")) + network_interfaces { network_interface_id = aws_network_interface.nat_public_network_interface[count.index].id device_index = 0 diff --git a/tests/integration/nat_zero_test.go b/tests/integration/nat_zero_test.go index dc4af3a..b351de6 100644 --- a/tests/integration/nat_zero_test.go +++ b/tests/integration/nat_zero_test.go @@ -130,6 +130,7 @@ func TestNatZero(t *testing.T) { lambdaName := terraform.Output(t, opts, "lambda_function_name") encryptRootVolume = terraform.Output(t, opts, "encrypt_root_volume") t.Logf("VPC: %s, private subnet: %s, Lambda: %s", vpcID, privateSubnet, lambdaName) + assertLaunchTemplateBootstrapGuard(t, ec2Client, vpcID) // Terminate test workload instances before terraform destroy. defer func() { @@ -718,6 +719,40 @@ func waitForEgress(t *testing.T, client *sqs.SQS, queueURL string, timeout time. return egressMessage{} // unreachable } +func assertLaunchTemplateBootstrapGuard(t *testing.T, c *ec2.EC2, vpcID string) { + t.Helper() + + templates, err := c.DescribeLaunchTemplates(&ec2.DescribeLaunchTemplatesInput{ + Filters: []*ec2.Filter{ + {Name: aws.String("tag:VpcId"), Values: []*string{aws.String(vpcID)}}, + }, + }) + require.NoError(t, err) + require.NotEmpty(t, templates.LaunchTemplates, "expected launch template tagged with VpcId=%s", vpcID) + + for _, lt := range templates.LaunchTemplates { + versions, err := c.DescribeLaunchTemplateVersions(&ec2.DescribeLaunchTemplateVersionsInput{ + LaunchTemplateId: lt.LaunchTemplateId, + Versions: []*string{aws.String("$Latest")}, + }) + require.NoError(t, err) + require.Len(t, versions.LaunchTemplateVersions, 1) + + data := versions.LaunchTemplateVersions[0].LaunchTemplateData + require.NotNil(t, data) + + encodedUserData := aws.StringValue(data.UserData) + require.NotEmpty(t, encodedUserData, "launch template %s missing user_data", aws.StringValue(lt.LaunchTemplateId)) + + decodedUserData, err := base64.StdEncoding.DecodeString(encodedUserData) + require.NoError(t, err) + + script := string(decodedUserData) + assert.Contains(t, script, "nat-zero-fck-nat-guard.sh") + assert.Contains(t, script, "nat-zero-fck-nat-guard.service") + } +} + func assertRouteTableEntry(t *testing.T, c *ec2.EC2, vpcID string, nat *ec2.Instance) { t.Helper() var privateENI string From 43628485b5b5c94c183a1df9858cc86fa088a1d5 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Wed, 4 Mar 2026 14:01:56 +1000 Subject: [PATCH 2/2] ci: trigger go-tests on test directory changes --- .github/workflows/go-tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/go-tests.yml b/.github/workflows/go-tests.yml index 827b23b..54fb8f1 100644 --- a/.github/workflows/go-tests.yml +++ b/.github/workflows/go-tests.yml @@ -4,11 +4,15 @@ on: pull_request: paths: - "cmd/lambda/**" + - "test/**" + - "tests/**" - ".github/workflows/go-tests.yml" push: branches: [main] paths: - "cmd/lambda/**" + - "test/**" + - "tests/**" - ".github/workflows/go-tests.yml" permissions: