Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/go-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@ on:
pull_request:
paths:
- "cmd/lambda/**"
- "test/**"
- "tests/**"
- ".github/workflows/go-tests.yml"
push:
branches: [main]
paths:
- "cmd/lambda/**"
- "test/**"
- "tests/**"
- ".github/workflows/go-tests.yml"

permissions:
Expand Down
139 changes: 139 additions & 0 deletions files/nat_bootstrap_guard_user_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#!/bin/bash
set -euo pipefail

cat >/usr/local/sbin/nat-zero-fck-nat-guard.sh <<'GUARD_SCRIPT'
#!/bin/bash
set -uo pipefail

LOG_TAG="nat-zero-fck-nat-guard"
IMDS_BASE_URL="http://169.254.169.254/latest"
MAX_ATTEMPTS=15
SLEEP_SECONDS=4

log() {
local msg="$1"
logger -t "$LOG_TAG" "$msg"
echo "$LOG_TAG: $msg"
}

get_imds_token() {
curl --silent --show-error --fail --connect-timeout 1 --max-time 2 \
-X PUT "$IMDS_BASE_URL/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 60"
}

resolve_public_interface() {
local token macs_raw mac mac_no_slash device_number iface iface_mac

token="$(get_imds_token)" || return 1

macs_raw="$(curl --silent --show-error --fail --connect-timeout 1 --max-time 2 \
-H "X-aws-ec2-metadata-token: $token" \
"$IMDS_BASE_URL/meta-data/network/interfaces/macs/")" || return 1

for mac in $macs_raw; do
mac_no_slash="${mac%/}"

device_number="$(curl --silent --show-error --fail --connect-timeout 1 --max-time 2 \
-H "X-aws-ec2-metadata-token: $token" \
"$IMDS_BASE_URL/meta-data/network/interfaces/macs/$mac_no_slash/device-number")" || return 1

if [ "$device_number" != "0" ]; then
continue
fi

for iface in /sys/class/net/*; do
iface="${iface##*/}"
if [ "$iface" = "lo" ] || [ ! -f "/sys/class/net/$iface/address" ]; then
continue
fi

iface_mac="$(cat "/sys/class/net/$iface/address" 2>/dev/null || true)"
if [ "$iface_mac" = "$mac_no_slash" ]; then
echo "$iface"
return 0
fi
done
done

return 1
}

has_snat_rule_for_interface() {
local iface="$1"

iptables -t nat -S POSTROUTING 2>/dev/null | awk -v ifn="$iface" '
$1 == "-A" && $2 == "POSTROUTING" {
has_if = 0
has_masq = 0
for (i = 3; i <= NF; i++) {
if ($i == "-o" && (i + 1) <= NF && $(i + 1) == ifn) {
has_if = 1
}
if ($i == "-j" && (i + 1) <= NF && $(i + 1) == "MASQUERADE") {
has_masq = 1
}
}
if (has_if && has_masq) {
found = 1
}
}
END { exit(found ? 0 : 1) }
'
}

main() {
local attempt public_interface

for attempt in $(seq 1 "$MAX_ATTEMPTS"); do
public_interface="$(resolve_public_interface 2>/dev/null || true)"
if [ -z "$public_interface" ]; then
log "attempt $attempt/$MAX_ATTEMPTS: IMDS or interface lookup not ready"
sleep "$SLEEP_SECONDS"
continue
fi

log "attempt $attempt/$MAX_ATTEMPTS: resolved public interface $public_interface"

if ! systemctl restart fck-nat.service; then
log "attempt $attempt/$MAX_ATTEMPTS: failed restarting fck-nat.service"
sleep "$SLEEP_SECONDS"
continue
fi

if has_snat_rule_for_interface "$public_interface"; then
log "SNAT MASQUERADE rule is installed on $public_interface"
return 0
fi

log "attempt $attempt/$MAX_ATTEMPTS: SNAT MASQUERADE rule not present on $public_interface"
sleep "$SLEEP_SECONDS"
done

log "exhausted retries without a valid SNAT rule"
return 1
}

main "$@"
GUARD_SCRIPT

chmod 0755 /usr/local/sbin/nat-zero-fck-nat-guard.sh

cat >/etc/systemd/system/nat-zero-fck-nat-guard.service <<'GUARD_UNIT'
[Unit]
Description=nat-zero guard for fck-nat IMDS/interface race
Wants=network-online.target fck-nat.service
After=network-online.target fck-nat.service

[Service]
Type=oneshot
ExecStart=/usr/local/sbin/nat-zero-fck-nat-guard.sh

[Install]
WantedBy=multi-user.target
GUARD_UNIT

systemctl daemon-reload
systemctl enable nat-zero-fck-nat-guard.service
# Run once now, but do not fail cloud-init if retries are exhausted.
systemctl start nat-zero-fck-nat-guard.service || true
1 change: 1 addition & 0 deletions lambda.tf
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ resource "aws_lambda_function" "nat_zero" {
var.market_type,
tostring(var.block_device_size),
tostring(var.encrypt_root_volume),
local.fck_nat_bootstrap_guard_version,
]))
}
}
Expand Down
6 changes: 6 additions & 0 deletions launch_template.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ locals {
},
var.tags,
)

# Bump when bootstrap guard behavior changes so existing NAT instances
# are replaced via the existing ConfigVersion reconciliation.
fck_nat_bootstrap_guard_version = "fck-nat-bootstrap-guard-v1"
}

resource "aws_launch_template" "nat_launch_template" {
Expand Down Expand Up @@ -45,6 +49,8 @@ resource "aws_launch_template" "nat_launch_template" {
http_tokens = "required"
}

user_data = base64encode(file("${path.module}/files/nat_bootstrap_guard_user_data.sh"))

network_interfaces {
network_interface_id = aws_network_interface.nat_public_network_interface[count.index].id
device_index = 0
Expand Down
35 changes: 35 additions & 0 deletions tests/integration/nat_zero_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ func TestNatZero(t *testing.T) {
lambdaName := terraform.Output(t, opts, "lambda_function_name")
encryptRootVolume = terraform.Output(t, opts, "encrypt_root_volume")
t.Logf("VPC: %s, private subnet: %s, Lambda: %s", vpcID, privateSubnet, lambdaName)
assertLaunchTemplateBootstrapGuard(t, ec2Client, vpcID)

// Terminate test workload instances before terraform destroy.
defer func() {
Expand Down Expand Up @@ -718,6 +719,40 @@ func waitForEgress(t *testing.T, client *sqs.SQS, queueURL string, timeout time.
return egressMessage{} // unreachable
}

func assertLaunchTemplateBootstrapGuard(t *testing.T, c *ec2.EC2, vpcID string) {
t.Helper()

templates, err := c.DescribeLaunchTemplates(&ec2.DescribeLaunchTemplatesInput{
Filters: []*ec2.Filter{
{Name: aws.String("tag:VpcId"), Values: []*string{aws.String(vpcID)}},
},
})
require.NoError(t, err)
require.NotEmpty(t, templates.LaunchTemplates, "expected launch template tagged with VpcId=%s", vpcID)

for _, lt := range templates.LaunchTemplates {
versions, err := c.DescribeLaunchTemplateVersions(&ec2.DescribeLaunchTemplateVersionsInput{
LaunchTemplateId: lt.LaunchTemplateId,
Versions: []*string{aws.String("$Latest")},
})
require.NoError(t, err)
require.Len(t, versions.LaunchTemplateVersions, 1)

data := versions.LaunchTemplateVersions[0].LaunchTemplateData
require.NotNil(t, data)

encodedUserData := aws.StringValue(data.UserData)
require.NotEmpty(t, encodedUserData, "launch template %s missing user_data", aws.StringValue(lt.LaunchTemplateId))

decodedUserData, err := base64.StdEncoding.DecodeString(encodedUserData)
require.NoError(t, err)

script := string(decodedUserData)
assert.Contains(t, script, "nat-zero-fck-nat-guard.sh")
assert.Contains(t, script, "nat-zero-fck-nat-guard.service")
}
}

func assertRouteTableEntry(t *testing.T, c *ec2.EC2, vpcID string, nat *ec2.Instance) {
t.Helper()
var privateENI string
Expand Down
Loading