From 9473d099256ec6fd8ba0d6a48f49e13b31b24052 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:47:56 +0100
Subject: [PATCH 01/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 examples/quickstart/docker-compose.yml | 48 ++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 examples/quickstart/docker-compose.yml

diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml
new file mode 100644
index 0000000..0b5affa
--- /dev/null
+++ b/examples/quickstart/docker-compose.yml
@@ -0,0 +1,48 @@
+# Fairvisor Edge — Quickstart stack (standalone mode)
+#
+# Usage:
+#   docker compose up -d
+#   curl -s http://localhost:8080/readyz          # health check
+#   curl -s -X POST http://localhost:8080/openai/v1/chat/completions \
+#     -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-fake-upstream-key" \
+#     -H "Content-Type: application/json" \
+#     -d @fixtures/normal_request.json            # expect 200
+#   curl -s -X POST http://localhost:8080/openai/v1/chat/completions \
+#     -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-fake-upstream-key" \
+#     -H "Content-Type: application/json" \
+#     -d @fixtures/over_limit_request.json        # expect 429
+#
+# This file is also the base for the e2e-smoke CI check.
+# CI extends it via tests/e2e/docker-compose.test.yml; do not diverge the
+# service name, port, or volume contract without updating CI as well.
+
+services:
+  edge:
+    image: ghcr.io/fairvisor/fairvisor-edge:latest
+    ports:
+      - "8080:8080"
+    environment:
+      FAIRVISOR_CONFIG_FILE: /etc/fairvisor/policy.json
+      FAIRVISOR_MODE: wrapper
+      FAIRVISOR_SHARED_DICT_SIZE: 32m
+      FAIRVISOR_LOG_LEVEL: info
+      FAIRVISOR_WORKER_PROCESSES: "1"
+    volumes:
+      - ./policy.json:/etc/fairvisor/policy.json:ro
+    healthcheck:
+      test: ["CMD", "curl", "-sf", "http://127.0.0.1:8080/readyz"]
+      interval: 2s
+      timeout: 2s
+      retries: 15
+      start_period: 5s
+
+  mock_llm:
+    image: nginx:1.27-alpine
+    volumes:
+      - ./mock-llm.conf:/etc/nginx/nginx.conf:ro
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:80/"]
+      interval: 2s
+      timeout: 2s
+      retries: 10
+      start_period: 5s

From c608efce8c579c8841286365014e5c58eecb81d4 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:47:57 +0100
Subject: [PATCH 02/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 examples/quickstart/policy.json | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 examples/quickstart/policy.json

diff --git a/examples/quickstart/policy.json b/examples/quickstart/policy.json
new file mode 100644
index 0000000..f5520aa
--- /dev/null
+++ b/examples/quickstart/policy.json
@@ -0,0 +1,31 @@
+{
+  "bundle_version": 1,
+  "issued_at": "2026-01-01T00:00:00Z",
+  "expires_at": "2030-01-01T00:00:00Z",
+  "policies": [
+    {
+      "id": "quickstart-tpm-policy",
+      "spec": {
+        "selector": {
+          "pathPrefix": "/openai/",
+          "methods": ["POST"]
+        },
+        "mode": "enforce",
+        "rules": [
+          {
+            "name": "tpm-limit",
+            "limit_keys": ["jwt:sub"],
+            "algorithm": "token_bucket_llm",
+            "algorithm_config": {
+              "tokens_per_minute": 100,
+              "tokens_per_day": 1000,
+              "burst_tokens": 100,
+              "default_max_completion_tokens": 50
+            }
+          }
+        ]
+      }
+    }
+  ],
+  "kill_switches": []
+}

From 046ac3ba7fbd1f53e8f412aa63aed12241901c67 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:47:57 +0100
Subject: [PATCH 03/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 examples/quickstart/mock-llm.conf | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 examples/quickstart/mock-llm.conf

diff --git a/examples/quickstart/mock-llm.conf b/examples/quickstart/mock-llm.conf
new file mode 100644
index 0000000..26603ab
--- /dev/null
+++ b/examples/quickstart/mock-llm.conf
@@ -0,0 +1,10 @@
+events {}
+http {
+  server {
+    listen 80;
+    location / {
+      default_type application/json;
+      return 200 '{"id":"chatcmpl-qs","object":"chat.completion","choices":[{"index":0,"message":{"role":"assistant","content":"Hello from the mock backend!"},"finish_reason":"stop"}],"usage":{"prompt_tokens":10,"completion_tokens":8,"total_tokens":18}}';
+    }
+  }
+}

From 7b4cbbedad973de8891c6ecfb1628c471c728f26 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:47:58 +0100
Subject: [PATCH 04/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 examples/quickstart/README.md | 97 +++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 examples/quickstart/README.md

diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md
new file mode 100644
index 0000000..a23fc78
--- /dev/null
+++ b/examples/quickstart/README.md
@@ -0,0 +1,97 @@
+# Fairvisor Edge — Quickstart
+
+Go from `git clone` to working policy enforcement in one step.
+
+## Prerequisites
+
+- Docker with Compose V2 (`docker compose version`)
+- Port 8080 free on localhost
+
+## Start
+
+```bash
+docker compose up -d
+```
+
+Wait for the edge service to report healthy:
+
+```bash
+docker compose ps
+# edge should show "healthy"
+```
+
+## Verify enforcement
+
+**Allowed request** — should return `200`:
+
+```bash
+curl -s -X POST http://localhost:8080/openai/v1/chat/completions \
+  -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-fake-key" \
+  -H "Content-Type: application/json" \
+  -d @../../fixtures/normal_request.json
+```
+
+Expected response matches `../../fixtures/allow_response.json`.
+
+**Over-limit request** — should return `429`:
+
+```bash
+curl -s -X POST http://localhost:8080/openai/v1/chat/completions \
+  -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-fake-key" \
+  -H "Content-Type: application/json" \
+  -d @../../fixtures/over_limit_request.json
+```
+
+Expected response body matches `../../fixtures/reject_tpm_exceeded.json`.
+The response will also include:
+- `X-Fairvisor-Reason: tpm_exceeded`
+- `Retry-After: 60`
+- `RateLimit-Limit: 100`
+- `RateLimit-Remaining: 0`
+
+## Wrapper mode and auth
+
+This quickstart runs in `FAIRVISOR_MODE=wrapper`. The composite Bearer token format is:
+
+```
+Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY
+```
+
+- `CLIENT_JWT` — a signed JWT identifying the calling client/tenant (used for policy enforcement)
+- `UPSTREAM_KEY` — the real upstream API key forwarded to the provider (e.g. `sk-...` for OpenAI)
+
+Fairvisor strips the composite header and injects the correct provider auth before forwarding. The upstream key is **never returned to the caller** — see `../../fixtures/allow_response.json` for proof (no `Authorization`, `x-api-key`, or `x-goog-api-key` headers in the response).
+
+## Provider-prefixed paths
+
+Wrapper mode routes by path prefix:
+
+| Path prefix | Upstream | Auth header |
+|---|---|---|
+| `/openai/v1/...` | `https://api.openai.com/v1/...` | `Authorization: Bearer UPSTREAM_KEY` |
+| `/anthropic/v1/...` | `https://api.anthropic.com/v1/...` | `x-api-key: UPSTREAM_KEY` |
+| `/gemini/v1beta/...` | `https://generativelanguage.googleapis.com/v1beta/...` | `x-goog-api-key: UPSTREAM_KEY` |
+
+## Anthropic example
+
+```bash
+curl -s -X POST http://localhost:8080/anthropic/v1/messages \
+  -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-ant-fake-key" \
+  -H "Content-Type: application/json" \
+  -d @../../fixtures/anthropic_normal_request.json
+```
+
+A rejected Anthropic request returns an Anthropic-native error body — see `../../fixtures/reject_anthropic.json`.
+
+## Teardown
+
+```bash
+docker compose down
+```
+
+## Next steps
+
+- See `../recipes/` for team budgets, runaway agent guard, and provider failover scenarios
+- See `../../fixtures/` for all sample request/response artifacts
+- See [fairvisor/benchmark](https://github.com/fairvisor/benchmark) for performance benchmarks
+- See [docs/install/](../../docs/install/) for Kubernetes, VM, and SaaS deployment options

From a0551ffdbec47ebe82da9db01ae00edde616cdc8 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:47:59 +0100
Subject: [PATCH 05/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 examples/recipes/team-budgets/policy.json | 47 +++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 examples/recipes/team-budgets/policy.json

diff --git a/examples/recipes/team-budgets/policy.json b/examples/recipes/team-budgets/policy.json
new file mode 100644
index 0000000..d361a30
--- /dev/null
+++ b/examples/recipes/team-budgets/policy.json
@@ -0,0 +1,47 @@
+{
+  "bundle_version": 1,
+  "issued_at": "2026-01-01T00:00:00Z",
+  "expires_at": "2030-01-01T00:00:00Z",
+  "policies": [
+    {
+      "id": "team-token-budget",
+      "spec": {
+        "selector": {
+          "pathPrefix": "/openai/",
+          "methods": ["POST"]
+        },
+        "mode": "enforce",
+        "rules": [
+          {
+            "name": "per-team-tpm",
+            "limit_keys": ["jwt:team_id"],
+            "algorithm": "token_bucket_llm",
+            "algorithm_config": {
+              "tokens_per_minute": 120000,
+              "tokens_per_day": 2000000,
+              "burst_tokens": 120000,
+              "default_max_completion_tokens": 1024
+            }
+          },
+          {
+            "name": "per-team-cost-budget",
+            "limit_keys": ["jwt:team_id"],
+            "algorithm": "cost_based",
+            "algorithm_config": {
+              "budget": 50000,
+              "period": "30d",
+              "cost_key": "fixed",
+              "fixed_cost": 1,
+              "staged_actions": [
+                { "threshold_percent": 80, "action": "warn" },
+                { "threshold_percent": 95, "action": "throttle", "delay_ms": 500 },
+                { "threshold_percent": 100, "action": "reject" }
+              ]
+            }
+          }
+        ]
+      }
+    }
+  ],
+  "kill_switches": []
+}

From a36312e84f5ad4b82d6ee71a96ce5dc88ff8efba Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:47:59 +0100
Subject: [PATCH 06/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 examples/recipes/team-budgets/README.md | 45 +++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 examples/recipes/team-budgets/README.md

diff --git a/examples/recipes/team-budgets/README.md b/examples/recipes/team-budgets/README.md
new file mode 100644
index 0000000..54c1551
--- /dev/null
+++ b/examples/recipes/team-budgets/README.md
@@ -0,0 +1,45 @@
+# Recipe: Team Budgets
+
+Enforce per-team token and cost limits using JWT claims.
+
+## How it works
+
+Each request carries a JWT with a `team_id` claim. Fairvisor uses this as
+the bucket key for two independent rules:
+
+1. **TPM/TPD limit** — token-rate enforcement per minute and per day
+2. **Monthly cost budget** — cumulative cost cap with staged warn/throttle/reject
+
+## Deploy
+
+```bash
+# Copy policy to your edge config path
+cp policy.json /etc/fairvisor/policy.json
+
+# Or use with docker compose (standalone mode):
+FAIRVISOR_CONFIG_FILE=./policy.json FAIRVISOR_MODE=wrapper docker compose up -d
+```
+
+## JWT shape expected
+
+```json
+{
+  "sub": "user-123",
+  "team_id": "engineering",
+  "plan": "pro",
+  "exp": 9999999999
+}
+```
+
+## Staged actions at cost budget thresholds
+
+| Threshold | Action |
+|---|---|
+| 80% | Warn (allow, log, emit business event) |
+| 95% | Throttle (allow with 500 ms delay) |
+| 100% | Reject (429, `budget_exceeded`) |
+
+## Related fixtures
+
+- `../../../fixtures/reject_tpd_exceeded.json` — TPD reject body
+- `../../../fixtures/reject_tpm_exceeded.json` — TPM reject body

From d70e2c4b5de167beae91c056dc5185017caed207 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:00 +0100
Subject: [PATCH 07/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 .../recipes/runaway-agent-guard/policy.json   | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 examples/recipes/runaway-agent-guard/policy.json

diff --git a/examples/recipes/runaway-agent-guard/policy.json b/examples/recipes/runaway-agent-guard/policy.json
new file mode 100644
index 0000000..2715b60
--- /dev/null
+++ b/examples/recipes/runaway-agent-guard/policy.json
@@ -0,0 +1,40 @@
+{
+  "bundle_version": 1,
+  "issued_at": "2026-01-01T00:00:00Z",
+  "expires_at": "2030-01-01T00:00:00Z",
+  "policies": [
+    {
+      "id": "runaway-agent-guard",
+      "spec": {
+        "selector": {
+          "pathPrefix": "/",
+          "methods": ["POST"]
+        },
+        "mode": "enforce",
+        "rules": [
+          {
+            "name": "loop-detection",
+            "limit_keys": ["jwt:agent_id"],
+            "algorithm": "loop_detector",
+            "algorithm_config": {
+              "window_seconds": 60,
+              "max_requests": 30,
+              "cooldown_seconds": 120
+            }
+          },
+          {
+            "name": "agent-tpm-guard",
+            "limit_keys": ["jwt:agent_id"],
+            "algorithm": "token_bucket_llm",
+            "algorithm_config": {
+              "tokens_per_minute": 50000,
+              "burst_tokens": 50000,
+              "default_max_completion_tokens": 512
+            }
+          }
+        ]
+      }
+    }
+  ],
+  "kill_switches": []
+}

From 489a28ab04e9de94ff4e46695c6e7328d2265623 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:01 +0100
Subject: [PATCH 08/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 .../recipes/runaway-agent-guard/README.md     | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 examples/recipes/runaway-agent-guard/README.md

diff --git a/examples/recipes/runaway-agent-guard/README.md b/examples/recipes/runaway-agent-guard/README.md
new file mode 100644
index 0000000..7b34491
--- /dev/null
+++ b/examples/recipes/runaway-agent-guard/README.md
@@ -0,0 +1,50 @@
+# Recipe: Runaway Agent Guard
+
+Stop runaway agentic workflows before they exhaust your token budget or
+billing limit.
+
+## Problem
+
+Autonomous agents (LangChain, AutoGPT, custom loops) can enter retry storms
+or infinite planning loops. Without enforcement, a single runaway agent
+can consume thousands of dollars of API budget in minutes.
+
+## How it works
+
+Two rules cooperate:
+
+1. **Loop detector** — counts requests per `agent_id` in a sliding window.
+   If the agent fires more than 30 requests in 60 seconds, it trips a
+   120-second cooldown. This catches tight retry loops.
+
+2. **TPM guard** — caps tokens per minute per agent. A burst-heavy agent
+   that passes the loop check still cannot drain the token pool.
+
+## Deploy
+
+```bash
+cp policy.json /etc/fairvisor/policy.json
+```
+
+## JWT shape expected
+
+```json
+{
+  "sub": "user-456",
+  "agent_id": "autoagent-prod-7",
+  "exp": 9999999999
+}
+```
+
+## Kill switch for incidents
+
+If an agent causes an incident, flip a kill switch without restarting edge:
+
+```bash
+# Via CLI
+fairvisor kill-switch enable agent-id=autoagent-prod-7
+
+# Or update the policy bundle with a kill_switch entry and hot-reload
+```
+
+See `docs/cookbook/kill-switch-incident-response.md` for the full incident playbook.

From 53a70356f89fc37b3f29d487f2200a9957d2c770 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:02 +0100
Subject: [PATCH 09/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 .../recipes/provider-failover/policy.json     | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 examples/recipes/provider-failover/policy.json

diff --git a/examples/recipes/provider-failover/policy.json b/examples/recipes/provider-failover/policy.json
new file mode 100644
index 0000000..5974c77
--- /dev/null
+++ b/examples/recipes/provider-failover/policy.json
@@ -0,0 +1,62 @@
+{
+  "bundle_version": 1,
+  "issued_at": "2026-01-01T00:00:00Z",
+  "expires_at": "2030-01-01T00:00:00Z",
+  "policies": [
+    {
+      "id": "provider-failover-primary",
+      "spec": {
+        "selector": {
+          "pathPrefix": "/openai/",
+          "methods": ["POST"]
+        },
+        "mode": "enforce",
+        "rules": [
+          {
+            "name": "openai-tpm",
+            "limit_keys": ["jwt:org_id"],
+            "algorithm": "token_bucket_llm",
+            "algorithm_config": {
+              "tokens_per_minute": 200000,
+              "burst_tokens": 200000,
+              "default_max_completion_tokens": 2048
+            }
+          },
+          {
+            "name": "openai-circuit-breaker",
+            "limit_keys": ["jwt:org_id"],
+            "algorithm": "circuit_breaker",
+            "algorithm_config": {
+              "spend_window_seconds": 300,
+              "spend_threshold": 100000,
+              "cooldown_seconds": 600
+            }
+          }
+        ]
+      }
+    },
+    {
+      "id": "provider-failover-fallback",
+      "spec": {
+        "selector": {
+          "pathPrefix": "/anthropic/",
+          "methods": ["POST"]
+        },
+        "mode": "enforce",
+        "rules": [
+          {
+            "name": "anthropic-tpm",
+            "limit_keys": ["jwt:org_id"],
+            "algorithm": "token_bucket_llm",
+            "algorithm_config": {
+              "tokens_per_minute": 100000,
+              "burst_tokens": 100000,
+              "default_max_completion_tokens": 2048
+            }
+          }
+        ]
+      }
+    }
+  ],
+  "kill_switches": []
+}

From 3b078d09d99083e1c5914e4b7615c69ee6e7f495 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:03 +0100
Subject: [PATCH 10/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 examples/recipes/provider-failover/README.md | 52 ++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 examples/recipes/provider-failover/README.md

diff --git a/examples/recipes/provider-failover/README.md b/examples/recipes/provider-failover/README.md
new file mode 100644
index 0000000..520226c
--- /dev/null
+++ b/examples/recipes/provider-failover/README.md
@@ -0,0 +1,52 @@
+# Recipe: Provider Failover / Edge Control
+
+Run two provider paths under independent policy budgets. When the primary
+provider (OpenAI) trips a circuit breaker, your client-side router can
+switch to the fallback (Anthropic) — both paths enforced by the same edge.
+
+## How it works
+
+- `/openai/v1/...` — enforced by an OpenAI TPM limit + a spend-based circuit breaker
+- `/anthropic/v1/...` — enforced by an Anthropic TPM limit
+
+The circuit breaker on the OpenAI path auto-trips when cumulative spend
+exceeds the threshold in a 5-minute window, then auto-resets after 10 minutes.
+Your application can detect the 429 with `X-Fairvisor-Reason: circuit_breaker_open`
+and switch to the Anthropic path without any Fairvisor configuration change.
+
+## Deploy
+
+```bash
+cp policy.json /etc/fairvisor/policy.json
+```
+
+## Client-side failover pattern
+
+```python
+import httpx
+
+EDGE = "http://localhost:8080"
+AUTH = "Bearer my-client-jwt.payload.sig:sk-my-upstream-key"
+
+def chat(messages, provider="openai"):
+    resp = httpx.post(
+        f"{EDGE}/{provider}/v1/chat/completions",
+        headers={"Authorization": AUTH, "Content-Type": "application/json"},
+        json={"model": "gpt-4o", "messages": messages},
+    )
+    if resp.status_code == 429:
+        reason = resp.headers.get("X-Fairvisor-Reason", "")
+        if reason == "circuit_breaker_open" and provider == "openai":
+            return chat(messages, provider="anthropic")
+    resp.raise_for_status()
+    return resp.json()
+```
+
+## Auth note
+
+The composite `CLIENT_JWT:UPSTREAM_KEY` format is the same for all providers.
+Fairvisor injects the correct provider-native auth header:
+- OpenAI: `Authorization: Bearer UPSTREAM_KEY`
+- Anthropic: `x-api-key: UPSTREAM_KEY`
+
+The upstream key is stripped from responses — it never reaches your client.

From 4b4d24922be8e1406f38f94d1f4172dfc639c000 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:04 +0100
Subject: [PATCH 11/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/reject_tpm_exceeded.json | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 fixtures/reject_tpm_exceeded.json

diff --git a/fixtures/reject_tpm_exceeded.json b/fixtures/reject_tpm_exceeded.json
new file mode 100644
index 0000000..26f45d0
--- /dev/null
+++ b/fixtures/reject_tpm_exceeded.json
@@ -0,0 +1,17 @@
+{
+  "_comment": "429 body returned when the per-minute token budget is exhausted.",
+  "_headers": {
+    "X-Fairvisor-Reason": "tpm_exceeded",
+    "Retry-After": "60",
+    "RateLimit-Limit": "120000",
+    "RateLimit-Remaining": "0",
+    "RateLimit-Reset": "<unix timestamp of next window>",
+    "Content-Type": "application/json"
+  },
+  "error": {
+    "type": "rate_limit_error",
+    "code": "tpm_exceeded",
+    "message": "Token budget exceeded for this tenant.",
+    "param": null
+  }
+}

From 70ed186f9604d7ccc6cabb6e8d42ecbb5ae2e7d8 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:05 +0100
Subject: [PATCH 12/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/reject_tpd_exceeded.json | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 fixtures/reject_tpd_exceeded.json

diff --git a/fixtures/reject_tpd_exceeded.json b/fixtures/reject_tpd_exceeded.json
new file mode 100644
index 0000000..8d2bcdb
--- /dev/null
+++ b/fixtures/reject_tpd_exceeded.json
@@ -0,0 +1,16 @@
+{
+  "_comment": "429 body returned when the per-day token budget is exhausted.",
+  "_headers": {
+    "X-Fairvisor-Reason": "tpd_exceeded",
+    "Retry-After": "86400",
+    "RateLimit-Limit": "2000000",
+    "RateLimit-Remaining": "0",
+    "Content-Type": "application/json"
+  },
+  "error": {
+    "type": "rate_limit_error",
+    "code": "tpd_exceeded",
+    "message": "Token budget exceeded for this tenant.",
+    "param": null
+  }
+}

From e03dfcc0ada593af89c2306b7eaa2777f238d3f1 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:06 +0100
Subject: [PATCH 13/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/reject_prompt_too_large.json | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 fixtures/reject_prompt_too_large.json

diff --git a/fixtures/reject_prompt_too_large.json b/fixtures/reject_prompt_too_large.json
new file mode 100644
index 0000000..9c4cf8c
--- /dev/null
+++ b/fixtures/reject_prompt_too_large.json
@@ -0,0 +1,13 @@
+{
+  "_comment": "429 body returned when the request exceeds max_prompt_tokens.",
+  "_headers": {
+    "X-Fairvisor-Reason": "prompt_too_large",
+    "Content-Type": "application/json"
+  },
+  "error": {
+    "type": "rate_limit_error",
+    "code": "prompt_too_large",
+    "message": "Request prompt exceeds the maximum allowed token count for this policy.",
+    "param": null
+  }
+}

From b538176d31b3e001fa17a4e7e17c1d50dac88d4e Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:07 +0100
Subject: [PATCH 14/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/allow_response.json | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 fixtures/allow_response.json

diff --git a/fixtures/allow_response.json b/fixtures/allow_response.json
new file mode 100644
index 0000000..7cc0312
--- /dev/null
+++ b/fixtures/allow_response.json
@@ -0,0 +1,28 @@
+{
+  "_comment": "Sample 200 response for an allowed request in wrapper mode. Note: no Authorization, x-api-key, or x-goog-api-key headers — upstream auth is stripped on the response side.",
+  "_status": 200,
+  "_headers": {
+    "Content-Type": "application/json",
+    "X-Fairvisor-Reason": null,
+    "Authorization": null,
+    "x-api-key": null,
+    "x-goog-api-key": null
+  },
+  "id": "chatcmpl-example",
+  "object": "chat.completion",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "Hello! How can I help you today?"
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 10,
+    "completion_tokens": 9,
+    "total_tokens": 19
+  }
+}

From f021fd66e98e14318834043bca602fa630d8cda5 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:08 +0100
Subject: [PATCH 15/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/normal_request.json | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 fixtures/normal_request.json

diff --git a/fixtures/normal_request.json b/fixtures/normal_request.json
new file mode 100644
index 0000000..049a4e4
--- /dev/null
+++ b/fixtures/normal_request.json
@@ -0,0 +1,10 @@
+{
+  "model": "gpt-4o-mini",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Say hello in one sentence."
+    }
+  ],
+  "max_tokens": 20
+}

From 3b54d3ff21dd3f9ff9f2f0bc97b6f17f59ea4156 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:08 +0100
Subject: [PATCH 16/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/over_limit_request.json | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 fixtures/over_limit_request.json

diff --git a/fixtures/over_limit_request.json b/fixtures/over_limit_request.json
new file mode 100644
index 0000000..b3b554f
--- /dev/null
+++ b/fixtures/over_limit_request.json
@@ -0,0 +1,10 @@
+{
+  "model": "gpt-4o",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Say hello in one sentence."
+    }
+  ],
+  "max_tokens": 200000
+}

From e1dd56da63c818b8705c312d724905d5b0f37be2 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:09 +0100
Subject: [PATCH 17/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/reject_openai.json | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 fixtures/reject_openai.json

diff --git a/fixtures/reject_openai.json b/fixtures/reject_openai.json
new file mode 100644
index 0000000..eabd023
--- /dev/null
+++ b/fixtures/reject_openai.json
@@ -0,0 +1,14 @@
+{
+  "_comment": "OpenAI-native 429 reject body. Used for /openai/* paths and OpenAI-compatible providers.",
+  "_headers": {
+    "X-Fairvisor-Reason": "tpm_exceeded",
+    "Retry-After": "60",
+    "Content-Type": "application/json"
+  },
+  "error": {
+    "type": "rate_limit_error",
+    "code": "tpm_exceeded",
+    "message": "Token budget exceeded for this tenant.",
+    "param": null
+  }
+}

From 800c4f9cc9a03ad436558254cc791d740d35dbb1 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:10 +0100
Subject: [PATCH 18/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/reject_anthropic.json | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 fixtures/reject_anthropic.json

diff --git a/fixtures/reject_anthropic.json b/fixtures/reject_anthropic.json
new file mode 100644
index 0000000..bdf468f
--- /dev/null
+++ b/fixtures/reject_anthropic.json
@@ -0,0 +1,13 @@
+{
+  "_comment": "Anthropic-native 429 reject body. Used for /anthropic/* paths.",
+  "_headers": {
+    "X-Fairvisor-Reason": "tpm_exceeded",
+    "Retry-After": "60",
+    "Content-Type": "application/json"
+  },
+  "type": "error",
+  "error": {
+    "type": "rate_limit_error",
+    "message": "Token budget exceeded for this tenant."
+  }
+}

From f13e64111623eae652a232c207f61624c0311e2a Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:10 +0100
Subject: [PATCH 19/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/reject_gemini.json | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 fixtures/reject_gemini.json

diff --git a/fixtures/reject_gemini.json b/fixtures/reject_gemini.json
new file mode 100644
index 0000000..f0df901
--- /dev/null
+++ b/fixtures/reject_gemini.json
@@ -0,0 +1,13 @@
+{
+  "_comment": "Gemini-native 429 reject body. Used for /gemini/* paths.",
+  "_headers": {
+    "X-Fairvisor-Reason": "tpm_exceeded",
+    "Retry-After": "60",
+    "Content-Type": "application/json"
+  },
+  "error": {
+    "code": 429,
+    "message": "Token budget exceeded for this tenant.",
+    "status": "RESOURCE_EXHAUSTED"
+  }
+}

From fbcf12d5b906454aa1d8c570e957acd5621be676 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:48:11 +0100
Subject: [PATCH 20/64] feat(quickstart): add runnable quickstart, recipes, and
 fixtures (issue #32)

---
 fixtures/anthropic_normal_request.json | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 fixtures/anthropic_normal_request.json

diff --git a/fixtures/anthropic_normal_request.json b/fixtures/anthropic_normal_request.json
new file mode 100644
index 0000000..bcffdbf
--- /dev/null
+++ b/fixtures/anthropic_normal_request.json
@@ -0,0 +1,10 @@
+{
+  "model": "claude-3-5-haiku-20241022",
+  "max_tokens": 20,
+  "messages": [
+    {
+      "role": "user",
+      "content": "Say hello in one sentence."
+    }
+  ]
+}

From a93b37735b0b80e8ed392bf111b0aabaaa15c835 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Tue, 17 Mar 2026 11:49:41 +0100
Subject: [PATCH 21/64] docs(readme): add quickstart pointer, update project
 layout, fix benchmark link (issue #32)

---
 README.md | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 07bc768..c267fa7 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,12 @@ Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacem
 
 ## Quick start
 
+> **Runnable quickstart:** `examples/quickstart/` — `docker compose up -d` and run your first enforce/reject test in under a minute. See [`examples/quickstart/README.md`](examples/quickstart/README.md).
+>
+> **Recipes:** `examples/recipes/` — deployable team budgets, runaway agent guard, and provider failover examples.
+>
+> **Sample artifacts:** `fixtures/` — canonical request/response fixtures for enforce, reject (TPM, TPD, prompt-too-large), and provider-native error bodies (OpenAI, Anthropic, Gemini).
+
 ### 1. Create a policy
 
 ```bash
@@ -304,7 +310,7 @@ Policies are versioned JSON — commit them to Git, review changes in PRs, roll
 
 **No external datastore.** All enforcement state lives in in-process shared memory (`ngx.shared.dict`). No Redis, no Postgres, no network round-trips in the decision path.
 
-> Reproduce: `git clone https://github.com/fairvisor/benchmark && cd benchmark && ./run-all.sh`
+> Reproduce: see [fairvisor/benchmark](https://github.com/fairvisor/benchmark) — the canonical benchmark source of truth for Fairvisor Edge performance numbers.
 
 ## Deployment
 
@@ -348,14 +354,16 @@ If the SaaS is unreachable, the edge keeps enforcing with the last-known policy
 ## Project layout
 
 ```
-src/fairvisor/    runtime modules (OpenResty/LuaJIT)
-cli/              command-line tooling
-spec/             unit and integration tests (busted)
-tests/e2e/        Docker-based E2E tests (pytest)
-examples/         sample policy bundles
-helm/             Helm chart
-docker/           Docker artifacts
-docs/             reference documentation
+src/fairvisor/           runtime modules (OpenResty/LuaJIT)
+cli/                     command-line tooling
+spec/                    unit and integration tests (busted)
+tests/e2e/               Docker-based E2E tests (pytest)
+examples/quickstart/     runnable quickstart (docker compose up -d)
+examples/recipes/        deployable policy recipes (team budgets, agent guard, failover)
+fixtures/                canonical request/response sample artifacts
+helm/                    Helm chart
+docker/                  Docker artifacts
+docs/                    reference documentation
 ```
 
 ## Contributing
@@ -376,3 +384,4 @@ pytest tests/e2e -v  # E2E (requires Docker)
 ---
 
 **Docs:** [docs.fairvisor.com](https://docs.fairvisor.com/docs/) · **Website:** [fairvisor.com](https://fairvisor.com) · **Quickstart:** [5 minutes to enforcement](https://docs.fairvisor.com/docs/quickstart/)
+

From a4ad21cdab0121ae56b760b086a7aa8b20f5aaca Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Wed, 18 Mar 2026 09:50:48 +0100
Subject: [PATCH 22/64] =?UTF-8?q?fix(quickstart):=20fix=20review=20issues?=
 =?UTF-8?q?=20=E2=80=94=20reverse=5Fproxy=20mode,=20correct=20config=20sha?=
 =?UTF-8?q?pes=20(issue=20#32)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/quickstart/docker-compose.yml | 28 +++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml
index 0b5affa..870812d 100644
--- a/examples/quickstart/docker-compose.yml
+++ b/examples/quickstart/docker-compose.yml
@@ -1,16 +1,22 @@
-# Fairvisor Edge — Quickstart stack (standalone mode)
+# Fairvisor Edge — Quickstart stack (standalone + reverse proxy mode)
 #
 # Usage:
 #   docker compose up -d
-#   curl -s http://localhost:8080/readyz          # health check
-#   curl -s -X POST http://localhost:8080/openai/v1/chat/completions \
-#     -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-fake-upstream-key" \
+#   curl -s http://localhost:8080/readyz                          # health check
+#   curl -s -X POST http://localhost:8080/v1/chat/completions \
 #     -H "Content-Type: application/json" \
-#     -d @fixtures/normal_request.json            # expect 200
-#   curl -s -X POST http://localhost:8080/openai/v1/chat/completions \
-#     -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-fake-upstream-key" \
+#     -d @../../fixtures/normal_request.json                      # expect 200
+#   curl -s -X POST http://localhost:8080/v1/chat/completions \
 #     -H "Content-Type: application/json" \
-#     -d @fixtures/over_limit_request.json        # expect 429
+#     -d @../../fixtures/over_limit_request.json                  # expect 429
+#
+# This stack runs in FAIRVISOR_MODE=reverse_proxy — requests to /v1/* are
+# enforced by policy then forwarded to the local mock LLM backend.
+# No real API keys required.
+#
+# Wrapper mode (routing by provider prefix, real upstream keys) is documented
+# in README.md under "Wrapper mode". It requires real provider credentials and
+# cannot be demonstrated with this mock stack.
 #
 # This file is also the base for the e2e-smoke CI check.
 # CI extends it via tests/e2e/docker-compose.test.yml; do not diverge the
@@ -23,12 +29,16 @@ services:
       - "8080:8080"
     environment:
       FAIRVISOR_CONFIG_FILE: /etc/fairvisor/policy.json
-      FAIRVISOR_MODE: wrapper
+      FAIRVISOR_MODE: reverse_proxy
+      FAIRVISOR_BACKEND_URL: http://mock_llm:80
       FAIRVISOR_SHARED_DICT_SIZE: 32m
       FAIRVISOR_LOG_LEVEL: info
       FAIRVISOR_WORKER_PROCESSES: "1"
     volumes:
       - ./policy.json:/etc/fairvisor/policy.json:ro
+    depends_on:
+      mock_llm:
+        condition: service_healthy
     healthcheck:
       test: ["CMD", "curl", "-sf", "http://127.0.0.1:8080/readyz"]
       interval: 2s

From 411c6c7feb65f135358e7963cb03cbe5b8d6de81 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Wed, 18 Mar 2026 09:50:54 +0100
Subject: [PATCH 23/64] =?UTF-8?q?fix(quickstart):=20fix=20review=20issues?=
 =?UTF-8?q?=20=E2=80=94=20reverse=5Fproxy=20mode,=20correct=20config=20sha?=
 =?UTF-8?q?pes=20(issue=20#32)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/quickstart/policy.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/quickstart/policy.json b/examples/quickstart/policy.json
index f5520aa..fb9b375 100644
--- a/examples/quickstart/policy.json
+++ b/examples/quickstart/policy.json
@@ -7,20 +7,20 @@
       "id": "quickstart-tpm-policy",
       "spec": {
         "selector": {
-          "pathPrefix": "/openai/",
+          "pathPrefix": "/v1/",
           "methods": ["POST"]
         },
         "mode": "enforce",
         "rules": [
           {
             "name": "tpm-limit",
-            "limit_keys": ["jwt:sub"],
+            "limit_keys": ["ip:address"],
             "algorithm": "token_bucket_llm",
             "algorithm_config": {
               "tokens_per_minute": 100,
               "tokens_per_day": 1000,
               "burst_tokens": 100,
-              "default_max_completion_tokens": 50
+              "default_max_completion": 50
             }
           }
         ]

From 288c9c75acdbe252aea4ff994b11cd12b0163e35 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Wed, 18 Mar 2026 09:50:59 +0100
Subject: [PATCH 24/64] =?UTF-8?q?fix(quickstart):=20fix=20review=20issues?=
 =?UTF-8?q?=20=E2=80=94=20reverse=5Fproxy=20mode,=20correct=20config=20sha?=
 =?UTF-8?q?pes=20(issue=20#32)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/quickstart/README.md | 63 ++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md
index a23fc78..3acb943 100644
--- a/examples/quickstart/README.md
+++ b/examples/quickstart/README.md
@@ -22,66 +22,77 @@ docker compose ps
 
 ## Verify enforcement
 
+This quickstart runs in `FAIRVISOR_MODE=reverse_proxy`. Requests to `/v1/*`
+are enforced by the TPM policy and forwarded to a local mock LLM backend.
+No real API keys are required.
+
 **Allowed request** — should return `200`:
 
 ```bash
-curl -s -X POST http://localhost:8080/openai/v1/chat/completions \
-  -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-fake-key" \
+curl -s -X POST http://localhost:8080/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d @../../fixtures/normal_request.json
 ```
 
-Expected response matches `../../fixtures/allow_response.json`.
+Expected response body shape matches `../../fixtures/allow_response.json`.
 
 **Over-limit request** — should return `429`:
 
 ```bash
-curl -s -X POST http://localhost:8080/openai/v1/chat/completions \
-  -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-fake-key" \
+curl -s -X POST http://localhost:8080/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d @../../fixtures/over_limit_request.json
 ```
 
-Expected response body matches `../../fixtures/reject_tpm_exceeded.json`.
+Expected response body shape: `../../fixtures/reject_tpm_exceeded.json`.
 The response will also include:
 - `X-Fairvisor-Reason: tpm_exceeded`
 - `Retry-After: 60`
-- `RateLimit-Limit: 100`
+- `RateLimit-Limit: 100` (matches the quickstart policy `tokens_per_minute`)
 - `RateLimit-Remaining: 0`
 
-## Wrapper mode and auth
+## How the policy works
+
+The quickstart policy (`policy.json`) enforces a TPM limit keyed on `ip:address`:
+
+- `tokens_per_minute: 100` — allows roughly 2 small requests per minute
+- `tokens_per_day: 1000` — daily cap
+- `default_max_completion: 50` — pessimistic reservation per request when `max_tokens` is not set
+
+Sending `over_limit_request.json` (which sets `max_tokens: 200000`) immediately
+exceeds the 100-token per-minute budget and triggers a `429`.
+
+## Wrapper mode (real provider routing)
+
+Wrapper mode routes requests to real upstream providers using provider-prefixed paths
+and a composite Bearer token. It requires real provider API keys and cannot be
+demonstrated with this mock stack.
 
-This quickstart runs in `FAIRVISOR_MODE=wrapper`. The composite Bearer token format is:
+**Path and auth format:**
 
 ```
+POST /openai/v1/chat/completions
 Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY
 ```
 
-- `CLIENT_JWT` — a signed JWT identifying the calling client/tenant (used for policy enforcement)
-- `UPSTREAM_KEY` — the real upstream API key forwarded to the provider (e.g. `sk-...` for OpenAI)
+Where:
+- `CLIENT_JWT` — signed JWT identifying the calling client/tenant (used for policy enforcement)
+- `UPSTREAM_KEY` — real upstream API key forwarded to the provider (e.g. `sk-...` for OpenAI)
 
-Fairvisor strips the composite header and injects the correct provider auth before forwarding. The upstream key is **never returned to the caller** — see `../../fixtures/allow_response.json` for proof (no `Authorization`, `x-api-key`, or `x-goog-api-key` headers in the response).
+Fairvisor strips the composite header, injects the correct provider auth before forwarding,
+and **never returns upstream auth headers to the caller**
+(see `../../fixtures/allow_response.json`).
 
-## Provider-prefixed paths
+**Provider-prefixed paths:**
 
-Wrapper mode routes by path prefix:
-
-| Path prefix | Upstream | Auth header |
+| Path prefix | Upstream | Auth header injected |
 |---|---|---|
 | `/openai/v1/...` | `https://api.openai.com/v1/...` | `Authorization: Bearer UPSTREAM_KEY` |
 | `/anthropic/v1/...` | `https://api.anthropic.com/v1/...` | `x-api-key: UPSTREAM_KEY` |
 | `/gemini/v1beta/...` | `https://generativelanguage.googleapis.com/v1beta/...` | `x-goog-api-key: UPSTREAM_KEY` |
 
-## Anthropic example
-
-```bash
-curl -s -X POST http://localhost:8080/anthropic/v1/messages \
-  -H "Authorization: Bearer demo-client-jwt.demo-payload.demo-sig:sk-ant-fake-key" \
-  -H "Content-Type: application/json" \
-  -d @../../fixtures/anthropic_normal_request.json
-```
-
-A rejected Anthropic request returns an Anthropic-native error body — see `../../fixtures/reject_anthropic.json`.
+To run in wrapper mode, change the compose env to `FAIRVISOR_MODE: wrapper` and
+supply real credentials in the `Authorization` header.
 
 ## Teardown
 

From 80365c9b2102493728eb94cd63bfb113f05a7bdb Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Wed, 18 Mar 2026 09:51:06 +0100
Subject: [PATCH 25/64] =?UTF-8?q?fix(quickstart):=20fix=20review=20issues?=
 =?UTF-8?q?=20=E2=80=94=20reverse=5Fproxy=20mode,=20correct=20config=20sha?=
 =?UTF-8?q?pes=20(issue=20#32)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/recipes/team-budgets/policy.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/recipes/team-budgets/policy.json b/examples/recipes/team-budgets/policy.json
index d361a30..87d7c63 100644
--- a/examples/recipes/team-budgets/policy.json
+++ b/examples/recipes/team-budgets/policy.json
@@ -20,7 +20,7 @@
               "tokens_per_minute": 120000,
               "tokens_per_day": 2000000,
               "burst_tokens": 120000,
-              "default_max_completion_tokens": 1024
+              "default_max_completion": 1024
             }
           },
           {

From e7785995c1f0987b6cf9d368c48b766443bb6f1f Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Wed, 18 Mar 2026 09:51:12 +0100
Subject: [PATCH 26/64] =?UTF-8?q?fix(quickstart):=20fix=20review=20issues?=
 =?UTF-8?q?=20=E2=80=94=20reverse=5Fproxy=20mode,=20correct=20config=20sha?=
 =?UTF-8?q?pes=20(issue=20#32)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/recipes/runaway-agent-guard/policy.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/recipes/runaway-agent-guard/policy.json b/examples/recipes/runaway-agent-guard/policy.json
index 2715b60..38de248 100644
--- a/examples/recipes/runaway-agent-guard/policy.json
+++ b/examples/recipes/runaway-agent-guard/policy.json
@@ -29,7 +29,7 @@
             "algorithm_config": {
               "tokens_per_minute": 50000,
               "burst_tokens": 50000,
-              "default_max_completion_tokens": 512
+              "default_max_completion": 512
             }
           }
         ]

From 399cd93e2a2b3bedc10e0e9d62908b59722e7a47 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Wed, 18 Mar 2026 09:51:18 +0100
Subject: [PATCH 27/64] =?UTF-8?q?fix(quickstart):=20fix=20review=20issues?=
 =?UTF-8?q?=20=E2=80=94=20reverse=5Fproxy=20mode,=20correct=20config=20sha?=
 =?UTF-8?q?pes=20(issue=20#32)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../recipes/provider-failover/policy.json     | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/examples/recipes/provider-failover/policy.json b/examples/recipes/provider-failover/policy.json
index 5974c77..03533dd 100644
--- a/examples/recipes/provider-failover/policy.json
+++ b/examples/recipes/provider-failover/policy.json
@@ -19,20 +19,17 @@
             "algorithm_config": {
               "tokens_per_minute": 200000,
               "burst_tokens": 200000,
-              "default_max_completion_tokens": 2048
-            }
-          },
-          {
-            "name": "openai-circuit-breaker",
-            "limit_keys": ["jwt:org_id"],
-            "algorithm": "circuit_breaker",
-            "algorithm_config": {
-              "spend_window_seconds": 300,
-              "spend_threshold": 100000,
-              "cooldown_seconds": 600
+              "default_max_completion": 2048
             }
           }
-        ]
+        ],
+        "circuit_breaker": {
+          "enabled": true,
+          "spend_rate_threshold_per_minute": 10000,
+          "action": "reject",
+          "alert": true,
+          "auto_reset_after_minutes": 10
+        }
       }
     },
     {
@@ -51,7 +48,7 @@
             "algorithm_config": {
               "tokens_per_minute": 100000,
               "burst_tokens": 100000,
-              "default_max_completion_tokens": 2048
+              "default_max_completion": 2048
             }
           }
         ]

From e327a0cf7a3e6cf00bf02a596f02f3508120573d Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Wed, 18 Mar 2026 09:51:25 +0100
Subject: [PATCH 28/64] =?UTF-8?q?fix(quickstart):=20fix=20review=20issues?=
 =?UTF-8?q?=20=E2=80=94=20reverse=5Fproxy=20mode,=20correct=20config=20sha?=
 =?UTF-8?q?pes=20(issue=20#32)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fixtures/reject_tpm_exceeded.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fixtures/reject_tpm_exceeded.json b/fixtures/reject_tpm_exceeded.json
index 26f45d0..0805778 100644
--- a/fixtures/reject_tpm_exceeded.json
+++ b/fixtures/reject_tpm_exceeded.json
@@ -1,9 +1,9 @@
 {
-  "_comment": "429 body returned when the per-minute token budget is exhausted.",
+  "_comment": "Illustrative 429 body returned when the per-minute token budget is exhausted. RateLimit-Limit reflects the policy's tokens_per_minute value.",
   "_headers": {
     "X-Fairvisor-Reason": "tpm_exceeded",
     "Retry-After": "60",
-    "RateLimit-Limit": "120000",
+    "RateLimit-Limit": "<tokens_per_minute from policy>",
     "RateLimit-Remaining": "0",
     "RateLimit-Reset": "<unix timestamp of next window>",
     "Content-Type": "application/json"

From ee2ab171e13536df91aa570085111bb78f4a25da Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Wed, 18 Mar 2026 09:51:31 +0100
Subject: [PATCH 29/64] =?UTF-8?q?fix(quickstart):=20fix=20review=20issues?=
 =?UTF-8?q?=20=E2=80=94=20reverse=5Fproxy=20mode,=20correct=20config=20sha?=
 =?UTF-8?q?pes=20(issue=20#32)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fixtures/reject_tpd_exceeded.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fixtures/reject_tpd_exceeded.json b/fixtures/reject_tpd_exceeded.json
index 8d2bcdb..83cb2ea 100644
--- a/fixtures/reject_tpd_exceeded.json
+++ b/fixtures/reject_tpd_exceeded.json
@@ -1,9 +1,9 @@
 {
-  "_comment": "429 body returned when the per-day token budget is exhausted.",
+  "_comment": "Illustrative 429 body returned when the per-day token budget is exhausted. RateLimit-Limit reflects the policy's tokens_per_day value.",
   "_headers": {
     "X-Fairvisor-Reason": "tpd_exceeded",
-    "Retry-After": "86400",
-    "RateLimit-Limit": "2000000",
+    "Retry-After": "<seconds until midnight UTC>",
+    "RateLimit-Limit": "<tokens_per_day from policy>",
     "RateLimit-Remaining": "0",
     "Content-Type": "application/json"
   },

From 348789e37d4cfdf35d6778ff23e6518fbe753dec Mon Sep 17 00:00:00 2001
From: Claude Agent <agent@fairvisor.ai>
Date: Wed, 18 Mar 2026 13:58:50 +0000
Subject: [PATCH 30/64] refactor: replace provider-failover recipe with
 circuit-breaker

Failover between providers is a client-side pattern, not a Fairvisor
feature. Replace with a focused circuit-breaker recipe that demonstrates
the actual cost-spike auto-shutdown capability:
- spend_rate_threshold_per_minute triggers full traffic block
- auto_reset_after_minutes provides hands-free cooldown
- per-org TPM limit continues to run in parallel
---
 examples/recipes/circuit-breaker/README.md    | 43 +++++++++++++++
 .../policy.json                               | 34 +++---------
 examples/recipes/provider-failover/README.md  | 52 -------------------
 3 files changed, 49 insertions(+), 80 deletions(-)
 create mode 100644 examples/recipes/circuit-breaker/README.md
 rename examples/recipes/{provider-failover => circuit-breaker}/policy.json (51%)
 delete mode 100644 examples/recipes/provider-failover/README.md

diff --git a/examples/recipes/circuit-breaker/README.md b/examples/recipes/circuit-breaker/README.md
new file mode 100644
index 0000000..ad1227e
--- /dev/null
+++ b/examples/recipes/circuit-breaker/README.md
@@ -0,0 +1,43 @@
+# Recipe: Circuit Breaker — Cost Spike Auto-Shutdown
+
+Automatically block all LLM traffic when the aggregate token spend rate
+exceeds a budget threshold, then self-reset after a cooldown period.
+
+## How it works
+
+- Normal traffic: per-org TPM limit enforced (`100 000 tokens/min`)
+- Spike detection: if the rolling spend rate hits `500 000 tokens/min`
+  the circuit breaker opens and **all requests return `429`** with
+  `X-Fairvisor-Reason: circuit_breaker_open`
+- Auto-reset: after 10 minutes without breaker-triggering load, the
+  circuit resets automatically — no manual intervention needed
+- `alert: true` logs the trip event to the Fairvisor audit log
+
+## Deploy
+
+```bash
+cp policy.json /etc/fairvisor/policy.json
+```
+
+## Expected behaviour
+
+```bash
+# Normal request — passes
+curl -s -o /dev/null -w "%{http_code}" \
+  -H "Authorization: Bearer <jwt>:<upstream-key>" \
+  http://localhost:8080/v1/chat/completions \
+  -d '{"model":"gpt-4o","messages":[{"role":"user","content":"hi"}]}'
+# → 200
+
+# After spend spike trips the breaker:
+# → 429  X-Fairvisor-Reason: circuit_breaker_open
+#        Retry-After: 600
+```
+
+## Tuning
+
+| Field | Description |
+|---|---|
+| `spend_rate_threshold_per_minute` | Tokens/min rolling spend that opens the breaker |
+| `auto_reset_after_minutes` | Cooldown before automatic reset (0 = manual only) |
+| `tokens_per_minute` | Per-org steady-state limit (independent of breaker) |
diff --git a/examples/recipes/provider-failover/policy.json b/examples/recipes/circuit-breaker/policy.json
similarity index 51%
rename from examples/recipes/provider-failover/policy.json
rename to examples/recipes/circuit-breaker/policy.json
index 03533dd..7d58c8d 100644
--- a/examples/recipes/provider-failover/policy.json
+++ b/examples/recipes/circuit-breaker/policy.json
@@ -4,55 +4,33 @@
   "expires_at": "2030-01-01T00:00:00Z",
   "policies": [
     {
-      "id": "provider-failover-primary",
+      "id": "cost-spike-guard",
       "spec": {
         "selector": {
-          "pathPrefix": "/openai/",
+          "pathPrefix": "/v1/",
           "methods": ["POST"]
         },
         "mode": "enforce",
         "rules": [
           {
-            "name": "openai-tpm",
+            "name": "per-org-tpm",
             "limit_keys": ["jwt:org_id"],
             "algorithm": "token_bucket_llm",
             "algorithm_config": {
-              "tokens_per_minute": 200000,
-              "burst_tokens": 200000,
+              "tokens_per_minute": 100000,
+              "burst_tokens": 100000,
               "default_max_completion": 2048
             }
           }
         ],
         "circuit_breaker": {
           "enabled": true,
-          "spend_rate_threshold_per_minute": 10000,
+          "spend_rate_threshold_per_minute": 500000,
           "action": "reject",
           "alert": true,
           "auto_reset_after_minutes": 10
         }
       }
-    },
-    {
-      "id": "provider-failover-fallback",
-      "spec": {
-        "selector": {
-          "pathPrefix": "/anthropic/",
-          "methods": ["POST"]
-        },
-        "mode": "enforce",
-        "rules": [
-          {
-            "name": "anthropic-tpm",
-            "limit_keys": ["jwt:org_id"],
-            "algorithm": "token_bucket_llm",
-            "algorithm_config": {
-              "tokens_per_minute": 100000,
-              "burst_tokens": 100000,
-              "default_max_completion": 2048
-            }
-          }
-        ]
-      }
     }
   ],
   "kill_switches": []
diff --git a/examples/recipes/provider-failover/README.md b/examples/recipes/provider-failover/README.md
deleted file mode 100644
index 520226c..0000000
--- a/examples/recipes/provider-failover/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# Recipe: Provider Failover / Edge Control
-
-Run two provider paths under independent policy budgets. When the primary
-provider (OpenAI) trips a circuit breaker, your client-side router can
-switch to the fallback (Anthropic) — both paths enforced by the same edge.
-
-## How it works
-
-- `/openai/v1/...` — enforced by an OpenAI TPM limit + a spend-based circuit breaker
-- `/anthropic/v1/...` — enforced by an Anthropic TPM limit
-
-The circuit breaker on the OpenAI path auto-trips when cumulative spend
-exceeds the threshold in a 5-minute window, then auto-resets after 10 minutes.
-Your application can detect the 429 with `X-Fairvisor-Reason: circuit_breaker_open`
-and switch to the Anthropic path without any Fairvisor configuration change.
-
-## Deploy
-
-```bash
-cp policy.json /etc/fairvisor/policy.json
-```
-
-## Client-side failover pattern
-
-```python
-import httpx
-
-EDGE = "http://localhost:8080"
-AUTH = "Bearer my-client-jwt.payload.sig:sk-my-upstream-key"
-
-def chat(messages, provider="openai"):
-    resp = httpx.post(
-        f"{EDGE}/{provider}/v1/chat/completions",
-        headers={"Authorization": AUTH, "Content-Type": "application/json"},
-        json={"model": "gpt-4o", "messages": messages},
-    )
-    if resp.status_code == 429:
-        reason = resp.headers.get("X-Fairvisor-Reason", "")
-        if reason == "circuit_breaker_open" and provider == "openai":
-            return chat(messages, provider="anthropic")
-    resp.raise_for_status()
-    return resp.json()
-```
-
-## Auth note
-
-The composite `CLIENT_JWT:UPSTREAM_KEY` format is the same for all providers.
-Fairvisor injects the correct provider-native auth header:
-- OpenAI: `Authorization: Bearer UPSTREAM_KEY`
-- Anthropic: `x-api-key: UPSTREAM_KEY`
-
-The upstream key is stripped from responses — it never reaches your client.

From 7a13c7ffa7b8e3018c321955d8bea3b7ca06fc73 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Thu, 19 Mar 2026 13:08:06 +0100
Subject: [PATCH 31/64] docs: add wrapper mode to README + integration links in
 comparison section

Refs #17 #19 from job discussion
---
 README.md | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ed06a20..bf6bda0 100644
--- a/README.md
+++ b/README.md
@@ -84,13 +84,13 @@ If you have an existing gateway, the question is whether Fairvisor adds anything
 
 **If nginx `limit_req` is enough for you**, use it. It has zero overhead and is the right tool for simple per-IP global throttling. Fairvisor becomes relevant when you need per-tenant awareness, JWT-claim-based bucketing, or cost/token tracking that `limit_req` has no model for.
 
-**If you are already running Kong**, the built-in rate limiting plugin stores counters in Redis or Postgres — every decision is a network call. Fairvisor can run alongside Kong as an `auth_request` decision service with no external state.
+**If you are already running Kong**, the built-in rate limiting plugin stores counters in Redis or Postgres — every decision is a network call. Fairvisor can run alongside Kong as an `auth_request` decision service with no external state. See [Kong / Traefik integration →](https://docs.fairvisor.com/docs/gateway/)
 
-**If you are running Envoy**, the [global rate limit service](https://github.com/envoyproxy/ratelimit) requires deploying a separate Redis-backed service with its own config language. Fairvisor is one container, one JSON file, and integrates via `ext_authz` in the same position.
+**If you are running Envoy**, the [global rate limit service](https://github.com/envoyproxy/ratelimit) requires deploying a separate Redis-backed service with its own config language. Fairvisor is one container, one JSON file, and integrates via `ext_authz` in the same position. See [Envoy ext_authz integration →](https://docs.fairvisor.com/docs/gateway/envoy/)
 
 **If you are on Cloudflare or Akamai**, per-JWT-claim limits, LLM token budgets, and cost caps are not in the platform's model. If your limits are tenant-aware or cost-aware, you need something that runs in your own stack.
 
-Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacement. See [docs/gateway-integration.md](docs/gateway-integration.md) for integration patterns.
+Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacement. See [nginx auth_request →](https://docs.fairvisor.com/docs/gateway/nginx/) · [Envoy ext_authz →](https://docs.fairvisor.com/docs/gateway/envoy/) · [Kong / Traefik →](https://docs.fairvisor.com/docs/gateway/) for integration patterns.
 
 ## Quick start
 
@@ -194,7 +194,9 @@ Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible e
 
 **Reverse proxy mode** — Fairvisor sits inline. Traffic arrives at Fairvisor directly, gets evaluated, and is proxied to the upstream if allowed. No separate gateway needed.
 
-Both modes use the same policy bundle and return the same rejection headers.
+**Wrapper mode** — Fairvisor acts as a transparent LLM proxy. Clients send requests to Fairvisor's OpenAI-compatible endpoint (`/openai/v1/chat/completions`, `/anthropic/v1/messages`, `/gemini/v1/generateContent`). Fairvisor enforces token budgets and cost limits, strips the client auth header, injects the upstream API key, and forwards the request. No changes needed in the client — swap the base URL and you're done.
+
+All three modes use the same policy bundle and return the same rejection headers.
 
 When a request is rejected:
 
@@ -245,7 +247,25 @@ Headers follow [RFC 9333 RateLimit Fields](https://www.rfc-editor.org/rfc/rfc933
           reject ──► 429 + RFC 9333 headers
 ```
 
-Both modes use the same policy bundle and produce the same rejection headers.
+**Wrapper mode** (transparent LLM proxy — swap base URL, no client changes):
+
+```
+ Client ──► Fairvisor Edge  POST /openai/v1/chat/completions
+            (wrapper)        Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY
+                  │
+                  │  1. Validate CLIENT_JWT, extract org_id / user_id claims
+                  │  2. Enforce token budget (TPM / TPD / cost)
+                  │  3. Strip Authorization header, inject upstream API key
+                  │  4. Forward to https://api.openai.com (or Anthropic/Gemini)
+                  │  5. Count response tokens, refund unused reservation
+                  │
+          allow ──► upstream response (Authorization header stripped from reply)
+          reject ──► 429 + X-Fairvisor-Reason: tpm_exceeded / budget_exhausted
+```
+
+Supported upstream paths out of the box: `/openai/*`, `/anthropic/*`, `/gemini/*`.
+
+All three modes use the same policy bundle and produce the same rejection headers.
 
 ## Enforcement capabilities
 

From 480a409c68ff16d6076cc86662d2d197d4e771a3 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Thu, 19 Mar 2026 13:20:59 +0100
Subject: [PATCH 32/64] docs: rewrite LLM token budget section to showcase
 wrapper mode

---
 README.md | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index bf6bda0..c193cee 100644
--- a/README.md
+++ b/README.md
@@ -162,11 +162,15 @@ curl -s -w "\nHTTP %{http_code}\n" \
 
 ## LLM token budget in 30 seconds
 
+The fastest path is **wrapper mode**: Fairvisor sits in front of the LLM API, enforces budgets, and strips the upstream key from the client. No gateway changes needed — just point your client at Fairvisor instead of OpenAI.
+
+**1. Policy** — one rule, per-org TPM + daily cap:
+
 ```json
 {
   "id": "llm-budget",
   "spec": {
-    "selector": { "pathPrefix": "/v1/chat" },
+    "selector": { "pathPrefix": "/openai/v1/chat" },
     "mode": "enforce",
     "rules": [
       {
@@ -184,9 +188,27 @@ curl -s -w "\nHTTP %{http_code}\n" \
 }
 ```
 
-Each organization (from the JWT `org_id` claim) gets its own independent 60k TPM / 1.2M TPD budget. Requests over the limit return a `429` with an OpenAI-compatible error body — no client changes needed.
+**2. Call the API** — token format `Bearer <client-jwt>:<upstream-key>`:
+
+```bash
+curl https://your-fairvisor-host/openai/v1/chat/completions   -H "Authorization: Bearer eyJhbGc...:sk-proj-..."   -H "Content-Type: application/json"   -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"Hello"}]}'
+```
+
+Fairvisor validates the JWT, extracts `org_id`, charges tokens against the budget, strips the `Authorization` header, and forwards with the upstream key. The upstream never sees the client JWT.
+
+When the budget is exhausted:
+
+```http
+HTTP/1.1 429 Too Many Requests
+X-Fairvisor-Reason: tpm_exceeded
+Retry-After: 12
+RateLimit-Limit: 60000
+RateLimit-Remaining: 0
+```
+
+Each organization gets its own independent 60k TPM / 1.2M TPD budget. Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible endpoint.
 
-Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible endpoint.
+> **Decision service / reverse proxy mode:** if you already have a gateway, use `selector: { "pathPrefix": "/v1/chat" }` and call `POST /v1/decision` from your existing `auth_request` or `ext_authz` hook instead.
 
 ## How a request flows
 

From c297873449fe8bf7263c957330bf4b7325c4eb05 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Thu, 19 Mar 2026 13:43:04 +0100
Subject: [PATCH 33/64] docs: wrapper mode selector pathPrefix "/" covers all
 providers

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c193cee..75f48c0 100644
--- a/README.md
+++ b/README.md
@@ -170,7 +170,7 @@ The fastest path is **wrapper mode**: Fairvisor sits in front of the LLM API, en
 {
   "id": "llm-budget",
   "spec": {
-    "selector": { "pathPrefix": "/openai/v1/chat" },
+    "selector": { "pathPrefix": "/" },
     "mode": "enforce",
     "rules": [
       {
@@ -208,6 +208,8 @@ RateLimit-Remaining: 0
 
 Each organization gets its own independent 60k TPM / 1.2M TPD budget. Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible endpoint.
 
+The selector matches the incoming wrapper path. Use `pathPrefix: "/"` to cover all providers, or `pathPrefix: "/openai"` to limit to one provider only.
+
 > **Decision service / reverse proxy mode:** if you already have a gateway, use `selector: { "pathPrefix": "/v1/chat" }` and call `POST /v1/decision` from your existing `auth_request` or `ext_authz` hook instead.
 
 ## How a request flows

From 989cc041b8d366c2605cf01f222e6dec0e2bee87 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Thu, 19 Mar 2026 20:40:34 +0100
Subject: [PATCH 34/64] docs: replace ASCII architecture diagrams with Mermaid
 sequence diagrams

---
 README.md | 98 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 75f48c0..69f6c53 100644
--- a/README.md
+++ b/README.md
@@ -238,56 +238,70 @@ Headers follow [RFC 9333 RateLimit Fields](https://www.rfc-editor.org/rfc/rfc933
 
 ### Architecture
 
-**Decision service mode** (sidecar — your gateway calls `/v1/decision`, handles forwarding itself):
-
-```
- Client ──► Your gateway (nginx / Envoy / Kong)
-                  │
-                  │  POST /v1/decision
-                  │  (auth_request / ext_authz)
-                  ▼
-          ┌─────────────────────┐
-          │   Fairvisor Edge    │
-          │  decision_service   │
-          │                     │
-          │  rule_engine        │
-          │  ngx.shared.dict    │  ◄── no Redis, no network
-          └──────────┬──────────┘
-                     │
-          204 allow  │  429 reject
-                     ▼
-          gateway proxies or returns rejection
+**Decision service mode** — sidecar: your gateway calls `/v1/decision`, handles forwarding itself.
+
+```mermaid
+sequenceDiagram
+    participant C as Client
+    participant G as Your Gateway<br/>(nginx / Envoy / Kong)
+    participant F as Fairvisor Edge<br/>decision_service
+    participant U as Upstream service
+
+    C->>G: Request
+    G->>F: POST /v1/decision<br/>(auth_request / ext_authz)
+    alt allow
+        F-->>G: 204 No Content
+        G->>U: Forward request
+        U-->>G: Response
+        G-->>C: Response
+    else reject
+        F-->>G: 429 + RateLimit headers
+        G-->>C: 429 Too Many Requests
+    end
 ```
 
-**Reverse proxy mode** (inline — Fairvisor handles proxying):
+No Redis, no external state — all counters live in `ngx.shared.dict`.
 
-```
- Client ──► Fairvisor Edge (reverse_proxy)
-                  │
-                  │  access.lua → rule_engine
-                  │  ngx.shared.dict
-                  │
-          allow ──► upstream service
-          reject ──► 429 + RFC 9333 headers
-```
+**Reverse proxy mode** — inline: Fairvisor handles both enforcement and proxying.
 
-**Wrapper mode** (transparent LLM proxy — swap base URL, no client changes):
+```mermaid
+sequenceDiagram
+    participant C as Client
+    participant F as Fairvisor Edge<br/>reverse_proxy
+    participant U as Upstream service
 
+    C->>F: Request
+    alt allow
+        F->>U: Forward request
+        U-->>F: Response
+        F-->>C: Response
+    else reject
+        F-->>C: 429 + RFC 9333 headers
+    end
 ```
- Client ──► Fairvisor Edge  POST /openai/v1/chat/completions
-            (wrapper)        Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY
-                  │
-                  │  1. Validate CLIENT_JWT, extract org_id / user_id claims
-                  │  2. Enforce token budget (TPM / TPD / cost)
-                  │  3. Strip Authorization header, inject upstream API key
-                  │  4. Forward to https://api.openai.com (or Anthropic/Gemini)
-                  │  5. Count response tokens, refund unused reservation
-                  │
-          allow ──► upstream response (Authorization header stripped from reply)
-          reject ──► 429 + X-Fairvisor-Reason: tpm_exceeded / budget_exhausted
+
+**Wrapper mode** — transparent LLM proxy: swap the base URL, no other client changes needed.
+
+```mermaid
+sequenceDiagram
+    participant C as Client
+    participant F as Fairvisor Edge<br/>wrapper
+    participant U as Upstream LLM<br/>(OpenAI / Anthropic / Gemini)
+
+    C->>F: POST /openai/v1/chat/completions<br/>Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY
+    F->>F: 1. Validate JWT · extract org_id claims
+    F->>F: 2. Enforce TPM / TPD / cost budget
+    alt budget ok
+        F->>U: POST /v1/chat/completions<br/>Authorization: Bearer UPSTREAM_KEY
+        U-->>F: 200 OK + token usage
+        F->>F: 3. Count tokens · refund unused reservation
+        F-->>C: 200 OK (Authorization stripped from reply)
+    else budget exceeded
+        F-->>C: 429 X-Fairvisor-Reason: tpm_exceeded
+    end
 ```
 
-Supported upstream paths out of the box: `/openai/*`, `/anthropic/*`, `/gemini/*`.
+Supported upstream paths: `/openai/*`, `/anthropic/*`, `/gemini/*`, `/grok/*`.
 
 All three modes use the same policy bundle and produce the same rejection headers.
 

From a2bb9dad92085dacbe180518580f5cb662626814 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Thu, 19 Mar 2026 20:42:41 +0100
Subject: [PATCH 35/64] =?UTF-8?q?docs:=20fix=20JWT=20wording=20=E2=80=94?=
 =?UTF-8?q?=20Fairvisor=20parses=20claims,=20does=20not=20validate=20signa?=
 =?UTF-8?q?ture?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 69f6c53..4c4b663 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,7 @@ The fastest path is **wrapper mode**: Fairvisor sits in front of the LLM API, en
 curl https://your-fairvisor-host/openai/v1/chat/completions   -H "Authorization: Bearer eyJhbGc...:sk-proj-..."   -H "Content-Type: application/json"   -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"Hello"}]}'
 ```
 
-Fairvisor validates the JWT, extracts `org_id`, charges tokens against the budget, strips the `Authorization` header, and forwards with the upstream key. The upstream never sees the client JWT.
+Fairvisor parses the JWT claims (no signature validation — the JWT is trusted as-is), extracts `org_id`, charges tokens against the budget, strips the `Authorization` header, and forwards with the upstream key. The upstream never sees the client JWT.
 
 When the budget is exhausted:
 
@@ -289,7 +289,7 @@ sequenceDiagram
     participant U as Upstream LLM<br/>(OpenAI / Anthropic / Gemini)
 
     C->>F: POST /openai/v1/chat/completions<br/>Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY
-    F->>F: 1. Validate JWT · extract org_id claims
+    F->>F: 1. Parse JWT claims (org_id, user_id)
     F->>F: 2. Enforce TPM / TPD / cost budget
     alt budget ok
         F->>U: POST /v1/chat/completions<br/>Authorization: Bearer UPSTREAM_KEY

From 4759b72ff8430c16b1e9cc790ff76cd3283284aa Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Thu, 19 Mar 2026 20:45:13 +0100
Subject: [PATCH 36/64] trim README: remove benchmark methodology, Contributing
 section; fix recipe name

---
 README.md | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 4c4b663..d739b01 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,6 @@
 - [CLI](#cli)
 - [SaaS control plane (optional)](#saas-control-plane-optional)
 - [Project layout](#project-layout)
-- [Contributing](#contributing)
 - [License](#license)
 
 ---
@@ -96,7 +95,7 @@ Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacem
 
 > **Runnable quickstart:** `examples/quickstart/` — `docker compose up -d` and run your first enforce/reject test in under a minute. See [`examples/quickstart/README.md`](examples/quickstart/README.md).
 >
-> **Recipes:** `examples/recipes/` — deployable team budgets, runaway agent guard, and provider failover examples.
+> **Recipes:** `examples/recipes/` — deployable team budgets, runaway agent guard, and circuit-breaker examples.
 >
 > **Sample artifacts:** `fixtures/` — canonical request/response fixtures for enforce, reject (TPM, TPD, prompt-too-large), and provider-native error bodies (OpenAI, Anthropic, Gemini).
 
@@ -338,17 +337,6 @@ Policies are versioned JSON — commit them to Git, review changes in PRs, roll
 
 ## Performance
 
-### Benchmark methodology (March 2026)
-
-- **Hosts:** 2 × AWS `c7i.xlarge` (4 vCPU, 8 GiB each), cluster placement group, eu-central-1
-- **OS:** Ubuntu 24.04 LTS
-- **Runtime:** OpenResty 1.29.2.1, Fairvisor latest `main` (no Docker)
-- **Load tool:** `k6` v0.54.0, `constant-arrival-rate`, 10,000 RPS for 60s, 10s warmup
-- **Benchmark script:** `run-all.sh` from `fairvisor/benchmark`
-- **Topology:** two-host — Fairvisor and k6 on separate machines (VPC private network)
-- **Decision endpoint contract:** `POST /v1/decision` with `X-Original-Method` and `X-Original-URI`
-- **Note:** reverse proxy numbers include policy evaluation and upstream proxy hop to backend nginx.
-
 ### Latest measured latency @ 10,000 RPS
 
 | Percentile | Decision service | Reverse proxy | Raw nginx (baseline) |
@@ -426,17 +414,6 @@ docker/                  Docker artifacts
 docs/                    reference documentation
 ```
 
-## Contributing
-
-See [CONTRIBUTING.md](CONTRIBUTING.md). Bug reports, issues, and pull requests welcome.
-
-Run the test suite:
-
-```bash
-busted spec          # unit + integration
-pytest tests/e2e -v  # E2E (requires Docker)
-```
-
 ## License
 
 [Mozilla Public License 2.0](LICENSE)

From 5fac6e22c2286342b3cb547035c5bb095073c903 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 09:58:16 +0100
Subject: [PATCH 37/64] README: rework tagline, add hook + mode selector, drop
 Policy as code + ngx.shared.dict refs

---
 README.md | 56 ++++++++-----------------------------------------------
 1 file changed, 8 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index d739b01..2da0365 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,7 @@
 
 <h1 align="center">FAIRVISOR</h1>
 
-<h3 align="center">Turn API limits into enforceable business policy.</h3>
-
-<p align="center">
-  Every API that charges per token, serves paying tenants, or runs agentic pipelines needs<br>
-  enforceable limits — not just rate-limit middleware bolted on as an afterthought.<br>
-  <br>
-  Open-source edge enforcement engine for rate limits, quotas, and cost budgets.<br>
-  Runs standalone or with a SaaS control plane for team governance.
-</p>
+<h3 align="center">Stop one tenant from burning everyone's LLM budget.</h3>
 
 <p align="center">
   <a href="https://github.com/fairvisor/edge/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MPL--2.0-blue" alt="License: MPL-2.0"></a>
@@ -34,13 +26,11 @@
 
 ## Table of Contents
 
-- [What is Fairvisor?](#what-is-fairvisor)
 - [Why not nginx / Kong / Envoy?](#why-not-nginx--kong--envoy)
 - [Quick start](#quick-start)
 - [LLM token budget in 30 seconds](#llm-token-budget-in-30-seconds)
 - [How a request flows](#how-a-request-flows)
 - [Enforcement capabilities](#enforcement-capabilities)
-- [Policy as code](#policy-as-code)
 - [Performance](#performance)
 - [Deployment](#deployment)
 - [CLI](#cli)
@@ -50,21 +40,7 @@
 
 ---
 
-## What is Fairvisor?
-
-Fairvisor Edge is a **policy enforcement layer** that sits between your API gateway and your upstream services. Every request is evaluated against a declarative JSON policy bundle and receives a deterministic allow or reject verdict — with machine-readable rejection headers and sub-millisecond latency.
-
-It is **not** a reverse proxy replacement. It is **not** a WAF. It is a dedicated, composable enforcement point for:
-
-- **Rate limits and quotas** — per route, per tenant, per JWT claim, per API key
-- **Cost budgets** — cumulative spend caps per org, team, or endpoint
-- **LLM token limits** — TPM/TPD budgets with pre-request reservation and post-response refund
-- **Kill switches** — instant traffic blocking per descriptor, no restart required
-- **Shadow mode** — dry-run enforcement against real traffic before going live
-- **Loop detection** — stops runaway agentic workflows at the edge
-- **Circuit breaker** — auto-trips on spend spikes, auto-resets after cooldown
-
-All controls are defined in one versioned policy bundle. Policies hot-reload without restarting the process.
+Every LLM call costs tokens — and when multiple teams, customers, or agents share the same API credentials, a single bad actor can exhaust the budget for everyone. Fairvisor is a lightweight enforcement engine that sits in front of your LLM API and gives each tenant, team, or user their own token budget. No Redis, no separate rate-limit service — one container, one JSON policy file, sub-millisecond enforcement overhead.
 
 ## Why not nginx / Kong / Envoy?
 
@@ -93,6 +69,11 @@ Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacem
 
 ## Quick start
 
+> **Which mode is right for you?**
+> - **Wrapper** — your app calls OpenAI / Anthropic / Gemini directly → point your client at Fairvisor instead, no other code changes needed.
+> - **Reverse proxy** — you have a single upstream service → Fairvisor sits in front and enforces before forwarding.
+> - **Decision service** — you already run nginx, Envoy, or Kong → call `POST /v1/decision` from `auth_request` / `ext_authz`.
+
 > **Runnable quickstart:** `examples/quickstart/` — `docker compose up -d` and run your first enforce/reject test in under a minute. See [`examples/quickstart/README.md`](examples/quickstart/README.md).
 >
 > **Recipes:** `examples/recipes/` — deployable team budgets, runaway agent guard, and circuit-breaker examples.
@@ -259,8 +240,6 @@ sequenceDiagram
     end
 ```
 
-No Redis, no external state — all counters live in `ngx.shared.dict`.
-
 **Reverse proxy mode** — inline: Fairvisor handles both enforcement and proxying.
 
 ```mermaid
@@ -318,23 +297,6 @@ All three modes use the same policy bundle and produce the same rejection header
 
 Identity keys can be **JWT claims** (`jwt:org_id`, `jwt:plan`), **HTTP headers** (`header:x-api-key`), or **IP attributes** (`ip:addr`, `ip:country`). Combine multiple keys per rule for compound matching.
 
-## Policy as code
-
-Define policies in JSON, validate against the schema, test in shadow mode, then promote:
-
-```bash
-# Validate bundle structure and rule semantics
-fairvisor validate ./policies.json
-
-# Replay real traffic without blocking anything
-fairvisor test --dry-run
-
-# Apply a new bundle (hot-reload, no restart)
-fairvisor connect --push ./policies.json
-```
-
-Policies are versioned JSON — commit them to Git, review changes in PRs, roll back with confidence.
-
 ## Performance
 
 ### Latest measured latency @ 10,000 RPS
@@ -355,8 +317,6 @@ Policies are versioned JSON — commit them to Git, review changes in PRs, roll
 | Simple rate limit (1 rule) | 195,000 |
 | Complex policy (5 rules, JWT parsing, loop detection) | 195,000 |
 
-**No external datastore.** All enforcement state lives in in-process shared memory (`ngx.shared.dict`). No Redis, no Postgres, no network round-trips in the decision path.
-
 Reproduce: see [fairvisor/benchmark](https://github.com/fairvisor/benchmark) — the canonical benchmark source of truth for Fairvisor Edge performance numbers.
 
 
@@ -407,7 +367,7 @@ cli/                     command-line tooling
 spec/                    unit and integration tests (busted)
 tests/e2e/               Docker-based E2E tests (pytest)
 examples/quickstart/     runnable quickstart (docker compose up -d)
-examples/recipes/        deployable policy recipes (team budgets, agent guard, failover)
+examples/recipes/        deployable policy recipes (team budgets, agent guard, circuit breaker)
 fixtures/                canonical request/response sample artifacts
 helm/                    Helm chart
 docker/                  Docker artifacts

From b4e2b4d6bf32790bcf6b5008de6a3960c63cd016 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 10:11:29 +0100
Subject: [PATCH 38/64] README: remove hero latency/RPS line

---
 README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.md b/README.md
index 2da0365..153ad46 100644
--- a/README.md
+++ b/README.md
@@ -18,10 +18,6 @@
   <a href="https://docs.fairvisor.com/docs/quickstart/"><img src="https://img.shields.io/badge/docs-quickstart-informational" alt="Docs"></a>
 </p>
 
-<p align="center">
-  <b>Latency: &lt; 70 µs enforcement overhead · 195k RPS max throughput · No external state (no Redis / DB)</b>
-</p>
-
 ---
 
 ## Table of Contents

From c75e2a554e47b2c5b760a76d7029d7f8b56f6981 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 10:16:09 +0100
Subject: [PATCH 39/64] =?UTF-8?q?README:=20broaden=20hook=20paragraph=20?=
 =?UTF-8?q?=E2=80=94=20not=20LLM-only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 153ad46..44b9d4e 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@
 
 ---
 
-Every LLM call costs tokens — and when multiple teams, customers, or agents share the same API credentials, a single bad actor can exhaust the budget for everyone. Fairvisor is a lightweight enforcement engine that sits in front of your LLM API and gives each tenant, team, or user their own token budget. No Redis, no separate rate-limit service — one container, one JSON policy file, sub-millisecond enforcement overhead.
+When multiple tenants, agents, or services share an API, one misbehaving caller can exhaust the budget for everyone — whether that's LLM tokens, API credits, or request quotas. Fairvisor is a lightweight enforcement engine that gives each tenant isolated limits at the edge: token budgets, cost caps, rate limits, and kill switches — keyed on JWT claims, API keys, or IP. One container, one JSON policy file, no Redis.
 
 ## Why not nginx / Kong / Envoy?
 

From 51d45ae767e3442fcf175f22e370fe847b278296 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 10:57:19 +0100
Subject: [PATCH 40/64] docs: add 'Why we built this' section and ToC entry

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index 44b9d4e..48cac2b 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@
 
 ## Table of Contents
 
+- [Why we built this](#why-we-built-this)
 - [Why not nginx / Kong / Envoy?](#why-not-nginx--kong--envoy)
 - [Quick start](#quick-start)
 - [LLM token budget in 30 seconds](#llm-token-budget-in-30-seconds)
@@ -38,6 +39,20 @@
 
 When multiple tenants, agents, or services share an API, one misbehaving caller can exhaust the budget for everyone — whether that's LLM tokens, API credits, or request quotas. Fairvisor is a lightweight enforcement engine that gives each tenant isolated limits at the edge: token budgets, cost caps, rate limits, and kill switches — keyed on JWT claims, API keys, or IP. One container, one JSON policy file, no Redis.
 
+## Why we built this
+
+API gateways count requests. LLM providers bill by the token.
+
+When you serve multiple tenants — customers, teams, or agentic pipelines — that gap becomes a real problem. One runaway agent can consume a month's token budget overnight. Your gateway sees one request per second; your invoice shows 3 million tokens.
+
+We needed something that:
+- Understood token budgets, not just request counts
+- Could key limits on JWT claims (`org_id`, `plan`, `user_id`), not just IPs
+- Added no external state — no Redis, no network round-trip in the hot path
+- Could plug into nginx or Envoy *or* run standalone as a transparent LLM proxy
+
+We couldn't find it, so we built Fairvisor.
+
 ## Why not nginx / Kong / Envoy?
 
 If you have an existing gateway, the question is whether Fairvisor adds anything you can't get from the plugin ecosystem already installed. Here is the honest comparison:

From 0a9d1ed224ee8b8abd4eba72570379238496f67e Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 13:35:53 +0100
Subject: [PATCH 41/64] =?UTF-8?q?docs:=20README=20final=20polish=20?=
 =?UTF-8?q?=E2=80=94=20tagline,=20quickstart=20restructure,=20badge=20fix,?=
 =?UTF-8?q?=20ip:address,=20curl=20formatting,=20Architecture=20ToC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 62 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 48cac2b..d267450 100644
--- a/README.md
+++ b/README.md
@@ -6,12 +6,12 @@
 
 <h1 align="center">FAIRVISOR</h1>
 
-<h3 align="center">Stop one tenant from burning everyone's LLM budget.</h3>
+<h3 align="center">Stop one tenant from exhausting everyone's budget.</h3>
 
 <p align="center">
   <a href="https://github.com/fairvisor/edge/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MPL--2.0-blue" alt="License: MPL-2.0"></a>
   <a href="https://github.com/fairvisor/edge/releases"><img src="https://img.shields.io/github/v/release/fairvisor/edge" alt="Latest release"></a>
-  <a href="https://github.com/fairvisor/-edge/actions"><img src="https://img.shields.io/github/actions/workflow/status/fairvisor/edge/ci.yml?label=CI" alt="CI"></a>
+  <a href="https://github.com/fairvisor/edge/actions"><img src="https://img.shields.io/github/actions/workflow/status/fairvisor/edge/ci.yml?label=CI" alt="CI"></a>
   <a href="https://fairvisor.github.io/edge/"><img src="https://img.shields.io/endpoint?url=https%3A%2F%2Ffairvisor.github.io%2Fedge%2Fcoverage-badge.json" alt="Lua coverage"></a>
   <a href="https://github.com/fairvisor/edge/pkgs/container/fairvisor-edge"><img src="https://img.shields.io/badge/ghcr.io-fairvisor--edge-blue?logo=docker" alt="GHCR image"></a>
   <img src="https://img.shields.io/badge/platform-linux%2Famd64%20·%20linux%2Farm64-lightgrey" alt="Platforms: linux/amd64 · linux/arm64">
@@ -27,6 +27,7 @@
 - [Quick start](#quick-start)
 - [LLM token budget in 30 seconds](#llm-token-budget-in-30-seconds)
 - [How a request flows](#how-a-request-flows)
+  - [Architecture](#architecture)
 - [Enforcement capabilities](#enforcement-capabilities)
 - [Performance](#performance)
 - [Deployment](#deployment)
@@ -81,17 +82,30 @@ Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacem
 ## Quick start
 
 > **Which mode is right for you?**
-> - **Wrapper** — your app calls OpenAI / Anthropic / Gemini directly → point your client at Fairvisor instead, no other code changes needed.
+> - **Wrapper** — your app calls OpenAI / Anthropic / Gemini directly → point your client at Fairvisor instead, no other code changes needed. *Fastest to try.*
 > - **Reverse proxy** — you have a single upstream service → Fairvisor sits in front and enforces before forwarding.
 > - **Decision service** — you already run nginx, Envoy, or Kong → call `POST /v1/decision` from `auth_request` / `ext_authz`.
 
-> **Runnable quickstart:** `examples/quickstart/` — `docker compose up -d` and run your first enforce/reject test in under a minute. See [`examples/quickstart/README.md`](examples/quickstart/README.md).
->
-> **Recipes:** `examples/recipes/` — deployable team budgets, runaway agent guard, and circuit-breaker examples.
->
-> **Sample artifacts:** `fixtures/` — canonical request/response fixtures for enforce, reject (TPM, TPD, prompt-too-large), and provider-native error bodies (OpenAI, Anthropic, Gemini).
+### Fastest path
 
-### 1. Create a policy
+```bash
+git clone https://github.com/fairvisor/edge.git
+cd examples/quickstart
+docker compose up -d
+```
+
+Run your first enforce/reject test in under a minute — full walkthrough in [`examples/quickstart/README.md`](examples/quickstart/README.md).
+
+**Recipes:** `examples/recipes/` — team budgets, runaway agent guard, circuit-breaker.
+
+**Sample artifacts:** `fixtures/` — canonical enforce/reject fixtures (OpenAI, Anthropic, Gemini).
+
+### Minimal decision\_service example
+
+<details>
+<summary>Expand — manual setup with a single <code>docker run</code></summary>
+
+**1. Create a policy**
 
 ```bash
 mkdir fairvisor-demo && cd fairvisor-demo
@@ -124,7 +138,7 @@ mkdir fairvisor-demo && cd fairvisor-demo
 }
 ```
 
-### 2. Run the edge
+**2. Run the edge**
 
 ```bash
 docker run -d \
@@ -133,22 +147,35 @@ docker run -d \
   -v "$(pwd)/policy.json:/etc/fairvisor/policy.json:ro" \
   -e FAIRVISOR_CONFIG_FILE=/etc/fairvisor/policy.json \
   -e FAIRVISOR_MODE=decision_service \
-  ghcr.io/fairvisor/fairvisor-edge:v0.1.0
+  ghcr.io/fairvisor/fairvisor-edge:latest
 ```
 
-### 3. Verify
+**3. Verify**
 
 ```bash
 curl -sf http://localhost:8080/readyz
 # {"status":"ok"}
 
-curl -s -w "\nHTTP %{http_code}\n" \
+# Allowed request → HTTP 204
+curl -s -o /dev/null -w "HTTP %{http_code}\n" \
   -H "X-Original-Method: GET" \
   -H "X-Original-URI: /api/data" \
   -H "X-Forwarded-For: 10.0.0.1" \
   http://localhost:8080/v1/decision
+
+# Rejected request — exhaust the burst (>10 requests)
+for i in $(seq 1 12); do
+  curl -s -o /dev/null -w "HTTP %{http_code}\n" \
+    -H "X-Original-Method: GET" \
+    -H "X-Original-URI: /api/data" \
+    -H "X-Forwarded-For: 10.0.0.1" \
+    http://localhost:8080/v1/decision
+done
+# last requests → HTTP 429  X-Fairvisor-Reason: rate_limit_exceeded
 ```
 
+</details>
+
 > Full walkthrough: [docs.fairvisor.com/docs/quickstart](https://docs.fairvisor.com/docs/quickstart/)
 
 ## LLM token budget in 30 seconds
@@ -182,7 +209,10 @@ The fastest path is **wrapper mode**: Fairvisor sits in front of the LLM API, en
 **2. Call the API** — token format `Bearer <client-jwt>:<upstream-key>`:
 
 ```bash
-curl https://your-fairvisor-host/openai/v1/chat/completions   -H "Authorization: Bearer eyJhbGc...:sk-proj-..."   -H "Content-Type: application/json"   -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"Hello"}]}'
+curl https://your-fairvisor-host/openai/v1/chat/completions \
+  -H "Authorization: Bearer eyJhbGc...:sk-proj-..." \
+  -H "Content-Type: application/json" \
+  -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"Hello"}]}'
 ```
 
 Fairvisor parses the JWT claims (no signature validation — the JWT is trusted as-is), extracts `org_id`, charges tokens against the budget, strips the `Authorization` header, and forwards with the upstream key. The upstream never sees the client JWT.
@@ -298,7 +328,7 @@ All three modes use the same policy bundle and produce the same rejection header
 
 | If you need to… | Algorithm | Typical identity keys | Reject reason |
 |---|---|---|---|
-| Cap request frequency | `token_bucket` | `jwt:user_id`, `header:x-api-key`, `ip:addr` | `rate_limit_exceeded` |
+| Cap request frequency | `token_bucket` | `jwt:user_id`, `header:x-api-key`, `ip:address` | `rate_limit_exceeded` |
 | Cap cumulative spend | `cost_based` | `jwt:org_id`, `jwt:plan` | `budget_exhausted` |
 | Cap LLM tokens (TPM/TPD) | `token_bucket_llm` | `jwt:org_id`, `jwt:user_id` | `tpm_exceeded`, `tpd_exceeded` |
 | Instantly block a segment | kill switch | any descriptor | `kill_switch_active` |
@@ -306,7 +336,7 @@ All three modes use the same policy bundle and produce the same rejection header
 | Stop runaway agent loops | loop detection | request fingerprint | `loop_detected` |
 | Clamp spend spikes | circuit breaker | global or policy scope | `circuit_breaker_open` |
 
-Identity keys can be **JWT claims** (`jwt:org_id`, `jwt:plan`), **HTTP headers** (`header:x-api-key`), or **IP attributes** (`ip:addr`, `ip:country`). Combine multiple keys per rule for compound matching.
+Identity keys can be **JWT claims** (`jwt:org_id`, `jwt:plan`), **HTTP headers** (`header:x-api-key`), or **IP attributes** (`ip:address`, `ip:country`). Combine multiple keys per rule for compound matching.
 
 ## Performance
 

From c7d13454645096d891de3a3b9c20257a64cc01f3 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 14:37:06 +0100
Subject: [PATCH 42/64] fix(recipes): replace unsupported loop_detector rule
 with spec-level loop_detection config

---
 .../recipes/runaway-agent-guard/policy.json   | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/examples/recipes/runaway-agent-guard/policy.json b/examples/recipes/runaway-agent-guard/policy.json
index 38de248..b3facab 100644
--- a/examples/recipes/runaway-agent-guard/policy.json
+++ b/examples/recipes/runaway-agent-guard/policy.json
@@ -8,23 +8,24 @@
       "spec": {
         "selector": {
           "pathPrefix": "/",
-          "methods": ["POST"]
+          "methods": [
+            "POST"
+          ]
         },
         "mode": "enforce",
+        "loop_detection": {
+          "enabled": true,
+          "window_seconds": 60,
+          "threshold_identical_requests": 30,
+          "action": "reject",
+          "similarity": "exact"
+        },
         "rules": [
-          {
-            "name": "loop-detection",
-            "limit_keys": ["jwt:agent_id"],
-            "algorithm": "loop_detector",
-            "algorithm_config": {
-              "window_seconds": 60,
-              "max_requests": 30,
-              "cooldown_seconds": 120
-            }
-          },
           {
             "name": "agent-tpm-guard",
-            "limit_keys": ["jwt:agent_id"],
+            "limit_keys": [
+              "jwt:agent_id"
+            ],
             "algorithm": "token_bucket_llm",
             "algorithm_config": {
               "tokens_per_minute": 50000,

From a7f935d0ddffa00091db4f6af1ee1d0064995d60 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 14:48:28 +0100
Subject: [PATCH 43/64] =?UTF-8?q?fix(recipes):=20change=20cost=5Fbased=20p?=
 =?UTF-8?q?eriod=20"30d"=20=E2=86=92=20"7d"=20(only=205m/1h/1d/7d=20are=20?=
 =?UTF-8?q?valid)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/recipes/team-budgets/policy.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/recipes/team-budgets/policy.json b/examples/recipes/team-budgets/policy.json
index 87d7c63..7a87d2c 100644
--- a/examples/recipes/team-budgets/policy.json
+++ b/examples/recipes/team-budgets/policy.json
@@ -29,7 +29,7 @@
             "algorithm": "cost_based",
             "algorithm_config": {
               "budget": 50000,
-              "period": "30d",
+              "period": "7d",
               "cost_key": "fixed",
               "fixed_cost": 1,
               "staged_actions": [

From 8566c15e52bd551ad6a5538a177eb028bfbbe640 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 17:35:20 +0100
Subject: [PATCH 44/64] docs: rewrite 'no external state' bullet to focus on
 request latency

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d267450..5b01bdd 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ When you serve multiple tenants — customers, teams, or agentic pipelines — t
 We needed something that:
 - Understood token budgets, not just request counts
 - Could key limits on JWT claims (`org_id`, `plan`, `user_id`), not just IPs
-- Added no external state — no Redis, no network round-trip in the hot path
+- Kept every request fast — no Redis round-trip, no extra network call in the hot path
 - Could plug into nginx or Envoy *or* run standalone as a transparent LLM proxy
 
 We couldn't find it, so we built Fairvisor.

From 1e9ffc69eaeba78b2a1589643e023f755bd1e789 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 17:53:04 +0100
Subject: [PATCH 45/64] =?UTF-8?q?docs:=20reword=20'does=20not=20replace'?=
 =?UTF-8?q?=20=E2=80=94=20Fairvisor=20can=20run=20standalone=20or=20alongs?=
 =?UTF-8?q?ide=20existing=20gateways?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5b01bdd..0811f7d 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ If you have an existing gateway, the question is whether Fairvisor adds anything
 
 **If you are on Cloudflare or Akamai**, per-JWT-claim limits, LLM token budgets, and cost caps are not in the platform's model. If your limits are tenant-aware or cost-aware, you need something that runs in your own stack.
 
-Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacement. See [nginx auth_request →](https://docs.fairvisor.com/docs/gateway/nginx/) · [Envoy ext_authz →](https://docs.fairvisor.com/docs/gateway/envoy/) · [Kong / Traefik →](https://docs.fairvisor.com/docs/gateway/) for integration patterns.
+Fairvisor can run alongside Kong, nginx, and Envoy — or as a standalone reverse proxy if you don't need a separate gateway. See [nginx auth_request →](https://docs.fairvisor.com/docs/gateway/nginx/) · [Envoy ext_authz →](https://docs.fairvisor.com/docs/gateway/envoy/) · [Kong / Traefik →](https://docs.fairvisor.com/docs/gateway/) for integration patterns.
 
 ## Quick start
 
@@ -372,7 +372,7 @@ Reproduce: see [fairvisor/benchmark](https://github.com/fairvisor/benchmark) —
 | Envoy `ext_authz` | [docs/gateway/envoy](https://docs.fairvisor.com/docs/gateway/envoy/) |
 | Kong / Traefik | [docs/gateway](https://docs.fairvisor.com/docs/gateway/) |
 
-Fairvisor integrates **alongside** Kong, nginx, Envoy, and Traefik — it does not replace them.
+Fairvisor works alongside Kong, nginx, Envoy, and Traefik — or runs standalone as a reverse proxy when you don't need a separate gateway.
 
 ## CLI
 

From 96de2a1593e067d9abb140cbfcca8fae20350eae Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sun, 22 Mar 2026 17:06:50 +0000
Subject: [PATCH 46/64] docs(readme): fix quickstart clone path

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0811f7d..5c586f6 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,7 @@ Fairvisor can run alongside Kong, nginx, and Envoy — or as a standalone revers
 
 ```bash
 git clone https://github.com/fairvisor/edge.git
-cd examples/quickstart
+cd edge/examples/quickstart
 docker compose up -d
 ```
 
@@ -422,4 +422,3 @@ docs/                    reference documentation
 ---
 
 **Docs:** [docs.fairvisor.com](https://docs.fairvisor.com/docs/) · **Website:** [fairvisor.com](https://fairvisor.com) · **Quickstart:** [5 minutes to enforcement](https://docs.fairvisor.com/docs/quickstart/)
-

From a4b166305671b2c538033d1f5f8d10ce5a1a351e Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sun, 22 Mar 2026 17:20:54 +0000
Subject: [PATCH 47/64] fix(quickstart): build local image instead of ghcr

---
 examples/quickstart/README.md          | 3 +++
 examples/quickstart/docker-compose.yml | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md
index 3acb943..375fa34 100644
--- a/examples/quickstart/README.md
+++ b/examples/quickstart/README.md
@@ -13,6 +13,9 @@ Go from `git clone` to working policy enforcement in one step.
 docker compose up -d
 ```
 
+The first run builds the `edge` image locally from `docker/Dockerfile`, so no
+GHCR login is required.
+
 Wait for the edge service to report healthy:
 
 ```bash
diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml
index 870812d..092b7a5 100644
--- a/examples/quickstart/docker-compose.yml
+++ b/examples/quickstart/docker-compose.yml
@@ -24,7 +24,9 @@
 
 services:
   edge:
-    image: ghcr.io/fairvisor/fairvisor-edge:latest
+    build:
+      context: ../..
+      dockerfile: docker/Dockerfile
     ports:
       - "8080:8080"
     environment:

From 4a6a52cf76364467d143bee8965eeca247871a50 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sun, 22 Mar 2026 17:24:17 +0000
Subject: [PATCH 48/64] docs(quickstart): rename compose service to fairvisor

---
 examples/quickstart/README.md          | 6 +++---
 examples/quickstart/docker-compose.yml | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md
index 375fa34..d9b637f 100644
--- a/examples/quickstart/README.md
+++ b/examples/quickstart/README.md
@@ -13,14 +13,14 @@ Go from `git clone` to working policy enforcement in one step.
 docker compose up -d
 ```
 
-The first run builds the `edge` image locally from `docker/Dockerfile`, so no
+The first run builds the `fairvisor` image locally from `docker/Dockerfile`, so no
 GHCR login is required.
 
-Wait for the edge service to report healthy:
+Wait for the `fairvisor` service to report healthy:
 
 ```bash
 docker compose ps
-# edge should show "healthy"
+# fairvisor should show "healthy"
 ```
 
 ## Verify enforcement
diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml
index 092b7a5..2efc128 100644
--- a/examples/quickstart/docker-compose.yml
+++ b/examples/quickstart/docker-compose.yml
@@ -19,11 +19,10 @@
 # cannot be demonstrated with this mock stack.
 #
 # This file is also the base for the e2e-smoke CI check.
-# CI extends it via tests/e2e/docker-compose.test.yml; do not diverge the
-# service name, port, or volume contract without updating CI as well.
+# CI expects the same port and volume contract; update CI too if those change.
 
 services:
-  edge:
+  fairvisor:
     build:
       context: ../..
       dockerfile: docker/Dockerfile

From a739b5bc28f28016b064445a73625d382fedd61c Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sun, 22 Mar 2026 17:26:14 +0000
Subject: [PATCH 49/64] chore(quickstart): reduce readyz healthcheck frequency

---
 examples/quickstart/docker-compose.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml
index 2efc128..51cacf6 100644
--- a/examples/quickstart/docker-compose.yml
+++ b/examples/quickstart/docker-compose.yml
@@ -42,7 +42,7 @@ services:
         condition: service_healthy
     healthcheck:
       test: ["CMD", "curl", "-sf", "http://127.0.0.1:8080/readyz"]
-      interval: 2s
+      interval: 2m
       timeout: 2s
       retries: 15
       start_period: 5s

From 3ba11ceee66fef8bdb8ea0c45840249c32fb6ed0 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sun, 22 Mar 2026 17:29:27 +0000
Subject: [PATCH 50/64] chore(quickstart): remove mock_llm healthcheck

---
 examples/quickstart/docker-compose.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml
index 51cacf6..60a9950 100644
--- a/examples/quickstart/docker-compose.yml
+++ b/examples/quickstart/docker-compose.yml
@@ -51,9 +51,3 @@ services:
     image: nginx:1.27-alpine
     volumes:
       - ./mock-llm.conf:/etc/nginx/nginx.conf:ro
-    healthcheck:
-      test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:80/"]
-      interval: 2s
-      timeout: 2s
-      retries: 10
-      start_period: 5s

From 78dc693d9117568b48bdc79b1a3e6632221e1c52 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sun, 22 Mar 2026 17:30:46 +0000
Subject: [PATCH 51/64] chore(quickstart): restore mock_llm healthcheck at low
 frequency

---
 examples/quickstart/docker-compose.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml
index 60a9950..59a2487 100644
--- a/examples/quickstart/docker-compose.yml
+++ b/examples/quickstart/docker-compose.yml
@@ -51,3 +51,9 @@ services:
     image: nginx:1.27-alpine
     volumes:
       - ./mock-llm.conf:/etc/nginx/nginx.conf:ro
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:80/"]
+      interval: 2m
+      timeout: 2s
+      retries: 10
+      start_period: 5s

From cfd558e5efceafc96f6942472d81a9164d42ce60 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sun, 22 Mar 2026 17:34:31 +0000
Subject: [PATCH 52/64] fix(docker): add maxminddb dev lib and tune map hashes

---
 docker/Dockerfile          | 1 +
 docker/nginx.conf.template | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index d1812e7..3c608c4 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -5,6 +5,7 @@ RUN apt-get update && apt-get upgrade -y --no-install-recommends \
   gettext-base \
   python3 \
   libmaxminddb0 \
+  libmaxminddb-dev \
   mmdb-bin \
   && rm -rf /var/lib/apt/lists/*
 
diff --git a/docker/nginx.conf.template b/docker/nginx.conf.template
index c0d4184..7786b79 100644
--- a/docker/nginx.conf.template
+++ b/docker/nginx.conf.template
@@ -25,6 +25,8 @@ worker_shutdown_timeout 35s;
 http {
   resolver 127.0.0.11 ipv6=off valid=30s;
   resolver_timeout 2s;
+  map_hash_max_size 4096;
+  map_hash_bucket_size 64;
 
   geo $is_tor_exit {
     default 0;

From 111d14d6977a1630324a481ff4aad562f8fb5ee1 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sun, 22 Mar 2026 18:11:39 +0000
Subject: [PATCH 53/64] fix(nginx): increase map hash size for quickstart

---
 docker/nginx.conf.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/nginx.conf.template b/docker/nginx.conf.template
index 7786b79..c5c77df 100644
--- a/docker/nginx.conf.template
+++ b/docker/nginx.conf.template
@@ -25,7 +25,7 @@ worker_shutdown_timeout 35s;
 http {
   resolver 127.0.0.11 ipv6=off valid=30s;
   resolver_timeout 2s;
-  map_hash_max_size 4096;
+  map_hash_max_size 16384;
   map_hash_bucket_size 64;
 
   geo $is_tor_exit {

From a03320419e39b94c774b355b40f550a8dfe582a8 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 19:30:24 +0100
Subject: [PATCH 54/64] fix(nginx): set map_hash_max_size 131072 to cover all
 85k ASN entries

---
 docker/nginx.conf.template | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docker/nginx.conf.template b/docker/nginx.conf.template
index c5c77df..866981c 100644
--- a/docker/nginx.conf.template
+++ b/docker/nginx.conf.template
@@ -25,7 +25,7 @@ worker_shutdown_timeout 35s;
 http {
   resolver 127.0.0.11 ipv6=off valid=30s;
   resolver_timeout 2s;
-  map_hash_max_size 16384;
+  map_hash_max_size 131072;
   map_hash_bucket_size 64;
 
   geo $is_tor_exit {
@@ -53,7 +53,8 @@ http {
 
     location = /livez {
       default_type text/plain;
-      return 200 "ok\n";
+      return 200 "ok
+";
     }
 
     location = /readyz {
@@ -104,7 +105,8 @@ http {
       }
 
       default_type text/plain;
-      return 404 "not found\n";
+      return 404 "not found
+";
     }
   }
 }

From 7772219a4c2f003d9f48345a4ebb116a828c7155 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 20:01:35 +0100
Subject: [PATCH 55/64] =?UTF-8?q?fix(nginx):=20map=5Fhash=5Fmax=5Fsize=202?=
 =?UTF-8?q?62144=20=E2=80=94=20need=20~3x=20entries=20for=20collision-free?=
 =?UTF-8?q?=20hash=20(85k=20ASNs)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docker/nginx.conf.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/nginx.conf.template b/docker/nginx.conf.template
index 866981c..c745a65 100644
--- a/docker/nginx.conf.template
+++ b/docker/nginx.conf.template
@@ -25,7 +25,7 @@ worker_shutdown_timeout 35s;
 http {
   resolver 127.0.0.11 ipv6=off valid=30s;
   resolver_timeout 2s;
-  map_hash_max_size 131072;
+  map_hash_max_size 262144;
   map_hash_bucket_size 64;
 
   geo $is_tor_exit {

From 8a7377705afda51968c5272ee30d26c646a87587 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 21:32:07 +0100
Subject: [PATCH 56/64] fix(cli): add bin/fairvisor-cli with corrected -I path
 for cli.* modules

---
 bin/fairvisor-cli | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 bin/fairvisor-cli

diff --git a/bin/fairvisor-cli b/bin/fairvisor-cli
new file mode 100644
index 0000000..1b1a490
--- /dev/null
+++ b/bin/fairvisor-cli
@@ -0,0 +1,7 @@
+#!/bin/bash
+# Requires: OpenResty 'resty' in PATH (e.g. openresty package or OPENRESTY_HOME)
+
+SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+
+exec resty -I "${SCRIPT_DIR}/src" -I "${SCRIPT_DIR}" \
+  "${SCRIPT_DIR}/cli/main.lua" "$@"

From 60927038d6376b36b226a4b10e13fb5c946bc0c5 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 21:32:31 +0100
Subject: [PATCH 57/64] =?UTF-8?q?fix(cli):=20Dockerfile.cli=20=E2=86=92=20?=
 =?UTF-8?q?bin/fairvisor-cli,=20ENTRYPOINT=20updated?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docker/Dockerfile.cli | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.cli b/docker/Dockerfile.cli
index bf52315..97f263f 100644
--- a/docker/Dockerfile.cli
+++ b/docker/Dockerfile.cli
@@ -9,8 +9,8 @@ WORKDIR /opt/fairvisor
 
 COPY src /opt/fairvisor/src
 COPY cli /opt/fairvisor/cli
-COPY bin/fairvisor /opt/fairvisor/bin/fairvisor
+COPY bin/fairvisor-cli /opt/fairvisor/bin/fairvisor-cli
 
-RUN chmod +x /opt/fairvisor/bin/fairvisor
+RUN chmod +x /opt/fairvisor/bin/fairvisor-cli
 
-ENTRYPOINT ["/opt/fairvisor/bin/fairvisor"]
+ENTRYPOINT ["/opt/fairvisor/bin/fairvisor-cli"]

From fac3632bc36bb895aca85c373b2775ef94e37676 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 21:32:31 +0100
Subject: [PATCH 58/64] =?UTF-8?q?docs(cli):=20rename=20fairvisor=20?=
 =?UTF-8?q?=E2=86=92=20fairvisor-cli,=20fix=20resty=20-I=20path=20in=20doc?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cli/README.md | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/cli/README.md b/cli/README.md
index bd18187..cd3cbb8 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -12,40 +12,40 @@ Command-line tool for scaffolding policies, validating configs, dry-run testing,
 From the repo root:
 
 ```bash
-./bin/fairvisor <command> [options]
+./bin/fairvisor-cli <command> [options]
 ```
 
 Or with `resty` directly (e.g. from another directory, adjusting `-I` paths):
 
 ```bash
-resty -I /path/to/fv-oss/src -I /path/to/fv-oss/cli /path/to/fv-oss/cli/main.lua <command> [options]
+resty -I /path/to/fv-oss/src -I /path/to/fv-oss /path/to/fv-oss/cli/main.lua <command> [options]
 ```
 
-`bin/fairvisor` sets `-I` to the repo's `src` and `cli` so that `require("cli.commands.init")` and `require("fairvisor.bundle_loader")` resolve correctly.
+`bin/fairvisor-cli` sets `-I` to the repo's `src` and root (for `cli.*` modules) so that `require("cli.commands.init")` and `require("fairvisor.bundle_loader")` resolve correctly.
 
 ## Commands
 
 | Command | Description |
 |--------|-------------|
-| `fairvisor init [--template=api\|llm\|webhook]` | Generate `policy.json` and `edge.env.example` in the current directory. |
-| `fairvisor validate <file\|->` | Validate policy JSON; exit 0 if valid, non-zero with errors otherwise. |
-| `fairvisor test <file> [--requests=<file>] [--format=table\|json]` | Dry-run mock requests through the rule engine. |
-| `fairvisor connect [--token=TOKEN] [--url=URL] [--output=PATH]` | Write credentials, verify SaaS connection, optionally download initial bundle. |
-| `fairvisor status [--edge-url=URL] [--format=table\|json]` | Show policy version, SaaS connection, counters. |
-| `fairvisor logs [--action=ACTION] [--reason=REASON]` | Stream structured logs with optional filters. |
-| `fairvisor version` | Print CLI version. |
-| `fairvisor help` | Print command list and usage. |
+| `fairvisor-cli init [--template=api\|llm\|webhook]` | Generate `policy.json` and `edge.env.example` in the current directory. |
+| `fairvisor-cli validate <file\|->` | Validate policy JSON; exit 0 if valid, non-zero with errors otherwise. |
+| `fairvisor-cli test <file> [--requests=<file>] [--format=table\|json]` | Dry-run mock requests through the rule engine. |
+| `fairvisor-cli connect [--token=TOKEN] [--url=URL] [--output=PATH]` | Write credentials, verify SaaS connection, optionally download initial bundle. |
+| `fairvisor-cli status [--edge-url=URL] [--format=table\|json]` | Show policy version, SaaS connection, counters. |
+| `fairvisor-cli logs [--action=ACTION] [--reason=REASON]` | Stream structured logs with optional filters. |
+| `fairvisor-cli version` | Print CLI version. |
+| `fairvisor-cli help` | Print command list and usage. |
 
 ## Examples
 
 ```bash
-fairvisor init
-fairvisor init --template=llm
-fairvisor validate policy.json
-fairvisor test policy.json
-fairvisor connect --token=eyJ...
-fairvisor version
-fairvisor help
+fairvisor-cli init
+fairvisor-cli init --template=llm
+fairvisor-cli validate policy.json
+fairvisor-cli test policy.json
+fairvisor-cli connect --token=eyJ...
+fairvisor-cli version
+fairvisor-cli help
 ```
 
 ## Tests

From f2059fc4728681b05c10ef3aa80c3176cbe89309 Mon Sep 17 00:00:00 2001
From: Lev <lev@leontiev.me>
Date: Sun, 22 Mar 2026 21:45:19 +0100
Subject: [PATCH 59/64] =?UTF-8?q?docs(readme):=20fairvisor=20=E2=86=92=20f?=
 =?UTF-8?q?airvisor-cli=20in=20CLI=20section?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 5c586f6..2f5c0b0 100644
--- a/README.md
+++ b/README.md
@@ -377,12 +377,12 @@ Fairvisor works alongside Kong, nginx, Envoy, and Traefik — or runs standalone
 ## CLI
 
 ```bash
-fairvisor init --template=api    # scaffold a policy bundle
-fairvisor validate policy.json   # validate before deploying
-fairvisor test --dry-run         # shadow-mode replay
-fairvisor status                 # edge health and loaded bundle info
-fairvisor logs                   # tail rejection events
-fairvisor connect                # connect to SaaS control plane
+fairvisor-cli init --template=api    # scaffold a policy bundle
+fairvisor-cli validate policy.json   # validate before deploying
+fairvisor-cli test --dry-run         # shadow-mode replay
+fairvisor-cli status                 # edge health and loaded bundle info
+fairvisor-cli logs                   # tail rejection events
+fairvisor-cli connect                # connect to SaaS control plane
 ```
 
 ## SaaS control plane (optional)

From ade005ae88a29e10b921501d926d129c5566add4 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Mon, 23 Mar 2026 11:40:48 +0000
Subject: [PATCH 60/64] feat(limiter): improve token count accuracy and
 max_completion_tokens support

- Fix _simple_word_estimate to properly skip false-positive "content" key
  matches (e.g. when key appears inside a string value) by verifying
  the value separator (: "...") before counting characters
- Add _extract_max_tokens() to parse max_tokens / max_completion_tokens
  from raw request body when request_context.max_tokens is not set,
  enabling accurate budget reservation for OpenAI-compatible payloads

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/fairvisor/llm_limiter.lua | 49 ++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/fairvisor/llm_limiter.lua b/src/fairvisor/llm_limiter.lua
index 3b29bc0..d184bc7 100644
--- a/src/fairvisor/llm_limiter.lua
+++ b/src/fairvisor/llm_limiter.lua
@@ -189,23 +189,32 @@ local function _simple_word_estimate(request_context)
     local array_end = array_start and string_find(body, "]", array_start, true)
     if array_start and array_end then
       local segment = string_sub(body, array_start, array_end)
-      local marker = "\"content\":\""
-      local marker_len = #marker
       local position = 1
       local char_count = 0
 
       while true do
-        local start_pos = string_find(segment, marker, position, true)
-        if not start_pos then
+        -- Find "content" key
+        local key_start = string_find(segment, "\"content\"", position, true)
+        if not key_start then
           break
         end
-        local content_start = start_pos + marker_len
-        local content_end = string_find(segment, "\"", content_start, true)
-        if not content_end then
-          break
+        
+        -- Look for value start: : "..."
+        -- pattern: ^%s*:%s*"
+        local val_marker_start, val_marker_end = string_find(segment, "^%s*:%s*\"", key_start + 9)
+        
+        if not val_marker_start then
+           -- False positive (e.g. key was in a string), skip it
+           position = key_start + 7
+        else
+           local content_start = val_marker_end + 1
+           local content_end = string_find(segment, "\"", content_start, true)
+           if not content_end then
+             break
+           end
+           char_count = char_count + (content_end - content_start)
+           position = content_end + 1
         end
-        char_count = char_count + (content_end - content_start)
-        position = content_end + 1
       end
 
       return ceil(char_count / 4)
@@ -215,6 +224,21 @@ local function _simple_word_estimate(request_context)
   return ceil(#body / 4)
 end
 
+local function _extract_max_tokens(body)
+  if type(body) ~= "string" or body == "" then
+    return nil
+  end
+  -- simple regex scan for "max_tokens": 123
+  -- or "max_completion_tokens": 123
+  local s, e, val = string_find(body, '"max_tokens"%s*:%s*(%d+)')
+  if val then return tonumber(val) end
+
+  s, e, val = string_find(body, '"max_completion_tokens"%s*:%s*(%d+)')
+  if val then return tonumber(val) end
+
+  return nil
+end
+
 local function _check_tpd_budget(dict, key, config, cost, now)
   local ttl = _seconds_until_midnight_utc(now)
   local new_total, incr_err = dict:incr(key, cost, 0, ttl)
@@ -365,6 +389,11 @@ function _M.check(dict, key, config, request_context, now)
   local max_completion = config.default_max_completion
   if request_context and type(request_context.max_tokens) == "number" and request_context.max_tokens > 0 then
     max_completion = request_context.max_tokens
+  elseif request_context and request_context.body then
+    local extracted = _extract_max_tokens(request_context.body)
+    if extracted and extracted > 0 then
+      max_completion = extracted
+    end
   end
   if config.max_completion_tokens and max_completion > config.max_completion_tokens then
     max_completion = config.max_completion_tokens

From 96fbdc11d21230b9fa9b7b7a04b89d6cf30be983 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Mon, 23 Mar 2026 11:51:00 +0000
Subject: [PATCH 61/64] =?UTF-8?q?fix(limiter):=20fix=20luacheck=20warnings?=
 =?UTF-8?q?=20=E2=80=94=20trailing=20whitespace=20and=20unused=20vars?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove trailing whitespace on blank lines (lines 201, 205)
- Replace unused s, e variables with _ in _extract_max_tokens()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/fairvisor/llm_limiter.lua | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fairvisor/llm_limiter.lua b/src/fairvisor/llm_limiter.lua
index d184bc7..ab3d3d4 100644
--- a/src/fairvisor/llm_limiter.lua
+++ b/src/fairvisor/llm_limiter.lua
@@ -198,11 +198,11 @@ local function _simple_word_estimate(request_context)
         if not key_start then
           break
         end
-        
+
         -- Look for value start: : "..."
         -- pattern: ^%s*:%s*"
         local val_marker_start, val_marker_end = string_find(segment, "^%s*:%s*\"", key_start + 9)
-        
+
         if not val_marker_start then
            -- False positive (e.g. key was in a string), skip it
            position = key_start + 7
@@ -230,10 +230,10 @@ local function _extract_max_tokens(body)
   end
   -- simple regex scan for "max_tokens": 123
   -- or "max_completion_tokens": 123
-  local s, e, val = string_find(body, '"max_tokens"%s*:%s*(%d+)')
+  local _, _, val = string_find(body, '"max_tokens"%s*:%s*(%d+)')
   if val then return tonumber(val) end
 
-  s, e, val = string_find(body, '"max_completion_tokens"%s*:%s*(%d+)')
+  _, _, val = string_find(body, '"max_completion_tokens"%s*:%s*(%d+)')
   if val then return tonumber(val) end
 
   return nil

From 843a26c2f2c90b2e4226d265cba561ada0706d0d Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Mon, 23 Mar 2026 11:53:32 +0000
Subject: [PATCH 62/64] test(limiter): add scenarios for max_completion_tokens
 and improved JSON parsing

---
 spec/unit/features/llm_limiter.feature | 38 ++++++++++++++++++++++++++
 spec/unit/llm_limiter_spec.lua         |  5 ++++
 2 files changed, 43 insertions(+)

diff --git a/spec/unit/features/llm_limiter.feature b/spec/unit/features/llm_limiter.feature
index e9d3828..5519044 100644
--- a/spec/unit/features/llm_limiter.feature
+++ b/spec/unit/features/llm_limiter.feature
@@ -190,3 +190,41 @@ Feature: LLM limiter module behavior
       And the llm limiter config is validated
       When I build error response for reason "tpm_exceeded"
       Then error response has OpenAI rate limit shape
+
+  Rule: New features — max_completion_tokens and improved JSON parsing
+    Scenario: max_completion_tokens is extracted from body when max_tokens is missing
+      Given the nginx mock environment is reset
+      And a valid llm limiter config with tokens_per_minute 10000
+      And the config has default_max_completion 1000
+      And the llm limiter config is validated
+      And the request body is '{"messages":[{"role":"user","content":"hello"}],"max_completion_tokens":2000}'
+      When I run llm check at now 1700000000
+      Then check is allowed
+      And reserved equals estimated_total 2002
+
+    Scenario: improved JSON parsing handles spaces and false positives
+      Given the nginx mock environment is reset
+      And a valid llm limiter config with tokens_per_minute 10000
+      And the config uses estimator "simple_word"
+      And the llm limiter config is validated
+      And the request body is '{"messages":[{"role":"user", "content" : "12345678"}]}'
+      When I estimate prompt tokens
+      Then prompt estimate equals 2
+
+    Scenario: simple_word parsing multiple messages
+      Given the nginx mock environment is reset
+      And a valid llm limiter config with tokens_per_minute 10000
+      And the config uses estimator "simple_word"
+      And the llm limiter config is validated
+      And the request body is '{"messages":[{"role":"user","content":"hello"},{"role":"assistant","content":"world!"}]}'
+      When I estimate prompt tokens
+      Then prompt estimate equals 3
+
+    Scenario: simple_word fallback when no messages key
+      Given the nginx mock environment is reset
+      And a valid llm limiter config with tokens_per_minute 10000
+      And the config uses estimator "simple_word"
+      And the llm limiter config is validated
+      And the request body is '{"input":"test"}'
+      When I estimate prompt tokens
+      Then prompt estimate equals 4
diff --git a/spec/unit/llm_limiter_spec.lua b/spec/unit/llm_limiter_spec.lua
index aecf8e1..362b3ae 100644
--- a/spec/unit/llm_limiter_spec.lua
+++ b/spec/unit/llm_limiter_spec.lua
@@ -129,6 +129,11 @@ runner:given("^the request body is empty$", function(ctx)
   ctx.request_context.body = ""
 end)
 
+runner:given("^the request body is '([^']+)'$", function(ctx, body)
+  ctx.request_context = ctx.request_context or {}
+  ctx.request_context.body = body
+end)
+
 runner:given("^the request body has (%d+) prompt characters in messages$", function(ctx, chars)
   local char_count = tonumber(chars)
   local content = string.rep("a", char_count)

From 64b3b5824d92ebbaa5192aa6a8c211f837174f47 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Mon, 23 Mar 2026 12:01:21 +0000
Subject: [PATCH 63/64] test(limiter): cover max_tokens body field and default
 fallback paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add two scenarios to exercise the uncovered branches in _extract_max_tokens:
- body contains "max_tokens" field → uses it (covers return tonumber(val) branch)
- body has no max_tokens/max_completion_tokens → falls back to default_max_completion
  (covers return nil branch)

Fixes coverage regression (91.61% < 91.63% threshold).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spec/unit/features/llm_limiter.feature | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/spec/unit/features/llm_limiter.feature b/spec/unit/features/llm_limiter.feature
index 5519044..d33e4b1 100644
--- a/spec/unit/features/llm_limiter.feature
+++ b/spec/unit/features/llm_limiter.feature
@@ -228,3 +228,23 @@ Feature: LLM limiter module behavior
       And the request body is '{"input":"test"}'
       When I estimate prompt tokens
       Then prompt estimate equals 4
+
+    Scenario: max_tokens field in body is used when request_context.max_tokens is absent
+      Given the nginx mock environment is reset
+      And a valid llm limiter config with tokens_per_minute 10000
+      And the config has default_max_completion 1000
+      And the llm limiter config is validated
+      And the request body is '{"messages":[{"role":"user","content":"hello"}],"max_tokens":500}'
+      When I run llm check at now 1700000000
+      Then check is allowed
+      And reserved equals estimated_total 502
+
+    Scenario: body with no max_tokens field falls back to default_max_completion
+      Given the nginx mock environment is reset
+      And a valid llm limiter config with tokens_per_minute 10000
+      And the config has default_max_completion 800
+      And the llm limiter config is validated
+      And the request body is '{"messages":[{"role":"user","content":"hi"}]}'
+      When I run llm check at now 1700000000
+      Then check is allowed
+      And reserved equals estimated_total 801

From fa72ccd38903eaa1b3cb18a4ffa07e41d1190a10 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Thu, 26 Mar 2026 12:24:35 +0000
Subject: [PATCH 64/64] docs: update README subtitle to sharper tagline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace "Turn API limits into enforceable business policy." with
"The LLM rate limiter your multi-tenant product was missing." —
more direct, audience-specific, and immediately recognisable for
the developer reading on GitHub.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2f5c0b0..1c68b7f 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 <h1 align="center">FAIRVISOR</h1>
 
-<h3 align="center">Stop one tenant from exhausting everyone's budget.</h3>
+<h3 align="center">The LLM rate limiter your multi-tenant product was missing.</h3>
 
 <p align="center">
   <a href="https://github.com/fairvisor/edge/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MPL--2.0-blue" alt="License: MPL-2.0"></a>