Buffden · Buffden · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -89,7 +89,7 @@ jobs:
     needs: [build-test, compose-smoke]
     runs-on: ubuntu-latest
     permissions:
-      id-token: write   # Required for OIDC
+      id-token: write   # Required for OIDC and cosign keyless signing
       contents: read
       packages: write   # Required for GHCR push
 
@@ -126,12 +126,26 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Install cosign
+        uses: sigstore/cosign-installer@v3
+
       - name: Build and push Docker image
+        id: build-push
         env:
           IMAGE_TAG: ${{ github.sha }}
         run: |
           docker build -t ghcr.io/buffden/tinyurl-api:$IMAGE_TAG tinyurl/
           docker push ghcr.io/buffden/tinyurl-api:$IMAGE_TAG
+          DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' ghcr.io/buffden/tinyurl-api:$IMAGE_TAG)
+          echo "digest=$DIGEST" >> $GITHUB_OUTPUT
+
+      - name: Sign Docker image with cosign (keyless)
+        env:
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          cosign sign --yes ghcr.io/buffden/tinyurl-api@$(docker inspect \
+            --format='{{index .RepoDigests 0}}' \
+            ghcr.io/buffden/tinyurl-api:$IMAGE_TAG | cut -d@ -f2)
 
       - name: Wait for SSM agent to come online
         run: |

diff --git a/README.md b/README.md
@@ -9,27 +9,43 @@ A single-region, production-oriented URL shortener built with Spring Boot and An
 
 ---
 
+## Request Flow
+
+Every request to `go.buffden.com` passes through six layers before any application code runs:
+
+| Layer | Component | Role |
+| --- | --- | --- |
+| 1 | **Cloudflare** | DDoS mitigation, bot protection, WAF rate limiting, global anycast routing. Kills attack traffic before it reaches AWS — a financial decision as much as a security one. EC2 security group accepts inbound only from Cloudflare's published IP ranges. |
+| 2 | **CloudFront** | Routes by path: `/api/*` and `/{code}` forward to the ALB; everything else serves the Angular SPA from S3. Frontend deploys never touch EC2. |
+| 3 | **ALB** | Port 80 → 301 to HTTPS. Port 443 terminates TLS, forwards plain HTTP to EC2:80. Health checks on `/actuator/health` — unhealthy instances leave rotation automatically. |
+| 4 | **Nginx** | Reverse proxy with per-IP rate limit zones (`create_url`: 40r/m, `redirect`: 30r/m). Known vulnerability scanners (sqlmap, nikto, nuclei, etc.) are blocked by User-Agent and receive `444` — TCP connection closed with zero bytes sent. |
+| 5 | **Spring Boot** | Stateless application layer. Input validation, short code generation, redirect logic, Bucket4j per-IP hourly rate cap. |
+| 6 | **PostgreSQL** | Single source of truth. Two DB users: Flyway (DDL rights for migrations), application user (SELECT, INSERT, UPDATE, DELETE only — no ALTER, no DROP). |
+
+---
+
 ## Architecture
 
 ### v1 — Baseline (Implemented)
 
-- Base62 encoded short codes (6–8 chars)
-- DB-backed ID generation via PostgreSQL sequence
+- Base62 encoded short codes (6–8 chars) generated from a PostgreSQL `bigint` sequence — no UUIDs, no hashing, no collision resolution. The sequence guarantees uniqueness; Base62 keeps codes short. Trade-off: codes are enumerable. Acceptable in v1 — no private content exists.
+- HTTP **301** (permanent redirect) when no expiry is set — browser caches it, zero round trip on repeat visits.
+- HTTP **302** (temporary redirect) when expiry is set — forces revalidation every time.
+- HTTP **410 Gone** on expiry — not 404. Tells browsers, crawlers, and clients the link existed and was intentionally removed.
 - Stateless Spring Boot application server
-- HTTP 301 (permanent) or 302 (expiring) redirects
-- Optional expiration (default 180 days)
 - Flyway-managed schema migrations
-- Prometheus metrics + structured JSON logging
+- Prometheus metrics + structured JSON logging to CloudWatch
 
 [![v1 HLD](diagrams/docs/architecture/00-baseline/v1/url-shortener-v1-hld.svg)](diagrams/docs/architecture/00-baseline/v1/url-shortener-v1-hld.svg)
 
 ### v2 — Scale & Abuse Resistance (Planned)
 
-- Redis cache (cache-aside pattern)
-- Negative caching for invalid codes
-- Rate limiting (token bucket)
-- Soft delete support
-- Custom aliases (feature-flagged)
+- Redis cache-aside on the redirect path — target >90% of redirects skip the DB. Cache warming on write, TTL jitter to prevent thundering herd, negative caching for unknown codes.
+- Bucket4j upgraded from in-process (v1) to Redis-backed distributed rate limiting — shared state across instances when autoscaling is introduced.
+- Cloudflare Turnstile CAPTCHA on URL creation — stops distributed bots that stay under per-IP rate limits.
+- Soft delete for malicious link takedowns — full audit trail preserved.
+- Custom aliases (4–32 chars, Base62, rate-limited tighter than normal creates).
+- Micrometer + Grafana: P95/P99 latency, cache hit ratio, error rate, QPS per endpoint.
 
 [![v2 HLD](diagrams/docs/architecture/00-baseline/v2/url-shortener-v2-hld.svg)](diagrams/docs/architecture/00-baseline/v2/url-shortener-v2-hld.svg)
 
@@ -44,13 +60,63 @@ A single-region, production-oriented URL shortener built with Spring Boot and An
 | Database | PostgreSQL 16 |
 | Migrations | Flyway |
 | Reverse proxy | Nginx |
+| Rate limiting | Nginx zones + Bucket4j (in-process token bucket, Caffeine cache) |
 | Containerization | Docker, Docker Compose |
-| Cloud | AWS (EC2, RDS, ALB, S3, CloudFront) |
-| CI/CD | GitHub Actions → GHCR → EC2 via SSM |
+| Edge | Cloudflare (DDoS, WAF, anycast) |
+| Cloud | AWS (EC2, RDS, ALB, S3, CloudFront, SSM, KMS, CloudWatch) |
+| CI/CD | GitHub Actions → GHCR → EC2 via SSM RunCommand |
+| Image signing | cosign (Sigstore keyless, OIDC-tied) |
 | Observability | Micrometer, Prometheus, CloudWatch |
 
 ---
 
+## Security
+
+Security is a design constraint, not a checklist. Every decision is cross-referenced against OWASP documentation — see [`docs/security/owasp-compliance.md`](docs/security/owasp-compliance.md).
+
+**Credentials and secrets**
+- All credentials in AWS SSM Parameter Store as `SecureString` + KMS. No plaintext secrets in environment variables, config files, or Docker Compose.
+
+**Zero-credential CI/CD**
+- GitHub Actions authenticates to AWS via OIDC — no long-lived access keys anywhere. EC2 has an IAM role with SSM access. Deployment issues an SSM `RunCommand` to pull the new image and restart containers. No SSH, no port 22 open.
+
+**Supply chain**
+- Every Gradle dependency download is verified against a committed SHA-256 checksum file (`gradle/verification-metadata.xml`). Gradle rejects any JAR that doesn't match — protects against compromised Maven mirrors.
+- Docker images are signed after every push using cosign keyless signing (Sigstore). The signature is tied to the GitHub OIDC identity — no signing keys to manage or rotate.
+
+**Response headers (OWASP Secure Headers Project)**
+- Full header set applied at Nginx: `Content-Security-Policy` (`default-src 'none'`), `Strict-Transport-Security` (with preload), `Cross-Origin-Opener-Policy`, `Cross-Origin-Resource-Policy`, `Permissions-Policy`, `X-Frame-Options`, `X-Content-Type-Options`, `Referrer-Policy`. `server_tokens off` — Nginx version never disclosed.
+- Spring Security 6 sets some headers by default. Nginx strips the backend's version via `proxy_hide_header` before adding its own — prevents duplicate or conflicting headers.
+
+**Defense-in-depth rate limiting**
+- Layer 1 — Cloudflare WAF: request rate limiting at the edge.
+- Layer 2 — Nginx: per-IP rate limit zones per endpoint; returns `444` (TCP close, no response) for malicious scanners.
+- Layer 3 — Application: Bucket4j token bucket per IP (20 URL creations/hour), backed by Caffeine in-process cache.
+
+**Input validation**
+- URL scheme whitelist (`http://`, `https://` only) — blocks `javascript:` injection.
+- 2048-character max enforced at DTO layer and DB `CHECK` constraint.
+- Parameterized queries via JPA/Hibernate everywhere — no string concatenation in SQL.
+- Standardised error responses — no stack traces in HTTP responses.
+
+**Least privilege**
+- Non-root Docker containers, minimal base image (`eclipse-temurin:21-jre-alpine`).
+- Two DB users: Flyway gets DDL rights; the application user gets only DML on the URL table. No `ALTER`, `DROP`, or `TRUNCATE`.
+
+---
+
+## CI/CD Pipeline
+
+Three stages run on every push to `main` — all must pass before anything reaches EC2:
+
+1. **Test** — JUnit 5 unit tests + Testcontainers integration tests against a real PostgreSQL instance (not mocks).
+2. **Smoke** — Docker Compose spins up the full stack with ephemeral randomised credentials, hits the health endpoint, tears down. If this fails, deploy never runs.
+3. **Deploy** — polls for SSM agent availability (handles EC2 cold starts), issues `RunCommand` to pull the new image from GHCR and restart containers. Waits for command completion and exits non-zero on failure.
+
+After push, the image is signed with cosign before the deploy stage runs.
+
+---
+
 ## API
 
 | Method | Path | Description |
@@ -102,7 +168,8 @@ infra/
   nginx/                # Nginx configs (dev + prod)
   postgres/             # DB init scripts
 docs/
-  architecture/         # ADRs and architecture docs
+  architecture/         # ADRs, v1/v2 architecture docs, security hardening backlog
+  security/             # OWASP compliance checklist, threat model, DB least privilege
   deployment/           # AWS deployment runbook (phases A–F)
 diagrams/               # Architecture diagrams (SVG)
 docker-compose.yml      # Local dev stack

diff --git a/docs/architecture/00-baseline/v2/security-hardening-v2.md b/docs/architecture/00-baseline/v2/security-hardening-v2.md
@@ -0,0 +1,78 @@
+# Security Hardening — v2 Backlog
+
+> Items deferred from v1. Each is justified: either requires new infrastructure (Redis, frontend changes), or the v1 controls already reduce risk to an acceptable level for launch.
+
+---
+
+## 1. Cloudflare Turnstile CAPTCHA
+
+**Why deferred:** Requires Angular frontend changes (Turnstile widget), a new backend validation call to Cloudflare's `/siteverify` API, and a new SSM secret (`/tinyurl/app/turnstile-secret-key`). The v1 dual-layer rate limiting (nginx 40r/m + Bucket4j 20/hour per IP) is sufficient to block single-IP and slow-drip abuse for launch traffic.
+
+**Why it matters in v2:** Turnstile is the only control that stops a distributed botnet (many IPs, each staying under per-IP rate limits) from mass-creating spam/phishing URLs.
+
+**Implementation plan:**
+- Add `ngx-turnstile` (or equivalent) to the Angular form. On submit, pass `cf-turnstile-response` token alongside the URL.
+- Add `captchaToken` field to `CreateUrlRequest` DTO (required in prod profile, optional in dev).
+- Create `CaptchaVerificationService` — calls `https://challenges.cloudflare.com/turnstile/v0/siteverify` with the secret key and token. Use `RestClient` (Spring Boot 3.2+).
+- Call `captchaVerificationService.verify(token)` in `UrlServiceImpl.shortenUrl()` before any DB work. Throw `CaptchaException` → 400 if invalid.
+- Load secret key from SSM (`/tinyurl/app/turnstile-secret-key`) via existing AWS Parameter Store config.
+- Gate the requirement on a `tinyurl.captcha.enabled` property (false in dev, true in prod).
+
+**Note on SSRF:** The backend calls a fixed Cloudflare endpoint, not a user-supplied URL — no new SSRF surface.
+
+---
+
+## 2. Distributed Rate Limiting (Bucket4j + Redis)
+
+**Why deferred:** v1 runs on a single EC2 instance. `IpRateLimitFilter` uses Caffeine (in-process) to store per-IP token buckets. This is correct and sufficient for a single instance.
+
+**Why it matters in v2:** v2 introduces autoscaling (multiple app instances). Per-IP state stored in-process is not shared across instances — a bot can hit 3 instances and create 3× the intended quota. Redis-backed Bucket4j solves this with atomic bucket operations via Redis scripts.
+
+**Implementation plan:**
+- Add `com.bucket4j:bucket4j-redis` dependency alongside the existing `bucket4j-core`.
+- Replace `Caffeine.newBuilder().build(key -> newBucket())` in `IpRateLimitFilter` with a `ProxyManager` backed by `RedisBasedProxyManager` (using the same Redis instance added for the cache layer).
+- No change to the bucket policy (20 creations/hour/IP) or filter logic.
+- The existing `IpRateLimitFilter` structure is designed for this migration: the only change is the backing store.
+
+---
+
+## 3. CloudWatch Log Retention Policy
+
+**Why deferred:** No IaC (Terraform/CloudFormation) exists in the repository to set the CloudWatch log group retention period. Logs default to indefinite retention, which is a cost and compliance concern but not a security vulnerability.
+
+**Why it matters in v2:** Unbounded log retention accumulates cost and may conflict with data retention policies.
+
+**Implementation plan:**
+- When IaC is introduced in v2, set `retention_in_days = 90` on the CloudWatch log group for the application and nginx logs.
+- If IaC is not added before v2, add a one-time CLI step to the deployment runbook:
+  ```
+  aws logs put-retention-policy \
+    --log-group-name /tinyurl/api \
+    --retention-in-days 90
+  ```
+
+---
+
+## 4. CloudWatch Anomaly Alerting (4xx / 5xx Rate Spikes)
+
+**Why deferred:** CloudWatch alarms for infrastructure health exist (CPU, ALB 5xx). Application-level alerting on anomalous request patterns (sudden spike in 400s suggesting scanning, or 429s suggesting a rate-limit bypass attempt) requires a CloudWatch Metric Filter on the structured log stream and an alarm with a meaningful threshold.
+
+**Why it matters in v2:** v1 has observability (structured logs, Prometheus metrics) but no automated paging when attack patterns emerge. Without this, an ongoing abuse incident may go unnoticed until it impacts latency or DB load.
+
+**Implementation plan:**
+- Add a CloudWatch Metric Filter on the application log group for `status_class=4xx` and `status_class=5xx`.
+- Create a CloudWatch alarm: alert if `4xx rate > 15%` of total requests over a 5-minute window.
+- Create a CloudWatch alarm: alert if `5xx rate > 1%` over a 5-minute window.
+- Route alarms to an SNS topic → email or Slack webhook.
+- This integrates with the broader v2 observability work (Micrometer + Grafana).
+
+---
+
+## Status in OWASP Compliance
+
+| Item | OWASP Ref | Deferred Until |
+|---|---|---|
+| CAPTCHA (Turnstile) | API6 | v2 |
+| Distributed rate limiting | API6 | v2 (requires autoscaling) |
+| CloudWatch log retention | A09 | v2 (requires IaC) |
+| CloudWatch anomaly alerting | A09 | v2 (observability milestone) |
diff --git a/docs/architecture/00-baseline/v2/url-shortener-v2.md b/docs/architecture/00-baseline/v2/url-shortener-v2.md
@@ -306,7 +306,20 @@ Adding enterprise/global features now would increase complexity faster than it i
 
 ---
 
-## 11) Open Questions and Possible Future Enhancements
+## 11) Security Hardening Backlog
+
+The following security items are deferred from v1. v1 controls are sufficient for launch; these become mandatory when autoscaling and higher traffic are introduced. Full implementation details in [`security-hardening-v2.md`](./security-hardening-v2.md).
+
+| Item | Trigger for v2 |
+|---|---|
+| Cloudflare Turnstile CAPTCHA | Stops distributed bots bypassing per-IP rate limits |
+| Distributed rate limiting (Bucket4j + Redis) | Required when autoscaling introduces multiple app instances |
+| CloudWatch log retention policy (90 days) | Cost + compliance; requires IaC |
+| CloudWatch anomaly alerting (4xx/5xx spikes) | Automated incident detection; part of v2 observability milestone |
+
+---
+
+## 12) Open Questions and Possible Future Enhancements
 
 - Should custom aliases be enabled by default or gated by feature flag?
 - Should redirects be guess-resistant (to reduce enumeration) by moving from sequential IDs to randomized IDs?

diff --git a/infra/nginx/nginx.prod.conf b/infra/nginx/nginx.prod.conf
@@ -37,6 +37,9 @@ http {
     limit_req_status 429;
     limit_conn_status 429;
 
+    # Do not disclose nginx version in error pages and Server response header.
+    server_tokens off;
+
     # -----------------------------------------------------------------------
     # Malicious Scanner / Attack Tool User-Agent Blocking
     #
@@ -168,6 +171,33 @@ http {
         add_header Referrer-Policy "strict-origin-when-cross-origin" always;
         add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
 
+        # -------------------------------------------------------------------
+        # Content-Security-Policy
+        #
+        # This is a pure JSON API — it serves no HTML, scripts, styles, or
+        # media. default-src 'none' denies all resource loading, which is the
+        # correct policy for an API. frame-ancestors 'none' is belt-and-braces
+        # alongside X-Frame-Options DENY (CSP takes precedence in modern browsers).
+        # -------------------------------------------------------------------
+        add_header Content-Security-Policy "default-src 'none'; frame-ancestors 'none'" always;
+
+        # -------------------------------------------------------------------
+        # Cross-Origin isolation headers
+        #
+        # Cross-Origin-Opener-Policy: same-origin — isolates this origin's
+        #   browsing context group, blocking cross-origin window references.
+        #   Enables cross-origin isolation required for high-resolution timers.
+        #
+        # Cross-Origin-Resource-Policy: same-origin — prevents other origins
+        #   from reading this API's responses via no-cors fetches.
+        #
+        # Permissions-Policy: disables all browser feature APIs. An API
+        #   endpoint has no use for camera, microphone, geolocation, etc.
+        # -------------------------------------------------------------------
+        add_header Cross-Origin-Opener-Policy "same-origin" always;
+        add_header Cross-Origin-Resource-Policy "same-origin" always;
+        add_header Permissions-Policy "camera=(), microphone=(), geolocation=(), payment=(), usb=(), interest-cohort=()" always;
+
         # -------------------------------------------------------------------
         # Block common vulnerability scan paths
         #
@@ -204,7 +234,7 @@ http {
         # Anyone flooding POST /api/urls gets 429 before hitting the database.
         # -------------------------------------------------------------------
         location = /api/urls {
-            limit_req zone=create_url burst=10 nodelay;
+            limit_req zone=create_url burst=5 nodelay;
             limit_conn conn_limit 5;
 
             proxy_pass http://app:8080;

diff --git a/tinyurl/build.gradle.kts b/tinyurl/build.gradle.kts
@@ -35,6 +35,8 @@ dependencies {
 	implementation("org.flywaydb:flyway-database-postgresql")
 	implementation("io.micrometer:micrometer-registry-prometheus")
 	implementation("net.logstash.logback:logstash-logback-encoder:7.4")
+	implementation("com.bucket4j:bucket4j-core:8.9.0")
+	implementation("com.github.ben-manes.caffeine:caffeine")
 	compileOnly("org.projectlombok:lombok")
 	developmentOnly("org.springframework.boot:spring-boot-devtools")
 	runtimeOnly("org.postgresql:postgresql")