diff --git a/Cargo.lock b/Cargo.lock index e6a916de..a9557172 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -187,6 +187,7 @@ dependencies = [ "nix", "once_cell", "percent-encoding", + "proptest", "proxy_agent_shared", "regex", "serde", @@ -231,11 +232,26 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" -version = "2.6.0" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "bumpalo" @@ -430,6 +446,22 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + [[package]] name = "field-offset" version = "0.3.6" @@ -517,6 +549,18 @@ dependencies = [ "wasi", ] +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + [[package]] name = "gimli" version = "0.31.0" @@ -733,6 +777,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + [[package]] name = "log" version = "0.4.26" @@ -936,6 +986,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags", + "num-traits", + "rand 0.9.4", + "rand_chacha 0.9.0", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + [[package]] name = "proxy_agent_setup" version = "9.9.9" @@ -982,6 +1051,12 @@ dependencies = [ "winreg", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.37" @@ -991,6 +1066,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "rand" version = "0.8.6" @@ -998,8 +1079,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", ] [[package]] @@ -1009,7 +1100,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", ] [[package]] @@ -1018,7 +1119,25 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core 0.9.5", ] [[package]] @@ -1085,12 +1204,37 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "ryu" version = "1.0.18" @@ -1232,6 +1376,19 @@ dependencies = [ "windows", ] +[[package]] +name = "tempfile" +version = "3.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +dependencies = [ + "fastrand", + "getrandom 0.3.4", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "thiserror" version = "1.0.64" @@ -1414,6 +1571,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-ident" version = "1.0.13" @@ -1432,8 +1595,8 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ - "getrandom", - "rand", + "getrandom 0.2.15", + "rand 0.8.6", ] [[package]] @@ -1458,6 +1621,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + [[package]] name = "want" version = "0.3.1" @@ -1473,6 +1645,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -1758,6 +1939,12 @@ dependencies = [ "toml", ] +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + [[package]] name = "xml-rs" version = "0.8.22" diff --git a/cspell.json b/cspell.json index 52429808..fd1f639a 100644 --- a/cspell.json +++ b/cspell.json @@ -645,6 +645,12 @@ "Skus", "VMSS", "zeroize", + "greppable", + "HGAP", + "hostgaplugin", + "rsid", + "strat", + "uncanonicalizable", "Аdministrator", "аzureuser", "rооt" diff --git a/doc/plans/Innovation-2.1-canonical-request.md b/doc/plans/Innovation-2.1-canonical-request.md index 7cfb50d0..8e83bf52 100644 --- a/doc/plans/Innovation-2.1-canonical-request.md +++ b/doc/plans/Innovation-2.1-canonical-request.md @@ -284,6 +284,7 @@ hyper::Uri │ ▼ parse_scheme_method (must be http; reject https/ws/...; metho | `MalformedPercent` | Truncated / non-hex `%XX` | Deny; 400 | `CANON_PCT` | | `OverlongUtf8` | Classic IDS-bypass payload | Deny; 400 | `CANON_OVERLONG` | | `InvalidUtf8` | Random bytes or wrong codec | Deny; 400 | `CANON_UTF8` | +| `NonAscii` | Unicode confusable / homoglyph attack in **path** (query allows non-ASCII values) | Deny; 400 | `CANON_NON_ASCII` | | `ControlChar` | CRLF injection attempt | Deny; 400 | `CANON_CTRL` | | `PathUnderflow` | Too many `..` | Deny; 400 | `CANON_UNDERFLOW` | | `EmbeddedQuery` | `%3F` smuggling | Deny; 400 | `CANON_EMBQ` | diff --git a/pentest/linux/phase4_rules_fuzz/url_diff.py b/pentest/linux/phase4_rules_fuzz/url_diff.py index 5ad3c92a..4d817bd6 100755 --- a/pentest/linux/phase4_rules_fuzz/url_diff.py +++ b/pentest/linux/phase4_rules_fuzz/url_diff.py @@ -13,7 +13,7 @@ Loopback fabric only; safe to run from a normal user account. """ from __future__ import annotations -import argparse, datetime, http.client, os, sys, urllib.parse +import argparse, datetime, http.client, os, re, socket, sys, time, urllib.parse IMDS_HOST = "169.254.169.254" IMDS_PORT = 80 @@ -22,19 +22,35 @@ CANONICAL = "/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/" VARIANTS = { - "canonical": CANONICAL, - "uppercase_pct": "/metadata/identity/oauth2%2Ftoken?api-version=2018-02-01&resource=https://management.azure.com/", - "lowercase_pct": "/metadata/identity/oauth2%2ftoken?api-version=2018-02-01&resource=https://management.azure.com/", - "double_encoded": "/metadata/identity/oauth2%252Ftoken?api-version=2018-02-01&resource=https://management.azure.com/", - "trailing_dot_path": "/metadata/identity/oauth2/token./?api-version=2018-02-01&resource=https://management.azure.com/", - "dot_segments": "/metadata/./identity/../identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/", - "param_injection": "/metadata/identity/oauth2/token;x=1?api-version=2018-02-01&resource=https://management.azure.com/", - "case_path": "/Metadata/Identity/OAuth2/Token?api-version=2018-02-01&resource=https://management.azure.com/", - "extra_slashes": "//metadata///identity//oauth2//token?api-version=2018-02-01&resource=https://management.azure.com/", - "fragment": "/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/#x", - "unicode_dotless": "/metadata/\u0131dentity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/", + "canonical": CANONICAL, + "uppercase_pct": "/metadata/identity/oauth2%2Ftoken?api-version=2018-02-01&resource=https://management.azure.com/", + "lowercase_pct": "/metadata/identity/oauth2%2ftoken?api-version=2018-02-01&resource=https://management.azure.com/", + "double_encoded": "/metadata/identity/oauth2%252Ftoken?api-version=2018-02-01&resource=https://management.azure.com/", + "trailing_dot_path": "/metadata/identity/oauth2/token./?api-version=2018-02-01&resource=https://management.azure.com/", + "dot_segments": "/metadata/./identity/../identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/", + "dot_segments_encoded": "/metadata/%2E/identity/%2E%2E/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/", + "param_injection": "/metadata/identity/oauth2/token;x=1?api-version=2018-02-01&resource=https://management.azure.com/", + "case_path": "/Metadata/Identity/OAuth2/Token?api-version=2018-02-01&resource=https://management.azure.com/", + "extra_slashes": "//metadata///identity//oauth2//token?api-version=2018-02-01&resource=https://management.azure.com/", + "fragment": "/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/#x", + "unicode_dotless": "/metadata/\u0131dentity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/", } +# Variants whose effectiveness depends on the raw bytes reaching GPA verbatim. +# Python's http.client.HTTPConnection currently passes the request-target +# through to the wire without collapsing `./`, `../`, repeated slashes, or +# decoding `%2E` — but this is an undocumented quirk of the CPython +# implementation, not a contract. .NET's System.Uri normalizes all of these +# before the request leaves the box; if a future Python release follows suit +# (or a sysadmin adds a transparent proxy that rewrites paths), the +# differential test silently turns into a no-op. Sending these as a literal +# HTTP/1.1 request line via a raw socket guarantees the bytes hit GPA +# verbatim and matches what the Windows harness does (Phase4-RulesFuzz.ps1 +# Send-VariantRaw). The eBPF cgroup redirector still intercepts outbound +# traffic to IMDS:80 at the kernel layer, so the raw connection is picked +# up by GPA the same as http.client. +RAW_VARIANTS = frozenset({"dot_segments", "dot_segments_encoded"}) + def send(path: str) -> tuple[int, str]: try: @@ -48,6 +64,54 @@ def send(path: str) -> tuple[int, str]: return -1, f"{type(exc).__name__}: {exc}" +def send_raw(path: str) -> tuple[int, str]: + """Write a literal HTTP/1.1 request line over a raw socket, bypassing any + URL parsing/canonicalization that http.client or a future Python release + might perform. The request bytes contain `path` verbatim.""" + req_text = ( + f"GET {path} HTTP/1.1\r\n" + f"Host: {IMDS_HOST}\r\n" + "Metadata: true\r\n" + "Connection: close\r\n" + "Accept: */*\r\n" + "\r\n" + ) + sock: socket.socket | None = None + try: + sock = socket.create_connection((IMDS_HOST, IMDS_PORT), timeout=8) + sock.sendall(req_text.encode("ascii")) + sock.settimeout(8) + chunks: list[bytes] = [] + total = 0 + deadline = time.monotonic() + 8.0 + while total < 4096 and time.monotonic() < deadline: + try: + buf = sock.recv(min(4096 - total, 1024)) + except socket.timeout: + break + if not buf: + break + chunks.append(buf) + total += len(buf) + text = b"".join(chunks).decode("latin-1", errors="replace") + status_line = text.split("\r\n", 1)[0] if text else "" + m = re.match(r"^HTTP/1\.[01]\s+(\d{3})", status_line) + code = int(m.group(1)) if m else -1 + body_head = "" + idx = text.find("\r\n\r\n") + if 0 <= idx and idx + 4 < len(text): + body_head = text[idx + 4 : idx + 84].replace("\n", " ").replace("\t", " ") + return code, body_head + except Exception as exc: # pragma: no cover + return -1, f"RawSocket {type(exc).__name__}: {exc}" + finally: + if sock is not None: + try: + sock.close() + except Exception: + pass + + def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--out", default=os.environ.get( @@ -63,12 +127,17 @@ def main() -> int: rows = [] diff_count = 0 for name, path in VARIANTS.items(): - status, head = send(path) + if name in RAW_VARIANTS: + status, head = send_raw(path) + sender = "raw" + else: + status, head = send(path) + sender = "net" diff = "DIFF" if status != baseline_status else "same" if diff == "DIFF": diff_count += 1 rows.append((name, status, diff, urllib.parse.quote(path, safe="/?&=:%")[:120], head[:60])) - print(f" {name:<18} status={status:<4} {diff}") + print(f" {name:<22} [{sender}] status={status:<4} {diff}") with open(args.out, "a", encoding="utf-8") as fh: ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") diff --git a/pentest/linux/phase4b_local_rules/run.py b/pentest/linux/phase4b_local_rules/run.py index 7c111ce8..6131c309 100755 --- a/pentest/linux/phase4b_local_rules/run.py +++ b/pentest/linux/phase4b_local_rules/run.py @@ -170,6 +170,26 @@ def get_current_user_identity() -> dict: } +def _wire_encode_path(path: str) -> str: + """Percent-encode non-ASCII path bytes as UTF-8 before send. + + Python's http.client transmits the request line via request.encode('ascii') + and raises UnicodeEncodeError on any raw non-ASCII char (e.g. the U+0131 + dotless-i confusable in token_unicode_dotless). That exception is caught + below as a false -1 — the request never reaches GPA, so we'd never observe + the real authorization decision. Browsers and .NET's System.Uri percent- + encode non-ASCII path bytes as UTF-8 before sending; mirror that so the + confusable arrives on the wire as %C4%B1 and GPA's 403 is actually exercised. + + Only non-ASCII codepoints are rewritten; ASCII (including existing %XX, + '/', '?', ';', '&', '=') passes through verbatim so every other probe is + byte-for-byte unchanged on the wire.""" + return "".join( + ch if ord(ch) < 0x80 else "".join(f"%{b:02X}" for b in ch.encode("utf-8")) + for ch in path + ) + + def send(target: Target, method: str, path: str, headers: Optional[dict] = None, timeout: float = 8.0) -> int: h: dict[str, str] = {} @@ -181,7 +201,7 @@ def send(target: Target, method: str, path: str, h.update(headers) try: conn = http.client.HTTPConnection(target.host, target.port, timeout=timeout) - conn.request(method, path, headers=h) + conn.request(method, _wire_encode_path(path), headers=h) resp = conn.getresponse() resp.read(64) conn.close() @@ -631,6 +651,67 @@ def _imds_scenarios(target: Target, identity: dict) -> list[Scenario]: ], )) + # S20: explicit fail-closed of the IMDS managed-identity token endpoint. + # defaultAccess=deny + mode=enforce + empty privileges means EVERY path — + # including every URL-equivalence variant of /metadata/identity/oauth2/ + # token (mixed case, %2F-encoded slash, double-encoded, trailing dot, + # extra slashes, fragment, matrix param, Unicode confusable) — must fall + # through to the deny default. With shadow-mode (M3) enabled this is the + # rule set under which the canonical authorizer can surface + # CANON_DIVERGENCE log lines: any variant where the legacy substring + # matcher and the canonical normalized-path matcher disagree shows up as + # `legacy=allow canon=deny` (or vice versa) in the GPA service log even + # though the on-wire response stays 403. Any 200 response here is a true + # bypass. + # `token_extra_slashes` may also surface as 404 once the pre-canonical + # traversal guard (path_has_traversal) catches `//` before the rule + # engine runs — both 403 and 404 mean the bypass attempt failed. + scenarios.append(Scenario( + sid=f"{pfx}-S20-deny-token-path", + target=target, + description="enforce + deny + no privileges; every URL-equivalence " + "variant of the IMDS token path must 403 (or 404 when " + "GPA traversal-guard fires)", + rules={ + "defaultAccess": "deny", + "mode": "enforce", + "id": "pentest-s20", + "rules": {}, + }, + probes=[ + Probe("token_canonical", + "/metadata/identity/oauth2/token?api-version=2018-02-01" + "&resource=https://management.azure.com/", 403), + Probe("token_uppercase_pct", + "/metadata/identity/oauth2%2Ftoken?api-version=2018-02-01" + "&resource=https://management.azure.com/", 403), + Probe("token_lowercase_pct", + "/metadata/identity/oauth2%2ftoken?api-version=2018-02-01" + "&resource=https://management.azure.com/", 403), + Probe("token_double_encoded", + "/metadata/identity/oauth2%252Ftoken?api-version=2018-02-01" + "&resource=https://management.azure.com/", 403), + Probe("token_trailing_dot_path", + "/metadata/identity/oauth2/token./?api-version=2018-02-01" + "&resource=https://management.azure.com/", 403), + Probe("token_param_injection", + "/metadata/identity/oauth2/token;x=1?api-version=2018-02-01" + "&resource=https://management.azure.com/", 403), + Probe("token_case_path", + "/Metadata/Identity/OAuth2/Token?api-version=2018-02-01" + "&resource=https://management.azure.com/", 403), + Probe("token_extra_slashes", + "//metadata///identity//oauth2//token?api-version=2018-02-01" + "&resource=https://management.azure.com/", (403, 404)), + Probe("token_fragment", + "/metadata/identity/oauth2/token?api-version=2018-02-01" + "&resource=https://management.azure.com/#x", 403), + Probe("token_unicode_dotless", + "/metadata/\u0131dentity/oauth2/token?api-version=2018-02-01" + "&resource=https://management.azure.com/", 403), + ], + )) + return scenarios diff --git a/pentest/linux/test_catalog.py b/pentest/linux/test_catalog.py index 70524c7f..cf127e29 100644 --- a/pentest/linux/test_catalog.py +++ b/pentest/linux/test_catalog.py @@ -294,6 +294,12 @@ class TestInfo(TypedDict, total=False): "design": "Identity `userName` uses Cyrillic о (U+043E) in place of ASCII 'o'. The matcher must compare bytes, not Unicode-normalized forms — 200 would mean Unicode normalization is happening in identity comparison and could be used to bypass authZ.", "automation": "Rewrites the harness identity's userName with Cyrillic о substitutions; expects 403.", }, + "IMDS-S20-deny-token-path": { + "title": "Fail-closed deny of the IMDS managed-identity token endpoint across URL-equivalence variants", + "design": "defaultAccess=deny + mode=enforce + empty privileges. Every URL-equivalence variant of /metadata/identity/oauth2/token (mixed case, %2F/%2f/%252F-encoded slash, trailing dot path, extra slashes, fragment, matrix param, Unicode confusable like U+0131) must 403 — none of them should reach the upstream. With shadow-mode (M3) enabled this is the rule set under which the canonical authorizer surfaces `CANON_DIVERGENCE` log lines for any variant where the legacy substring matcher and the canonical normalized-path matcher disagree (`legacy=allow canon=deny` or vice versa) even though the on-wire response stays 403. Any 200 is a true bypass.", + "automation": "Writes `defaultAccess=deny, mode=enforce, rules={}`; probes 10 equivalent forms of the token path. `token_extra_slashes` accepts (403, 404) since the pre-canonical traversal guard (path_has_traversal) may fire on `//` before the rule engine runs — both outcomes mean the bypass failed.", + "fix": "A 200 response here means a URL canonicalization gap before AuthZ. Audit the request path through `authorization_rules.rs` → enable canonical-mode enforce (M5 sunset of the legacy substring matcher) — by design every variant in this scenario normalizes to /metadata/identity/oauth2/token, which falls through to defaultAccess=deny.", + }, # ---------------- WireServer ---------------- "WS-S1-disabled-allow": { diff --git a/pentest/windows/Phase4-RulesFuzz.ps1 b/pentest/windows/Phase4-RulesFuzz.ps1 index bf8b068f..abe88970 100644 --- a/pentest/windows/Phase4-RulesFuzz.ps1 +++ b/pentest/windows/Phase4-RulesFuzz.ps1 @@ -20,19 +20,34 @@ if (-not $Out) { $Out = $UrlDiffTsv } $canonical = '/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/' $variants = [ordered]@{ - 'canonical' = $canonical - 'uppercase_pct' = '/metadata/identity/oauth2%2Ftoken?api-version=2018-02-01&resource=https://management.azure.com/' - 'lowercase_pct' = '/metadata/identity/oauth2%2ftoken?api-version=2018-02-01&resource=https://management.azure.com/' - 'double_encoded' = '/metadata/identity/oauth2%252Ftoken?api-version=2018-02-01&resource=https://management.azure.com/' - 'trailing_dot_path' = '/metadata/identity/oauth2/token./?api-version=2018-02-01&resource=https://management.azure.com/' - 'dot_segments' = '/metadata/./identity/../identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/' - 'param_injection' = '/metadata/identity/oauth2/token;x=1?api-version=2018-02-01&resource=https://management.azure.com/' - 'case_path' = '/Metadata/Identity/OAuth2/Token?api-version=2018-02-01&resource=https://management.azure.com/' - 'extra_slashes' = '//metadata///identity//oauth2//token?api-version=2018-02-01&resource=https://management.azure.com/' - 'fragment' = '/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/#x' - 'unicode_dotless' = "/metadata/$([char]0x131)dentity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/" + 'canonical' = $canonical + 'uppercase_pct' = '/metadata/identity/oauth2%2Ftoken?api-version=2018-02-01&resource=https://management.azure.com/' + 'lowercase_pct' = '/metadata/identity/oauth2%2ftoken?api-version=2018-02-01&resource=https://management.azure.com/' + 'double_encoded' = '/metadata/identity/oauth2%252Ftoken?api-version=2018-02-01&resource=https://management.azure.com/' + 'trailing_dot_path' = '/metadata/identity/oauth2/token./?api-version=2018-02-01&resource=https://management.azure.com/' + 'dot_segments' = '/metadata/./identity/../identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/' + 'dot_segments_encoded' = '/metadata/%2E/identity/%2E%2E/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/' + 'param_injection' = '/metadata/identity/oauth2/token;x=1?api-version=2018-02-01&resource=https://management.azure.com/' + 'case_path' = '/Metadata/Identity/OAuth2/Token?api-version=2018-02-01&resource=https://management.azure.com/' + 'extra_slashes' = '//metadata///identity//oauth2//token?api-version=2018-02-01&resource=https://management.azure.com/' + 'fragment' = '/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/#x' + 'unicode_dotless' = "/metadata/$([char]0x131)dentity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/" } +# Variants whose effectiveness depends on the raw bytes reaching GPA verbatim. +# System.Net.HttpWebRequest constructs a System.Uri that canonicalizes `./` +# and `../` and decodes `%2E` BEFORE the request leaves the box, so without +# raw-socket sending these variants would be silently rewritten to the +# canonical path and never actually exercise GPA's traversal guard nor the +# canonical-vs-legacy shadow comparator. Sending them as a literal HTTP/1.1 +# request line via TcpClient bypasses the .NET URI parser entirely; the WFP +# redirector still intercepts the outbound connection to ImdsIp:80 the same +# way it does for HttpWebRequest, so the bytes hit GPA verbatim. +$rawVariants = [System.Collections.Generic.HashSet[string]]::new( + [System.StringComparer]::OrdinalIgnoreCase) +[void]$rawVariants.Add('dot_segments') +[void]$rawVariants.Add('dot_segments_encoded') + function Send-Variant { param([string] $Path) # Use HttpWebRequest so we can pass paths with raw %-sequences without @@ -60,6 +75,61 @@ function Send-Variant { } } +function Send-VariantRaw { + param([string] $Path) + # Write a literal HTTP/1.1 request line to a TcpClient stream so that + # System.Uri / HttpWebRequest cannot canonicalize `./`, `../`, `%2E`, + # or repeated slashes before send. The WFP redirector intercepts + # outbound traffic to ImdsIp:80 at the kernel layer, so TcpClient is + # picked up by GPA the same as HttpWebRequest. + $client = New-Object System.Net.Sockets.TcpClient + try { + $client.SendTimeout = 8000 + $client.ReceiveTimeout = 8000 + $client.Connect($ImdsIp, 80) + $stream = $client.GetStream() + # Build the raw request bytes manually so the path is sent verbatim. + $reqText = "GET $Path HTTP/1.1`r`nHost: $ImdsIp`r`nMetadata: true`r`nConnection: close`r`nAccept: */*`r`n`r`n" + $reqBytes = [System.Text.Encoding]::ASCII.GetBytes($reqText) + $stream.Write($reqBytes, 0, $reqBytes.Length) + $stream.Flush() + + # Read up to 4 KB — enough for the status line + headers + a body head. + $buf = New-Object byte[] 4096 + $total = 0 + $deadline = [DateTime]::UtcNow.AddSeconds(8) + while ($total -lt $buf.Length -and [DateTime]::UtcNow -lt $deadline) { + if (-not $stream.DataAvailable) { + Start-Sleep -Milliseconds 50 + continue + } + $n = $stream.Read($buf, $total, $buf.Length - $total) + if ($n -le 0) { break } + $total += $n + } + $text = [System.Text.Encoding]::ASCII.GetString($buf, 0, $total) + $statusLine = ($text -split "`r`n", 2)[0] + if ($statusLine -match '^HTTP/1\.[01]\s+(\d{3})') { + $code = [int]$Matches[1] + } else { + $code = -1 + } + # Body head = everything after the first blank line, lightly cleaned. + $headBody = '' + $idx = $text.IndexOf("`r`n`r`n") + if ($idx -ge 0 -and $idx + 4 -lt $text.Length) { + $headBody = $text.Substring($idx + 4) + } + $head = ($headBody.Substring(0, [Math]::Min(80, $headBody.Length))) -replace "[`r`n`t]", ' ' + return [pscustomobject]@{ Code = $code; Head = $head } + } catch { + return [pscustomobject]@{ Code = -1; Head = ("RawSocket: $($_.Exception.Message)" -replace "[`r`n`t]", ' ') } + } finally { + if ($client.Connected) { $client.Close() } + $client.Dispose() + } +} + $baseline = Send-Variant $canonical Write-Host ("baseline ({0}) for canonical path" -f $baseline.Code) @@ -67,10 +137,16 @@ $ts = (Get-Date).ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ssZ') $rows = @() $diffs = 0 foreach ($k in $variants.Keys) { - $r = Send-Variant $variants[$k] + if ($rawVariants.Contains($k)) { + $r = Send-VariantRaw $variants[$k] + $sender = 'raw' + } else { + $r = Send-Variant $variants[$k] + $sender = 'net' + } $verdict = if ($r.Code -ne $baseline.Code) { 'DIFF' } else { 'same' } if ($verdict -eq 'DIFF') { $diffs++ } - Write-Host (" {0,-18} status={1,-4} {2}" -f $k, $r.Code, $verdict) + Write-Host (" {0,-22} [{1}] status={2,-4} {3}" -f $k, $sender, $r.Code, $verdict) $rows += "$ts`t$k`t$($r.Code)`t$verdict`t$($variants[$k])`t$($r.Head)" } diff --git a/pentest/windows/Phase4b-LocalRules.ps1 b/pentest/windows/Phase4b-LocalRules.ps1 index ea101470..464aaa61 100644 --- a/pentest/windows/Phase4b-LocalRules.ps1 +++ b/pentest/windows/Phase4b-LocalRules.ps1 @@ -479,6 +479,34 @@ function Build-ImdsScenarios { Write-Finding "$pfx-S19-identity-homoglyph" INFO ("skipped: '{0}' has no character with a Cyrillic visual look-alike in our table" -f $Identity.userName) } + # S20 — explicit fail-closed of the IMDS managed-identity token endpoint. + # defaultAccess=deny + mode=enforce + empty privileges means EVERY path + # — including every URL-equivalence variant of /metadata/identity/oauth2/ + # token (mixed case, %2F-encoded slash, double-encoded, dot segments, + # extra slashes, fragment, matrix param, Unicode confusable) — must fall + # through to the deny default. With shadow-mode (M3) enabled this is the + # rule set under which the canonical authorizer can surface + # CANON_DIVERGENCE log lines: any variant where the legacy substring + # matcher and the canonical normalized-path matcher disagree shows up as + # `legacy=allow canon=deny` (or vice versa) in the GPA service log even + # though the on-wire response stays 403. Any 200 response here is a true + # bypass. + $scn += New-Scenario -Sid "$pfx-S20-deny-token-path" -TargetName imds ` + -Description 'enforce + deny + no privileges; every URL-equivalence variant of the IMDS token path must 403 (or 404 when GPA traversal-guard fires)' ` + -Rules @{ defaultAccess = 'deny'; mode = 'enforce'; id = 'pentest-s20'; rules = @{} } ` + -Probes @( + [Probe]::new('token_canonical', '/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/', 403), + [Probe]::new('token_uppercase_pct', '/metadata/identity/oauth2%2Ftoken?api-version=2018-02-01&resource=https://management.azure.com/', 403), + [Probe]::new('token_lowercase_pct', '/metadata/identity/oauth2%2ftoken?api-version=2018-02-01&resource=https://management.azure.com/', 403), + [Probe]::new('token_double_encoded', '/metadata/identity/oauth2%252Ftoken?api-version=2018-02-01&resource=https://management.azure.com/', 403), + [Probe]::new('token_trailing_dot_path', '/metadata/identity/oauth2/token./?api-version=2018-02-01&resource=https://management.azure.com/', 403), + [Probe]::new('token_param_injection', '/metadata/identity/oauth2/token;x=1?api-version=2018-02-01&resource=https://management.azure.com/', 403), + [Probe]::new('token_case_path', '/Metadata/Identity/OAuth2/Token?api-version=2018-02-01&resource=https://management.azure.com/', 403), + [Probe]::new('token_extra_slashes', '//metadata///identity//oauth2//token?api-version=2018-02-01&resource=https://management.azure.com/', 403), + [Probe]::new('token_fragment', '/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/#x', 403), + [Probe]::new('token_unicode_dotless', "/metadata/$([char]0x131)dentity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/", 403) + ) + return $scn } diff --git a/pentest/windows/TestCatalog.psm1 b/pentest/windows/TestCatalog.psm1 index a67e430f..c6f07e3c 100644 --- a/pentest/windows/TestCatalog.psm1 +++ b/pentest/windows/TestCatalog.psm1 @@ -210,6 +210,7 @@ $script:Phase4b = @{ 'IMDS-S17-name-collision-shielded' = @{ Title='Local rule names already containing LocalFileRules_ prefix'; Design='Tests that `prefix_local_rule_names` rewrites both names AND cross-references consistently even when the admin pre-includes the prefix — so the rule still binds. 403 on /metadata/instance would mean the prefixer mishandled the doubled prefix.' } 'IMDS-S18-dangling-references'= @{ Title='Dangling role→privilege reference must reject'; Design='Validator must reject; agent fail-closes to defaultAccess=deny. 200 means dangling-ref validation regressed.' } 'IMDS-S19-identity-homoglyph' = @{ Title='Cyrillic homoglyph in userName must NOT match ASCII user'; Design='Identity userName uses a Cyrillic look-alike (e.g. а=U+0430, е=U+0435, о=U+043E, …) in place of one character of the current user''s ASCII name. The two strings render identically in most fonts but are NOT byte-equal. Matcher must compare bytes, not Unicode-normalized forms — 200 would mean Unicode normalization is happening in identity comparison and could be used to bypass authZ. Skipped (INFO) when the current user has no character with a Cyrillic look-alike in the harness table.' } + 'IMDS-S20-deny-token-path' = @{ Title='Fail-closed deny of the IMDS managed-identity token endpoint across URL-equivalence variants'; Design='defaultAccess=deny + mode=enforce + empty privileges. Every URL-equivalence variant of /metadata/identity/oauth2/token (mixed case, %2F/%2f/%252F-encoded slash, dot segments, extra slashes, fragment, matrix param, Unicode confusable like U+0131) must 403 — none of them should reach the upstream. With shadow-mode (M3) enabled this is the rule set under which the canonical authorizer can surface `CANON_DIVERGENCE` log lines for variants where the legacy substring-matcher and canonical normalized-path matcher disagree. Any 200 here is a true bypass.' } 'WS-S1-disabled-allow' = @{ Title='Control: mode=disabled + allow (WireServer)'; Design='Baseline 200s for goalstate and versions.' } 'WS-S2-enforce-deny-empty' = @{ Title='Fail-closed enforce+deny (WireServer)'; Design='All WireServer calls 403.' } 'WS-S3-audit-deny-empty' = @{ Title='Local `mode` field must be ignored (WireServer regression guard)'; Design='Writes mode=audit + defaultAccess=deny + empty rules to WireServer_Rules.json. PRE confirms remote mode=enforce, so probes must 403. A 200 would mean the agent honored `mode` from the local file.' } diff --git a/proxy_agent/Cargo.toml b/proxy_agent/Cargo.toml index 3b8cd9a1..bb7d9021 100644 --- a/proxy_agent/Cargo.toml +++ b/proxy_agent/Cargo.toml @@ -31,6 +31,12 @@ libc = "0.2.147" socket2 = "0.5" # Set socket options without tokio/std conversion base64 = "0.22" percent-encoding = "2.3" +# Optional: only compiled when the `proptests` feature is enabled (see +# `[features]` below). Lives under `[dependencies]` rather than +# `[dev-dependencies]` because Cargo rejects `optional` on dev-deps; +# the only `use` sites are inside `#[cfg(all(test, feature = "proptests"))]` +# modules so the crate never lands in the production binary. +proptest = { version = "1.11", optional = true } [dependencies.uuid] version = "1.3.0" @@ -84,6 +90,13 @@ features = [ [features] test-with-root = [] +# Innovation 2.1 M2 property tests. Off by default so the inner-loop +# `cargo test` skips both the proptest *crate compilation* (~3s the +# first time) and the 256-case generation per property. The `dep:` form +# below makes Cargo treat the optional `proptest` dependency as opt-in +# too — with the feature off the crate is not pulled into the build +# graph at all. CI picks it up via `--all-features` in build-linux.sh. +proptests = ["dep:proptest"] [package.metadata.deb] name = "azure-proxy-agent" @@ -101,4 +114,4 @@ assets = [ ["azure-proxy-agent", "usr/sbin/azure-proxy-agent", "755"], # Binary ["proxy-agent.json", "etc/azure/proxy-agent.json", "644"], ["ebpf_cgroup.o", "usr/lib/azure-proxy-agent/ebpf_cgroup.o", "644"], -] \ No newline at end of file +] diff --git a/proxy_agent/config/GuestProxyAgent.linux.json b/proxy_agent/config/GuestProxyAgent.linux.json index 34007661..90b6d0b5 100644 --- a/proxy_agent/config/GuestProxyAgent.linux.json +++ b/proxy_agent/config/GuestProxyAgent.linux.json @@ -9,5 +9,6 @@ "cgroupRoot": "/sys/fs/cgroup", "fileLogLevel": "Trace", "fileLogLevelForEvents": "Info", - "fileLogLevelForSystemEvents": "Info" + "fileLogLevelForSystemEvents": "Info", + "canonicalRequestMode": "Shadow" } \ No newline at end of file diff --git a/proxy_agent/config/GuestProxyAgent.windows.json b/proxy_agent/config/GuestProxyAgent.windows.json index ccad256c..b43e5768 100644 --- a/proxy_agent/config/GuestProxyAgent.windows.json +++ b/proxy_agent/config/GuestProxyAgent.windows.json @@ -8,5 +8,6 @@ "ebpfProgramName": "redirect.bpf.sys", "fileLogLevel": "Trace", "fileLogLevelForEvents": "Info", - "fileLogLevelForSystemEvents": "Info" + "fileLogLevelForSystemEvents": "Info", + "canonicalRequestMode": "Shadow" } \ No newline at end of file diff --git a/proxy_agent/proptest-regressions/proxy/canonical/property_tests.txt b/proxy_agent/proptest-regressions/proxy/canonical/property_tests.txt new file mode 100644 index 00000000..6777a0c8 --- /dev/null +++ b/proxy_agent/proptest-regressions/proxy/canonical/property_tests.txt @@ -0,0 +1,9 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 6e318ae548f23aef626ed306c43a3903aaa8d87519f7314c005fa58118e5b587 # shrinks to url = "http://169.254.169.254/;/" +cc ced0a9f4654c5313e012532a2fe6b322ab32cfa67d567b3668e31bd3089ba09b # shrinks to raw = 1 +cc 9f6ed1e81539e9e4aa92df7aaa4ab446a4fccad216ed51984342293b42b3ad5d # shrinks to url = "http://169.254.169.254/.;" diff --git a/proxy_agent/src/common/config.rs b/proxy_agent/src/common/config.rs index 021ef065..4365f148 100644 --- a/proxy_agent/src/common/config.rs +++ b/proxy_agent/src/common/config.rs @@ -17,6 +17,7 @@ //! ``` use crate::common::constants; +use crate::common::logger; use once_cell::sync::Lazy; use proxy_agent_shared::{logger::LoggerLevel, misc_helpers}; use serde_derive::{Deserialize, Serialize}; @@ -78,6 +79,16 @@ pub fn get_enable_http_proxy_trace() -> bool { SYSTEM_CONFIG.enableHttpProxyTrace.unwrap_or(false) } +/// Rollout flag for the Innovation 2.1 canonical request pipeline. +/// +/// Read from the optional `canonicalRequestMode` key in the GPA config +/// JSON. Accepted values: `off`, `shadow`, `enforce`. Anything missing +/// or unparseable resolves to [`CanonicalMode::Off`] so production +/// traffic keeps the legacy authorizer end-to-end. +pub fn get_canonical_request_mode() -> crate::proxy::canonical::CanonicalMode { + SYSTEM_CONFIG.get_canonical_request_mode() +} + #[derive(Serialize, Deserialize)] #[allow(non_snake_case)] pub struct Config { @@ -104,6 +115,13 @@ pub struct Config { /// This is an optional config, mainly for manual debugging purpose #[serde(skip_serializing_if = "Option::is_none")] enableHttpProxyTrace: Option, + /// Innovation 2.1 canonical request rollout flag. + /// Optional; absent or unparseable values resolve to + /// [`crate::proxy::canonical::CanonicalMode::Off`] so production + /// behavior is unchanged when the key is missing. + /// Accepted values: `off`, `shadow`, `enforce`. + #[serde(skip_serializing_if = "Option::is_none")] + canonicalRequestMode: Option, } impl Default for Config { @@ -211,6 +229,33 @@ impl Config { } None } + + /// Resolve the canonical-request rollout flag. + /// + /// Returns [`crate::proxy::canonical::CanonicalMode::Off`] when the + /// config key is absent or holds an unrecognized value (with a + /// warning logged in the latter case). This conservative default is + /// what guarantees the "behavior unchanged for production traffic" + /// M3 exit criterion: nothing in the canonical pipeline runs until + /// the operator explicitly opts in. + pub fn get_canonical_request_mode(&self) -> crate::proxy::canonical::CanonicalMode { + use crate::proxy::canonical::CanonicalMode; + match &self.canonicalRequestMode { + None => CanonicalMode::Off, + Some(s) => match CanonicalMode::from_str(s) { + Ok(mode) => mode, + Err(err) => { + // Don't take down the agent over a typo'd config key; + // fall back to Off (the safe default) and log so + // operators see the misconfiguration in trace. + logger::write_warning(format!( + "Invalid canonicalRequestMode={s:?} in config ({err}); defaulting to Off" + )); + CanonicalMode::Off + } + }, + } + } } #[cfg(test)] diff --git a/proxy_agent/src/proxy.rs b/proxy_agent/src/proxy.rs index b0be815c..721e912c 100644 --- a/proxy_agent/src/proxy.rs +++ b/proxy_agent/src/proxy.rs @@ -32,6 +32,7 @@ //! ``` pub mod authorization_rules; +pub mod canonical; pub mod proxy_authorizer; pub mod proxy_connection; pub mod proxy_server; diff --git a/proxy_agent/src/proxy/authorization_rules.rs b/proxy_agent/src/proxy/authorization_rules.rs index 857c3ade..7eb261d4 100644 --- a/proxy_agent/src/proxy/authorization_rules.rs +++ b/proxy_agent/src/proxy/authorization_rules.rs @@ -20,6 +20,7 @@ use super::{proxy_connection::ConnectionLogger, Claims}; use crate::common::logger; use crate::key_keeper::key::{AuthorizationItem, AuthorizationRules, Identity, Privilege, Role}; +use crate::proxy::canonical::{self, CanonError, CanonicalMode, CanonicalPattern}; use proxy_agent_shared::logger::LoggerLevel; use proxy_agent_shared::misc_helpers; use serde_derive::{Deserialize, Serialize}; @@ -73,6 +74,40 @@ pub struct ComputedAuthorizationItem { // all the defined unique identities, distinct by name // key - identity name, value - identity object pub identities: HashMap, + + /// Innovation 2.1 shadow cache: each `Privilege` compiled through + /// the canonical pipeline once at rule-load time so per-request + /// shadow evaluation in [`ComputedAuthorizationItem::canonical_decision`] + /// is a plain linear scan instead of re-parsing the rule path on + /// every connection. + /// + /// Stored as `Vec<(name, pattern)>` rather than `HashMap` because: + /// - The access pattern is "scan all" — `canonical_decision` + /// iterates every entry looking for matches; there is no + /// lookup-by-privilege-name path on the canonical side. + /// - Iteration order is stable (insertion order), which makes + /// divergence logs and any future "first match wins" semantics + /// deterministic across processes — `HashMap` iteration is + /// randomized. + /// - For typical rule counts (tens of entries) the per-entry + /// overhead is smaller than `HashMap`. + /// + /// Skipped from (de)serialization because: + /// 1. The cache is a pure function of `privileges` — round-tripping + /// it through the AuthorizationRulesForLogging JSON would just + /// duplicate that data and risk drift if the canonical pipeline + /// is upgraded between writer and reader. + /// 2. `CanonicalPattern` is not (and intentionally should not be) + /// `Serialize`/`Deserialize` — its shape is an internal contract + /// of the matcher. + /// + /// Privileges that fail to canonicalize at load time are dropped + /// from this cache **and** a warning is logged. This is fail-closed + /// for the canonical side: shadow / enforce mode will report a + /// divergence (legacy may still match the un-canonicalizable rule) + /// which is exactly the signal we want during rollout. + #[serde(skip, default)] + canonical_patterns: Vec<(String, CanonicalPattern)>, } #[allow(dead_code)] @@ -171,11 +206,41 @@ impl ComputedAuthorizationItem { defaultAllowed: authorization_item.defaultAccess.to_lowercase() == "allow", mode: authorization_mode, identities: identity_dict, + canonical_patterns: Self::compile_canonical_patterns(&privilege_dict), privileges: privilege_dict, privilegeAssignments: privilege_assignments, } } + /// Compile every privilege through the canonical pipeline once. + /// + /// Errors are logged and the offending privilege is dropped from + /// the cache (fail-closed on the canonical side; the legacy matcher + /// retains its copy in `self.privileges`). This is exactly the + /// shape of divergence shadow-mode is designed to surface. + fn compile_canonical_patterns( + privilege_dict: &HashMap, + ) -> Vec<(String, CanonicalPattern)> { + let mut out = Vec::with_capacity(privilege_dict.len()); + for (name, privilege) in privilege_dict { + match CanonicalPattern::from_privilege(privilege) { + Ok(pat) => out.push((name.clone(), pat)), + Err(e) => { + // Don't fail rule-load; canonical mode will deny + // anything that needed this rule, which is the M3 + // signal we want operators to see. + logger::write_warning(format!( + "Privilege '{name}' failed canonicalization ({code}); dropping from canonical cache (legacy matcher unaffected). path={path:?}", + name = name, + code = e.code(), + path = privilege.path, + )); + } + } + } + out + } + pub fn is_allowed( &self, logger: &mut ConnectionLogger, @@ -259,6 +324,203 @@ impl ComputedAuthorizationItem { ); self.defaultAllowed } + + // ------------------------------------------------------------------ + // Innovation 2.1 M3 — shadow-mode integration. + // + // The two methods below add the canonical-pipeline evaluator and the + // divergence comparator that `proxy_authorizer::authorize` invokes + // when the rollout flag is `shadow` or `enforce`. With the default + // flag (`off`) neither runs and the legacy path above is the + // entirety of the authorization decision — the M3 exit criterion + // "behavior unchanged for production traffic". + // ------------------------------------------------------------------ + + /// Canonical-pipeline mirror of [`is_allowed`]. + /// + /// Runs the request through `canonical::canonicalize` and matches it + /// against the precomputed [`CanonicalPattern`]s in + /// `self.canonical_patterns`. The identity check is shared with the + /// legacy path (same `Identity::is_match` semantics, same default + /// fallback) — the only difference is *path matching*. This is what + /// the comparator targets. + /// + /// **Fail-closed**: canonicalization errors collapse to + /// [`CanonicalDecision::Error`], which the comparator surfaces in + /// the divergence record and which the enforce-mode caller treats + /// as deny. + pub fn canonical_decision( + &self, + logger: &mut ConnectionLogger, + request_uri: &hyper::Uri, + request_method: &hyper::Method, + claims: &Claims, + ) -> CanonicalDecision { + // Disabled mode short-circuits identically to legacy. We keep + // the two implementations symmetric here so a divergence is + // always attributable to path/query handling, never to the + // disabled-skip. + if self.mode == AuthorizationMode::Disabled { + return CanonicalDecision::Allowed; + } + + let canon = match canonical::canonicalize(request_uri, request_method) { + Ok(c) => c, + Err(e) => return CanonicalDecision::Error(e), + }; + + let mut any_pattern_matched = false; + for (privilege_name, pattern) in &self.canonical_patterns { + if !pattern.matches(&canon) { + continue; + } + any_pattern_matched = true; + logger.write( + LoggerLevel::Trace, + format!("[canonical] Request matched privilege '{privilege_name}'."), + ); + + if let Some(assignments) = self.privilegeAssignments.get(privilege_name) { + for identity_name in assignments { + if let Some(identity) = self.identities.get(identity_name) { + if identity.is_match(logger, claims) { + return CanonicalDecision::Allowed; + } + } + } + } + } + + if any_pattern_matched { + // Same semantics as legacy: any privilege matched but no + // identity matched -> deny (do NOT fall through to default). + return CanonicalDecision::Denied; + } + if self.defaultAllowed { + CanonicalDecision::Allowed + } else { + CanonicalDecision::Denied + } + } + + /// Compare the precomputed legacy decision with the canonical + /// pipeline's verdict and emit a single divergence log line per + /// request when they disagree. + /// + /// The shape of the emitted line is the M3 telemetry contract from + /// `doc/plans/Innovation-2.1-canonical-request.md` §9.2 — see + /// [`DivergenceRecord`]. We log it as a prefixed single-line + /// `key=value` record so dev/test grep / structured log shippers can + /// pick it up without a new telemetry pipeline (deferred to M4). + /// + /// Returns the canonical decision so an enforce-mode caller can use + /// it directly. Off / shadow callers ignore the return value. + pub fn shadow_compare( + &self, + logger: &mut ConnectionLogger, + request_uri: &hyper::Uri, + request_method: &hyper::Method, + claims: &Claims, + legacy_allowed: bool, + mode: CanonicalMode, + ) -> CanonicalDecision { + debug_assert!( + mode != CanonicalMode::Off, + "shadow_compare invoked while canonical mode is Off; the proxy_authorizer guard should have skipped this call" + ); + + let canon = self.canonical_decision(logger, request_uri, request_method, claims); + + let canon_allowed = canon.allowed_or_fail_closed(); + if canon_allowed != legacy_allowed { + let record = DivergenceRecord { + rule_set_id: &self.id, + mode, + legacy_decision: legacy_allowed, + canon_decision: &canon, + request_uri, + }; + // Warn level: M3 wants this visible in dev/test without a + // separate telemetry pipeline. It's per-request only when + // there *is* a divergence, so noise stays low. The plan + // is to graduate this to a structured telemetry event in + // M4 (§9.3). + logger.write(LoggerLevel::Warn, record.to_log_line()); + } + canon + } +} + +/// Outcome of [`ComputedAuthorizationItem::canonical_decision`]. +/// +/// Distinct from the legacy `bool` return so the shadow comparator can +/// distinguish "canonicalization rejected the request" from "matcher +/// said deny" in the divergence record. Both collapse to `false` under +/// [`CanonicalDecision::allowed_or_fail_closed`]. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum CanonicalDecision { + Allowed, + Denied, + /// The canonical pipeline rejected the request before any matcher + /// ran. Always a deny in fail-closed evaluation. + Error(CanonError), +} + +impl CanonicalDecision { + /// Project to a boolean using fail-closed semantics: errors become + /// `false`. Used by the comparator to decide whether the canonical + /// verdict diverges from the legacy bool. + pub fn allowed_or_fail_closed(&self) -> bool { + matches!(self, CanonicalDecision::Allowed) + } +} + +impl std::fmt::Display for CanonicalDecision { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CanonicalDecision::Allowed => write!(f, "allow"), + CanonicalDecision::Denied => write!(f, "deny"), + CanonicalDecision::Error(e) => write!(f, "error:{}", e.code()), + } + } +} + +/// Single divergence event between the legacy authorizer and the +/// canonical pipeline. Mapped to a one-line audit string by +/// [`DivergenceRecord::to_log_line`]; M4 will graduate this to a +/// structured telemetry event per design §9.3. +struct DivergenceRecord<'a> { + rule_set_id: &'a str, + mode: CanonicalMode, + legacy_decision: bool, + canon_decision: &'a CanonicalDecision, + /// The full original `hyper::Uri`. We only log the path+query + /// portion to keep authority/userinfo (already validated by the + /// canonical pipeline) out of audit lines. + request_uri: &'a hyper::Uri, +} + +impl<'a> DivergenceRecord<'a> { + fn to_log_line(&self) -> String { + // `CANON_DIVERGENCE` is a stable prefix so structured log + // shippers and ad-hoc grep can both find these. Field order is + // also part of the contract — append-only. + let path = self.request_uri.path(); + let query = self.request_uri.query().unwrap_or(""); + let path_and_query = if query.is_empty() { + path.to_string() + } else { + format!("{path}?{query}") + }; + format!( + "CANON_DIVERGENCE mode={mode} rule_set={rsid} legacy={legacy} canon={canon} uri={uri:?}", + mode = self.mode, + rsid = self.rule_set_id, + legacy = if self.legacy_decision { "allow" } else { "deny" }, + canon = self.canon_decision, + uri = path_and_query, + ) + } } #[derive(Serialize, Deserialize, Clone)] @@ -740,4 +1002,232 @@ mod tests { "Trusted user must be allowed through percent-encoded path (%2f)" ); } + + // ------------------------------------------------------------------ + // Innovation 2.1 M3 — shadow-mode integration tests. + // + // These tests verify the new canonical-pipeline evaluator and the + // shadow comparator added in `ComputedAuthorizationItem`. They do + // NOT exercise the proxy_authorizer wire-up directly (that needs + // a live config getter); see `proxy_authorizer::tests` for the + // outer plumbing. + // + // What we pin here: + // 1. canonical_decision agrees with legacy on a path that does + // not exercise canonicalization differences. + // 2. canonical_decision *disagrees* with legacy on the exact + // bypass class M2 was built to catch — substring vs + // segment prefix — so shadow telemetry has the signal it + // promises in §9.2. + // 3. canonical_decision returns Error (fail-closed) when the + // pipeline rejects the URL, and Display surfaces a stable + // `error:` token for the audit log. + // 4. compile_canonical_patterns drops privileges that cannot be + // canonicalized, without taking down the rule load. + // 5. shadow_compare returns the canonical decision so an + // enforce-mode caller can use it (M5/M6 hand-off). + // ------------------------------------------------------------------ + + use crate::proxy::authorization_rules::CanonicalDecision; + use crate::proxy::canonical::{CanonError, CanonicalMode}; + + /// Builds a single-privilege rule set: privilege path `/test` is + /// granted to one identity named "trusted". Default-deny. + fn build_test_rules(privilege_path: &str) -> ComputedAuthorizationItem { + let access_control_rules = AccessControlRules { + roles: Some(vec![Role { + name: "r".to_string(), + privileges: vec!["test".to_string()], + }]), + privileges: Some(vec![Privilege { + name: "test".to_string(), + path: privilege_path.to_string(), + queryParameters: None, + }]), + identities: Some(vec![Identity { + name: "trusted".to_string(), + userName: Some("trusted-user".to_string()), + groupName: None, + exePath: None, + processName: None, + }]), + roleAssignments: Some(vec![RoleAssignment { + role: "r".to_string(), + identities: vec!["trusted".to_string()], + }]), + }; + let authorization_item = AuthorizationItem { + defaultAccess: "deny".to_string(), + mode: "enforce".to_string(), + rules: Some(access_control_rules), + id: "test-rs".to_string(), + }; + ComputedAuthorizationItem::from_authorization_item(authorization_item) + } + + fn trusted_claims() -> Claims { + Claims { + userId: 0, + userName: "trusted-user".to_string(), + userGroups: vec![], + processId: 0, + processFullPath: PathBuf::from("p"), + clientIp: "0".to_string(), + clientPort: 0, + processName: OsString::from("p"), + processCmdLine: "p".to_string(), + runAsElevated: true, + } + } + + #[test] + fn canonical_decision_agrees_with_legacy_on_clean_paths() { + // Sanity: when nothing in the URL exercises a canonical-vs- + // substring difference, the two evaluators must agree. If this + // ever flips, every shadow log line becomes noise and we lose + // the M3 signal. So this is a regression net for both sides. + let rules = build_test_rules("/test"); + let claims = trusted_claims(); + let mut log = ConnectionLogger::new(0, 0); + + let cases: &[(&str, bool)] = &[ + // (uri, expected_allowed) + ("http://169.254.169.254/test/x", true), + ("http://169.254.169.254/test", true), + // not under /test -> default deny + ("http://169.254.169.254/other", false), + ]; + for (uri_str, expected) in cases { + let uri = hyper::Uri::from_str(uri_str).unwrap(); + let legacy = rules.is_allowed(&mut log, uri.clone(), claims.clone()); + let canon = rules.canonical_decision(&mut log, &uri, &hyper::Method::GET, &claims); + assert_eq!(legacy, *expected, "legacy verdict for {uri_str}"); + assert_eq!( + canon.allowed_or_fail_closed(), + *expected, + "canonical verdict for {uri_str}" + ); + assert_eq!( + legacy, + canon.allowed_or_fail_closed(), + "shadow drift for {uri_str}" + ); + } + } + + #[test] + fn canonical_decision_diverges_on_substring_vs_segment_prefix() { + // THE M3 motivating divergence. Legacy `Privilege::is_match` + // uses `starts_with` -> `/test-evil/x` is "under /test" by + // substring. Canonical does segment-by-segment prefix + // matching -> `/test-evil/x` is NOT under `/test`. Shadow mode + // exists to report this exact class. + let rules = build_test_rules("/test"); + let claims = trusted_claims(); + let mut log = ConnectionLogger::new(0, 0); + + let uri = hyper::Uri::from_str("http://169.254.169.254/test-evil/anything").unwrap(); + let legacy = rules.is_allowed(&mut log, uri.clone(), claims.clone()); + let canon = rules.canonical_decision(&mut log, &uri, &hyper::Method::GET, &claims); + + // Legacy is buggy here (allows the SSRF-shaped path). + assert!( + legacy, + "legacy substring match should still allow /test-evil; if this fails, legacy was fixed and this test should be retired" + ); + // Canonical correctly denies. + assert_eq!( + canon, + CanonicalDecision::Denied, + "canonical segment match must deny /test-evil" + ); + // The shape we'll feed to telemetry: + assert_ne!( + legacy, + canon.allowed_or_fail_closed(), + "this is the divergence shadow mode must surface" + ); + } + + #[test] + fn canonical_decision_returns_error_for_userinfo_url() { + // Userinfo in the authority -> canonical rejects at the + // pipeline entry with UserinfoPresent. The decision must + // collapse to fail-closed and Display must surface the stable + // error code so audit logs can grep for it. + let rules = build_test_rules("/test"); + let claims = trusted_claims(); + let mut log = ConnectionLogger::new(0, 0); + + // Build via hyper to avoid relying on whether `from_str` + // accepts the userinfo form. + let uri: hyper::Uri = match "http://attacker@169.254.169.254/test".parse() { + Ok(u) => u, + Err(_) => { + // Hyper may refuse to parse this -> nothing to test. + eprintln!("hyper refused to parse userinfo URL; skipping"); + return; + } + }; + let canon = rules.canonical_decision(&mut log, &uri, &hyper::Method::GET, &claims); + assert_eq!( + canon, + CanonicalDecision::Error(CanonError::UserinfoPresent), + "canonical must surface UserinfoPresent (got {canon:?})" + ); + assert!(!canon.allowed_or_fail_closed(), "errors must fail-closed"); + // The audit-log token shape is part of the §9.2 telemetry + // contract — keep it stable across refactors. + assert_eq!(canon.to_string(), "error:CANON_USERINFO"); + } + + #[test] + fn shadow_compare_returns_canonical_decision_for_enforce_handoff() { + // shadow_compare's return value is the M5/M6 hand-off: the + // enforce-mode caller will use it as the decision once the + // shadow window proves zero divergences. We pin its + // semantics here so a future enforce wiring doesn't get a + // surprise. + let rules = build_test_rules("/test"); + let claims = trusted_claims(); + let mut log = ConnectionLogger::new(0, 0); + + // Divergent case: legacy=true (substring), canon=Denied. + let uri = hyper::Uri::from_str("http://169.254.169.254/test-evil/anything").unwrap(); + let legacy_allowed = true; + let canon = rules.shadow_compare( + &mut log, + &uri, + &hyper::Method::GET, + &claims, + legacy_allowed, + CanonicalMode::Shadow, + ); + assert_eq!( + canon, + CanonicalDecision::Denied, + "shadow_compare must return the canonical verdict" + ); + } + + #[test] + fn compile_canonical_patterns_drops_uncanonicalizable_rules() { + // A privilege whose path can't be canonicalized must NOT + // brick the whole rule-load — the legacy matcher still has + // its copy and the canonical cache simply skips it. This is + // exactly the asymmetry shadow mode is designed to surface. + // + // We use an overlong %C0%AF (canonically `/`) which the + // pipeline rejects with OverlongUtf8. + let rules = build_test_rules("/x%C0%AFy"); + + // Legacy still sees the privilege: + assert_eq!(rules.privileges.len(), 1); + // Canonical cache dropped it: + assert_eq!( + rules.canonical_patterns.len(), + 0, + "uncanonicalizable privilege must be skipped from canonical cache" + ); + } } diff --git a/proxy_agent/src/proxy/canonical/destination.rs b/proxy_agent/src/proxy/canonical/destination.rs new file mode 100644 index 00000000..34c8b9fb --- /dev/null +++ b/proxy_agent/src/proxy/canonical/destination.rs @@ -0,0 +1,873 @@ +// Copyright (c) Microsoft Corporation +// SPDX-License-Identifier: MIT + +//! Destination classification. +//! +//! Maps the host+port of an incoming request to one of GPA's known +//! endpoints (IMDS, WireServer, HostGAPlugin). Numeric host forms +//! (decimal, hex, octal, IPv4-mapped IPv6, etc.) all canonicalize to the +//! same variant — this is the defense against pentest C7. +//! +//! Hostnames that are not IP literals are *not* DNS-resolved here. DNS at +//! the matcher would be a confused-deputy surface; instead we surface the +//! host text in [`Destination::Unknown`] so rule authors can write +//! explicit allow rules keyed on host text if they need it. + +use std::fmt; +use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; + +use hyper::Uri; + +use crate::common::constants; + +use super::CanonError; + +/// Address family of an [`Destination::Unknown`] target. Kept narrow so +/// we don't accidentally treat numeric strings as hostnames. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum AddrFamily { + V4, + V6, + Name, +} + +/// Canonical destination. Matching uses the typed enum only; the raw +/// `host_text` on `Unknown` is for audit, never for matching decisions. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum Destination { + /// Instance Metadata Service: 169.254.169.254:80 in any encoding. + Imds, + /// Azure WireServer: 168.63.129.16:80. + WireServer, + /// Host GuestAgent Plugin: 168.63.129.16:32526. + HostGaPlugin, + /// Anything else. The matcher denies unknowns unless an explicit rule + /// allows them. + Unknown { + family: AddrFamily, + ip: Option, + port: u16, + host_text: Option, + }, +} + +impl fmt::Display for Destination { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Destination::Imds => f.write_str("imds"), + Destination::WireServer => f.write_str("wireserver"), + Destination::HostGaPlugin => f.write_str("hostga"), + Destination::Unknown { .. } => f.write_str("unknown"), + } + } +} + +/// Classify the destination of a request URI. See module docs. +pub fn classify(uri: &Uri) -> Result { + // Reject userinfo (`user@host` smuggling). + if uri + .authority() + .map(|a| a.as_str().contains('@')) + .unwrap_or(false) + { + return Err(CanonError::UserinfoPresent); + } + + let host = match uri.host() { + Some(h) => h, + // Origin-form requests (the common proxy case) have no authority. + // We must still allow them to flow: the destination is decided by + // the redirector at the socket layer, not by the URL. + None => { + return Ok(Destination::Unknown { + family: AddrFamily::Name, + ip: None, + port: 0, + host_text: None, + }); + } + }; + + let port = if let Some(p) = uri.port() { + // Happy path: hyper successfully framed the port and it fits in + // u16. Re-parsing the text here is a belt-and-suspenders check + // so we never widen the return type just to thread an infallible + // unwrap through. + p.as_str().parse::().map_err(|_| CanonError::BadPort)? + } else { + // hyper's port() / port_u16() both return None when the port + // text is *present but malformed* (e.g. ":99999" overflows + // u16). The old `port_u16().unwrap_or(IMDS_PORT)` collapsed + // this into the default, which let + // `http://169.254.169.254:99999/x` smuggle past as IMDS. + // Inspect the authority string ourselves to tell "no port" + // apart from "bad port". For IPv6, the host is bracketed + // (`[...]`), so any colon *after* `]` is the port separator; + // otherwise the last `:` (if any) is. + let port_text: Option<&str> = uri.authority().and_then(|a| { + let s = a.as_str(); + if let Some(rb) = s.rfind(']') { + s[rb + 1..].strip_prefix(':') + } else { + s.rsplit_once(':').map(|(_, p)| p) + } + }); + match port_text { + Some(text) => text.parse::().map_err(|_| CanonError::BadPort)?, + None => constants::IMDS_PORT, + } + }; + + let ip = parse_host_as_ip(host)?; + match ip { + Some(IpAddr::V4(v4)) => Ok(known_v4(v4, port).unwrap_or(Destination::Unknown { + family: AddrFamily::V4, + ip: Some(IpAddr::V4(v4)), + port, + host_text: Some(host.to_string()), + })), + Some(IpAddr::V6(v6)) => { + // IPv4-mapped IPv6 (::ffff:a.b.c.d) projects down to IPv4 so + // it shares the same Destination as the dotted form. + if let Some(v4) = v6.to_ipv4_mapped() { + if let Some(known) = known_v4(v4, port) { + return Ok(known); + } + return Ok(Destination::Unknown { + family: AddrFamily::V4, + ip: Some(IpAddr::V4(v4)), + port, + host_text: Some(host.to_string()), + }); + } + Ok(Destination::Unknown { + family: AddrFamily::V6, + ip: Some(IpAddr::V6(v6)), + port, + host_text: Some(host.to_string()), + }) + } + None => Ok(Destination::Unknown { + family: AddrFamily::Name, + ip: None, + port, + host_text: Some(host.to_string()), + }), + } +} + +fn known_v4(v4: Ipv4Addr, port: u16) -> Option { + let imds: Ipv4Addr = constants::IMDS_IP.parse().ok()?; + let wire: Ipv4Addr = constants::WIRE_SERVER_IP.parse().ok()?; + + if v4 == imds && port == constants::IMDS_PORT { + return Some(Destination::Imds); + } + if v4 == wire && port == constants::WIRE_SERVER_PORT { + return Some(Destination::WireServer); + } + if v4 == wire && port == constants::GA_PLUGIN_PORT { + return Some(Destination::HostGaPlugin); + } + None +} + +/// Parse a host string into an `IpAddr` when it is an IP literal in any +/// historical numeric form. Returns `Ok(None)` for true hostnames (i.e. +/// not an IP), which the caller treats as `Destination::Unknown`. +/// +/// Supports: +/// - dotted quad `169.254.169.254` +/// - 32-bit decimal `2852039166` +/// - 32-bit hex `0xa9fea9fe` +/// - octal-quad `0251.0376.0251.0376` +/// - hex-quad `0xa9.0xfe.0xa9.0xfe` +/// - mixed forms allowed per RFC 3493 / inet_aton tradition +/// - bracketed IPv6 `[::ffff:169.254.169.254]` (brackets handled by hyper) +fn parse_host_as_ip(host: &str) -> Result, CanonError> { + // Tolerate both forms (hyper strips brackets in most versions but + // not all). Strip surrounding `[]` if present before parsing IPv6. + let host_unbracketed = host + .strip_prefix('[') + .and_then(|h| h.strip_suffix(']')) + .unwrap_or(host); + if let Ok(v6) = host_unbracketed.parse::() { + return Ok(Some(IpAddr::V6(v6))); + } + if let Ok(v4) = host.parse::() { + return Ok(Some(IpAddr::V4(v4))); + } + + // Trailing dot on hostname (`metadata.azure.internal.`) — strip then + // re-classify. A bare `.` is not a host. + let trimmed = host.trim_end_matches('.'); + if trimmed.is_empty() { + return Err(CanonError::BadHost); + } + + if let Some(v4) = parse_inet_aton(trimmed)? { + return Ok(Some(IpAddr::V4(v4))); + } + + // Not an IP literal in any supported form. Distinguish "valid + // hostname" from "garbage": at least one ASCII alphanumeric and no + // forbidden characters. + if trimmed + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '.') + && trimmed.chars().any(|c| c.is_ascii_alphanumeric()) + { + Ok(None) + } else { + Err(CanonError::BadHost) + } +} + +/// `inet_aton`-style numeric IPv4 parser. +/// +/// Implemented by hand (rather than calling out to libc) because +/// `inet_aton` behavior is platform-dependent: glibc accepts `0x` and +/// leading-zero octal; musl is stricter; Windows differs again. A +/// hand-rolled parser keeps Linux and Windows builds identical. +fn parse_inet_aton(input: &str) -> Result, CanonError> { + // Must look numeric. Reject early if it has any character outside the + // numeric/separator set so we don't shadow a legitimate hostname. + if input.is_empty() { + return Err(CanonError::BadHost); + } + let looks_numeric = input + .chars() + .all(|c| c.is_ascii_hexdigit() || c == 'x' || c == 'X' || c == '.'); + if !looks_numeric { + return Ok(None); + } + + let parts: Vec<&str> = input.split('.').collect(); + if parts.is_empty() || parts.len() > 4 { + return Err(CanonError::BadHost); + } + // Empty parts (e.g. trailing dot already stripped, double dot here) + // are illegal. + if parts.iter().any(|p| p.is_empty()) { + return Err(CanonError::BadHost); + } + + let nums: Vec = parts + .iter() + .map(|p| parse_numeric_octet(p)) + .collect::, _>>()?; + + let addr: u32 = match nums.len() { + // single 32-bit number: maps directly + 1 => nums[0], + // a.b => a in top 8 bits, b in low 24 + 2 => { + if nums[0] > 0xFF || nums[1] > 0x00FF_FFFF { + return Err(CanonError::BadHost); + } + (nums[0] << 24) | nums[1] + } + // a.b.c => a,b top 16 bits, c low 16 + 3 => { + if nums[0] > 0xFF || nums[1] > 0xFF || nums[2] > 0xFFFF { + return Err(CanonError::BadHost); + } + (nums[0] << 24) | (nums[1] << 16) | nums[2] + } + // a.b.c.d => standard dotted quad + 4 => { + if nums.iter().any(|&n| n > 0xFF) { + return Err(CanonError::BadHost); + } + (nums[0] << 24) | (nums[1] << 16) | (nums[2] << 8) | nums[3] + } + _ => return Err(CanonError::BadHost), + }; + + Ok(Some(Ipv4Addr::from(addr))) +} + +fn parse_numeric_octet(s: &str) -> Result { + // 0x... => hex + if let Some(rest) = s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")) { + if rest.is_empty() || rest.len() > 8 { + return Err(CanonError::BadHost); + } + return u32::from_str_radix(rest, 16).map_err(|_| CanonError::BadHost); + } + // 0... (and not just "0") => octal + if s.len() > 1 && s.starts_with('0') { + return u32::from_str_radix(&s[1..], 8).map_err(|_| CanonError::BadHost); + } + // decimal + s.parse::().map_err(|_| CanonError::BadHost) +} + +#[cfg(test)] +mod destination_tests { + use super::*; + + fn aton(s: &str) -> Result, CanonError> { + parse_inet_aton(s) + } + fn host(s: &str) -> Result, CanonError> { + parse_host_as_ip(s) + } + fn ip(s: &str) -> Ipv4Addr { + s.parse().unwrap() + } + fn uri(s: &str) -> Uri { + s.parse().unwrap() + } + + // ----------------------------------------------------------------- + // parse_numeric_octet + // ----------------------------------------------------------------- + + #[test] + fn numeric_octet_accepts_valid_forms() { + // (input, expected). Covers decimal (incl. u32::MAX and lone + // zero), hex with both prefix cases, and octal. + let cases: &[(&str, u32)] = &[ + ("0", 0), + ("169", 169), + ("255", 255), + ("4294967295", u32::MAX), + ("0x0", 0), + ("0xa9", 0xA9), + ("0XA9", 0xA9), + ("0xa9fea9fe", 0xA9FE_A9FE), + ("0xFFFFFFFF", u32::MAX), + ("0251", 0o251), // 169 + ("0376", 0o376), // 254 + ("00", 0), + ]; + for (input, expected) in cases { + assert_eq!( + parse_numeric_octet(input).unwrap(), + *expected, + "input={input:?}" + ); + } + } + + #[test] + fn numeric_octet_rejects_invalid_forms() { + // All of these must be BadHost: decimal overflow, hex without + // digits, hex too long for u32, non-hex digits in hex, non-octal + // digits in an octal-prefixed string, and empty input. + let bad: &[&str] = &[ + "4294967296", // decimal overflow + "0x", // empty hex + "0X", // empty hex (upper prefix) + "0x100000000", // hex too long for u32 + "0xZZ", // bad hex digit + "08", // 8 is not octal + "0129", // 9 is not octal + "", // empty + ]; + for input in bad { + assert_eq!( + parse_numeric_octet(input).unwrap_err(), + CanonError::BadHost, + "input={input:?}" + ); + } + } + + // ----------------------------------------------------------------- + // parse_inet_aton + // ----------------------------------------------------------------- + + #[test] + fn inet_aton_accepts_all_numeric_forms() { + let imds = Ipv4Addr::new(169, 254, 169, 254); + // (input, expected). Covers dotted quad, single decimal, single + // hex, octal/hex/mixed quads, and the 2-/3-part legacy forms. + let two_part = { + let v: u32 = (169u32 << 24) | 16_624_894; + Ipv4Addr::from(v) + }; + let cases: &[(&str, Ipv4Addr)] = &[ + ("169.254.169.254", imds), + ("2852039166", imds), + ("0xa9fea9fe", imds), + ("0251.0376.0251.0376", imds), + ("0xa9.0xfe.0xa9.0xfe", imds), + ("169.0xfe.0251.254", imds), + ("169.254.43518", imds), // 3-part form + ("169.16624894", two_part), // 2-part form + ]; + for (input, expected) in cases { + assert_eq!(aton(input).unwrap(), Some(*expected), "input={input:?}"); + } + } + + #[test] + fn inet_aton_rejects_malformed_inputs() { + // Empty parts, too-many parts, octet overflow at each supported + // arity, and the empty string itself. + let bad: &[&str] = &[ + "", + "1.2.3.4.5", // too many parts + "1..2.3", // double dot + ".1.2.3", // leading dot + "1.2.3.", // trailing dot + "300.1.1.1", // 4-part octet > 0xFF + "256.0.0.0", // 4-part octet > 0xFF + "256.1", // 2-part top byte > 0xFF + "1.16777216", // 2-part low value > 0x00FF_FFFF + "1.2.65536", // 3-part last value > 0xFFFF + ]; + for input in bad { + assert_eq!( + aton(input).unwrap_err(), + CanonError::BadHost, + "input={input:?}" + ); + } + } + + #[test] + fn inet_aton_passes_through_non_numeric_hostnames() { + // Hostnames must fall through with Ok(None), not error — the + // caller decides whether to allow them. + for input in ["metadata", "metadata.azure.internal", "host-with-dash"] { + assert_eq!(aton(input).unwrap(), None, "input={input:?}"); + } + } + + // ----------------------------------------------------------------- + // parse_host_as_ip + // ----------------------------------------------------------------- + + #[test] + fn host_parses_all_ip_literal_forms() { + // (input, expected IpAddr). Covers IPv4 dotted, IPv4 numeric + // (falls through to inet_aton), IPv6 plain, IPv6 bracketed + // (tolerance for hyper versions that don't strip brackets), and + // IPv4-mapped IPv6. + let v6_mapped: Ipv6Addr = "::ffff:169.254.169.254".parse().unwrap(); + let cases: &[(&str, IpAddr)] = &[ + ("127.0.0.1", IpAddr::V4(Ipv4Addr::LOCALHOST)), + ("2852039166", IpAddr::V4(Ipv4Addr::new(169, 254, 169, 254))), + ("::1", IpAddr::V6(Ipv6Addr::LOCALHOST)), + ("[::1]", IpAddr::V6(Ipv6Addr::LOCALHOST)), + ("::ffff:169.254.169.254", IpAddr::V6(v6_mapped)), + ]; + for (input, expected) in cases { + assert_eq!(host(input).unwrap(), Some(*expected), "input={input:?}"); + } + } + + #[test] + fn host_returns_none_for_valid_hostnames() { + // Hostnames (including the RFC 1034 trailing-dot form) must not + // be silently treated as IPs. + for input in [ + "metadata.azure.internal", + "metadata.azure.internal.", + "foo", + "a-b-c.example", + ] { + assert_eq!(host(input).unwrap(), None, "input={input:?}"); + } + } + + #[test] + fn host_rejects_garbage() { + // Empty, bare dot, and any input containing characters outside + // the hostname/IP alphabet must be rejected — not silently + // treated as a hostname. + for input in ["", ".", "foo bar", "foo/bar", "foo_bar"] { + assert_eq!( + host(input).unwrap_err(), + CanonError::BadHost, + "input={input:?}" + ); + } + } + + // ----------------------------------------------------------------- + // known_v4 + // ----------------------------------------------------------------- + + #[test] + fn known_v4_maps_recognized_endpoints() { + let cases: &[(&str, u16, Destination)] = &[ + (constants::IMDS_IP, constants::IMDS_PORT, Destination::Imds), + ( + constants::WIRE_SERVER_IP, + constants::WIRE_SERVER_PORT, + Destination::WireServer, + ), + ( + constants::WIRE_SERVER_IP, + constants::GA_PLUGIN_PORT, + Destination::HostGaPlugin, + ), + ]; + for (addr, port, expected) in cases { + assert_eq!( + known_v4(ip(addr), *port), + Some(expected.clone()), + "ip={addr}, port={port}" + ); + } + } + + #[test] + fn known_v4_returns_none_for_misses() { + // Correct IP / wrong port and any unrelated IP must not be + // promoted to a known destination. + let cases: &[(&str, u16)] = &[ + (constants::IMDS_IP, 8080), + (constants::WIRE_SERVER_IP, 8080), + ("8.8.8.8", constants::IMDS_PORT), + ("127.0.0.1", constants::IMDS_PORT), + ]; + for (addr, port) in cases { + assert_eq!(known_v4(ip(addr), *port), None, "ip={addr}, port={port}"); + } + } + + // ----------------------------------------------------------------- + // classify (URI-level entrypoint) + // ----------------------------------------------------------------- + + #[test] + fn classify_resolves_known_destinations() { + // (url, expected). Covers default-port inference for IMDS, + // explicit-port forms, WireServer, HostGA, and the IPv4-mapped + // IPv6 projection (pentest C7). + let cases: &[(&str, Destination)] = &[ + ("http://169.254.169.254/x", Destination::Imds), + ("http://169.254.169.254:80/x", Destination::Imds), + ("http://168.63.129.16:80/x", Destination::WireServer), + ("http://168.63.129.16:32526/x", Destination::HostGaPlugin), + ("http://[::ffff:169.254.169.254]/x", Destination::Imds), + ]; + for (url, expected) in cases { + assert_eq!(classify(&uri(url)).unwrap(), *expected, "url={url}"); + } + } + + #[test] + fn classify_falls_back_to_unknown_for_unrecognized_targets() { + // IMDS IP on the wrong port must NOT inherit the IMDS variant. + match classify(&uri("http://169.254.169.254:8080/x")).unwrap() { + Destination::Unknown { + family: AddrFamily::V4, + ip: Some(IpAddr::V4(v4)), + port: 8080, + .. + } => assert_eq!(v4, ip(constants::IMDS_IP)), + other => panic!("expected Unknown V4 on port 8080, got {other:?}"), + } + + // Arbitrary public IP -> Unknown V4 with host_text preserved. + match classify(&uri("http://1.2.3.4/x")).unwrap() { + Destination::Unknown { + family: AddrFamily::V4, + ip: Some(IpAddr::V4(v4)), + host_text: Some(_), + .. + } => assert_eq!(v4, Ipv4Addr::new(1, 2, 3, 4)), + other => panic!("expected Unknown V4, got {other:?}"), + } + + // Hostname -> Unknown Name with host_text preserved (no DNS). + match classify(&uri("http://metadata.azure.internal/x")).unwrap() { + Destination::Unknown { + family: AddrFamily::Name, + ip: None, + host_text: Some(s), + .. + } => assert_eq!(s, "metadata.azure.internal"), + other => panic!("expected Unknown Name, got {other:?}"), + } + + // Non-mapped IPv6 -> Unknown V6. + match classify(&uri("http://[::1]/x")).unwrap() { + Destination::Unknown { + family: AddrFamily::V6, + ip: Some(IpAddr::V6(v6)), + .. + } => assert_eq!(v6, Ipv6Addr::LOCALHOST), + other => panic!("expected Unknown V6, got {other:?}"), + } + + // Origin-form (no authority) -> stub Unknown with no info. + let origin_form: Uri = "/metadata/identity".parse().unwrap(); + match classify(&origin_form).unwrap() { + Destination::Unknown { + family: AddrFamily::Name, + ip: None, + port: 0, + host_text: None, + } => {} + other => panic!("expected stub Unknown for origin-form, got {other:?}"), + } + } + + #[test] + fn classify_rejects_bad_inputs() { + // (url, expected error). Userinfo smuggling and host text with + // characters outside the hostname/IP alphabet. + let cases: &[(&str, CanonError)] = &[ + ("http://user@169.254.169.254/x", CanonError::UserinfoPresent), + ("http://foo_bar/x", CanonError::BadHost), + ]; + for (url, expected) in cases { + assert_eq!(classify(&uri(url)).unwrap_err(), *expected, "url={url}"); + } + } + + // ----------------------------------------------------------------- + // Display + // ----------------------------------------------------------------- + + #[test] + fn display_strings_are_stable() { + // These strings appear in audit logs; pin them so a rename + // doesn't silently break downstream log consumers. + let cases: &[(Destination, &str)] = &[ + (Destination::Imds, "imds"), + (Destination::WireServer, "wireserver"), + (Destination::HostGaPlugin, "hostga"), + ( + Destination::Unknown { + family: AddrFamily::Name, + ip: None, + port: 0, + host_text: None, + }, + "unknown", + ), + ]; + for (dest, expected) in cases { + assert_eq!(&format!("{dest}"), expected); + } + } + + // ----------------------------------------------------------------- + // Appendix A.2 — host golden vectors + end-to-end classification + // + // Spec-conformance tests for destination classification. Vector + // labels (`A2.xxx`) appear in every failure message so a regression + // points straight back to the row in `Innovation-2.1-canonical-request.md`. + // + // These call the full canonicalize_str() entrypoint rather than + // classify() directly so the assertion target matches what a caller + // would see — the assertion target is still the destination output. + // ----------------------------------------------------------------- + + fn dest_via_pipeline(url: &str) -> Destination { + super::super::canonicalize_str(url).unwrap().destination + } + + #[test] + fn appendix_a2_host_vectors_classify_as_imds() { + // Every numeric / packed / IPv4-mapped-IPv6 form of + // 169.254.169.254 must classify to Destination::Imds — this is + // the SSRF-defeating contract that justifies the whole module's + // existence. + let cases: &[(&str, &str)] = &[ + ("A2.dotted_quad", "http://169.254.169.254/x"), + ("A2.decimal_32bit", "http://2852039166/x"), + ("A2.hex_packed", "http://0xa9fea9fe/x"), + ("A2.octal_quad", "http://0251.0376.0251.0376/x"), + ("A2.ipv4_mapped_dotted", "http://[::ffff:169.254.169.254]/x"), + ("A2.ipv4_mapped_hex", "http://[::ffff:a9fe:a9fe]/x"), + // Explicit :80 — must still classify as IMDS (default port). + ("A2.explicit_default_port", "http://169.254.169.254:80/x"), + ]; + for (label, url) in cases { + assert_eq!(dest_via_pipeline(url), Destination::Imds, "vector={label}"); + } + } + + #[test] + fn hostnames_classify_as_unknown_for_dns_rebinding_defense() { + // Hostnames are NEVER trusted — even one that resolves to IMDS + // at runtime must canonicalize to Unknown (with host_text + // preserved for audit). This is the OWASP DNS-rebinding + // defense. + match dest_via_pipeline("http://metadata.azure.internal/x") { + Destination::Unknown { + host_text: Some(s), .. + } => assert!( + s.contains("metadata.azure.internal"), + "host_text must preserve original hostname for audit, got {s:?}" + ), + d => panic!("expected Unknown with host_text, got {d:?}"), + } + } + + #[test] + fn destination_classified_by_host_and_port() { + // WireServer and HostGAPlugin share an IP but differ by port — + // pin both the default-port and the :32526 branch. + let cases: &[(&str, &str, Destination)] = &[ + ( + "wireserver default port", + "http://168.63.129.16/x", + Destination::WireServer, + ), + ( + "wireserver explicit :80", + "http://168.63.129.16:80/x", + Destination::WireServer, + ), + ( + "hostgaplugin on :32526", + "http://168.63.129.16:32526/x", + Destination::HostGaPlugin, + ), + ]; + for (label, url, expected) in cases { + assert_eq!(dest_via_pipeline(url), *expected, "{label}"); + } + } + + // ----------------------------------------------------------------- + // C7 extended host vectors (M2: golden vectors for the pentest C7 + // "host-form differentials" family — design doc §3.2). The + // Appendix A.2 table above covers IMDS-side variations; this + // expands to: + // + // - WireServer (168.63.129.16) in every numeric form. + // - HostGAPlugin (same IP, port 32526) in every numeric form. + // - Adversarial host shapes from the threat-model section: userinfo + // smuggling, port smuggling, hostname-uppercase / trailing-dot, + // out-of-range port, malformed IP literal. + // ----------------------------------------------------------------- + + #[test] + fn c7_extended_host_vectors_classify_correctly() { + // (label, url, expected) — all numeric forms of 168.63.129.16 + // (= 0xA83F8110 = 2_822_734_096 decimal) must classify to + // WireServer when on port 80 and HostGaPlugin when on :32526. + let cases: &[(&str, &str, Destination)] = &[ + // WireServer ↔ numeric forms. + ( + "C7.wireserver_decimal_32bit", + "http://2822734096/x", + Destination::WireServer, + ), + ( + "C7.wireserver_hex_packed", + "http://0xa83f8110/x", + Destination::WireServer, + ), + ( + "C7.wireserver_ipv4_mapped_dotted", + "http://[::ffff:168.63.129.16]/x", + Destination::WireServer, + ), + ( + "C7.wireserver_ipv4_mapped_hex", + "http://[::ffff:a83f:8110]/x", + Destination::WireServer, + ), + // HostGAPlugin ↔ numeric forms (still on :32526). + ( + "C7.hostga_decimal_with_port", + "http://2822734096:32526/x", + Destination::HostGaPlugin, + ), + ( + "C7.hostga_hex_packed_with_port", + "http://0xa83f8110:32526/x", + Destination::HostGaPlugin, + ), + ]; + for (label, url, expected) in cases { + assert_eq!(dest_via_pipeline(url), *expected, "vector={label}"); + } + } + + #[test] + fn c7_hostname_shape_variations_classify_as_unknown() { + // Every hostname shape stays Unknown — the matcher never + // trusts DNS. host_text is preserved verbatim (modulo any + // lowercasing hyper does at parse time) for audit. + let cases: &[(&str, &str)] = &[ + // Plain hostname. + ("C7.hostname_plain", "http://metadata.azure.internal/x"), + // Uppercased hostname. + ("C7.hostname_uppercase", "http://METADATA.AZURE.INTERNAL/x"), + // Trailing-dot hostname (FQDN). Hyper may or may not + // strip the dot, but classification is still Unknown. + ( + "C7.hostname_trailing_dot", + "http://metadata.azure.internal./x", + ), + // Arbitrary attacker-controlled hostname. + ("C7.hostname_attacker", "http://attacker.example.com/x"), + ]; + for (label, url) in cases { + match dest_via_pipeline(url) { + Destination::Unknown { host_text, .. } => assert!( + host_text.is_some(), + "vector={label}: host_text must be preserved for audit" + ), + d => panic!("vector={label}: expected Unknown, got {d:?}"), + } + } + } + + #[test] + fn c7_adversarial_host_shapes_rejected() { + use super::super::CanonError; + + // (label, url, expected_one_of) — the canonicalizer's + // top-level entrypoint may surface either of two errors + // depending on whether hyper or our gate rejects the input + // first. Both are equally a deny. + let cases: &[(&str, &str, &[CanonError])] = &[ + // Userinfo smuggle: `user@host`. UserinfoPresent is the + // preferred surfacing; BadHost is the fallback if hyper + // refuses to parse it as an absolute URI. + ( + "C7.userinfo_bare_user", + "http://user@169.254.169.254/x", + &[CanonError::UserinfoPresent, CanonError::BadHost], + ), + ( + "C7.userinfo_user_password", + "http://user:pass@169.254.169.254/x", + &[CanonError::UserinfoPresent, CanonError::BadHost], + ), + // Port-smuggle: `IP:port@evil` is the classic confusable + // (the `:80` looks like a port but the `@` makes the real + // host `evil`). Either UserinfoPresent or BadHost. + ( + "C7.port_smuggle_via_userinfo", + "http://169.254.169.254:80@evil.com/x", + &[CanonError::UserinfoPresent, CanonError::BadHost], + ), + // Out-of-range port (>65535). + ( + "C7.port_too_large", + "http://169.254.169.254:99999/x", + &[CanonError::BadPort, CanonError::BadHost], + ), + // Malformed dotted-quad (octet > 255). + ( + "C7.ipv4_octet_overflow", + "http://999.999.999.999/x", + &[CanonError::BadHost], + ), + ]; + for (label, url, expected_set) in cases { + match super::super::canonicalize_str(url) { + Ok(ok) => panic!("vector={label}: expected deny, got Ok({ok:?})"), + Err(err) => assert!( + expected_set.contains(&err), + "vector={label}: got {err:?}, expected one of {expected_set:?}" + ), + } + } + } +} diff --git a/proxy_agent/src/proxy/canonical/mod.rs b/proxy_agent/src/proxy/canonical/mod.rs new file mode 100644 index 00000000..384c0cad --- /dev/null +++ b/proxy_agent/src/proxy/canonical/mod.rs @@ -0,0 +1,637 @@ +// Copyright (c) Microsoft Corporation +// SPDX-License-Identifier: MIT + +//! Canonical request model (Innovation 2.1). +//! +//! Provides a single, total, idempotent normalization step that reduces +//! every incoming [`hyper::Uri`] (and, separately, every authorization +//! rule pattern) to the same [`CanonicalRequest`] form before they meet +//! the matcher. The goal is to eliminate the rule/request asymmetries +//! that produce SSRF-style AuthZ bypasses (pentest categories D1, C7). +//! +//! ## Pipeline +//! +//! ```text +//! hyper::Uri +//! │ +//! ▼ parse_scheme_method (http only; allow-list of methods) +//! ▼ classify_destination (IP/host -> Destination; covers numeric forms) +//! ▼ validate_userinfo (must be empty) +//! ▼ decode_path_once (single percent-decode; strict UTF-8) +//! ▼ reject_control_chars (no CR/LF/NUL/HTAB after decode) +//! ▼ ascii_lowercase_path (case-insensitive matching) +//! ▼ split_segments (split '/'; collapse '.'; resolve '..') +//! ▼ strip_matrix_params (drop `;k=v` suffix on each segment) +//! ▼ decode_query_once (k/v percent-decode once; lowercase keys) +//! ▼ reject_embedded_query (decoded path must not contain literal '?') +//! ▼ fold_into_btreemap (group by key) +//! CanonicalRequest +//! ``` +//! +//! ## Fail-closed +//! +//! Every error variant in [`CanonError`] denies the request. There is no +//! "best effort" branch. +//! +//! ## Idempotency +//! +//! `canonicalize(canonicalize(x).render()) == canonicalize(x)`. This is +//! enforced via property tests in `tests::proptests`. +//! +//! ## Fuzzing (M2) +//! +//! The M2 exit criterion is "zero panics in 1 CPU-day of fuzzing." We +//! currently meet it with the [`proptests`] module — `no_panics` runs +//! `canonicalize_str` against random printable-ASCII strings and +//! `idempotent` exercises the parse-canonicalize-render-reparse loop. +//! Crank case counts via the standard env var: +//! +//! ```text +//! PROPTEST_CASES=1000000 cargo test -p azure-proxy-agent --release \ +//! proxy::canonical::proptests +//! ``` +//! +//! A dedicated `cargo-fuzz` (libFuzzer) target would give better corpus +//! minimization and coverage feedback. It is **deferred** because +//! `proxy_agent` is a binary crate (`src/main.rs`, no `lib.rs`) and +//! `cargo-fuzz` requires importing a library. The non-invasive +//! follow-up: +//! +//! 1. Add a `lib.rs` re-exporting `pub mod proxy;` (and whatever else +//! the fuzz targets need). The existing binary should stay +//! `bin/azure-proxy-agent.rs` to avoid disturbing release artifact +//! paths. +//! 2. `cargo fuzz init` in the crate, then add targets +//! `fuzz_targets/canonicalize.rs` and `fuzz_targets/matches.rs` +//! calling `azure_proxy_agent::proxy::canonical::canonicalize_str` +//! and `CanonicalPattern::matches` respectively. +//! 3. Wire `cargo fuzz run canonicalize -- -max_total_time=86400` into +//! the nightly CI matrix. + +pub mod destination; +pub mod path; +pub mod query; +pub mod rule; + +// Innovation 2.1 M2 property tests live behind the `proptests` feature +// so the default `cargo test` inner loop stays fast. CI picks them up +// through `cargo test --all-features` (see build-linux.sh). +#[cfg(all(test, feature = "proptests"))] +mod property_tests; + +use std::collections::BTreeMap; +use std::fmt; + +use hyper::{Method, Uri}; +use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; + +pub use destination::{AddrFamily, Destination}; +pub use rule::CanonicalPattern; + +/// Bytes that must be percent-encoded inside a path segment so that +/// re-parsing the rendered output yields the same canonical form. +/// +/// - `%` would otherwise be interpreted as the start of an escape on +/// the second decode pass. +/// - `#` would be stripped by hyper as a fragment delimiter. +/// - `?` would be flagged by the canonicalizer as `EmbeddedQuery`. +/// - `;` would be re-stripped as a matrix-param sentinel. +/// - space and the C0 controls are invalid in a URI per RFC 3986. +/// +/// `/` is intentionally NOT in this set because [`path::split_and_resolve`] +/// already guarantees no literal `/` survives inside a segment. +const PATH_SEG_ENCODE: &AsciiSet = &CONTROLS.add(b' ').add(b'%').add(b'#').add(b'?').add(b';'); + +/// Bytes that must be percent-encoded inside a query key or value. +/// +/// In addition to the path-segment hazards, query strings treat `&` and +/// `=` as delimiters, and [`query::decode_query_component`] turns `+` +/// into a literal space — so a raw `+` in the rendered output would +/// round-trip to a space and lose data. All three must be encoded. +const QUERY_ENCODE: &AsciiSet = &CONTROLS + .add(b' ') + .add(b'%') + .add(b'#') + .add(b'&') + .add(b'=') + .add(b'+'); + +/// Fully-normalized form of an HTTP request as it is fed to the matcher. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct CanonicalRequest { + /// HTTP method, restricted to the allow-list. + pub method: Method, + + /// Classified destination. Matching uses the typed enum only; the raw + /// host text is never compared. + pub destination: Destination, + + /// Canonical path segments: percent-decoded once, ASCII-lowercased, + /// `.` collapsed, `..` resolved, matrix params stripped. + /// + /// Always begins with the empty root segment (so `/metadata/identity` + /// becomes `["", "metadata", "identity"]`). + pub path_segments: Vec, + + /// Whether the original path ended in `/`. Preserved as a single bit + /// so rules can opt to be slash-sensitive without re-introducing + /// string-level asymmetry. + pub trailing_slash: bool, + + /// Query parameters, canonical form: keys lowercased & decoded once, + /// values decoded once, grouped by key. Insertion order within a key + /// is preserved; key order is lexicographic. + pub query: BTreeMap>, +} + +impl CanonicalRequest { + /// Stable textual rendering. Re-parsing this string and canonicalizing + /// it must yield the same `CanonicalRequest` (idempotency invariant). + /// + /// Path segments and query components are percent-encoded on the way + /// out (using [`PATH_SEG_ENCODE`] / [`QUERY_ENCODE`]) so that bytes + /// which the pipeline decoded once — `%`, `&`, `=`, `+`, ` `, `#` and + /// friends — survive a parse-decode-canonicalize round trip without + /// being re-interpreted as delimiters. + pub fn render(&self) -> String { + let mut out = String::new(); + if self.path_segments.is_empty() { + out.push('/'); + } else { + for (i, seg) in self.path_segments.iter().enumerate() { + if i == 0 && seg.is_empty() { + // root marker + out.push('/'); + continue; + } + if i > 0 { + out.push('/'); + } + out.extend(utf8_percent_encode(seg, PATH_SEG_ENCODE)); + } + } + if self.trailing_slash && !out.ends_with('/') { + out.push('/'); + } + if !self.query.is_empty() { + out.push('?'); + let mut first = true; + for (k, values) in self.query.iter() { + for v in values { + if !first { + out.push('&'); + } + first = false; + out.extend(utf8_percent_encode(k, QUERY_ENCODE)); + out.push('='); + out.extend(utf8_percent_encode(v, QUERY_ENCODE)); + } + } + } + out + } +} + +impl fmt::Display for CanonicalRequest { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} {} {}", self.method, self.destination, self.render()) + } +} + +/// Typed errors produced by the canonicalizer. All variants are +/// **fail-closed**: callers must deny the request when any of these is +/// returned. +#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] +pub enum CanonError { + #[error("scheme not http")] + SchemeNotHttp, + #[error("method not allowed")] + MethodNotAllowed, + #[error("userinfo present in URL")] + UserinfoPresent, + #[error("malformed percent-encoding")] + MalformedPercent, + #[error("overlong UTF-8 in path/query")] + OverlongUtf8, + #[error("invalid UTF-8 in path/query")] + InvalidUtf8, + /// Decoded **path** bytes were well-formed UTF-8 but contained at + /// least one non-ASCII codepoint. Separated from + /// [`CanonError::InvalidUtf8`] so audit logs distinguish *encoding + /// corruption* (random bytes, wrong codec) from *Unicode-confusable + /// attacks* (e.g. U+0131 dotless-i looks like ASCII `i`, fullwidth + /// solidus U+FF0F looks like ASCII `/`, Cyrillic homoglyphs) where the + /// attacker hand-crafts perfectly valid UTF-8 specifically to fool + /// ASCII-only string comparisons. The two classes have very different + /// triage paths, so they get different stable codes. Only the path + /// pipeline raises this — the query pipeline allows non-ASCII values + /// (ARM ids may contain Unicode). + #[error("non-ASCII codepoint in path")] + NonAscii, + #[error("control character in path/query")] + ControlChar, + #[error("path traversal past root")] + PathUnderflow, + #[error("embedded '?' after decoding")] + EmbeddedQuery, + #[error("unparseable host")] + BadHost, + #[error("unparseable port")] + BadPort, +} + +impl CanonError { + /// Stable short code suitable for audit logs and pentest assertions. + pub fn code(&self) -> &'static str { + match self { + CanonError::SchemeNotHttp => "CANON_SCHEME", + CanonError::MethodNotAllowed => "CANON_METHOD", + CanonError::UserinfoPresent => "CANON_USERINFO", + CanonError::MalformedPercent => "CANON_PCT", + CanonError::OverlongUtf8 => "CANON_OVERLONG", + CanonError::InvalidUtf8 => "CANON_UTF8", + CanonError::NonAscii => "CANON_NON_ASCII", + CanonError::ControlChar => "CANON_CTRL", + CanonError::PathUnderflow => "CANON_UNDERFLOW", + CanonError::EmbeddedQuery => "CANON_EMBQ", + CanonError::BadHost => "CANON_HOST", + CanonError::BadPort => "CANON_PORT", + } + } +} + +/// HTTP methods accepted by the canonicalizer. Anything not on this list +/// is rejected with [`CanonError::MethodNotAllowed`]. +/// +/// Kept in sync with `proxy_server::ProxyServer::ALLOWED_METHODS`. +const ALLOWED_METHODS: &[Method] = &[ + Method::GET, + Method::POST, + Method::PUT, + Method::DELETE, + Method::HEAD, + Method::OPTIONS, + Method::PATCH, +]; + +fn check_method(method: &Method) -> Result<(), CanonError> { + if ALLOWED_METHODS.iter().any(|m| m == method) { + Ok(()) + } else { + Err(CanonError::MethodNotAllowed) + } +} + +fn check_scheme(uri: &Uri) -> Result<(), CanonError> { + match uri.scheme_str() { + // Hyper guarantees the connect-target form for absolute URIs has a + // scheme; the proxy receives origin-form requests where the scheme + // is omitted. Both cases are acceptable. A non-http scheme (https, + // ws, gopher, ...) is a hard reject. + None => Ok(()), + Some(s) if s.eq_ignore_ascii_case("http") => Ok(()), + Some(_) => Err(CanonError::SchemeNotHttp), + } +} + +/// Reject any URL whose authority carries `userinfo` (the `user[:pass]@` +/// prefix before the host). +/// +/// Hyper exposes the full authority string via [`Uri::authority`]; the +/// presence of a literal `@` is the unambiguous signal of userinfo. We +/// refuse it entirely because it is the canonical host-smuggle vector +/// (pentest C7): `http://169.254.169.254:80@evil.com/` *looks* like +/// IMDS to a careless parser but resolves to `evil.com` in real HTTP +/// clients. Symmetrically, `http://attacker@169.254.169.254/` lets the +/// attacker decorate an otherwise-legitimate URL with audit-confusing +/// junk. +fn check_userinfo(uri: &Uri) -> Result<(), CanonError> { + if let Some(authority) = uri.authority() { + if authority.as_str().contains('@') { + return Err(CanonError::UserinfoPresent); + } + } + Ok(()) +} + +/// Canonicalize a parsed request. +/// +/// Returns `Ok(CanonicalRequest)` for inputs that survive every stage of +/// the pipeline, or a typed [`CanonError`] otherwise. The function is +/// **total** — every well-formed `hyper::Uri` produces exactly one of +/// these two outcomes; it never panics. +pub fn canonicalize(uri: &Uri, method: &Method) -> Result { + check_scheme(uri)?; + check_method(method)?; + check_userinfo(uri)?; + + let destination = destination::classify(uri)?; + + let (path_segments, trailing_slash) = path::canonicalize_path(uri.path())?; + let query = query::canonicalize_query(uri.query().unwrap_or(""))?; + + Ok(CanonicalRequest { + method: method.clone(), + destination, + path_segments, + trailing_slash, + query, + }) +} + +/// Convenience: parse and canonicalize a string. Useful for tests and the +/// shadow-mode shim that takes raw URLs from telemetry replay. +#[allow(dead_code)] +pub fn canonicalize_str(url: &str) -> Result { + let uri: Uri = url.parse().map_err(|_| CanonError::BadHost)?; + canonicalize(&uri, &Method::GET) +} + +/// Rollout flag controlling whether the canonical pipeline shadows or +/// replaces the legacy authorizer. +/// +/// See `doc/plans/Innovation-2.1-canonical-request.md` §9 (Shadow-Mode +/// Rollout). Defaults to [`CanonicalMode::Off`] so production traffic +/// keeps the pre-canonical behavior bit-for-bit. +/// +/// - [`Off`](CanonicalMode::Off): legacy decides; canonical not invoked. +/// - [`Shadow`](CanonicalMode::Shadow): legacy decides; canonical runs +/// in parallel and divergences are logged as telemetry. **Behavior +/// unchanged** — this is the M3 default for dev/test. +/// - [`Enforce`](CanonicalMode::Enforce): canonical decides; legacy is +/// still computed for divergence telemetry. Wired but only intended +/// for use once shadow-mode reports zero divergences (M5/M6). +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub enum CanonicalMode { + #[default] + Off, + Shadow, + Enforce, +} + +impl std::fmt::Display for CanonicalMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CanonicalMode::Off => write!(f, "off"), + CanonicalMode::Shadow => write!(f, "shadow"), + CanonicalMode::Enforce => write!(f, "enforce"), + } + } +} + +impl std::str::FromStr for CanonicalMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.trim().to_ascii_lowercase().as_str() { + "off" | "" => Ok(CanonicalMode::Off), + "shadow" => Ok(CanonicalMode::Shadow), + "enforce" => Ok(CanonicalMode::Enforce), + other => Err(format!("Invalid CanonicalMode: {other}")), + } + } +} + +#[cfg(test)] +mod mod_tests { + //! Cross-cutting tests for the canonical pipeline. + //! + //! Helper-specific golden vectors live next to their helpers + //! (`path::path_tests::appendix_a1_*`, + //! `destination::destination_tests::appendix_a2_*`). What lives + //! here is what cuts across every helper: + //! + //! - Scheme / method / userinfo gating (the top-level checks in + //! [`canonicalize`]). + //! - Idempotency: `canonicalize(canonicalize(x).render()) == + //! canonicalize(x)` — verifies the renderer round-trips every + //! helper's output. + //! - Total / no-panic on adversarial inputs. + //! - Stability of [`CanonError::code`] strings (audit-log contract). + + use hyper::{Method, Uri}; + + use super::*; + + #[test] + fn userinfo_rejected_at_pipeline_entry() { + // hyper may either parse-and-reject or refuse outright. Either + // is a deny, but UserinfoPresent is the preferred surfacing. + let err = canonicalize_str("http://user@169.254.169.254/x").unwrap_err(); + assert!( + matches!(err, CanonError::UserinfoPresent | CanonError::BadHost), + "userinfo: got {err:?}" + ); + } + + #[test] + fn allowed_methods_are_accepted_others_rejected() { + let uri: Uri = "http://169.254.169.254/x".parse().unwrap(); + + // Positive: every method in ALLOWED_METHODS canonicalizes + // successfully. Locks the slice's contents against accidental + // shrinking — a removal would surface here as a method that + // used to work and now doesn't. + let allowed = &[ + Method::GET, + Method::POST, + Method::PUT, + Method::DELETE, + Method::HEAD, + Method::OPTIONS, + Method::PATCH, + ]; + for m in allowed { + assert!( + canonicalize(&uri, m).is_ok(), + "method {m:?} should be accepted" + ); + } + + // Negative: methods explicitly NOT on the list deny with + // MethodNotAllowed. + let denied = &[Method::CONNECT, Method::TRACE]; + for m in denied { + assert_eq!( + canonicalize(&uri, m).unwrap_err(), + CanonError::MethodNotAllowed, + "method {m:?} should be rejected" + ); + } + } + + #[test] + fn non_http_schemes_rejected() { + // Anything not bare `http://` is a deny. Hyper may refuse to + // parse some of these as absolute URIs; either path is a deny, + // but the explicit canonicalize() rejection is what we want to + // pin for the schemes hyper does accept. + let cases: &[(&str, &str)] = &[ + ("https", "https://169.254.169.254/x"), + ("ftp", "ftp://169.254.169.254/x"), + ("file", "file://169.254.169.254/x"), + ]; + for (label, url) in cases { + match url.parse::() { + Ok(uri) => assert_eq!( + canonicalize(&uri, &Method::GET).unwrap_err(), + CanonError::SchemeNotHttp, + "scheme={label}" + ), + Err(_) => { + // hyper refused to parse — equally a deny, no-op. + } + } + } + } + + #[test] + fn canonical_form_is_idempotent() { + // canonicalize(canonicalize(x).render()) == canonicalize(x). + // + // Per-vector idempotency catches a class of bugs where the + // renderer and the parser disagree on something subtle + // (encoding of `+`, matrix-param re-emergence, etc). + let cases: &[(&str, &str)] = &[ + ( + "typical imds token", + "http://169.254.169.254/Metadata/Identity/oauth2/token?api-version=2018-02-01&Resource=https%3A%2F%2Fmanagement.azure.com%2F", + ), + ("root only", "http://169.254.169.254/"), + ( + "empty query", + "http://169.254.169.254/metadata/identity?", + ), + ( + "multi-key query with case fold", + "http://169.254.169.254/m?Foo=1&BAR=2&baz=3", + ), + ( + // Value contains decoded space (`%20`) and decoded `&` + // (`%26`). render() must re-encode both, otherwise the + // re-parse would either fail (space is invalid in a + // URI) or split the value on the spurious `&`. + "query value with reserved chars", + "http://169.254.169.254/m?k=a%20b%26c", + ), + ( + // Value contains decoded `+` (`%2B`). + // decode_query_component turns raw `+` into a space, so + // render must re-encode `+` as `%2B` to round-trip. + "query value with literal plus", + "http://169.254.169.254/m?k=a%2Bb", + ), + ( + // Path segment contains a literal `%` after one decode + // (`%252F` -> `%2f` literal). render() must emit + // `%252f` so the second decode lands on the same + // literal byte. + "path segment with literal percent", + "http://169.254.169.254/metadata%252Fidentity", + ), + ( + "dot segments collapse first time", + "http://169.254.169.254/a/./b/../c", + ), + ]; + for (label, url) in cases { + let c1 = canonicalize_str(url).expect(label); + // render() drops host; reattach so the second pass sees the + // same destination. + let rendered = format!("http://169.254.169.254{}", c1.render()); + let c2 = canonicalize_str(&rendered).expect(label); + assert_eq!(c1, c2, "not idempotent: {label}"); + } + } + + #[test] + fn canonicalize_never_panics_on_adversarial_inputs() { + // Sanity smoke test — a proptest target lives in a follow-up + // PR. For every shape hyper consents to parse, the + // canonicalizer must return either Ok or a typed CanonError. + // Panics here are bugs. + let paths = &[ + "/", + "//", + "/.", + "/..", + "/%00", + "/a/b/c?", + "/?", + "/a;", + "/a;b;c;", + "/%", + "/%%", + "/%%%", + "/%C0%AF", // overlong utf-8 + "/very/long/path/that/repeats/very/long/path/that/repeats", + "/a/../../..", // underflow + "/a/b/c/../../..", // exact-root underflow + ]; + let methods = &[Method::GET, Method::POST, Method::CONNECT]; + for raw in paths { + if let Ok(uri) = format!("http://169.254.169.254{raw}").parse::() { + for m in methods { + let _ = canonicalize(&uri, m); + } + } + } + } + + #[test] + fn error_codes_are_stable() { + // Stability of these strings is a CONTRACT with the audit log + // and pentest scripts. Changing any one of these is a breaking + // change that must bump the canonical-request schema version. + let cases: &[(CanonError, &str)] = &[ + (CanonError::SchemeNotHttp, "CANON_SCHEME"), + (CanonError::MethodNotAllowed, "CANON_METHOD"), + (CanonError::UserinfoPresent, "CANON_USERINFO"), + (CanonError::MalformedPercent, "CANON_PCT"), + (CanonError::OverlongUtf8, "CANON_OVERLONG"), + (CanonError::InvalidUtf8, "CANON_UTF8"), + (CanonError::NonAscii, "CANON_NON_ASCII"), + (CanonError::ControlChar, "CANON_CTRL"), + (CanonError::PathUnderflow, "CANON_UNDERFLOW"), + (CanonError::EmbeddedQuery, "CANON_EMBQ"), + (CanonError::BadHost, "CANON_HOST"), + (CanonError::BadPort, "CANON_PORT"), + ]; + for (err, expected_code) in cases { + assert_eq!(err.code(), *expected_code, "variant={err:?}"); + } + } + + #[test] + fn canonical_mode_parses_and_defaults_off() { + use std::str::FromStr; + + // Empty / missing config string defaults to Off so production + // traffic is unchanged when the config key is absent. + assert_eq!(CanonicalMode::default(), CanonicalMode::Off); + assert_eq!(CanonicalMode::from_str("").unwrap(), CanonicalMode::Off); + + // Accepted spellings — case- and whitespace-insensitive so + // operators can write "Shadow" or " enforce " without surprises. + let cases: &[(&str, CanonicalMode)] = &[ + ("off", CanonicalMode::Off), + ("Off", CanonicalMode::Off), + ("shadow", CanonicalMode::Shadow), + ("SHADOW", CanonicalMode::Shadow), + (" shadow ", CanonicalMode::Shadow), + ("enforce", CanonicalMode::Enforce), + ]; + for (input, expected) in cases { + assert_eq!( + CanonicalMode::from_str(input).unwrap(), + *expected, + "input={input:?}" + ); + } + + // Unknown strings reject — better to fail loud at config load + // than silently fall through to Off and lie in telemetry. + assert!(CanonicalMode::from_str("audit").is_err()); + assert!(CanonicalMode::from_str("on").is_err()); + } +} diff --git a/proxy_agent/src/proxy/canonical/path.rs b/proxy_agent/src/proxy/canonical/path.rs new file mode 100644 index 00000000..5b165e84 --- /dev/null +++ b/proxy_agent/src/proxy/canonical/path.rs @@ -0,0 +1,867 @@ +// Copyright (c) Microsoft Corporation +// SPDX-License-Identifier: MIT + +//! Path canonicalization. +//! +//! Steps (each is a small pure function so they can be unit-tested +//! independently): +//! +//! 1. Single percent-decode of the raw path; reject malformed `%XX`. +//! 2. Reject overlong UTF-8 encodings of ASCII (`%C0%AF` etc.). +//! 3. Reject control characters (`\r`, `\n`, `\0`, `\t`). +//! 4. Reject non-ASCII bytes (paths to IMDS / WireServer are ASCII; +//! accepting non-ASCII would require NFC normalization that adds a +//! dependency we do not currently take). +//! 5. ASCII-lowercase. +//! 6. Split on `/`, drop empty segments, drop `.`, resolve `..` +//! (RFC 3986 §5.2.4). Underflow past the root is an error, not a +//! no-op — a real client would never produce it. +//! 7. Strip matrix params (`;jsessionid=...`) from each segment. +//! 8. Reject an embedded `?` in the decoded path (caused by `%3F` +//! smuggling) — the matcher must never see ambiguous input. +//! +//! The output is `(segments, trailing_slash)`. `segments` always begins +//! with the empty root segment, so the canonical form of `/` is +//! `vec![""]` and the canonical form of `/metadata/identity` is +//! `vec!["", "metadata", "identity"]`. + +use super::CanonError; + +const ROOT: &str = ""; + +/// Run the path pipeline. Public for unit tests; the canonicalizer +/// entrypoint is [`super::canonicalize`]. +pub fn canonicalize_path(raw: &str) -> Result<(Vec, bool), CanonError> { + // hyper guarantees the path starts with '/'. + let raw = if raw.is_empty() { "/" } else { raw }; + let trailing_slash = raw.len() > 1 && raw.ends_with('/'); + + let decoded = decode_path_once(raw)?; + reject_overlong_utf8(decoded.as_bytes())?; + reject_control_chars(&decoded)?; + reject_non_ascii(&decoded)?; + if decoded.contains('?') { + return Err(CanonError::EmbeddedQuery); + } + + let lowered = decoded.to_ascii_lowercase(); + let segments = split_and_resolve(&lowered)?; + + // A trailing `/` only carries meaning when there's a non-root + // segment in front of it. Without this clamp, inputs that collapse + // to root after dot/matrix resolution (e.g. `/;/`, `/./`, `//`) + // would carry `trailing_slash = true` even though `render()` + // produces just `/` — which re-parses with `trailing_slash = + // false`, breaking the canonicalize-render-canonicalize idempotency + // invariant the rest of the pipeline depends on. + let trailing_slash = trailing_slash && segments.len() > 1; + + Ok((segments, trailing_slash)) +} + +/// Single-pass percent-decode. Rejects truncated (`%2`) and non-hex +/// (`%ZZ`) sequences as `MalformedPercent`. Never decodes twice — that +/// is exactly the asymmetry the canonical model is built to remove. +fn decode_path_once(raw: &str) -> Result { + let bytes = raw.as_bytes(); + let mut out: Vec = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + let b = bytes[i]; + if b == b'%' { + if i + 2 >= bytes.len() { + return Err(CanonError::MalformedPercent); + } + let h = hex_value(bytes[i + 1])?; + let l = hex_value(bytes[i + 2])?; + out.push((h << 4) | l); + i += 3; + } else { + out.push(b); + i += 1; + } + } + // Strict UTF-8: lossy decoding is what allowed the silent-replacement + // bypass in the legacy matcher. + String::from_utf8(out).map_err(|_| CanonError::InvalidUtf8) +} + +fn hex_value(b: u8) -> Result { + match b { + b'0'..=b'9' => Ok(b - b'0'), + b'a'..=b'f' => Ok(10 + b - b'a'), + b'A'..=b'F' => Ok(10 + b - b'A'), + _ => Err(CanonError::MalformedPercent), + } +} + +/// Detect classic overlong UTF-8 encodings (e.g. `%C0%AF` for `/`). +/// +/// `String::from_utf8` already rejects overlong sequences as invalid, so +/// by the time we run this the bytes are *guaranteed* well-formed +/// UTF-8 — meaning the overlong forms below would already have produced +/// `InvalidUtf8`. We run this pass *before* UTF-8 validation in case the +/// caller ever switches to a lossy decoder; today it is a defense in +/// depth that also gives us a more specific telemetry code. +fn reject_overlong_utf8(bytes: &[u8]) -> Result<(), CanonError> { + let mut i = 0; + while i < bytes.len() { + let b = bytes[i]; + // 2-byte overlong: lead byte 0xC0 or 0xC1 (would encode <0x80). + if b == 0xC0 || b == 0xC1 { + return Err(CanonError::OverlongUtf8); + } + // 3-byte overlong: 0xE0 0x80..0x9F (would encode <0x800). + if b == 0xE0 && i + 1 < bytes.len() && (0x80..=0x9F).contains(&bytes[i + 1]) { + return Err(CanonError::OverlongUtf8); + } + // 4-byte overlong: 0xF0 0x80..0x8F (would encode <0x10000). + if b == 0xF0 && i + 1 < bytes.len() && (0x80..=0x8F).contains(&bytes[i + 1]) { + return Err(CanonError::OverlongUtf8); + } + i += 1; + } + Ok(()) +} + +fn reject_control_chars(s: &str) -> Result<(), CanonError> { + for b in s.bytes() { + // CR, LF, NUL, HTAB, plus the rest of the C0 control block and + // DEL. Anything below 0x20 or equal to 0x7F is rejected. + if b < 0x20 || b == 0x7F { + return Err(CanonError::ControlChar); + } + } + Ok(()) +} + +fn reject_non_ascii(s: &str) -> Result<(), CanonError> { + // Well-formed-UTF-8 non-ASCII gets its own dedicated error class so + // it shows up in audit logs as `canon=error:CANON_NON_ASCII`, + // distinguishable from genuine encoding corruption (`CANON_UTF8`, + // `CANON_OVERLONG`). The Unicode-confusable attack class + // (U+0131 dotless-i, fullwidth solidus, Cyrillic homoglyphs) + // surfaces exclusively here, making the family greppable in audit + // logs without false positives from random-byte fuzz. + if s.is_ascii() { + Ok(()) + } else { + Err(CanonError::NonAscii) + } +} + +/// Split on `/`, strip matrix params, drop empty/`.` segments, resolve +/// `..` with underflow detection. +fn split_and_resolve(path: &str) -> Result, CanonError> { + let mut segments: Vec = vec![ROOT.to_string()]; + // RFC 3986 defines a path as a sequence of segments separated by '/'. + for raw_seg in path.split('/') { + // ';' is a sub-delim — it's a perfectly legal character inside a path segment. + // Strip matrix params *before* dot-resolution so a segment like + // `.;jsessionid=1` (which decodes to the current-directory + // marker `.` after stripping) is dropped rather than preserved. + // The reverse order let `/.;` canonicalize to `["", "."]` while + // its rendered form `//.` re-parsed to `[""]`, breaking the + // idempotency invariant exercised by the M2 proptest. + // Matrix params are never used in authorization decisions: + // e.g. `segment;k=v;k2=v2` -> `segment`. + let cleaned = match raw_seg.find(';') { + Some(pos) => &raw_seg[..pos], + None => raw_seg, + }; + // A segment that's empty (from `//`), a current-directory marker + // (`.`), or pure matrix params (`;k=v`, which strips to "") + // collapses away — same treatment, same reason: keeping any of + // them in the canonical form would re-introduce the kind of + // request/rule asymmetry the canonical model exists to remove. + if cleaned.is_empty() || cleaned == "." { + continue; + } + if cleaned == ".." { + // Pop the previous segment. Popping the root is an error. + if segments.len() <= 1 { + return Err(CanonError::PathUnderflow); + } + segments.pop(); + continue; + } + segments.push(cleaned.to_string()); + } + Ok(segments) +} + +#[cfg(test)] +mod path_tests { + use super::*; + + fn segs(parts: &[&str]) -> Vec { + parts.iter().map(|s| (*s).to_string()).collect() + } + + // ----------------------------------------------------------------- + // canonicalize_path — end-to-end happy paths + // ----------------------------------------------------------------- + + #[test] + fn canonicalize_path_accepts_valid_inputs() { + // (input, expected_segments, expected_trailing_slash). Covers: + // root, empty-string fallback, simple path, ASCII case folding, + // double-slash collapse, `.` removal, `..` resolution + // (including chained), single percent-decode, double-encoding + // leaving a literal `%` after one decode, mixed-case percent + // (`%2f` and `%2F`), matrix-param stripping on one and many + // segments, and a segment that becomes empty after matrix + // stripping. + let cases: &[(&str, &[&str], bool)] = &[ + ("/", &[""], false), + ("", &[""], false), + ("/metadata/identity", &["", "metadata", "identity"], false), + ("/Metadata/Identity", &["", "metadata", "identity"], false), + ("/metadata//identity", &["", "metadata", "identity"], false), + ("/metadata/./identity", &["", "metadata", "identity"], false), + ( + "/metadata/x/../identity", + &["", "metadata", "identity"], + false, + ), + ("/a/b/c/../../identity", &["", "a", "identity"], false), + ("/metadata%2Fidentity", &["", "metadata", "identity"], false), + ("/metadata%2fidentity", &["", "metadata", "identity"], false), + ( + "/metadata%252Fidentity", + &["", "metadata%2fidentity"], + false, + ), + ( + "/metadata/identity;jsessionid=abc", + &["", "metadata", "identity"], + false, + ), + ( + "/metadata;a=1;b=2/identity;c=3", + &["", "metadata", "identity"], + false, + ), + ("/a/;jsessionid=x/b", &["", "a", "b"], false), + ("/metadata/", &["", "metadata"], true), + ]; + for (input, expected_segs, expected_ts) in cases { + let got = canonicalize_path(input) + .unwrap_or_else(|e| panic!("expected Ok for {input:?}, got {e:?}")); + assert_eq!(got, (segs(expected_segs), *expected_ts), "input={input:?}"); + } + } + + // ----------------------------------------------------------------- + // canonicalize_path — end-to-end errors + // ----------------------------------------------------------------- + + #[test] + fn canonicalize_path_rejects_invalid_inputs() { + // Each row is (input, expected_error). Exercises every typed + // error the path pipeline can produce except OverlongUtf8 / + // InvalidUtf8 for overlong sequences, which have their own test + // because either variant is acceptable. + let cases: &[(&str, CanonError)] = &[ + // Malformed percent: dangling `%`, truncated, non-hex digit + ("/abc%", CanonError::MalformedPercent), + ("/abc%2", CanonError::MalformedPercent), + ("/abc%ZZ", CanonError::MalformedPercent), + ("/abc%2G", CanonError::MalformedPercent), + // Control characters: NUL, HTAB, LF, CR, DEL + ("/x%00", CanonError::ControlChar), + ("/x%09", CanonError::ControlChar), + ("/x%0A", CanonError::ControlChar), + ("/x%0D", CanonError::ControlChar), + ("/x%7F", CanonError::ControlChar), + // Non-ASCII after decode: U+4E2D `中` in UTF-8 — well-formed, + // so it surfaces as NonAscii (Unicode-confusable / homoglyph + // attack class), not InvalidUtf8 (encoding corruption). + ("/x%E4%B8%AD", CanonError::NonAscii), + // Embedded `?` from %3F smuggling + ( + "/metadata/identity%3Fapi-version=2018", + CanonError::EmbeddedQuery, + ), + // Path traversal past root: chained, immediate, and a lone `..` + ("/a/../..", CanonError::PathUnderflow), + ("/..", CanonError::PathUnderflow), + ("/a/b/../../../c", CanonError::PathUnderflow), + ]; + for (input, expected) in cases { + assert_eq!( + canonicalize_path(input).unwrap_err(), + *expected, + "input={input:?}" + ); + } + } + + #[test] + fn canonicalize_path_rejects_overlong_utf8() { + // %C0%AF is the classic 2-byte overlong for `/` (IDS bypass). + // %E0%80%AF is the 3-byte overlong for `/`. + // %F0%80%80%AF is the 4-byte overlong for `/`. + // + // Each MUST be rejected. The exact variant depends on whether + // reject_overlong_utf8 catches it first or String::from_utf8 does + // (both happen to fire on these inputs); either way the request + // is denied, so we accept either error code. + for input in ["/x%C0%AFy", "/x%E0%80%AFy", "/x%F0%80%80%AFy"] { + let err = canonicalize_path(input).unwrap_err(); + assert!( + matches!(err, CanonError::OverlongUtf8 | CanonError::InvalidUtf8), + "input={input:?} got={err:?}" + ); + } + } + + // ----------------------------------------------------------------- + // decode_path_once + // ----------------------------------------------------------------- + + #[test] + fn decode_path_once_handles_hex_correctly() { + // Happy cases: identity (no `%`), upper, lower, and mixed hex. + // The decoder must accept all of them — case asymmetry was one + // of the legacy bypass vectors. + let ok: &[(&str, &str)] = &[ + ("/plain/path", "/plain/path"), + ("/a%2Fb", "/a/b"), + ("/a%2fb", "/a/b"), + ("/a%2fb%2F%2f", "/a/b//"), + ("/space%20here", "/space here"), + ]; + for (input, expected) in ok { + assert_eq!( + decode_path_once(input).unwrap(), + *expected, + "input={input:?}" + ); + } + + // Malformed: `%` at very end, `%X` (only one nibble), invalid + // hex digits, surrounded by garbage. + let bad: &[&str] = &["%", "abc%", "%2", "abc%2", "%ZZ", "abc%2G", "%9Q"]; + for input in bad { + assert_eq!( + decode_path_once(input).unwrap_err(), + CanonError::MalformedPercent, + "input={input:?}" + ); + } + } + + // ----------------------------------------------------------------- + // hex_value + // ----------------------------------------------------------------- + + #[test] + fn hex_value_accepts_digits_and_rejects_others() { + // Every valid hex digit, both letter cases. + let valid: &[(u8, u8)] = &[ + (b'0', 0), + (b'1', 1), + (b'2', 2), + (b'3', 3), + (b'4', 4), + (b'5', 5), + (b'6', 6), + (b'7', 7), + (b'8', 8), + (b'9', 9), + (b'a', 10), + (b'b', 11), + (b'c', 12), + (b'd', 13), + (b'e', 14), + (b'f', 15), + (b'A', 10), + (b'B', 11), + (b'C', 12), + (b'D', 13), + (b'E', 14), + (b'F', 15), + ]; + for (byte, expected) in valid { + assert_eq!( + hex_value(*byte).unwrap(), + *expected, + "byte={}", + *byte as char + ); + } + + // A handful of representative non-hex bytes: G/g (just past f), + // punctuation, whitespace, high bytes. + for byte in [b'g', b'G', b'/', b' ', b'\0', 0xFFu8] { + assert_eq!( + hex_value(byte).unwrap_err(), + CanonError::MalformedPercent, + "byte=0x{byte:02X}" + ); + } + } + + // ----------------------------------------------------------------- + // reject_overlong_utf8 — direct byte-level tests + // ----------------------------------------------------------------- + + #[test] + fn reject_overlong_utf8_catches_all_overlong_forms() { + // Each "bad" buffer carries an overlong sequence of arity 2, 3, + // or 4. Each "ok" buffer is a well-formed UTF-8 sequence of the + // same arity that must NOT be flagged. This pins both the + // detection AND the absence of false positives. + let bad: &[&[u8]] = &[ + // 2-byte overlong: leading 0xC0 / 0xC1 always overlong. + &[0xC0, 0xAF], + &[b'/', 0xC1, 0xAF, b'/'], + // 3-byte overlong: 0xE0 followed by 0x80..=0x9F. + &[0xE0, 0x80, 0xAF], + &[0xE0, 0x9F, 0xBF], + // 4-byte overlong: 0xF0 followed by 0x80..=0x8F. + &[0xF0, 0x80, 0x80, 0xAF], + &[0xF0, 0x8F, 0xBF, 0xBF], + ]; + for buf in bad { + assert_eq!( + reject_overlong_utf8(buf).unwrap_err(), + CanonError::OverlongUtf8, + "buf={buf:?}" + ); + } + + let ok: &[&[u8]] = &[ + b"plain ascii", + // Well-formed 2-byte sequence (U+00A9 ©): 0xC2 0xA9 + &[0xC2, 0xA9], + // Well-formed 3-byte sequence (U+4E2D 中): 0xE4 0xB8 0xAD + &[0xE4, 0xB8, 0xAD], + // Well-formed 4-byte sequence (U+1F600): 0xF0 0x9F 0x98 0x80 + &[0xF0, 0x9F, 0x98, 0x80], + b"", + ]; + for buf in ok { + assert!(reject_overlong_utf8(buf).is_ok(), "buf={buf:?}"); + } + } + + // ----------------------------------------------------------------- + // reject_control_chars — direct + // ----------------------------------------------------------------- + + #[test] + fn reject_control_chars_blocks_c0_block_and_del() { + // Every byte in the C0 control block (0x00..=0x1F) plus DEL + // (0x7F) must be rejected. We check the full block, not just + // a few representative bytes, because each byte is a separate + // CRLF/NUL/HTAB injection vector. + for b in 0u8..=0x1F { + let s = std::str::from_utf8(&[b]).unwrap().to_string(); + assert_eq!( + reject_control_chars(&s).unwrap_err(), + CanonError::ControlChar, + "byte=0x{b:02X}" + ); + } + let del = String::from_utf8(vec![0x7F]).unwrap(); + assert_eq!( + reject_control_chars(&del).unwrap_err(), + CanonError::ControlChar + ); + + // Printable ASCII (0x20..=0x7E) and the empty string must pass. + assert!(reject_control_chars("").is_ok()); + assert!(reject_control_chars(" !#0AZaz~").is_ok()); + assert!(reject_control_chars("/metadata/identity?x=1&y=2").is_ok()); + } + + // ----------------------------------------------------------------- + // reject_non_ascii — direct + // ----------------------------------------------------------------- + + #[test] + fn reject_non_ascii_blocks_high_bytes() { + // Anything within the ASCII range (incl. the C0 block — that's + // a different helper's job) must pass. + for s in ["", "/", "/abc", "/metadata/identity?api-version=2018"] { + assert!(reject_non_ascii(s).is_ok(), "input={s:?}"); + } + // Any non-ASCII character (1, 2, 3, or 4 UTF-8 bytes wide) must + // be rejected with the NonAscii code — distinct from InvalidUtf8 + // (random bytes) and OverlongUtf8 (IDS-bypass overlongs) so audit + // logs can `grep CANON_NON_ASCII` for the homoglyph attack class + // without picking up generic encoding-failure noise. + for s in ["é", "中", "🙂", "/abc/中文/x"] { + assert_eq!( + reject_non_ascii(s).unwrap_err(), + CanonError::NonAscii, + "input={s:?}" + ); + } + } + + // ----------------------------------------------------------------- + // split_and_resolve — direct + // ----------------------------------------------------------------- + + #[test] + fn split_and_resolve_handles_dot_segments_and_matrix_params() { + // (input lowered path, expected segments). Inputs are passed as + // they would be after decode + lowercase, so this isolates the + // segment-level behavior. Covers: root, double-slash collapse, + // `.` removal, `..` chain, matrix params on one and many + // segments, a segment that becomes empty after matrix strip, + // and a trailing matrix-param-only segment. + let cases: &[(&str, &[&str])] = &[ + ("/", &[""]), + ("//", &[""]), + ("/a/b", &["", "a", "b"]), + ("/a//b", &["", "a", "b"]), + ("/a/./b", &["", "a", "b"]), + ("/a/b/../c", &["", "a", "c"]), + ("/a/b/c/../../d", &["", "a", "d"]), + ("/a;k=v/b", &["", "a", "b"]), + ("/a;k=v;k2=v2/b;k3=v3", &["", "a", "b"]), + ("/a/;k=v/b", &["", "a", "b"]), + ("/a/b/;k=v", &["", "a", "b"]), + ]; + for (input, expected) in cases { + assert_eq!( + split_and_resolve(input).unwrap(), + segs(expected), + "input={input:?}" + ); + } + + // Underflow: every form must fail-closed with PathUnderflow. + for input in ["/..", "/a/../..", "/../a", "/a/b/../../../c"] { + assert_eq!( + split_and_resolve(input).unwrap_err(), + CanonError::PathUnderflow, + "input={input:?}" + ); + } + } + + // ----------------------------------------------------------------- + // Appendix A.1 — path golden vectors + // + // Spec-conformance tests for the path pipeline. The vector labels + // (`A1.xxx`) appear in every failure message so a regression points + // straight back to the row in `Innovation-2.1-canonical-request.md`. + // + // These call the full canonicalize_str() entrypoint rather than + // canonicalize_path() directly so the assertions match the form a + // caller would see — the assertion target is still the path output. + // ----------------------------------------------------------------- + + fn canon_path_via_pipeline(url: &str) -> String { + super::super::canonicalize_str(url) + .unwrap() + .path_segments + .join("/") + } + + #[test] + fn appendix_a1_path_vectors_canonicalize_successfully() { + // (vector_label, raw_url, expected_canonical_path) + let cases: &[(&str, &str, &str)] = &[ + ( + "A1.plain", + "http://169.254.169.254/metadata/identity", + "/metadata/identity", + ), + ( + "A1.mixed_case", + "http://169.254.169.254/Metadata/Identity", + "/metadata/identity", + ), + ( + "A1.double_slash", + "http://169.254.169.254/metadata//identity", + "/metadata/identity", + ), + ( + "A1.dot_segment", + "http://169.254.169.254/metadata/./identity", + "/metadata/identity", + ), + ( + "A1.dotdot_segment", + "http://169.254.169.254/metadata/x/../identity", + "/metadata/identity", + ), + ( + "A1.encoded_slash_decodes", + "http://169.254.169.254/metadata%2Fidentity", + "/metadata/identity", + ), + ( + // Decoding happens once: %252F -> %2F (literal, not a separator). + "A1.double_encoding_decoded_once", + "http://169.254.169.254/metadata%252Fidentity", + "/metadata%2fidentity", + ), + ( + "A1.matrix_param_stripped", + "http://169.254.169.254/metadata/identity;jsessionid=abc", + "/metadata/identity", + ), + ]; + for (label, url, expected) in cases { + assert_eq!(canon_path_via_pipeline(url), *expected, "vector={label}"); + } + } + + #[test] + fn appendix_a1_path_vectors_rejected() { + // (vector_label, raw_url, expected_error) + let exact: &[(&str, &str, CanonError)] = &[ + ( + "A1.path_underflow", + // /metadata/identity -> pop x2 -> root; the third .. underflows. + "http://169.254.169.254/metadata/identity/../../..", + CanonError::PathUnderflow, + ), + ( + "A1.embedded_query", + "http://169.254.169.254/metadata/identity%3Fapi-version=2018", + CanonError::EmbeddedQuery, + ), + ( + "A1.control_char", + "http://169.254.169.254/metadata/identity%0A", + CanonError::ControlChar, + ), + ]; + for (label, url, expected) in exact { + assert_eq!( + super::super::canonicalize_str(url).unwrap_err(), + *expected, + "vector={label}" + ); + } + + // Either-of class: overlong UTF-8 may surface as OverlongUtf8 or + // InvalidUtf8 depending on decoder state — both are equally a deny. + let either: &[(&str, &str)] = &[( + "A1.overlong_utf8", + "http://169.254.169.254/metadata/%C0%AFidentity", + )]; + for (label, url) in either { + let err = super::super::canonicalize_str(url).unwrap_err(); + assert!( + matches!(err, CanonError::OverlongUtf8 | CanonError::InvalidUtf8), + "vector={label} got={err:?}" + ); + } + } + + // ----------------------------------------------------------------- + // D1 extended path vectors (M2: golden vectors for the pentest D1 + // "URL parsing differentials" family — pentest/linux/DESIGN.md row D1 + // and design doc §3.1). + // + // The Appendix A.1 table above is the *representative* set the + // design promises by name. These are the additional concrete forms + // that must produce the same canonical output (or the same typed + // deny) so that the matcher's behavior cannot diverge from the + // upstream server's interpretation. + // ----------------------------------------------------------------- + + #[test] + fn d1_extended_path_vectors_canonicalize_successfully() { + let cases: &[(&str, &str, &str)] = &[ + // Mixed-case percent escapes for `/` decode the same. + ( + "D1.lowercase_encoded_slash", + "http://169.254.169.254/metadata%2fidentity", + "/metadata/identity", + ), + // Double-encoded `..` (`%252e%252e`) decodes ONCE to the literal + // bytes `%2e%2e` — it must NOT collapse like `..` would. + ( + "D1.double_encoded_dotdot", + "http://169.254.169.254/metadata/%252e%252e/identity", + "/metadata/%2e%2e/identity", + ), + // Multiple matrix params on a single segment all strip. + ( + "D1.multiple_matrix_params", + "http://169.254.169.254/metadata;a=1;b=2;c=3/identity", + "/metadata/identity", + ), + // Matrix params across multiple segments. + ( + "D1.matrix_params_each_segment", + "http://169.254.169.254/metadata;k=v/identity;k2=v2", + "/metadata/identity", + ), + // A segment that is ONLY matrix params (`;k=v`) collapses to + // nothing — never to an empty segment that would shift indices. + ( + "D1.matrix_only_segment_drops", + "http://169.254.169.254/a/;k=v/b", + "/a/b", + ), + // Encoded space survives as a literal space in the segment; + // render() must percent-encode it back. + ( + "D1.encoded_space_in_segment", + "http://169.254.169.254/foo%20bar", + "/foo bar", + ), + // Combined dot / dotdot / matrix in one path. + ( + "D1.combined_dot_dotdot_matrix", + "http://169.254.169.254/a/b/./c/../d;p=q/e", + "/a/b/d/e", + ), + // Leading multi-slash collapses to single root. + ( + "D1.leading_multi_slash", + "http://169.254.169.254///metadata", + "/metadata", + ), + // Intermixed `./` segments collapse. + ( + "D1.intermixed_dot_segments", + "http://169.254.169.254/./a/./b/./", + "/a/b", + ), + // Case folding plus encoded slash plus matrix. + ( + "D1.combined_case_encoded_slash_matrix", + "http://169.254.169.254/Foo%2FBar;p=q", + "/foo/bar", + ), + // Encoded `;` (`%3B`) decodes to a literal `;` and then + // triggers the same matrix-param stripping as a raw `;` — + // this symmetry is REQUIRED, otherwise an attacker could + // smuggle past a `/foo;v=1` deny rule by writing `/foo%3Bv=1`. + ( + "D1.encoded_semicolon_strips_like_raw", + "http://169.254.169.254/a%3Bb", + "/a", + ), + ]; + for (label, url, expected) in cases { + assert_eq!(canon_path_via_pipeline(url), *expected, "vector={label}"); + } + } + + #[test] + fn d1_extended_path_vectors_rejected() { + // Exact-error vectors. + let exact: &[(&str, &str, CanonError)] = &[ + // Truncated percent at end of input (no following hex digits). + ( + "D1.truncated_percent_end", + "http://169.254.169.254/a%", + CanonError::MalformedPercent, + ), + ( + "D1.truncated_percent_one_hex", + "http://169.254.169.254/a%2", + CanonError::MalformedPercent, + ), + // Non-hex characters after `%`. + ( + "D1.non_hex_percent", + "http://169.254.169.254/a%ZZ", + CanonError::MalformedPercent, + ), + ( + "D1.partial_non_hex_percent", + "http://169.254.169.254/a%2G", + CanonError::MalformedPercent, + ), + // All four control-character flavours the matcher must deny. + ( + "D1.nul_byte", + "http://169.254.169.254/a%00b", + CanonError::ControlChar, + ), + ( + "D1.cr_byte", + "http://169.254.169.254/a%0Db", + CanonError::ControlChar, + ), + ( + "D1.tab_byte", + "http://169.254.169.254/a%09b", + CanonError::ControlChar, + ), + ( + "D1.del_byte", + "http://169.254.169.254/a%7Fb", + CanonError::ControlChar, + ), + // Decoded non-ASCII (valid UTF-8 but outside the matcher's + // ASCII-only contract). Distinct from encoding-corruption + // (`InvalidUtf8`) and overlong-bypass (`OverlongUtf8`) + // because this is the Unicode-confusable attack family. + ( + "D1.decoded_non_ascii_lowercase_e_acute", + "http://169.254.169.254/caf%C3%A9", + CanonError::NonAscii, + ), + // Underflow variants the Appendix table doesn't enumerate. + ( + "D1.underflow_from_root", + "http://169.254.169.254/..", + CanonError::PathUnderflow, + ), + ( + "D1.underflow_via_dotdot_chain", + "http://169.254.169.254/a/b/../../..", + CanonError::PathUnderflow, + ), + // Embedded `?` smuggled via uppercase AND lowercase `%3F`. + ( + "D1.embedded_query_uppercase_hex", + "http://169.254.169.254/x%3Fy", + CanonError::EmbeddedQuery, + ), + ( + "D1.embedded_query_lowercase_hex", + "http://169.254.169.254/x%3fy", + CanonError::EmbeddedQuery, + ), + ]; + for (label, url, expected) in exact { + assert_eq!( + super::super::canonicalize_str(url).unwrap_err(), + *expected, + "vector={label}" + ); + } + + // Either-of class: 3-byte and 4-byte overlong UTF-8 sequences may + // surface as OverlongUtf8 or InvalidUtf8 depending on which check + // fires first. + let either: &[(&str, &str)] = &[ + ( + "D1.overlong_utf8_3byte_slash", + "http://169.254.169.254/x/%E0%80%AFy", + ), + ( + "D1.overlong_utf8_4byte_slash", + "http://169.254.169.254/x/%F0%80%80%AFy", + ), + ( + "D1.overlong_utf8_2byte_backslash", + "http://169.254.169.254/x/%C1%9Cy", + ), + ]; + for (label, url) in either { + let err = super::super::canonicalize_str(url).unwrap_err(); + assert!( + matches!(err, CanonError::OverlongUtf8 | CanonError::InvalidUtf8), + "vector={label} got={err:?}" + ); + } + } +} diff --git a/proxy_agent/src/proxy/canonical/property_tests.rs b/proxy_agent/src/proxy/canonical/property_tests.rs new file mode 100644 index 00000000..6671d04f --- /dev/null +++ b/proxy_agent/src/proxy/canonical/property_tests.rs @@ -0,0 +1,231 @@ +// Copyright (c) Microsoft Corporation +// SPDX-License-Identifier: MIT + +//! Property tests for the canonical pipeline (Innovation 2.1, M2). +//! +//! These are *invariants*, not example tables: every test draws inputs +//! from a [`proptest`] strategy and asserts a relationship that must hold +//! for **all** survivors of that strategy. They complement the hand-rolled +//! Appendix A / D1 / C7 golden vectors by exercising shapes nobody +//! enumerated. +//! +//! The three properties — taken verbatim from §10.2 of the design doc — +//! are: +//! +//! 1. **`idempotent`**: `canonicalize(canonicalize(x).render()) == canonicalize(x)`. +//! Forms the contract that lets us cache canonicalized rules and reuse +//! them across requests without re-deriving on every match. +//! +//! 2. **`no_panics`**: `canonicalize_str` returns `Result`, never panics, +//! for *any* string short of those that fail `Uri::parse`. This is the +//! M2 fuzz-target exit criterion expressed as a property so we get the +//! same coverage with one tool (no separate cargo-fuzz crate yet — +//! see the TODO in `mod.rs` for the binary-crate blocker). +//! +//! 3. **`host_form_equivalence`**: every numeric IPv4 spelling of the same +//! address (dotted, decimal-u32, hex-u32, IPv4-mapped IPv6) classifies +//! to the same [`Destination`]. This is the *positive* counterpart of +//! the C7 host-smuggling vectors. + +use super::{canonicalize, canonicalize_str, Destination}; +use hyper::{Method, Uri}; +use proptest::prelude::*; +use std::net::Ipv4Addr; + +// --------------------------------------------------------------------------- +// Strategies +// --------------------------------------------------------------------------- + +/// Characters that survive a `path_segment` round-trip without being +/// reinterpreted as a delimiter. We pick a curated subset so the strategy +/// stays focused on shapes the canonicalizer is supposed to normalize +/// (case, percent-encoding, dot resolution, matrix params) rather than +/// degenerating into a hyper-parser-rejection generator. +/// +/// Intentionally includes `%` and `;` so the strategy exercises the +/// percent-decode and matrix-strip stages. +const SEG_BYTES: &str = "abcXY01-._~/%;:@&=+$,"; + +fn arb_segment() -> impl Strategy { + // 0..16 keeps the search space tight enough to drive ~1000 cases + // through every code path without blowing the default proptest budget. + proptest::collection::vec(proptest::sample::select(SEG_BYTES.as_bytes()), 0..16) + .prop_map(|bs| String::from_utf8(bs).expect("ASCII subset is always valid UTF-8")) +} + +const QKV_BYTES: &str = "abcXY01-._~%+"; + +fn arb_qkv() -> impl Strategy { + proptest::collection::vec(proptest::sample::select(QKV_BYTES.as_bytes()), 0..8) + .prop_map(|bs| String::from_utf8(bs).expect("ASCII subset is always valid UTF-8")) +} + +/// Build a syntactically reasonable URL whose authority we control (so +/// `Destination` is stable across round-trips) but whose path/query are +/// drawn from broad strategies. +fn arb_url() -> impl Strategy { + let path_strat = proptest::collection::vec(arb_segment(), 0..4); + let query_strat = proptest::collection::vec((arb_qkv(), arb_qkv()), 0..4); + let trailing_slash = any::(); + + (path_strat, query_strat, trailing_slash).prop_map(|(segs, kvs, ts)| { + let mut url = String::from("http://169.254.169.254"); + if segs.is_empty() { + url.push('/'); + } else { + for s in &segs { + url.push('/'); + url.push_str(s); + } + } + if ts && !url.ends_with('/') { + url.push('/'); + } + if !kvs.is_empty() { + url.push('?'); + for (i, (k, v)) in kvs.iter().enumerate() { + if i > 0 { + url.push('&'); + } + url.push_str(k); + url.push('='); + url.push_str(v); + } + } + url + }) +} + +// --------------------------------------------------------------------------- +// Properties +// --------------------------------------------------------------------------- + +proptest! { + /// Re-canonicalizing a rendered canonical request yields the same + /// request. This is the *load-bearing* invariant: every cache, every + /// audit-log signature, and every "rule equals request" comparison + /// in the matcher assumes this holds. + /// + /// We only assert the property on inputs that canonicalize + /// successfully on the first pass; rejected inputs are out of scope + /// (covered by `no_panics`). + #[test] + fn idempotent(url in arb_url()) { + let c1 = match canonicalize_str(&url) { + Ok(c) => c, + Err(_) => return Ok(()), + }; + // render() omits the authority by design — re-attach the same + // host we built the input from so the destination doesn't + // change between rounds. + let url2 = format!("http://169.254.169.254{}", c1.render()); + let c2 = canonicalize_str(&url2) + .map_err(|e| TestCaseError::fail(format!( + "second pass failed: input={url:?} rendered={url2:?} err={e:?}" + )))?; + prop_assert_eq!( + &c1, &c2, + "non-idempotent: input={:?} rendered={:?}", url, url2 + ); + } + + /// `canonicalize_str` is *total* on any input that survives + /// `Uri::parse` — it returns a typed error rather than panicking. + /// Random printable-ASCII strings give the parser plenty to choke + /// on, and the canonicalizer plenty of weird-but-valid `Uri` shapes + /// to chew through. + #[test] + fn no_panics(s in r"[\x20-\x7E]{0,80}") { + // proptest catches panics across the FFI boundary automatically; + // the test "passes" iff this call returns (Ok or Err) without + // unwinding. + let _ = canonicalize_str(&s); + } + + /// Same target — different spelling. All four well-formed numeric + /// forms of an IPv4 address must classify to the same Destination. + /// This is the positive contract behind the C7 host-shape rejection + /// vectors: the canonicalizer is allowed to *deny* exotic forms, + /// but if it *accepts* them they must collapse onto the same + /// classification as the dotted form. + #[test] + fn host_form_equivalence(raw in any::()) { + let v4 = Ipv4Addr::from(raw); + // Don't run the property on the dual-stack any-address; hyper + // happens to reject the bracketed `[::ffff:0.0.0.0]` form, which + // would unbalance the comparison. The fix isn't shape-changing, + // so skipping is safe. + if raw == 0 { + return Ok(()); + } + + let dotted = format!("http://{}/x", v4); + let decimal = format!("http://{}/x", raw); + let hex = format!("http://0x{:x}/x", raw); + let mapped = format!("http://[::ffff:{}]/x", v4); + + let dotted_dest = classify_or_none(&dotted); + let decimal_dest = classify_or_none(&decimal); + let hex_dest = classify_or_none(&hex); + let mapped_dest = classify_or_none(&mapped); + + // Forms the canonicalizer accepts must all agree. Forms it + // rejects are out of scope (we don't require it to accept any + // particular alternative spelling — only that acceptance is + // self-consistent). + // + // `Destination::Unknown` carries `host_text` so audit logs can + // show the *originally requested* spelling; that field is + // intentionally form-dependent and we strip it for the + // equivalence comparison. Known destinations (IMDS, WireServer, + // HostGAPlugin) have no host_text and compare directly. + let accepted: Vec<(&str, Destination)> = [ + ("dotted", dotted_dest), + ("decimal", decimal_dest), + ("hex", hex_dest), + ("mapped", mapped_dest), + ] + .into_iter() + .filter_map(|(n, d)| d.map(|d| (n, normalize_for_equivalence(d)))) + .collect(); + + if let Some((_, first)) = accepted.first() { + let first = first.clone(); + for (name, d) in accepted.iter().skip(1) { + prop_assert_eq!( + d, &first, + "host-form mismatch for {}: {} -> {:?} vs dotted -> {:?}", + v4, name, d, first + ); + } + } + } +} + +/// Helper: parse + canonicalize, returning only the destination. Used by +/// `host_form_equivalence` so a parse failure on one spelling doesn't +/// short-circuit the comparison. +fn classify_or_none(url: &str) -> Option { + let uri: Uri = url.parse().ok()?; + canonicalize(&uri, &Method::GET).ok().map(|c| c.destination) +} + +/// Strip `host_text` from `Destination::Unknown` so two equivalent +/// numeric spellings (e.g. `0.0.0.1` and `1`) compare equal. Known +/// destinations are returned unchanged. +fn normalize_for_equivalence(d: Destination) -> Destination { + match d { + Destination::Unknown { + family, + ip, + port, + host_text: _, + } => Destination::Unknown { + family, + ip, + port, + host_text: None, + }, + other => other, + } +} diff --git a/proxy_agent/src/proxy/canonical/query.rs b/proxy_agent/src/proxy/canonical/query.rs new file mode 100644 index 00000000..1540ffd2 --- /dev/null +++ b/proxy_agent/src/proxy/canonical/query.rs @@ -0,0 +1,209 @@ +// Copyright (c) Microsoft Corporation +// SPDX-License-Identifier: MIT + +//! Query string canonicalization. +//! +//! - Split on `&`, then on the first `=` (additional `=` characters +//! become part of the value). +//! - Single percent-decode of both key and value. +//! - Lowercase the key (case-insensitive matching). +//! - Reject control characters and malformed UTF-8 in both key and value. +//! Unlike the path pipeline, well-formed non-ASCII UTF-8 is **allowed** +//! here: query *values* legitimately carry it (e.g. an IMDS +//! `msi_res_id` / `resource` ARM id whose resource-group name contains +//! Unicode letters, which Azure naming rules permit). The path stays +//! ASCII-only because IMDS / WireServer / HGAP paths never contain +//! non-ASCII and a confusable path segment could slip past a deny rule. +//! - Fold into a `BTreeMap>`: deterministic key +//! ordering, insertion order preserved within a key. + +use std::collections::BTreeMap; + +use super::CanonError; + +/// Canonicalize a raw query string (the part after `?`, without it). +pub fn canonicalize_query(raw: &str) -> Result>, CanonError> { + let mut map: BTreeMap> = BTreeMap::new(); + if raw.is_empty() { + return Ok(map); + } + for pair in raw.split('&') { + if pair.is_empty() { + // `?a=1&&b=2` -> skip empty pairs. + continue; + } + let (k_raw, v_raw) = match pair.find('=') { + Some(pos) => (&pair[..pos], &pair[pos + 1..]), + None => (pair, ""), + }; + let k = decode_query_component(k_raw)?; + let v = decode_query_component(v_raw)?; + if k.is_empty() { + // `?=foo` is malformed — silently drop instead of injecting a + // ghost empty key into the map. + continue; + } + map.entry(k.to_ascii_lowercase()).or_default().push(v); + } + Ok(map) +} + +fn decode_query_component(raw: &str) -> Result { + // `+` in a query component is a legacy form for space (application/x-www-form-urlencoded). + // IMDS / WireServer don't use form-encoded queries, but normalize it so a rule author + // can't tell the difference between `?key=a+b` and `?key=a%20b`. + let bytes = raw.as_bytes(); + let mut out: Vec = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + let b = bytes[i]; + match b { + b'%' => { + if i + 2 >= bytes.len() { + return Err(CanonError::MalformedPercent); + } + let h = hex_value(bytes[i + 1])?; + let l = hex_value(bytes[i + 2])?; + out.push((h << 4) | l); + i += 3; + } + b'+' => { + out.push(b' '); + i += 1; + } + _ => { + out.push(b); + i += 1; + } + } + } + // `String::from_utf8` still rejects malformed byte sequences + // (`InvalidUtf8`), so smuggling via broken encodings is closed. We + // deliberately do NOT reject well-formed non-ASCII here the way the + // path pipeline does: query values legitimately carry Unicode (e.g. + // an ARM `msi_res_id` whose resource-group name has Unicode letters), + // and a confusable in a query value cannot slip past a path-prefix + // deny rule. Control characters stay forbidden in both halves. + let s = String::from_utf8(out).map_err(|_| CanonError::InvalidUtf8)?; + for b in s.bytes() { + if b < 0x20 || b == 0x7F { + return Err(CanonError::ControlChar); + } + } + Ok(s) +} + +fn hex_value(b: u8) -> Result { + match b { + b'0'..=b'9' => Ok(b - b'0'), + b'a'..=b'f' => Ok(10 + b - b'a'), + b'A'..=b'F' => Ok(10 + b - b'A'), + _ => Err(CanonError::MalformedPercent), + } +} + +#[cfg(test)] +mod query_tests { + use super::*; + + #[test] + fn empty() { + assert!(canonicalize_query("").unwrap().is_empty()); + } + + #[test] + fn single_pair() { + let q = canonicalize_query("api-version=2018-02-01").unwrap(); + assert_eq!(q.get("api-version"), Some(&vec!["2018-02-01".to_string()])); + } + + #[test] + fn key_lowercased() { + let q = canonicalize_query("API-Version=2018").unwrap(); + assert_eq!(q.get("api-version"), Some(&vec!["2018".to_string()])); + assert!(!q.contains_key("API-Version")); + } + + #[test] + fn percent_decoded_once() { + let q = canonicalize_query("resource=https%3A%2F%2Fmanagement.azure.com%2F").unwrap(); + assert_eq!( + q.get("resource"), + Some(&vec!["https://management.azure.com/".to_string()]) + ); + } + + #[test] + fn plus_to_space() { + let q = canonicalize_query("k=a+b").unwrap(); + assert_eq!(q.get("k"), Some(&vec!["a b".to_string()])); + } + + #[test] + fn repeated_key_preserves_order() { + let q = canonicalize_query("k=1&k=2&k=3").unwrap(); + assert_eq!( + q.get("k"), + Some(&vec!["1".to_string(), "2".to_string(), "3".to_string()]) + ); + } + + #[test] + fn no_value() { + let q = canonicalize_query("foo").unwrap(); + assert_eq!(q.get("foo"), Some(&vec!["".to_string()])); + } + + #[test] + fn malformed_percent_rejected() { + assert_eq!( + canonicalize_query("k=%2").unwrap_err(), + CanonError::MalformedPercent + ); + } + + #[test] + fn empty_key_dropped() { + let q = canonicalize_query("=value&k=v").unwrap(); + assert!(!q.contains_key("")); + assert_eq!(q.get("k"), Some(&vec!["v".to_string()])); + } + + #[test] + fn control_char_rejected() { + assert_eq!( + canonicalize_query("k=%0A").unwrap_err(), + CanonError::ControlChar + ); + } + + #[test] + fn non_ascii_allowed_in_query_value_and_key() { + // Unlike the path pipeline, well-formed non-ASCII UTF-8 is + // accepted in the query so legitimate ARM ids (e.g. an + // `msi_res_id` whose resource-group name has Unicode letters) are + // not rejected. The decoded codepoints survive verbatim. + // Value side: U+4E2D `中`. + let q = canonicalize_query("k=%E4%B8%AD").unwrap(); + assert_eq!(q.get("k"), Some(&vec!["\u{4e2d}".to_string()])); + // Key side: U+00E9 `é` (only A–Z is ASCII-lowercased, so the + // non-ASCII key is preserved as-is). + let q = canonicalize_query("caf%C3%A9=1").unwrap(); + assert_eq!(q.get("caf\u{e9}"), Some(&vec!["1".to_string()])); + // Both halves non-ASCII. + let q = canonicalize_query("%C3%A9=%C3%A9").unwrap(); + assert_eq!(q.get("\u{e9}"), Some(&vec!["\u{e9}".to_string()])); + } + + #[test] + fn malformed_utf8_still_reports_invalid_utf8() { + // Lone continuation byte (0x80 with no lead byte) is genuine + // encoding corruption, not a homoglyph attack. It must stay on + // `InvalidUtf8` / `CANON_UTF8` so the two audit-log classes + // remain separable. + assert_eq!( + canonicalize_query("k=%80").unwrap_err(), + CanonError::InvalidUtf8 + ); + } +} diff --git a/proxy_agent/src/proxy/canonical/rule.rs b/proxy_agent/src/proxy/canonical/rule.rs new file mode 100644 index 00000000..edbdd106 --- /dev/null +++ b/proxy_agent/src/proxy/canonical/rule.rs @@ -0,0 +1,493 @@ +// Copyright (c) Microsoft Corporation +// SPDX-License-Identifier: MIT + +//! Canonical form of a rule pattern. +//! +//! Rules go through the same pipeline as requests, with two +//! differences: +//! +//! 1. There is no scheme/method on a rule (the matcher inherits those +//! from the request). +//! 2. A rule's destination is `RuleDestination`, which adds an `Any` +//! variant for rules that intentionally span endpoints. +//! +//! Matching is then a pure structural comparison: +//! +//! - `Destination` must equal the rule's destination (or the rule is +//! `Any`). +//! - The rule's path is a **prefix** of the request's canonical path, +//! compared segment-by-segment (not character-by-character — this is +//! what prevents `starts_with("/metadata")` from matching +//! `/metadata-attacker`). +//! - For each query key constrained by the rule, the request must +//! have at least one matching value (case-insensitive after the +//! canonical pipeline already lowercased both sides). + +use std::collections::BTreeMap; + +use crate::key_keeper::key::Privilege; + +use super::destination::Destination; +use super::path::canonicalize_path; +use super::query::canonicalize_query; +use super::{CanonError, CanonicalRequest}; + +/// Destination constraint on a rule. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum RuleDestination { + /// Rule applies to a single classified destination. + Only(Destination), + /// Rule applies regardless of destination (used for the per-endpoint + /// rule files where the file itself already partitions the rules). + Any, +} + +/// Canonical form of an authorization rule. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CanonicalPattern { + pub destination: RuleDestination, + /// Segments to prefix-match against [`CanonicalRequest::path_segments`]. + /// Always starts with the root marker (empty string). + pub path_prefix: Vec, + /// Required query parameters. Empty map means "no query constraint". + /// All present keys must match at least one of the supplied values. + pub required_query: BTreeMap>, +} + +impl CanonicalPattern { + /// Build from a raw `Privilege` (the on-disk rule format). + /// + /// The privilege's path is run through the canonical path pipeline, + /// and its query parameters are run through the canonical query + /// pipeline. Rules that fail canonicalization are **rejected** by + /// the loader — fail-closed. + pub fn from_privilege(p: &Privilege) -> Result { + let (segments, _trailing) = canonicalize_path(&p.path)?; + let required_query = match &p.queryParameters { + None => BTreeMap::new(), + Some(qp) => { + let mut joined = String::new(); + for (k, v) in qp.iter() { + if !joined.is_empty() { + joined.push('&'); + } + joined.push_str(k); + joined.push('='); + joined.push_str(v); + } + canonicalize_query(&joined)? + } + }; + Ok(CanonicalPattern { + destination: RuleDestination::Any, + path_prefix: segments, + required_query, + }) + } + + /// Structural match against a canonical request. + pub fn matches(&self, req: &CanonicalRequest) -> bool { + // Destination + if let RuleDestination::Only(d) = &self.destination { + if d != &req.destination { + return false; + } + } + + // Path: segment-by-segment prefix match. + if req.path_segments.len() < self.path_prefix.len() { + return false; + } + for (i, seg) in self.path_prefix.iter().enumerate() { + if &req.path_segments[i] != seg { + return false; + } + } + + // Query: every required key must be present and at least one + // of its required values must appear among the request's values + // for that key. + for (k, required_values) in &self.required_query { + let actual = match req.query.get(k) { + Some(v) => v, + None => return false, + }; + let any_match = required_values + .iter() + .any(|rv| actual.iter().any(|av| av == rv)); + if !any_match { + return false; + } + } + true + } +} + +#[cfg(test)] +// The table-driven tests below intentionally carry deep nested-tuple +// literal types like `&[(&str, Option<&[(&str, &str)]>, &[&str], &[(&str, &[&str])])]`. +// That nesting IS the readability point — each column lines up with a +// (input, expected) axis the test is exercising — and factoring it out +// into a `type` alias hurts at-a-glance reading more than it helps. +#[allow(clippy::type_complexity)] +mod rule_tests { + use std::collections::HashMap; + + use hyper::{Method, Uri}; + + use super::*; + use crate::proxy::canonical::canonicalize; + + // ---------- helpers ---------- + + fn priv_of(path: &str, qp: Option<&[(&str, &str)]>) -> Privilege { + Privilege { + name: "test".to_string(), + path: path.to_string(), + queryParameters: qp.map(|pairs| { + pairs + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect::>() + }), + } + } + + fn req_of(uri: &str, method: &Method) -> CanonicalRequest { + let u: Uri = uri.parse().unwrap(); + canonicalize(&u, method).unwrap() + } + + fn segs(parts: &[&str]) -> Vec { + parts.iter().map(|s| s.to_string()).collect() + } + + fn pat( + destination: RuleDestination, + prefix: &[&str], + query: &[(&str, &[&str])], + ) -> CanonicalPattern { + let mut required = BTreeMap::new(); + for (k, vs) in query { + required.insert( + (*k).to_string(), + vs.iter().map(|v| (*v).to_string()).collect(), + ); + } + CanonicalPattern { + destination, + path_prefix: segs(prefix), + required_query: required, + } + } + + // ---------- from_privilege ---------- + + #[test] + fn from_privilege_canonicalizes_and_normalizes() { + // (raw_path, raw_query_pairs) -> (expected_path_prefix, expected_required_query as sorted pairs) + let cases: &[(&str, Option<&[(&str, &str)]>, &[&str], &[(&str, &[&str])])] = &[ + // case folding + leading-slash root marker preserved. + ("/Metadata", None, &["", "metadata"], &[]), + // dot-segments collapse on the rule side too. + ("/a/./b/../c", None, &["", "a", "c"], &[]), + // trailing slash on rule is dropped from segments (same as request side). + ( + "/metadata/identity/", + None, + &["", "metadata", "identity"], + &[], + ), + // None vs Some(empty) both yield an empty required_query. + ("/x", None, &["", "x"], &[]), + ("/x", Some(&[]), &["", "x"], &[]), + // single query param canonicalized (key case-folded). + ( + "/m", + Some(&[("Api-Version", "2018-02-01")]), + &["", "m"], + &[("api-version", &["2018-02-01"])], + ), + // value with encoded specials kept literal (not re-split on '='). + ( + "/m", + Some(&[("k", "a%3Db")]), + &["", "m"], + &[("k", &["a=b"])], + ), + // root-only rule. + ("/", None, &[""], &[]), + ]; + + for (path, qp, expect_segs, expect_q) in cases { + let p = CanonicalPattern::from_privilege(&priv_of(path, *qp)).unwrap(); + assert_eq!( + p.destination, + RuleDestination::Any, + "from_privilege must default to Any (input path={path:?})" + ); + assert_eq!(p.path_prefix, segs(expect_segs), "path={path:?}"); + let actual: Vec<(String, Vec)> = p.required_query.into_iter().collect(); + let expected: Vec<(String, Vec)> = expect_q + .iter() + .map(|(k, vs)| { + ( + (*k).to_string(), + vs.iter().map(|v| (*v).to_string()).collect(), + ) + }) + .collect(); + assert_eq!(actual, expected, "path={path:?}"); + } + } + + #[test] + fn from_privilege_rejects_invalid_inputs() { + // Garbage propagates from the canonical pipeline as a CanonError; + // the loader is expected to drop the rule rather than admit it. + let bad: &[(&str, Option<&[(&str, &str)]>)] = &[ + // Malformed percent in rule path. + ("/bad%ZZ", None), + // Control byte in rule path. + ("/bad\x01path", None), + // Non-ASCII in rule path. + ("/café", None), + // Malformed percent in rule query value. + ("/x", Some(&[("k", "%ZZ")])), + // Control byte in rule query key. + ("/x", Some(&[("k\x01", "v")])), + ]; + for (path, qp) in bad { + let r = CanonicalPattern::from_privilege(&priv_of(path, *qp)); + assert!( + r.is_err(), + "expected canonical rejection for path={path:?} qp={qp:?}" + ); + } + } + + // ---------- destination matching ---------- + + #[test] + fn matches_destination_constraint() { + // Any always matches; Only matches only its own classified destination. + let any = pat(RuleDestination::Any, &[""], &[]); + let only_imds = pat(RuleDestination::Only(Destination::Imds), &[""], &[]); + let only_ws = pat(RuleDestination::Only(Destination::WireServer), &[""], &[]); + + let imds_req = req_of("http://169.254.169.254/x", &Method::GET); + let ws_req = req_of("http://168.63.129.16/x", &Method::GET); + let unk_req = req_of("http://10.0.0.1/x", &Method::GET); + + let cases: &[(&CanonicalPattern, &CanonicalRequest, bool, &str)] = &[ + (&any, &imds_req, true, "Any+Imds"), + (&any, &ws_req, true, "Any+WireServer"), + (&any, &unk_req, true, "Any+Unknown"), + (&only_imds, &imds_req, true, "Only(Imds)+Imds"), + (&only_imds, &ws_req, false, "Only(Imds)+WireServer rejects"), + (&only_imds, &unk_req, false, "Only(Imds)+Unknown rejects"), + (&only_ws, &ws_req, true, "Only(WS)+WireServer"), + (&only_ws, &imds_req, false, "Only(WS)+Imds rejects"), + ]; + for (rule, req, expected, label) in cases { + assert_eq!(rule.matches(req), *expected, "{label}"); + } + } + + // ---------- path matching ---------- + + #[test] + fn matches_path_prefix_semantics() { + // (rule_path, request_uri, expected, label) + // Method is intentionally varied to confirm it is NOT part of rule matching. + let cases: &[(&str, &str, &Method, bool, &str)] = &[ + // Segment-boundary safety: /metadata must not match /metadata-attacker. + ( + "/metadata", + "http://169.254.169.254/metadata/identity", + &Method::GET, + true, + "rule shorter than request matches at boundary", + ), + ( + "/metadata", + "http://169.254.169.254/metadata-attacker/identity", + &Method::GET, + false, + "rule must not bleed across segment boundary", + ), + // Exact match. + ( + "/metadata/identity", + "http://169.254.169.254/metadata/identity", + &Method::POST, + true, + "exact path equality (method irrelevant)", + ), + // Request strictly shorter than rule prefix -> reject. + ( + "/metadata/identity/oauth2/token", + "http://169.254.169.254/metadata/identity", + &Method::GET, + false, + "request shorter than rule rejects", + ), + // Mid-segment differ (not just at the end). + ( + "/a/b/c", + "http://169.254.169.254/a/X/c", + &Method::GET, + false, + "differing mid segment rejects", + ), + // Case-insensitive (both rule and request lowercased by pipeline). + ( + "/Metadata", + "http://169.254.169.254/METADATA/Identity", + &Method::GET, + true, + "case-insensitive path match", + ), + // Percent-encoded slash decoded once -> rule that includes the slash matches. + ( + "/metadata/identity", + "http://169.254.169.254/metadata%2Fidentity/oauth2/token", + &Method::GET, + true, + "encoded slash in request decodes to rule path", + ), + // Root-only rule matches any request path. + ( + "/", + "http://169.254.169.254/anything/at/all", + &Method::GET, + true, + "root-only rule is universal on path", + ), + ( + "/", + "http://169.254.169.254/", + &Method::GET, + true, + "root-only rule on root request", + ), + ]; + for (rule_path, uri, method, expected, label) in cases { + let p = CanonicalPattern::from_privilege(&priv_of(rule_path, None)).unwrap(); + let r = req_of(uri, method); + assert_eq!(p.matches(&r), *expected, "{label}"); + } + } + + // ---------- query matching ---------- + + #[test] + fn matches_query_constraint_semantics() { + // Across keys: AND. Within a key: OR over the rule's allowed values. + // Build patterns directly so we can exercise multiple values per key + // (the on-disk Privilege format only supports a single value per key). + let multi_value = pat(RuleDestination::Any, &["", "m"], &[("v", &["a", "b"])]); + let multi_key = pat( + RuleDestination::Any, + &["", "m"], + &[("a", &["1"]), ("b", &["2"])], + ); + let no_query = pat(RuleDestination::Any, &["", "m"], &[]); + + let cases: &[(&CanonicalPattern, &str, bool, &str)] = &[ + // No required_query -> any query (including none) matches. + ( + &no_query, + "http://169.254.169.254/m", + true, + "no constraint + no query", + ), + ( + &no_query, + "http://169.254.169.254/m?anything=here", + true, + "no constraint + extra query", + ), + // OR within a key. + ( + &multi_value, + "http://169.254.169.254/m?v=a", + true, + "OR within key, first value", + ), + ( + &multi_value, + "http://169.254.169.254/m?v=b", + true, + "OR within key, second value", + ), + ( + &multi_value, + "http://169.254.169.254/m?v=c", + false, + "value outside allowed set rejects", + ), + ( + &multi_value, + "http://169.254.169.254/m", + false, + "missing required key rejects", + ), + // Request supplies the same key twice; rule accepts if ANY request value matches. + ( + &multi_value, + "http://169.254.169.254/m?v=c&v=b", + true, + "request-side repeat: any value matches rule", + ), + // Extra request keys are allowed (rule is a minimum requirement, not an exact match). + ( + &multi_value, + "http://169.254.169.254/m?v=a&extra=zzz", + true, + "extra request keys allowed", + ), + // AND across keys. + ( + &multi_key, + "http://169.254.169.254/m?a=1&b=2", + true, + "AND across keys satisfied", + ), + ( + &multi_key, + "http://169.254.169.254/m?a=1", + false, + "AND across keys: missing second key rejects", + ), + ( + &multi_key, + "http://169.254.169.254/m?a=1&b=9", + false, + "AND across keys: wrong value on one key rejects", + ), + // Case folding on keys (Api-Version vs api-version) via canonical pipeline. + ( + &CanonicalPattern::from_privilege(&priv_of( + "/metadata/identity", + Some(&[("Api-Version", "2018-02-01")]), + )) + .unwrap(), + "http://169.254.169.254/metadata/identity?API-VERSION=2018-02-01", + true, + "case-insensitive key fold both sides", + ), + // Encoded value on the request side decodes once to match the rule. + ( + &CanonicalPattern::from_privilege(&priv_of("/m", Some(&[("k", "a b")]))).unwrap(), + "http://169.254.169.254/m?k=a%20b", + true, + "request value decoded once matches rule value", + ), + ]; + for (rule, uri, expected, label) in cases { + let r = req_of(uri, &Method::GET); + assert_eq!(rule.matches(&r), *expected, "{label}"); + } + } +} diff --git a/proxy_agent/src/proxy/proxy_authorizer.rs b/proxy_agent/src/proxy/proxy_authorizer.rs index f4f8a28d..0c59c82a 100644 --- a/proxy_agent/src/proxy/proxy_authorizer.rs +++ b/proxy_agent/src/proxy/proxy_authorizer.rs @@ -21,8 +21,9 @@ use super::authorization_rules::{AuthorizationMode, ComputedAuthorizationItem}; use super::proxy_connection::ConnectionLogger; +use crate::proxy::canonical::CanonicalMode; use crate::shared_state::access_control_wrapper::AccessControlSharedState; -use crate::{common::constants, common::result::Result, proxy::Claims}; +use crate::{common::config, common::constants, common::result::Result, proxy::Claims}; use proxy_agent_shared::hyper_client; use proxy_agent_shared::logger::LoggerLevel; @@ -243,12 +244,50 @@ pub fn authorize( return AuthorizeResult::Ok; } - let auth = get_authorizer(ip, port, claims); + let auth = get_authorizer(ip, port, claims.clone()); logger.write( LoggerLevel::Trace, format!("Got auth: {}", auth.to_string()), ); - auth.authorize(logger, request_uri, access_control_rules) + let legacy_result = auth.authorize(logger, request_uri.clone(), access_control_rules.clone()); + + // Innovation 2.1 M3 — shadow-mode integration. + // + // Only walks the canonical pipeline when the operator opts in via + // the `canonicalRequestMode` config key. With the default (`off`) + // there is **zero** additional work versus the pre-M3 control + // flow — that's what guarantees the "behavior unchanged for + // production traffic" exit criterion. + // + // Why we re-evaluate `is_allowed` here instead of recovering it + // from `legacy_result`: the wrapper above mixes in audit-mode + // softening and `runAsElevated` gating, neither of which the + // canonical matcher knows about. Comparing the wrapper output to + // the canonical matcher would produce phantom divergences. So we + // call the rules-only predicate twice in shadow/enforce — once + // through the Authorizer, once explicitly for the shadow + // comparison — accepting one extra matcher invocation per request + // only in non-production modes. + let mode = config::get_canonical_request_mode(); + if mode != CanonicalMode::Off { + if let Some(rules) = access_control_rules.as_ref() { + let legacy_allowed = rules.is_allowed(logger, request_uri.clone(), claims.clone()); + let _canon = rules.shadow_compare( + logger, + &request_uri, + &request_method, + &claims, + legacy_allowed, + mode, + ); + // Enforce-mode *behavior* is deliberately deferred to + // M5/M6 — see the design doc §9.3. In M3 Enforce only + // surfaces telemetry; it does NOT yet replace the legacy + // decision. A one-shot warning above (during config load) + // tells operators the same. + } + } + legacy_result } #[cfg(test)] diff --git a/proxy_agent/src/proxy/proxy_connection.rs b/proxy_agent/src/proxy/proxy_connection.rs index a810c77a..20fe40db 100644 --- a/proxy_agent/src/proxy/proxy_connection.rs +++ b/proxy_agent/src/proxy/proxy_connection.rs @@ -282,8 +282,19 @@ impl HttpConnectionContext { hyper_client::should_skip_sig(&self.method, &self.url) } + /// Pre-canonical defense-in-depth guard. Returns `true` if the request + /// path contains a pattern commonly used to bypass prefix-based + /// authorization rules. See [`path_has_traversal`] for the exact set. + /// This is checked in `handle_new_http_request` before any rule lookup + /// so suspicious paths short-circuit to 403 without ever reaching the + /// matcher or the upstream. + /// + /// Once the canonical pipeline graduates to enforce mode (M5), the + /// `PathUnderflow` / slash-collapsing / encoded-dot handling in + /// `crate::proxy::canonical::path` subsumes this check entirely and + /// this method can be removed. pub fn contains_traversal_characters(&self) -> bool { - self.url.path().contains("..") + path_has_traversal(self.url.path()) } pub fn log(&mut self, logger_level: LoggerLevel, message: String) { @@ -393,3 +404,126 @@ impl Clone for ConnectionLogger { } } } + +/// Returns `true` if `raw_path` (the path component of a request URI in +/// its on-wire, *not yet percent-decoded* form) contains a pattern that +/// is commonly used to bypass prefix-based authorization rules. +/// +/// Caught by this function (rejected with 404 by the caller): +/// * `..` — literal dot-dot anywhere in the path +/// * `%2E%2E`, `%2e%2e`, and mixed-case (`%2E%2e`, `%2e%2E`) — caught +/// after a single percent-decode pass +/// * `.%2e`, `%2E.`, etc. — same single-decode pass collapses them +/// into a literal `..` +/// * `//` — two or more consecutive slashes (multi-slash bypass). +/// Legacy prefix-matching treats `/foo//bar` as a distinct path +/// from `/foo/bar`; reject the raw form so attackers cannot pivot +/// past a privilege whose path string lacks the extra slash. +/// +/// INTENTIONALLY NOT rejected here (the canonical pipeline in +/// [`crate::proxy::canonical::path`] handles these and will subsume this +/// guard entirely once it graduates to enforce mode in M5): +/// * single `.` segments — valid in real paths like `/.well-known/*` +/// * `;` matrix parameters — sub-delim that is legal inside a segment +/// * non-ASCII / Unicode-confusable bytes — caught by `reject_non_ascii` +/// * embedded `?` after percent-decode — caught by canonical's EmbQ check +/// * trailing-slash variation — clamped by canonical's normalizer +/// +/// Decoding uses `decode_utf8_lossy`: any invalid-UTF-8 percent +/// sequences are replaced by U+FFFD (which does not contain `..`), and +/// truly malformed sequences are passed through unchanged. This keeps +/// the check fail-open for the rare valid non-ASCII path while staying +/// safe for the traversal cases that matter. +fn path_has_traversal(raw_path: &str) -> bool { + if raw_path.contains("//") { + return true; + } + let decoded = percent_encoding::percent_decode_str(raw_path).decode_utf8_lossy(); + decoded.contains("..") +} + +#[cfg(test)] +mod tests { + use super::path_has_traversal; + + #[test] + fn clean_paths_are_not_traversal() { + for p in [ + "/", + "/metadata/instance", + "/metadata/identity/oauth2/token", + "/machine/?comp=goalstate", + "/.well-known/foo", + "/metadata/instance?api-version=2021-02-01", + ] { + assert!( + !path_has_traversal(p), + "expected clean path, got traversal: {p:?}" + ); + } + } + + #[test] + fn literal_dot_dot_is_traversal() { + for p in [ + "/foo/../bar", + "/metadata/./identity/../identity/oauth2/token", + "/..", + "../etc/passwd", + ] { + assert!(path_has_traversal(p), "expected traversal: {p:?}"); + } + } + + #[test] + fn percent_encoded_dot_dot_is_traversal() { + // Uppercase, lowercase, and mixed-case percent encodings must all + // be caught — they all decode to a literal `..` in a single pass. + for p in [ + "/foo/%2E%2E/bar", + "/foo/%2e%2e/bar", + "/foo/%2E%2e/bar", + "/foo/%2e%2E/bar", + "/foo/.%2e/bar", + "/foo/%2E./bar", + "/metadata/%2E/identity/%2E%2E/identity/oauth2/token", + ] { + assert!( + path_has_traversal(p), + "expected percent-encoded traversal: {p:?}" + ); + } + } + + #[test] + fn multi_slash_is_traversal() { + for p in [ + "//metadata/instance", + "/metadata//instance", + "/metadata///identity//oauth2//token", + "//", + ] { + assert!( + path_has_traversal(p), + "expected multi-slash traversal: {p:?}" + ); + } + } + + #[test] + fn single_dot_and_matrix_params_are_not_traversal_here() { + // Single `.` segments and matrix params (`;`) are not traversal + // markers and must pass through this pre-canonical guard. The + // canonical pipeline applies its own normalization in M5. + for p in [ + "/metadata/./instance", + "/metadata/instance;jsessionid=abc", + "/.well-known/openid-configuration", + ] { + assert!( + !path_has_traversal(p), + "expected non-traversal (deferred to canonical): {p:?}" + ); + } + } +} diff --git a/proxy_agent/src/proxy/proxy_server.rs b/proxy_agent/src/proxy/proxy_server.rs index 9cd2258d..be8fc2e7 100644 --- a/proxy_agent/src/proxy/proxy_server.rs +++ b/proxy_agent/src/proxy/proxy_server.rs @@ -404,15 +404,15 @@ impl ProxyServer { } if http_connection_context.contains_traversal_characters() { - // If the proxied request contains traversal characters, we will return 404 Not Found to avoid potential security issues. + // If the proxied request contains traversal characters, we will return 403 Forbidden to avoid potential security issues. self.log_connection_summary( &mut http_connection_context, - StatusCode::NOT_FOUND, + StatusCode::FORBIDDEN, false, - "Traversal characters found in the request, return NOT_FOUND!".to_string(), + "Traversal characters found in the request, return FORBIDDEN!".to_string(), ) .await; - return Ok(Self::closed_response(StatusCode::NOT_FOUND)); + return Ok(Self::closed_response(StatusCode::FORBIDDEN)); } if http_connection_context.url == provision::provision_query::PROVISION_URL_PATH { @@ -1176,9 +1176,9 @@ mod tests { .await .unwrap(); assert_eq!( - http::StatusCode::NOT_FOUND, + http::StatusCode::FORBIDDEN, response.status(), - "response.status must be NOT_FOUND." + "response.status must be FORBIDDEN." ); // test large request body