From 08ee6b04d7c7dad9c060f0480134dd74a028f182 Mon Sep 17 00:00:00 2001
From: Tristen Pierson <tpierson@bitconcepts.tech>
Date: Tue, 2 Jun 2026 14:33:33 -0400
Subject: [PATCH] feat: cross-validation harness and performance regression
 tracking

Add cross-validation test suite that validates the Python evaluator
against 12 golden test vectors covering all major evaluation paths:
basic eval, safety guard ordering, expression opcodes (scale,
accumulate, clamp with saturation), div-by-zero, staleness, mode
transitions, fault raise/clear, delta operators, condition groups,
and INT32 saturation.

Tests include:
- Parametrised golden-vector evaluation (12 vectors)
- Determinism verification (100 identical runs)
- Compile-to-C validation (each vector model compiles to valid C)

Also adds:
- tools/parse_benchmark.py for extracting ns/tick timing from
  Twister benchmark logs (PID and Kalman)
- CI workflow step to print benchmark timing summary after Twister

Co-Authored-By: Oz <oz-agent@warp.dev>
---
 .github/workflows/ci.yml                      |   6 +
 tests/python/test_cross_validation.py         | 196 ++++++++++++++++++
 tests/python/test_golden_vectors.py           |  39 ++--
 tests/vectors/01_basic_eval/vector.json       |  30 +++
 .../02_safety_guard_ordering/vector.json      |  40 ++++
 tests/vectors/03_expr_scale/vector.json       |  36 ++++
 tests/vectors/04_expr_accumulate/vector.json  |  37 ++++
 .../05_expr_clamp_saturation/vector.json      |  35 ++++
 tests/vectors/06_div_by_zero/vector.json      |  40 ++++
 tests/vectors/07_staleness/vector.json        |  38 ++++
 tests/vectors/08_mode_transition/vector.json  |  42 ++++
 .../vectors/09_fault_raise_clear/vector.json  |  38 ++++
 tests/vectors/10_delta_operators/vector.json  |  38 ++++
 tests/vectors/11_condition_groups/vector.json |  44 ++++
 tests/vectors/12_int32_saturation/vector.json |  47 +++++
 tools/parse_benchmark.py                      | 151 ++++++++++++++
 16 files changed, 845 insertions(+), 12 deletions(-)
 create mode 100644 tests/python/test_cross_validation.py
 create mode 100644 tests/vectors/01_basic_eval/vector.json
 create mode 100644 tests/vectors/02_safety_guard_ordering/vector.json
 create mode 100644 tests/vectors/03_expr_scale/vector.json
 create mode 100644 tests/vectors/04_expr_accumulate/vector.json
 create mode 100644 tests/vectors/05_expr_clamp_saturation/vector.json
 create mode 100644 tests/vectors/06_div_by_zero/vector.json
 create mode 100644 tests/vectors/07_staleness/vector.json
 create mode 100644 tests/vectors/08_mode_transition/vector.json
 create mode 100644 tests/vectors/09_fault_raise_clear/vector.json
 create mode 100644 tests/vectors/10_delta_operators/vector.json
 create mode 100644 tests/vectors/11_condition_groups/vector.json
 create mode 100644 tests/vectors/12_int32_saturation/vector.json
 create mode 100644 tools/parse_benchmark.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9c55c1e..1131ebb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -243,6 +243,12 @@ jobs:
             --inline-logs -v \
             -O twister-out/benchmarks
 
+      # Parse benchmark timing and print summary table.
+      # No fail threshold yet — we need baseline data first.
+      - name: Benchmark timing summary
+        if: always()
+        run: python app/tools/parse_benchmark.py twister-out/benchmarks/
+
       # All 17 samples are build_only; CI proves they compile clean.
       - name: Twister — samples
         run: |
diff --git a/tests/python/test_cross_validation.py b/tests/python/test_cross_validation.py
new file mode 100644
index 0000000..daedbc0
--- /dev/null
+++ b/tests/python/test_cross_validation.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: MIT
+"""Cross-validation harness — proves Python evaluator matches golden vectors.
+
+Each vector in tests/vectors/<name>/vector.json contains an inline ARB model,
+input facts/timestamps, and expected outputs.  The Python evaluator is the
+reference implementation; these vectors will also be consumed by the C engine
+under Zephyr to prove cross-platform equivalence.
+
+Tests:
+  1. Parametrised golden-vector evaluation (10+ vectors).
+  2. Determinism: same input, 100 runs → identical output.
+  3. Compile-to-C: each vector model compiles and the generated source
+     contains the required ARBITER_generated_model symbol.
+"""
+
+from __future__ import annotations
+
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from arbiter.compiler import CompileOptions, compile_model
+from arbiter.evaluator import ArbiterEvaluator
+
+VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _discover_vectors() -> list[str]:
+    """Return sorted list of vector directory names that contain vector.json."""
+    if not VECTORS_DIR.exists():
+        return []
+    return sorted(
+        d.name
+        for d in VECTORS_DIR.iterdir()
+        if d.is_dir() and (d / "vector.json").exists()
+    )
+
+
+def _load_vector(name: str) -> dict:
+    """Load and parse a vector.json file."""
+    path = VECTORS_DIR / name / "vector.json"
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _run_vector(vec: dict) -> tuple[ArbiterEvaluator, dict]:
+    """Run the Python evaluator on a vector and return (evaluator, result_dict)."""
+    model_data = vec["model"]
+    ev = ArbiterEvaluator(model_data)
+
+    # Set fact values
+    for fact_name, value in vec.get("facts", {}).items():
+        ev.set_fact(fact_name, value)
+
+    # Set timestamps
+    for fact_name, ms in vec.get("timestamps", {}).items():
+        ev.set_timestamp(fact_name, ms)
+
+    # Set snapshot timestamp
+    snap_ts = vec.get("snapshot_timestamp_ms", 0)
+    if snap_ts:
+        ev.set_snapshot_timestamp(snap_ts)
+
+    result = ev.eval()
+    return ev, result
+
+
+# ---------------------------------------------------------------------------
+# 1. Golden vector evaluation
+# ---------------------------------------------------------------------------
+
+_VECTOR_NAMES = _discover_vectors()
+
+
+@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"])
+def test_golden_vector(vector_name: str) -> None:
+    """Evaluate each golden vector and assert output matches expected."""
+    if vector_name == "_no_vectors_":
+        pytest.fail("No golden vectors found in tests/vectors/")
+
+    vec = _load_vector(vector_name)
+    expected = vec["expected"]
+
+    ev, result = _run_vector(vec)
+
+    # --- fired_rules: exact ordered list ---
+    assert result.fired_rules == expected["fired_rules"], (
+        f"[{vector_name}] fired_rules mismatch"
+    )
+
+    # --- current_mode ---
+    assert result.current_mode == expected.get("current_mode"), (
+        f"[{vector_name}] current_mode mismatch"
+    )
+
+    # --- raised_faults: sorted set comparison ---
+    assert sorted(result.raised_faults) == sorted(expected.get("raised_faults", [])), (
+        f"[{vector_name}] raised_faults mismatch"
+    )
+
+    # --- requested_actions: ordered list ---
+    assert result.requested_actions == expected.get("requested_actions", []), (
+        f"[{vector_name}] requested_actions mismatch"
+    )
+
+    # --- fact_values: spot-check only the facts listed in expected ---
+    expected_facts = expected.get("fact_values", {})
+    for fact_name, expected_val in expected_facts.items():
+        actual = ev._fact_values.get(fact_name)
+        assert actual == expected_val, (
+            f"[{vector_name}] fact {fact_name}: expected {expected_val}, got {actual}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# 2. Determinism — same input, 100 runs, identical output
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("vector_name", _VECTOR_NAMES[:3] or ["_no_vectors_"])
+def test_determinism(vector_name: str) -> None:
+    """Run the same vector 100 times and assert all outputs are identical."""
+    if vector_name == "_no_vectors_":
+        pytest.skip("No vectors for determinism test")
+
+    vec = _load_vector(vector_name)
+    results: list[dict] = []
+
+    for _ in range(100):
+        _, result = _run_vector(vec)
+        results.append(result.to_dict())
+
+    baseline = results[0]
+    for i, r in enumerate(results[1:], start=1):
+        assert r == baseline, (
+            f"[{vector_name}] Non-deterministic result on iteration {i}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# 3. Compile-to-C — verify each vector model compiles to valid C source
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"])
+def test_compile_to_c(vector_name: str) -> None:
+    """Compile each vector model to C and verify the source contains required symbols."""
+    if vector_name == "_no_vectors_":
+        pytest.skip("No vectors for compile test")
+
+    vec = _load_vector(vector_name)
+    model_data = vec["model"]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmp = Path(tmpdir)
+        # Write model as YAML for the compiler
+        import yaml
+
+        model_path = tmp / "model.arb.yaml"
+        model_path.write_text(
+            yaml.dump(model_data, default_flow_style=False), encoding="utf-8"
+        )
+
+        opts = CompileOptions(
+            out_c=tmp / "model.c",
+            out_h=tmp / "model.h",
+        )
+        result = compile_model(model_path, opts)
+
+        assert result.success, (
+            f"[{vector_name}] Compilation failed: "
+            + "; ".join(
+                d.message
+                for d in result.diagnostics.errors
+            )
+        )
+
+        # Verify generated C source contains required symbols
+        c_source = (tmp / "model.c").read_text(encoding="utf-8")
+        h_source = (tmp / "model.h").read_text(encoding="utf-8")
+
+        assert "ARBITER_generated_model" in c_source, (
+            f"[{vector_name}] Missing ARBITER_generated_model in C source"
+        )
+        assert "ARBITER_generated_model" in h_source, (
+            f"[{vector_name}] Missing ARBITER_generated_model in header"
+        )
+        assert "ARBITER_MODEL_HASH" in h_source, (
+            f"[{vector_name}] Missing ARBITER_MODEL_HASH in header"
+        )
diff --git a/tests/python/test_golden_vectors.py b/tests/python/test_golden_vectors.py
index 96bbc8d..6eadc43 100644
--- a/tests/python/test_golden_vectors.py
+++ b/tests/python/test_golden_vectors.py
@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: MIT
 """Golden vector tests — framework for verifying deterministic evaluation.
 
-Each model in tests/vectors/ should include:
-  - input_snapshot.json
-  - expected_result.json
-  - expected_trace.json
+Each subdirectory under tests/vectors/ contains a vector.json with an inline
+ARB model, input facts/timestamps, and expected results.
 
 The same vectors are tested by:
-  - Python reference evaluator (this file)
+  - Python reference evaluator (this file + test_cross_validation.py)
   - Generated C runtime under Zephyr
   - Blob runtime under Zephyr
+
+NOTE: The comprehensive cross-validation tests are in test_cross_validation.py.
+This file is kept for backwards compatibility with the original vector
+discovery mechanism.
 """
 
 import json
@@ -17,27 +19,40 @@
 
 import pytest
 
+from arbiter.evaluator import ArbiterEvaluator
+
 VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors"
 
 
 def _load_vectors():
-    """Discover and load golden vector test cases."""
+    """Discover golden vector test cases using vector.json format."""
     if not VECTORS_DIR.exists():
         return []
     vectors = []
     for d in sorted(VECTORS_DIR.iterdir()):
-        if d.is_dir() and (d / "input_snapshot.json").exists():
+        if d.is_dir() and (d / "vector.json").exists():
             vectors.append(d.name)
     return vectors
 
 
 @pytest.mark.parametrize("vector_name", _load_vectors() or ["placeholder"])
 def test_golden_vector(vector_name):
-    """Verify golden vector produces expected result."""
+    """Verify golden vector produces expected result via Python evaluator."""
     if vector_name == "placeholder":
         pytest.skip("No golden vectors yet — add to tests/vectors/")
     vector_dir = VECTORS_DIR / vector_name
-    input_snap = json.loads((vector_dir / "input_snapshot.json").read_text())
-    expected = json.loads((vector_dir / "expected_result.json").read_text())
-    # TODO: implement Python reference evaluator and compare
-    pytest.skip("Python reference evaluator not yet implemented")
+    vec = json.loads((vector_dir / "vector.json").read_text(encoding="utf-8"))
+    model_data = vec["model"]
+    expected = vec["expected"]
+
+    ev = ArbiterEvaluator(model_data)
+    for fact_name, value in vec.get("facts", {}).items():
+        ev.set_fact(fact_name, value)
+    for fact_name, ms in vec.get("timestamps", {}).items():
+        ev.set_timestamp(fact_name, ms)
+    snap_ts = vec.get("snapshot_timestamp_ms", 0)
+    if snap_ts:
+        ev.set_snapshot_timestamp(snap_ts)
+
+    result = ev.eval()
+    assert result.fired_rules == expected["fired_rules"]
diff --git a/tests/vectors/01_basic_eval/vector.json b/tests/vectors/01_basic_eval/vector.json
new file mode 100644
index 0000000..8f1f129
--- /dev/null
+++ b/tests/vectors/01_basic_eval/vector.json
@@ -0,0 +1,30 @@
+{
+  "description": "Basic evaluation: unconditional rule always fires",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_basic_eval",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "sensor.value", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.always_on",
+        "class": "inference",
+        "then": {"explanation": "Unconditional rule fires every tick"}
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"sensor.value": 42},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.always_on"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"sensor.value": 42}
+  }
+}
diff --git a/tests/vectors/02_safety_guard_ordering/vector.json b/tests/vectors/02_safety_guard_ordering/vector.json
new file mode 100644
index 0000000..a5e838b
--- /dev/null
+++ b/tests/vectors/02_safety_guard_ordering/vector.json
@@ -0,0 +1,40 @@
+{
+  "description": "Safety guard ordering: safety_guard fires before inference regardless of declaration order",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_safety_ordering",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "temp_c", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.inference_first_alpha",
+        "class": "inference",
+        "then": {"explanation": "Inference rule"}
+      },
+      {
+        "id": "rule.advisory_z",
+        "class": "advisory",
+        "then": {"explanation": "Advisory rule"}
+      },
+      {
+        "id": "rule.safety_override",
+        "class": "safety_guard",
+        "then": {"explanation": "Safety guard fires first"}
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"temp_c": 100},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.safety_override", "rule.inference_first_alpha", "rule.advisory_z"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"temp_c": 100}
+  }
+}
diff --git a/tests/vectors/03_expr_scale/vector.json b/tests/vectors/03_expr_scale/vector.json
new file mode 100644
index 0000000..9b0c17e
--- /dev/null
+++ b/tests/vectors/03_expr_scale/vector.json
@@ -0,0 +1,36 @@
+{
+  "description": "Expression opcode: scale — target = (left * right) / scale",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_expr_scale",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "input_a", "type": "int32", "default": 0},
+      {"id": "input_b", "type": "int32", "default": 0},
+      {"id": "result", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.compute",
+        "class": "inference",
+        "then": {
+          "compute": [
+            {"target": "result", "op": "scale", "left": "input_a", "right": "input_b", "scale": 1000}
+          ]
+        }
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"input_a": 5000, "input_b": 2500, "result": 0},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.compute"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"input_a": 5000, "input_b": 2500, "result": 12500}
+  }
+}
diff --git a/tests/vectors/04_expr_accumulate/vector.json b/tests/vectors/04_expr_accumulate/vector.json
new file mode 100644
index 0000000..473c8eb
--- /dev/null
+++ b/tests/vectors/04_expr_accumulate/vector.json
@@ -0,0 +1,37 @@
+{
+  "description": "Expression opcode: accumulate — target = target + (left * right) / scale",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_expr_accum",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "delta", "type": "int32", "default": 0},
+      {"id": "gain", "type": "int32", "default": 0},
+      {"id": "integrator", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.integrate",
+        "class": "inference",
+        "then": {
+          "compute": [
+            {"target": "integrator", "op": "assign", "left_literal": 100},
+            {"target": "integrator", "op": "accumulate", "left": "delta", "right": "gain", "scale": 10}
+          ]
+        }
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"delta": 50, "gain": 3, "integrator": 0},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.integrate"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"delta": 50, "gain": 3, "integrator": 115}
+  }
+}
diff --git a/tests/vectors/05_expr_clamp_saturation/vector.json b/tests/vectors/05_expr_clamp_saturation/vector.json
new file mode 100644
index 0000000..a6f3ce2
--- /dev/null
+++ b/tests/vectors/05_expr_clamp_saturation/vector.json
@@ -0,0 +1,35 @@
+{
+  "description": "Expression opcode: clamp with saturation — value exceeds hi bound, clamped to 100",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_expr_clamp",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "output", "type": "int32", "default": 0},
+      {"id": "raw_value", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.clamp_output",
+        "class": "inference",
+        "then": {
+          "compute": [
+            {"target": "output", "op": "clamp", "left": "raw_value", "right_literal": -100, "scale": 100}
+          ]
+        }
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"raw_value": 500, "output": 0},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.clamp_output"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"raw_value": 500, "output": 100}
+  }
+}
diff --git a/tests/vectors/06_div_by_zero/vector.json b/tests/vectors/06_div_by_zero/vector.json
new file mode 100644
index 0000000..9f74c01
--- /dev/null
+++ b/tests/vectors/06_div_by_zero/vector.json
@@ -0,0 +1,40 @@
+{
+  "description": "Division by zero: div and mod with zero divisor return 0 (safe default)",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_div_zero",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "divisor", "type": "int32", "default": 0},
+      {"id": "dividend", "type": "int32", "default": 0},
+      {"id": "div_result", "type": "int32", "default": -1},
+      {"id": "mod_result", "type": "int32", "default": -1},
+      {"id": "scale_result", "type": "int32", "default": -1}
+    ],
+    "rules": [
+      {
+        "id": "rule.divs",
+        "class": "inference",
+        "then": {
+          "compute": [
+            {"target": "div_result", "op": "div", "left": "dividend", "right": "divisor"},
+            {"target": "mod_result", "op": "mod", "left": "dividend", "right": "divisor"},
+            {"target": "scale_result", "op": "scale", "left": "dividend", "right": "divisor", "scale": 0}
+          ]
+        }
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"dividend": 100, "divisor": 0, "div_result": -1, "mod_result": -1, "scale_result": -1},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.divs"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"dividend": 100, "divisor": 0, "div_result": 0, "mod_result": 0, "scale_result": 0}
+  }
+}
diff --git a/tests/vectors/07_staleness/vector.json b/tests/vectors/07_staleness/vector.json
new file mode 100644
index 0000000..0368237
--- /dev/null
+++ b/tests/vectors/07_staleness/vector.json
@@ -0,0 +1,38 @@
+{
+  "description": "Staleness: fact with old timestamp fires stale rule, fresh fact fires not_stale rule",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_staleness",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "sensor.fresh", "type": "int32", "default": 0, "stale_after_ms": 500},
+      {"id": "sensor.old", "type": "int32", "default": 0, "stale_after_ms": 500}
+    ],
+    "rules": [
+      {
+        "id": "rule.old_is_stale",
+        "class": "safety_guard",
+        "when": {"all": [{"fact": "sensor.old", "op": "stale", "value": 500}]},
+        "then": {"explanation": "Old sensor data is stale"}
+      },
+      {
+        "id": "rule.fresh_is_ok",
+        "class": "inference",
+        "when": {"all": [{"fact": "sensor.fresh", "op": "not_stale", "value": 500}]},
+        "then": {"explanation": "Fresh sensor data is valid"}
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"sensor.old": 42, "sensor.fresh": 99},
+  "timestamps": {"sensor.old": 0, "sensor.fresh": 900},
+  "snapshot_timestamp_ms": 1000,
+  "expected": {
+    "fired_rules": ["rule.old_is_stale", "rule.fresh_is_ok"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"sensor.old": 42, "sensor.fresh": 99}
+  }
+}
diff --git a/tests/vectors/08_mode_transition/vector.json b/tests/vectors/08_mode_transition/vector.json
new file mode 100644
index 0000000..5435cb6
--- /dev/null
+++ b/tests/vectors/08_mode_transition/vector.json
@@ -0,0 +1,42 @@
+{
+  "description": "Mode transition: condition triggers set_mode, last mode setter wins",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_mode_trans",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "level", "type": "int32", "default": 0}
+    ],
+    "modes": [
+      {"id": "mode.idle"},
+      {"id": "mode.active"},
+      {"id": "mode.critical"}
+    ],
+    "rules": [
+      {
+        "id": "rule.go_active",
+        "class": "mode_guard",
+        "when": {"all": [{"fact": "level", "op": ">", "value": 50}]},
+        "then": {"set_mode": "mode.active"}
+      },
+      {
+        "id": "rule.go_critical",
+        "class": "safety_guard",
+        "when": {"all": [{"fact": "level", "op": ">", "value": 90}]},
+        "then": {"set_mode": "mode.critical"}
+      }
+    ],
+    "actions": [],
+    "modes": [{"id": "mode.idle"}, {"id": "mode.active"}, {"id": "mode.critical"}]
+  },
+  "facts": {"level": 95},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.go_critical", "rule.go_active"],
+    "current_mode": "mode.active",
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"level": 95}
+  }
+}
diff --git a/tests/vectors/09_fault_raise_clear/vector.json b/tests/vectors/09_fault_raise_clear/vector.json
new file mode 100644
index 0000000..f529259
--- /dev/null
+++ b/tests/vectors/09_fault_raise_clear/vector.json
@@ -0,0 +1,38 @@
+{
+  "description": "Fault management: raise_fault then clear_fault in same eval cycle",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_faults",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "error_flag", "type": "bool", "default": false},
+      {"id": "recovery_flag", "type": "bool", "default": false}
+    ],
+    "rules": [
+      {
+        "id": "rule.raise_error",
+        "class": "safety_guard",
+        "when": {"all": [{"fact": "error_flag", "op": "==", "value": true}]},
+        "then": {"raise_fault": "fault.hw_error", "explanation": "Hardware error detected"}
+      },
+      {
+        "id": "rule.clear_error",
+        "class": "inference",
+        "when": {"all": [{"fact": "recovery_flag", "op": "==", "value": true}]},
+        "then": {"clear_fault": "fault.hw_error", "explanation": "Recovery confirmed"}
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"error_flag": 1, "recovery_flag": 1},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.raise_error", "rule.clear_error"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": ["raise_fault:fault.hw_error", "clear_fault:fault.hw_error"],
+    "fact_values": {"error_flag": 1, "recovery_flag": 1}
+  }
+}
diff --git a/tests/vectors/10_delta_operators/vector.json b/tests/vectors/10_delta_operators/vector.json
new file mode 100644
index 0000000..e7fac92
--- /dev/null
+++ b/tests/vectors/10_delta_operators/vector.json
@@ -0,0 +1,38 @@
+{
+  "description": "Delta operators: delta_gt fires when |current - prev| > threshold, delta_lt when < threshold",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_delta_ops",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "sensor.fast", "type": "int32", "default": 0},
+      {"id": "sensor.slow", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.big_change",
+        "class": "inference",
+        "when": {"all": [{"fact": "sensor.fast", "op": "delta_gt", "value": 50}]},
+        "then": {"explanation": "Large change detected on fast sensor"}
+      },
+      {
+        "id": "rule.small_change",
+        "class": "inference",
+        "when": {"all": [{"fact": "sensor.slow", "op": "delta_lt", "value": 10}]},
+        "then": {"explanation": "Small change on slow sensor"}
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"sensor.fast": 100, "sensor.slow": 5},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.big_change", "rule.small_change"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"sensor.fast": 100, "sensor.slow": 5}
+  }
+}
diff --git a/tests/vectors/11_condition_groups/vector.json b/tests/vectors/11_condition_groups/vector.json
new file mode 100644
index 0000000..68e39b3
--- /dev/null
+++ b/tests/vectors/11_condition_groups/vector.json
@@ -0,0 +1,44 @@
+{
+  "description": "Condition groups: any (one of many) and not (inversion) groups",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_cond_groups",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "flag_a", "type": "bool", "default": false},
+      {"id": "flag_b", "type": "bool", "default": false},
+      {"id": "flag_c", "type": "bool", "default": false}
+    ],
+    "rules": [
+      {
+        "id": "rule.any_flag",
+        "class": "inference",
+        "when": {"any": [
+          {"fact": "flag_a", "op": "==", "value": true},
+          {"fact": "flag_b", "op": "==", "value": true}
+        ]},
+        "then": {"explanation": "At least one flag is set"}
+      },
+      {
+        "id": "rule.not_c",
+        "class": "inference",
+        "when": {"not": [
+          {"fact": "flag_c", "op": "==", "value": true}
+        ]},
+        "then": {"explanation": "flag_c is NOT set"}
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"flag_a": 0, "flag_b": 1, "flag_c": 0},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.any_flag", "rule.not_c"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"flag_a": 0, "flag_b": 1, "flag_c": 0}
+  }
+}
diff --git a/tests/vectors/12_int32_saturation/vector.json b/tests/vectors/12_int32_saturation/vector.json
new file mode 100644
index 0000000..9968a2d
--- /dev/null
+++ b/tests/vectors/12_int32_saturation/vector.json
@@ -0,0 +1,47 @@
+{
+  "description": "INT32 saturation: add overflow clamps to INT32_MAX, sub underflow clamps to INT32_MIN",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_saturation",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "big", "type": "int32", "default": 0},
+      {"id": "one", "type": "int32", "default": 0},
+      {"id": "overflow_result", "type": "int32", "default": 0},
+      {"id": "small", "type": "int32", "default": 0},
+      {"id": "underflow_result", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.overflow",
+        "class": "inference",
+        "then": {
+          "compute": [
+            {"target": "overflow_result", "op": "add", "left": "big", "right": "one"}
+          ]
+        }
+      },
+      {
+        "id": "rule.underflow",
+        "class": "inference",
+        "then": {
+          "compute": [
+            {"target": "underflow_result", "op": "sub", "left": "small", "right": "one"}
+          ]
+        }
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"big": 2147483647, "small": -2147483648, "one": 1, "overflow_result": 0, "underflow_result": 0},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.overflow", "rule.underflow"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"big": 2147483647, "small": -2147483648, "one": 1, "overflow_result": 2147483647, "underflow_result": -2147483648}
+  }
+}
diff --git a/tools/parse_benchmark.py b/tools/parse_benchmark.py
new file mode 100644
index 0000000..caafb2f
--- /dev/null
+++ b/tools/parse_benchmark.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+"""Parse benchmark timing from Twister log output.
+
+Extracts ns/tick for both hand-coded and engine implementations from
+PID and Kalman benchmark logs.  Prints a summary table suitable for
+CI log inspection and future regression gating.
+
+Usage::
+
+    python tools/parse_benchmark.py twister-out/benchmarks/
+
+Reads all handler.log files under the given directory tree.
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass
+class BenchmarkResult:
+    """Timing result from one benchmark variant."""
+
+    benchmark: str
+    variant: str
+    ns_per_tick: int
+
+
+# Patterns matching the LOG_INF lines from benchmark main.c files.
+# PID:    "  Total: 123456 ns  (42 ns/tick)"   or   "  Total: 123 ms  (42 ns/tick)"
+# Kalman: "  Total: 123 ms  (42 ns/tick)"
+_NS_PER_TICK_RE = re.compile(r"\((\d+)\s*ns/tick\)")
+
+# Section headers that identify which variant we're parsing.
+_HAND_CODED_RE = re.compile(r"---\s*Hand-coded\s+(\w+)\s*---")
+_ENGINE_RE = re.compile(r"---\s*arbiter\s+Engine\s+(\w+)\s*---")
+
+
+def parse_log(log_text: str, benchmark_name: str) -> list[BenchmarkResult]:
+    """Parse a single benchmark log and extract timing results."""
+    results: list[BenchmarkResult] = []
+    current_variant: str | None = None
+    current_algo: str | None = None
+
+    for line in log_text.splitlines():
+        # Check for section headers
+        m = _HAND_CODED_RE.search(line)
+        if m:
+            current_algo = m.group(1)
+            current_variant = "hand-coded"
+            continue
+
+        m = _ENGINE_RE.search(line)
+        if m:
+            current_algo = m.group(1)
+            current_variant = "engine"
+            continue
+
+        # Check for ns/tick value
+        m = _NS_PER_TICK_RE.search(line)
+        if m and current_variant and current_algo:
+            ns = int(m.group(1))
+            results.append(BenchmarkResult(
+                benchmark=f"{benchmark_name}/{current_algo}",
+                variant=current_variant,
+                ns_per_tick=ns,
+            ))
+            current_variant = None
+            current_algo = None
+
+    return results
+
+
+def find_and_parse(base_dir: Path) -> list[BenchmarkResult]:
+    """Find all handler.log files under base_dir and parse them."""
+    results: list[BenchmarkResult] = []
+
+    for log_path in sorted(base_dir.rglob("handler.log")):
+        # Infer benchmark name from path: .../pid_benchmark/... or .../kalman_benchmark/...
+        parts = log_path.parts
+        bench_name = "unknown"
+        for part in parts:
+            if "pid" in part.lower():
+                bench_name = "pid"
+                break
+            if "kalman" in part.lower():
+                bench_name = "kalman"
+                break
+
+        log_text = log_path.read_text(encoding="utf-8", errors="replace")
+        results.extend(parse_log(log_text, bench_name))
+
+    return results
+
+
+def print_summary(results: list[BenchmarkResult]) -> None:
+    """Print a formatted summary table of benchmark results."""
+    if not results:
+        print("No benchmark timing data found.")
+        return
+
+    # Header
+    print()
+    print("=" * 60)
+    print("  Benchmark Performance Summary")
+    print("=" * 60)
+    print(f"  {'Benchmark':<25} {'Variant':<15} {'ns/tick':>10}")
+    print("-" * 60)
+
+    for r in results:
+        print(f"  {r.benchmark:<25} {r.variant:<15} {r.ns_per_tick:>10}")
+
+    print("-" * 60)
+
+    # Compute overhead for each benchmark that has both variants
+    by_bench: dict[str, dict[str, int]] = {}
+    for r in results:
+        by_bench.setdefault(r.benchmark, {})[r.variant] = r.ns_per_tick
+
+    for bench, variants in sorted(by_bench.items()):
+        hand = variants.get("hand-coded")
+        engine = variants.get("engine")
+        if hand and engine and hand > 0:
+            overhead_pct = ((engine - hand) * 100) // hand
+            print(f"  {bench:<25} overhead: {overhead_pct}%")
+
+    print("=" * 60)
+    print()
+
+
+def main() -> int:
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} <twister-out-dir>", file=sys.stderr)
+        return 1
+
+    base_dir = Path(sys.argv[1])
+    if not base_dir.exists():
+        print(f"Error: directory not found: {base_dir}", file=sys.stderr)
+        return 1
+
+    results = find_and_parse(base_dir)
+    print_summary(results)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())