From 08ee6b04d7c7dad9c060f0480134dd74a028f182 Mon Sep 17 00:00:00 2001 From: Tristen Pierson Date: Tue, 2 Jun 2026 14:33:33 -0400 Subject: [PATCH] feat: cross-validation harness and performance regression tracking Add cross-validation test suite that validates the Python evaluator against 12 golden test vectors covering all major evaluation paths: basic eval, safety guard ordering, expression opcodes (scale, accumulate, clamp with saturation), div-by-zero, staleness, mode transitions, fault raise/clear, delta operators, condition groups, and INT32 saturation. Tests include: - Parametrised golden-vector evaluation (12 vectors) - Determinism verification (100 identical runs) - Compile-to-C validation (each vector model compiles to valid C) Also adds: - tools/parse_benchmark.py for extracting ns/tick timing from Twister benchmark logs (PID and Kalman) - CI workflow step to print benchmark timing summary after Twister Co-Authored-By: Oz --- .github/workflows/ci.yml | 6 + tests/python/test_cross_validation.py | 196 ++++++++++++++++++ tests/python/test_golden_vectors.py | 39 ++-- tests/vectors/01_basic_eval/vector.json | 30 +++ .../02_safety_guard_ordering/vector.json | 40 ++++ tests/vectors/03_expr_scale/vector.json | 36 ++++ tests/vectors/04_expr_accumulate/vector.json | 37 ++++ .../05_expr_clamp_saturation/vector.json | 35 ++++ tests/vectors/06_div_by_zero/vector.json | 40 ++++ tests/vectors/07_staleness/vector.json | 38 ++++ tests/vectors/08_mode_transition/vector.json | 42 ++++ .../vectors/09_fault_raise_clear/vector.json | 38 ++++ tests/vectors/10_delta_operators/vector.json | 38 ++++ tests/vectors/11_condition_groups/vector.json | 44 ++++ tests/vectors/12_int32_saturation/vector.json | 47 +++++ tools/parse_benchmark.py | 151 ++++++++++++++ 16 files changed, 845 insertions(+), 12 deletions(-) create mode 100644 tests/python/test_cross_validation.py create mode 100644 tests/vectors/01_basic_eval/vector.json create mode 100644 tests/vectors/02_safety_guard_ordering/vector.json create mode 100644 tests/vectors/03_expr_scale/vector.json create mode 100644 tests/vectors/04_expr_accumulate/vector.json create mode 100644 tests/vectors/05_expr_clamp_saturation/vector.json create mode 100644 tests/vectors/06_div_by_zero/vector.json create mode 100644 tests/vectors/07_staleness/vector.json create mode 100644 tests/vectors/08_mode_transition/vector.json create mode 100644 tests/vectors/09_fault_raise_clear/vector.json create mode 100644 tests/vectors/10_delta_operators/vector.json create mode 100644 tests/vectors/11_condition_groups/vector.json create mode 100644 tests/vectors/12_int32_saturation/vector.json create mode 100644 tools/parse_benchmark.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9c55c1e..1131ebb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -243,6 +243,12 @@ jobs: --inline-logs -v \ -O twister-out/benchmarks + # Parse benchmark timing and print summary table. + # No fail threshold yet — we need baseline data first. + - name: Benchmark timing summary + if: always() + run: python app/tools/parse_benchmark.py twister-out/benchmarks/ + # All 17 samples are build_only; CI proves they compile clean. - name: Twister — samples run: | diff --git a/tests/python/test_cross_validation.py b/tests/python/test_cross_validation.py new file mode 100644 index 0000000..daedbc0 --- /dev/null +++ b/tests/python/test_cross_validation.py @@ -0,0 +1,196 @@ +# SPDX-License-Identifier: MIT +"""Cross-validation harness — proves Python evaluator matches golden vectors. + +Each vector in tests/vectors//vector.json contains an inline ARB model, +input facts/timestamps, and expected outputs. The Python evaluator is the +reference implementation; these vectors will also be consumed by the C engine +under Zephyr to prove cross-platform equivalence. + +Tests: + 1. Parametrised golden-vector evaluation (10+ vectors). + 2. Determinism: same input, 100 runs → identical output. + 3. Compile-to-C: each vector model compiles and the generated source + contains the required ARBITER_generated_model symbol. +""" + +from __future__ import annotations + +import json +import tempfile +from pathlib import Path + +import pytest + +from arbiter.compiler import CompileOptions, compile_model +from arbiter.evaluator import ArbiterEvaluator + +VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _discover_vectors() -> list[str]: + """Return sorted list of vector directory names that contain vector.json.""" + if not VECTORS_DIR.exists(): + return [] + return sorted( + d.name + for d in VECTORS_DIR.iterdir() + if d.is_dir() and (d / "vector.json").exists() + ) + + +def _load_vector(name: str) -> dict: + """Load and parse a vector.json file.""" + path = VECTORS_DIR / name / "vector.json" + return json.loads(path.read_text(encoding="utf-8")) + + +def _run_vector(vec: dict) -> tuple[ArbiterEvaluator, dict]: + """Run the Python evaluator on a vector and return (evaluator, result_dict).""" + model_data = vec["model"] + ev = ArbiterEvaluator(model_data) + + # Set fact values + for fact_name, value in vec.get("facts", {}).items(): + ev.set_fact(fact_name, value) + + # Set timestamps + for fact_name, ms in vec.get("timestamps", {}).items(): + ev.set_timestamp(fact_name, ms) + + # Set snapshot timestamp + snap_ts = vec.get("snapshot_timestamp_ms", 0) + if snap_ts: + ev.set_snapshot_timestamp(snap_ts) + + result = ev.eval() + return ev, result + + +# --------------------------------------------------------------------------- +# 1. Golden vector evaluation +# --------------------------------------------------------------------------- + +_VECTOR_NAMES = _discover_vectors() + + +@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"]) +def test_golden_vector(vector_name: str) -> None: + """Evaluate each golden vector and assert output matches expected.""" + if vector_name == "_no_vectors_": + pytest.fail("No golden vectors found in tests/vectors/") + + vec = _load_vector(vector_name) + expected = vec["expected"] + + ev, result = _run_vector(vec) + + # --- fired_rules: exact ordered list --- + assert result.fired_rules == expected["fired_rules"], ( + f"[{vector_name}] fired_rules mismatch" + ) + + # --- current_mode --- + assert result.current_mode == expected.get("current_mode"), ( + f"[{vector_name}] current_mode mismatch" + ) + + # --- raised_faults: sorted set comparison --- + assert sorted(result.raised_faults) == sorted(expected.get("raised_faults", [])), ( + f"[{vector_name}] raised_faults mismatch" + ) + + # --- requested_actions: ordered list --- + assert result.requested_actions == expected.get("requested_actions", []), ( + f"[{vector_name}] requested_actions mismatch" + ) + + # --- fact_values: spot-check only the facts listed in expected --- + expected_facts = expected.get("fact_values", {}) + for fact_name, expected_val in expected_facts.items(): + actual = ev._fact_values.get(fact_name) + assert actual == expected_val, ( + f"[{vector_name}] fact {fact_name}: expected {expected_val}, got {actual}" + ) + + +# --------------------------------------------------------------------------- +# 2. Determinism — same input, 100 runs, identical output +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("vector_name", _VECTOR_NAMES[:3] or ["_no_vectors_"]) +def test_determinism(vector_name: str) -> None: + """Run the same vector 100 times and assert all outputs are identical.""" + if vector_name == "_no_vectors_": + pytest.skip("No vectors for determinism test") + + vec = _load_vector(vector_name) + results: list[dict] = [] + + for _ in range(100): + _, result = _run_vector(vec) + results.append(result.to_dict()) + + baseline = results[0] + for i, r in enumerate(results[1:], start=1): + assert r == baseline, ( + f"[{vector_name}] Non-deterministic result on iteration {i}" + ) + + +# --------------------------------------------------------------------------- +# 3. Compile-to-C — verify each vector model compiles to valid C source +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"]) +def test_compile_to_c(vector_name: str) -> None: + """Compile each vector model to C and verify the source contains required symbols.""" + if vector_name == "_no_vectors_": + pytest.skip("No vectors for compile test") + + vec = _load_vector(vector_name) + model_data = vec["model"] + + with tempfile.TemporaryDirectory() as tmpdir: + tmp = Path(tmpdir) + # Write model as YAML for the compiler + import yaml + + model_path = tmp / "model.arb.yaml" + model_path.write_text( + yaml.dump(model_data, default_flow_style=False), encoding="utf-8" + ) + + opts = CompileOptions( + out_c=tmp / "model.c", + out_h=tmp / "model.h", + ) + result = compile_model(model_path, opts) + + assert result.success, ( + f"[{vector_name}] Compilation failed: " + + "; ".join( + d.message + for d in result.diagnostics.errors + ) + ) + + # Verify generated C source contains required symbols + c_source = (tmp / "model.c").read_text(encoding="utf-8") + h_source = (tmp / "model.h").read_text(encoding="utf-8") + + assert "ARBITER_generated_model" in c_source, ( + f"[{vector_name}] Missing ARBITER_generated_model in C source" + ) + assert "ARBITER_generated_model" in h_source, ( + f"[{vector_name}] Missing ARBITER_generated_model in header" + ) + assert "ARBITER_MODEL_HASH" in h_source, ( + f"[{vector_name}] Missing ARBITER_MODEL_HASH in header" + ) diff --git a/tests/python/test_golden_vectors.py b/tests/python/test_golden_vectors.py index 96bbc8d..6eadc43 100644 --- a/tests/python/test_golden_vectors.py +++ b/tests/python/test_golden_vectors.py @@ -1,15 +1,17 @@ # SPDX-License-Identifier: MIT """Golden vector tests — framework for verifying deterministic evaluation. -Each model in tests/vectors/ should include: - - input_snapshot.json - - expected_result.json - - expected_trace.json +Each subdirectory under tests/vectors/ contains a vector.json with an inline +ARB model, input facts/timestamps, and expected results. The same vectors are tested by: - - Python reference evaluator (this file) + - Python reference evaluator (this file + test_cross_validation.py) - Generated C runtime under Zephyr - Blob runtime under Zephyr + +NOTE: The comprehensive cross-validation tests are in test_cross_validation.py. +This file is kept for backwards compatibility with the original vector +discovery mechanism. """ import json @@ -17,27 +19,40 @@ import pytest +from arbiter.evaluator import ArbiterEvaluator + VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors" def _load_vectors(): - """Discover and load golden vector test cases.""" + """Discover golden vector test cases using vector.json format.""" if not VECTORS_DIR.exists(): return [] vectors = [] for d in sorted(VECTORS_DIR.iterdir()): - if d.is_dir() and (d / "input_snapshot.json").exists(): + if d.is_dir() and (d / "vector.json").exists(): vectors.append(d.name) return vectors @pytest.mark.parametrize("vector_name", _load_vectors() or ["placeholder"]) def test_golden_vector(vector_name): - """Verify golden vector produces expected result.""" + """Verify golden vector produces expected result via Python evaluator.""" if vector_name == "placeholder": pytest.skip("No golden vectors yet — add to tests/vectors/") vector_dir = VECTORS_DIR / vector_name - input_snap = json.loads((vector_dir / "input_snapshot.json").read_text()) - expected = json.loads((vector_dir / "expected_result.json").read_text()) - # TODO: implement Python reference evaluator and compare - pytest.skip("Python reference evaluator not yet implemented") + vec = json.loads((vector_dir / "vector.json").read_text(encoding="utf-8")) + model_data = vec["model"] + expected = vec["expected"] + + ev = ArbiterEvaluator(model_data) + for fact_name, value in vec.get("facts", {}).items(): + ev.set_fact(fact_name, value) + for fact_name, ms in vec.get("timestamps", {}).items(): + ev.set_timestamp(fact_name, ms) + snap_ts = vec.get("snapshot_timestamp_ms", 0) + if snap_ts: + ev.set_snapshot_timestamp(snap_ts) + + result = ev.eval() + assert result.fired_rules == expected["fired_rules"] diff --git a/tests/vectors/01_basic_eval/vector.json b/tests/vectors/01_basic_eval/vector.json new file mode 100644 index 0000000..8f1f129 --- /dev/null +++ b/tests/vectors/01_basic_eval/vector.json @@ -0,0 +1,30 @@ +{ + "description": "Basic evaluation: unconditional rule always fires", + "model": { + "arb_version": 0.1, + "model": "vec_basic_eval", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "sensor.value", "type": "int32", "default": 0} + ], + "rules": [ + { + "id": "rule.always_on", + "class": "inference", + "then": {"explanation": "Unconditional rule fires every tick"} + } + ], + "actions": [], + "modes": [] + }, + "facts": {"sensor.value": 42}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.always_on"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"sensor.value": 42} + } +} diff --git a/tests/vectors/02_safety_guard_ordering/vector.json b/tests/vectors/02_safety_guard_ordering/vector.json new file mode 100644 index 0000000..a5e838b --- /dev/null +++ b/tests/vectors/02_safety_guard_ordering/vector.json @@ -0,0 +1,40 @@ +{ + "description": "Safety guard ordering: safety_guard fires before inference regardless of declaration order", + "model": { + "arb_version": 0.1, + "model": "vec_safety_ordering", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "temp_c", "type": "int32", "default": 0} + ], + "rules": [ + { + "id": "rule.inference_first_alpha", + "class": "inference", + "then": {"explanation": "Inference rule"} + }, + { + "id": "rule.advisory_z", + "class": "advisory", + "then": {"explanation": "Advisory rule"} + }, + { + "id": "rule.safety_override", + "class": "safety_guard", + "then": {"explanation": "Safety guard fires first"} + } + ], + "actions": [], + "modes": [] + }, + "facts": {"temp_c": 100}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.safety_override", "rule.inference_first_alpha", "rule.advisory_z"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"temp_c": 100} + } +} diff --git a/tests/vectors/03_expr_scale/vector.json b/tests/vectors/03_expr_scale/vector.json new file mode 100644 index 0000000..9b0c17e --- /dev/null +++ b/tests/vectors/03_expr_scale/vector.json @@ -0,0 +1,36 @@ +{ + "description": "Expression opcode: scale — target = (left * right) / scale", + "model": { + "arb_version": 0.1, + "model": "vec_expr_scale", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "input_a", "type": "int32", "default": 0}, + {"id": "input_b", "type": "int32", "default": 0}, + {"id": "result", "type": "int32", "default": 0} + ], + "rules": [ + { + "id": "rule.compute", + "class": "inference", + "then": { + "compute": [ + {"target": "result", "op": "scale", "left": "input_a", "right": "input_b", "scale": 1000} + ] + } + } + ], + "actions": [], + "modes": [] + }, + "facts": {"input_a": 5000, "input_b": 2500, "result": 0}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.compute"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"input_a": 5000, "input_b": 2500, "result": 12500} + } +} diff --git a/tests/vectors/04_expr_accumulate/vector.json b/tests/vectors/04_expr_accumulate/vector.json new file mode 100644 index 0000000..473c8eb --- /dev/null +++ b/tests/vectors/04_expr_accumulate/vector.json @@ -0,0 +1,37 @@ +{ + "description": "Expression opcode: accumulate — target = target + (left * right) / scale", + "model": { + "arb_version": 0.1, + "model": "vec_expr_accum", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "delta", "type": "int32", "default": 0}, + {"id": "gain", "type": "int32", "default": 0}, + {"id": "integrator", "type": "int32", "default": 0} + ], + "rules": [ + { + "id": "rule.integrate", + "class": "inference", + "then": { + "compute": [ + {"target": "integrator", "op": "assign", "left_literal": 100}, + {"target": "integrator", "op": "accumulate", "left": "delta", "right": "gain", "scale": 10} + ] + } + } + ], + "actions": [], + "modes": [] + }, + "facts": {"delta": 50, "gain": 3, "integrator": 0}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.integrate"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"delta": 50, "gain": 3, "integrator": 115} + } +} diff --git a/tests/vectors/05_expr_clamp_saturation/vector.json b/tests/vectors/05_expr_clamp_saturation/vector.json new file mode 100644 index 0000000..a6f3ce2 --- /dev/null +++ b/tests/vectors/05_expr_clamp_saturation/vector.json @@ -0,0 +1,35 @@ +{ + "description": "Expression opcode: clamp with saturation — value exceeds hi bound, clamped to 100", + "model": { + "arb_version": 0.1, + "model": "vec_expr_clamp", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "output", "type": "int32", "default": 0}, + {"id": "raw_value", "type": "int32", "default": 0} + ], + "rules": [ + { + "id": "rule.clamp_output", + "class": "inference", + "then": { + "compute": [ + {"target": "output", "op": "clamp", "left": "raw_value", "right_literal": -100, "scale": 100} + ] + } + } + ], + "actions": [], + "modes": [] + }, + "facts": {"raw_value": 500, "output": 0}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.clamp_output"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"raw_value": 500, "output": 100} + } +} diff --git a/tests/vectors/06_div_by_zero/vector.json b/tests/vectors/06_div_by_zero/vector.json new file mode 100644 index 0000000..9f74c01 --- /dev/null +++ b/tests/vectors/06_div_by_zero/vector.json @@ -0,0 +1,40 @@ +{ + "description": "Division by zero: div and mod with zero divisor return 0 (safe default)", + "model": { + "arb_version": 0.1, + "model": "vec_div_zero", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "divisor", "type": "int32", "default": 0}, + {"id": "dividend", "type": "int32", "default": 0}, + {"id": "div_result", "type": "int32", "default": -1}, + {"id": "mod_result", "type": "int32", "default": -1}, + {"id": "scale_result", "type": "int32", "default": -1} + ], + "rules": [ + { + "id": "rule.divs", + "class": "inference", + "then": { + "compute": [ + {"target": "div_result", "op": "div", "left": "dividend", "right": "divisor"}, + {"target": "mod_result", "op": "mod", "left": "dividend", "right": "divisor"}, + {"target": "scale_result", "op": "scale", "left": "dividend", "right": "divisor", "scale": 0} + ] + } + } + ], + "actions": [], + "modes": [] + }, + "facts": {"dividend": 100, "divisor": 0, "div_result": -1, "mod_result": -1, "scale_result": -1}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.divs"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"dividend": 100, "divisor": 0, "div_result": 0, "mod_result": 0, "scale_result": 0} + } +} diff --git a/tests/vectors/07_staleness/vector.json b/tests/vectors/07_staleness/vector.json new file mode 100644 index 0000000..0368237 --- /dev/null +++ b/tests/vectors/07_staleness/vector.json @@ -0,0 +1,38 @@ +{ + "description": "Staleness: fact with old timestamp fires stale rule, fresh fact fires not_stale rule", + "model": { + "arb_version": 0.1, + "model": "vec_staleness", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "sensor.fresh", "type": "int32", "default": 0, "stale_after_ms": 500}, + {"id": "sensor.old", "type": "int32", "default": 0, "stale_after_ms": 500} + ], + "rules": [ + { + "id": "rule.old_is_stale", + "class": "safety_guard", + "when": {"all": [{"fact": "sensor.old", "op": "stale", "value": 500}]}, + "then": {"explanation": "Old sensor data is stale"} + }, + { + "id": "rule.fresh_is_ok", + "class": "inference", + "when": {"all": [{"fact": "sensor.fresh", "op": "not_stale", "value": 500}]}, + "then": {"explanation": "Fresh sensor data is valid"} + } + ], + "actions": [], + "modes": [] + }, + "facts": {"sensor.old": 42, "sensor.fresh": 99}, + "timestamps": {"sensor.old": 0, "sensor.fresh": 900}, + "snapshot_timestamp_ms": 1000, + "expected": { + "fired_rules": ["rule.old_is_stale", "rule.fresh_is_ok"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"sensor.old": 42, "sensor.fresh": 99} + } +} diff --git a/tests/vectors/08_mode_transition/vector.json b/tests/vectors/08_mode_transition/vector.json new file mode 100644 index 0000000..5435cb6 --- /dev/null +++ b/tests/vectors/08_mode_transition/vector.json @@ -0,0 +1,42 @@ +{ + "description": "Mode transition: condition triggers set_mode, last mode setter wins", + "model": { + "arb_version": 0.1, + "model": "vec_mode_trans", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "level", "type": "int32", "default": 0} + ], + "modes": [ + {"id": "mode.idle"}, + {"id": "mode.active"}, + {"id": "mode.critical"} + ], + "rules": [ + { + "id": "rule.go_active", + "class": "mode_guard", + "when": {"all": [{"fact": "level", "op": ">", "value": 50}]}, + "then": {"set_mode": "mode.active"} + }, + { + "id": "rule.go_critical", + "class": "safety_guard", + "when": {"all": [{"fact": "level", "op": ">", "value": 90}]}, + "then": {"set_mode": "mode.critical"} + } + ], + "actions": [], + "modes": [{"id": "mode.idle"}, {"id": "mode.active"}, {"id": "mode.critical"}] + }, + "facts": {"level": 95}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.go_critical", "rule.go_active"], + "current_mode": "mode.active", + "raised_faults": [], + "requested_actions": [], + "fact_values": {"level": 95} + } +} diff --git a/tests/vectors/09_fault_raise_clear/vector.json b/tests/vectors/09_fault_raise_clear/vector.json new file mode 100644 index 0000000..f529259 --- /dev/null +++ b/tests/vectors/09_fault_raise_clear/vector.json @@ -0,0 +1,38 @@ +{ + "description": "Fault management: raise_fault then clear_fault in same eval cycle", + "model": { + "arb_version": 0.1, + "model": "vec_faults", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "error_flag", "type": "bool", "default": false}, + {"id": "recovery_flag", "type": "bool", "default": false} + ], + "rules": [ + { + "id": "rule.raise_error", + "class": "safety_guard", + "when": {"all": [{"fact": "error_flag", "op": "==", "value": true}]}, + "then": {"raise_fault": "fault.hw_error", "explanation": "Hardware error detected"} + }, + { + "id": "rule.clear_error", + "class": "inference", + "when": {"all": [{"fact": "recovery_flag", "op": "==", "value": true}]}, + "then": {"clear_fault": "fault.hw_error", "explanation": "Recovery confirmed"} + } + ], + "actions": [], + "modes": [] + }, + "facts": {"error_flag": 1, "recovery_flag": 1}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.raise_error", "rule.clear_error"], + "current_mode": null, + "raised_faults": [], + "requested_actions": ["raise_fault:fault.hw_error", "clear_fault:fault.hw_error"], + "fact_values": {"error_flag": 1, "recovery_flag": 1} + } +} diff --git a/tests/vectors/10_delta_operators/vector.json b/tests/vectors/10_delta_operators/vector.json new file mode 100644 index 0000000..e7fac92 --- /dev/null +++ b/tests/vectors/10_delta_operators/vector.json @@ -0,0 +1,38 @@ +{ + "description": "Delta operators: delta_gt fires when |current - prev| > threshold, delta_lt when < threshold", + "model": { + "arb_version": 0.1, + "model": "vec_delta_ops", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "sensor.fast", "type": "int32", "default": 0}, + {"id": "sensor.slow", "type": "int32", "default": 0} + ], + "rules": [ + { + "id": "rule.big_change", + "class": "inference", + "when": {"all": [{"fact": "sensor.fast", "op": "delta_gt", "value": 50}]}, + "then": {"explanation": "Large change detected on fast sensor"} + }, + { + "id": "rule.small_change", + "class": "inference", + "when": {"all": [{"fact": "sensor.slow", "op": "delta_lt", "value": 10}]}, + "then": {"explanation": "Small change on slow sensor"} + } + ], + "actions": [], + "modes": [] + }, + "facts": {"sensor.fast": 100, "sensor.slow": 5}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.big_change", "rule.small_change"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"sensor.fast": 100, "sensor.slow": 5} + } +} diff --git a/tests/vectors/11_condition_groups/vector.json b/tests/vectors/11_condition_groups/vector.json new file mode 100644 index 0000000..68e39b3 --- /dev/null +++ b/tests/vectors/11_condition_groups/vector.json @@ -0,0 +1,44 @@ +{ + "description": "Condition groups: any (one of many) and not (inversion) groups", + "model": { + "arb_version": 0.1, + "model": "vec_cond_groups", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "flag_a", "type": "bool", "default": false}, + {"id": "flag_b", "type": "bool", "default": false}, + {"id": "flag_c", "type": "bool", "default": false} + ], + "rules": [ + { + "id": "rule.any_flag", + "class": "inference", + "when": {"any": [ + {"fact": "flag_a", "op": "==", "value": true}, + {"fact": "flag_b", "op": "==", "value": true} + ]}, + "then": {"explanation": "At least one flag is set"} + }, + { + "id": "rule.not_c", + "class": "inference", + "when": {"not": [ + {"fact": "flag_c", "op": "==", "value": true} + ]}, + "then": {"explanation": "flag_c is NOT set"} + } + ], + "actions": [], + "modes": [] + }, + "facts": {"flag_a": 0, "flag_b": 1, "flag_c": 0}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.any_flag", "rule.not_c"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"flag_a": 0, "flag_b": 1, "flag_c": 0} + } +} diff --git a/tests/vectors/12_int32_saturation/vector.json b/tests/vectors/12_int32_saturation/vector.json new file mode 100644 index 0000000..9968a2d --- /dev/null +++ b/tests/vectors/12_int32_saturation/vector.json @@ -0,0 +1,47 @@ +{ + "description": "INT32 saturation: add overflow clamps to INT32_MAX, sub underflow clamps to INT32_MIN", + "model": { + "arb_version": 0.1, + "model": "vec_saturation", + "target": {"rtos": "zephyr"}, + "facts": [ + {"id": "big", "type": "int32", "default": 0}, + {"id": "one", "type": "int32", "default": 0}, + {"id": "overflow_result", "type": "int32", "default": 0}, + {"id": "small", "type": "int32", "default": 0}, + {"id": "underflow_result", "type": "int32", "default": 0} + ], + "rules": [ + { + "id": "rule.overflow", + "class": "inference", + "then": { + "compute": [ + {"target": "overflow_result", "op": "add", "left": "big", "right": "one"} + ] + } + }, + { + "id": "rule.underflow", + "class": "inference", + "then": { + "compute": [ + {"target": "underflow_result", "op": "sub", "left": "small", "right": "one"} + ] + } + } + ], + "actions": [], + "modes": [] + }, + "facts": {"big": 2147483647, "small": -2147483648, "one": 1, "overflow_result": 0, "underflow_result": 0}, + "timestamps": {}, + "snapshot_timestamp_ms": 0, + "expected": { + "fired_rules": ["rule.overflow", "rule.underflow"], + "current_mode": null, + "raised_faults": [], + "requested_actions": [], + "fact_values": {"big": 2147483647, "small": -2147483648, "one": 1, "overflow_result": 2147483647, "underflow_result": -2147483648} + } +} diff --git a/tools/parse_benchmark.py b/tools/parse_benchmark.py new file mode 100644 index 0000000..caafb2f --- /dev/null +++ b/tools/parse_benchmark.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +"""Parse benchmark timing from Twister log output. + +Extracts ns/tick for both hand-coded and engine implementations from +PID and Kalman benchmark logs. Prints a summary table suitable for +CI log inspection and future regression gating. + +Usage:: + + python tools/parse_benchmark.py twister-out/benchmarks/ + +Reads all handler.log files under the given directory tree. +""" + +from __future__ import annotations + +import re +import sys +from dataclasses import dataclass +from pathlib import Path + + +@dataclass +class BenchmarkResult: + """Timing result from one benchmark variant.""" + + benchmark: str + variant: str + ns_per_tick: int + + +# Patterns matching the LOG_INF lines from benchmark main.c files. +# PID: " Total: 123456 ns (42 ns/tick)" or " Total: 123 ms (42 ns/tick)" +# Kalman: " Total: 123 ms (42 ns/tick)" +_NS_PER_TICK_RE = re.compile(r"\((\d+)\s*ns/tick\)") + +# Section headers that identify which variant we're parsing. +_HAND_CODED_RE = re.compile(r"---\s*Hand-coded\s+(\w+)\s*---") +_ENGINE_RE = re.compile(r"---\s*arbiter\s+Engine\s+(\w+)\s*---") + + +def parse_log(log_text: str, benchmark_name: str) -> list[BenchmarkResult]: + """Parse a single benchmark log and extract timing results.""" + results: list[BenchmarkResult] = [] + current_variant: str | None = None + current_algo: str | None = None + + for line in log_text.splitlines(): + # Check for section headers + m = _HAND_CODED_RE.search(line) + if m: + current_algo = m.group(1) + current_variant = "hand-coded" + continue + + m = _ENGINE_RE.search(line) + if m: + current_algo = m.group(1) + current_variant = "engine" + continue + + # Check for ns/tick value + m = _NS_PER_TICK_RE.search(line) + if m and current_variant and current_algo: + ns = int(m.group(1)) + results.append(BenchmarkResult( + benchmark=f"{benchmark_name}/{current_algo}", + variant=current_variant, + ns_per_tick=ns, + )) + current_variant = None + current_algo = None + + return results + + +def find_and_parse(base_dir: Path) -> list[BenchmarkResult]: + """Find all handler.log files under base_dir and parse them.""" + results: list[BenchmarkResult] = [] + + for log_path in sorted(base_dir.rglob("handler.log")): + # Infer benchmark name from path: .../pid_benchmark/... or .../kalman_benchmark/... + parts = log_path.parts + bench_name = "unknown" + for part in parts: + if "pid" in part.lower(): + bench_name = "pid" + break + if "kalman" in part.lower(): + bench_name = "kalman" + break + + log_text = log_path.read_text(encoding="utf-8", errors="replace") + results.extend(parse_log(log_text, bench_name)) + + return results + + +def print_summary(results: list[BenchmarkResult]) -> None: + """Print a formatted summary table of benchmark results.""" + if not results: + print("No benchmark timing data found.") + return + + # Header + print() + print("=" * 60) + print(" Benchmark Performance Summary") + print("=" * 60) + print(f" {'Benchmark':<25} {'Variant':<15} {'ns/tick':>10}") + print("-" * 60) + + for r in results: + print(f" {r.benchmark:<25} {r.variant:<15} {r.ns_per_tick:>10}") + + print("-" * 60) + + # Compute overhead for each benchmark that has both variants + by_bench: dict[str, dict[str, int]] = {} + for r in results: + by_bench.setdefault(r.benchmark, {})[r.variant] = r.ns_per_tick + + for bench, variants in sorted(by_bench.items()): + hand = variants.get("hand-coded") + engine = variants.get("engine") + if hand and engine and hand > 0: + overhead_pct = ((engine - hand) * 100) // hand + print(f" {bench:<25} overhead: {overhead_pct}%") + + print("=" * 60) + print() + + +def main() -> int: + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + return 1 + + base_dir = Path(sys.argv[1]) + if not base_dir.exists(): + print(f"Error: directory not found: {base_dir}", file=sys.stderr) + return 1 + + results = find_and_parse(base_dir) + print_summary(results) + return 0 + + +if __name__ == "__main__": + sys.exit(main())