Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
85 commits
Select commit Hold shift + click to select a range
db388bd
few more udpates for new categories
haoranpb Apr 8, 2026
57c004e
Refactor evaluation and dataset operations for improved workspace setup
haoranpb Apr 9, 2026
8e2f216
enable skipping container setup in action
haoranpb Apr 9, 2026
69a8db8
fix missing implementation for MockEvaluationPipeline
haoranpb Apr 9, 2026
7549d92
Refactor evaluation result classes to be more generic
haoranpb Apr 11, 2026
f32dd00
Merge branch 'main' into fix/more-ready-for-categories
haoranpb Apr 12, 2026
a4089b9
Improve readabilty of GitHub Action summary
haoranpb Apr 12, 2026
99af6b2
fix failing tests
haoranpb Apr 12, 2026
e1b0b93
Code Review POC
haoranpb Apr 12, 2026
1a68d78
Merge branch 'main' into category/code-review
haoranpb Apr 13, 2026
3ec10a0
fix merge conflict resolution mistake
haoranpb Apr 13, 2026
4e52832
Merge branch 'main' into category/code-review
haoranpb Apr 13, 2026
a9f59d9
Make container parameters optional in evaluate and run commands
haoranpb Apr 13, 2026
065e1aa
Merge branch 'category/code-review' of https://github.com/microsoft/B…
haoranpb Apr 13, 2026
4ad4bd9
Enhance code review functionality by adding expected review comments …
haoranpb Apr 13, 2026
92951c4
better hanlding container for not required categories
haoranpb Apr 13, 2026
7902610
Merge branch 'main' into category/code-review
haoranpb Apr 20, 2026
dad9289
Merge branch 'main' of https://github.com/microsoft/BC-Bench into cat…
haoranpb May 5, 2026
f1c4894
Merge branch 'main' of https://github.com/microsoft/BC-Bench into cat…
haoranpb May 11, 2026
aa48a29
prefer copilot.exe executable
haoranpb May 12, 2026
a244503
Normalize code-review dataset and preserve eval outputs
WaelAbuSeada May 16, 2026
9f6c353
Fix code-review branch setup and workflow wiring
WaelAbuSeada May 20, 2026
1a58e44
Require review.json and add log-based recovery fallback
WaelAbuSeada May 20, 2026
d0e8076
Harden code-review prompt for Windows copilot.cmd parsing
WaelAbuSeada May 20, 2026
0b764ef
Experiment: use al-code-review skill template
WaelAbuSeada May 20, 2026
83a4b28
Add skip-container-setup option to evaluation workflows
WaelAbuSeada May 20, 2026
394d005
Fix codereview lint issues in pipeline helpers
WaelAbuSeada May 20, 2026
ae0f1d2
Revert "Fix codereview lint issues in pipeline helpers"
WaelAbuSeada May 20, 2026
143075b
Expand code-review detailed table metrics
WaelAbuSeada May 21, 2026
7411246
Expand code-review detailed table metrics
WaelAbuSeada May 21, 2026
0d6e7ad
Update config and container setup action
WaelAbuSeada May 21, 2026
c7131a4
Update config and container setup action
WaelAbuSeada May 21, 2026
213ce7f
Remove unused apply_patch import from code-review evaluate
WaelAbuSeada May 21, 2026
2e1ced0
Refactor code-review metrics into pipeline and split comment display …
WaelAbuSeada May 21, 2026
b9babe6
Merge category/code-review into experiment/code-review-al-skill
WaelAbuSeada May 21, 2026
558d8ad
Normalize code-review test-run instance IDs to valid pattern
WaelAbuSeada May 21, 2026
3ff6876
Normalize code-review test-run instance IDs to valid pattern
WaelAbuSeada May 21, 2026
be4ccd9
Use plain code-review IDs (security_001 style) and relax ID pattern
WaelAbuSeada May 21, 2026
6e68751
Use plain code-review IDs (security_001 style) and relax ID pattern
WaelAbuSeada May 21, 2026
9b4e5b1
Revert instance_id regex to original strict pattern
WaelAbuSeada May 21, 2026
d691d26
Revert instance_id regex to original strict pattern
WaelAbuSeada May 21, 2026
32e499b
Rename code-review test IDs to strict non-vsoadmin format
WaelAbuSeada May 21, 2026
4c7e03c
Rename code-review test IDs to strict non-vsoadmin format
WaelAbuSeada May 21, 2026
54c618f
fix: add dataset-path input to setup-bc-container action
WaelAbuSeada May 21, 2026
85503e8
fix: add dataset-path input to setup-bc-container action
WaelAbuSeada May 21, 2026
05673bc
feat: add precision and recall to detailed results table
WaelAbuSeada May 21, 2026
06ee0b9
feat: add precision and recall to detailed results table
WaelAbuSeada May 21, 2026
db9c805
fix: apply pre-commit lint and typing fixes
WaelAbuSeada May 21, 2026
f5ffe80
fix: apply pre-commit lint and typing fixes
WaelAbuSeada May 21, 2026
7b6f871
chore: remove UI instruction file
WaelAbuSeada May 21, 2026
8835a18
chore: remove UI instruction file
WaelAbuSeada May 21, 2026
b05a635
fix: review code changes from applied entry patch
WaelAbuSeada May 21, 2026
aabee80
fix: review code changes from applied entry patch
WaelAbuSeada May 21, 2026
0e11f9d
fix: support simplified code-review patch materialization
WaelAbuSeada May 21, 2026
1f7a5bd
fix: support simplified code-review patch materialization
WaelAbuSeada May 21, 2026
7d4ee94
fix: tighten code-review diff and parsing behavior
WaelAbuSeada May 21, 2026
6777610
fix: tighten code-review diff and parsing behavior
WaelAbuSeada May 21, 2026
f64ecdf
Merge branch 'main' of https://github.com/microsoft/BC-Bench into cat…
haoranpb May 29, 2026
ef84b18
cleanup after merge from main
haoranpb May 29, 2026
a711b3e
Refactor evaluation workflows to use dynamic runner and container req…
haoranpb May 29, 2026
0c58e8c
make run step OS indenpendent
haoranpb May 29, 2026
b076b98
fix score mismatch
haoranpb May 29, 2026
df11718
extract github action related commands
haoranpb May 29, 2026
541f6e4
test should not test runner name
haoranpb May 29, 2026
c9193e5
make code review patches proper git diff
haoranpb May 29, 2026
4408974
Merge branch 'main' of https://github.com/microsoft/BC-Bench into cat…
haoranpb May 29, 2026
859ec99
refactor to seperate the logics
haoranpb Jun 1, 2026
0feba63
make more steps OS independent
haoranpb Jun 1, 2026
db12ed4
skip leaderboard update and stricter field for codereview resutl
haoranpb Jun 1, 2026
820b767
simplify import/export
haoranpb Jun 1, 2026
7848f4b
move CodeReviewResultSummary into codereview result file
haoranpb Jun 1, 2026
64f37c0
strongly type CodeReviewResultSummary and reuse metrics util
haoranpb Jun 1, 2026
d216e42
saperate leaderboard from summary and make it generic
haoranpb Jun 1, 2026
49a5cef
fix failing tests
haoranpb Jun 1, 2026
35c5045
Potential fix for pull request finding 'Module imports itself'
haoranpb Jun 1, 2026
eaa1a2c
Merge branch 'main' of https://github.com/microsoft/BC-Bench into cat…
haoranpb Jun 1, 2026
e00e939
add CodeReview to mock tests
haoranpb Jun 1, 2026
13b568c
Merge branch 'main' of https://github.com/microsoft/BC-Bench into cat…
haoranpb Jun 3, 2026
0fef385
Merge category/code-review into experiment/code-review-al-skill
WaelAbuSeada Jun 4, 2026
d34742b
Remove skills and instructions from category branch
WaelAbuSeada Jun 4, 2026
f450ae2
Merge remote-tracking branch 'origin/category/code-review' into exper…
WaelAbuSeada Jun 4, 2026
6c2437b
Keep instructions/skills on experiment and enable skill-based code re…
WaelAbuSeada Jun 4, 2026
d063ac2
Add skill/instruction read diagnostics from hook logs
WaelAbuSeada Jun 4, 2026
a9b3e3f
Set instructions disabled in shared config
WaelAbuSeada Jun 4, 2026
d519e67
Add session-log skill diagnostics and enable custom instructions
WaelAbuSeada Jun 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/claude-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ on:
options:
- "bug-fix"
- "test-generation"
- "code-review"
test-run:
description: "Indicate this is a test run (with few entries)"
required: false
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/copilot-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ on:
options:
- "bug-fix"
- "test-generation"
- "code-review"
test-run:
description: "Indicate this is a test run (with few entries)"
required: false
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/summarize-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ jobs:
--use-capi ${{ !inputs.mock && '--storage braintrust --storage kusto' || '' }}

- name: Update leaderboard in a new branch
if: ${{ !inputs.mock && !inputs.skip-leaderboard }}
# WIP for code-review category
if: ${{ !inputs.mock && !inputs.skip-leaderboard && inputs.category != 'code-review' }}
run: |
git fetch origin main

Expand Down
82 changes: 82 additions & 0 deletions dataset/codereview.jsonl

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions docs/_data/code-review.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"runs": [],
"aggregate": []
}
20 changes: 20 additions & 0 deletions evaluator/scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,23 @@ def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
class PostPatchPassedRate:
def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
return metadata.get("post_patch_passed", False)


class PrecisionScore:
def __call__(self, *, metadata: dict, **kwargs: object) -> float:
return float(metadata.get("precision", 0.0))


class RecallScore:
def __call__(self, *, metadata: dict, **kwargs: object) -> float:
return float(metadata.get("recall", 0.0))


class F1Score:
def __call__(self, *, metadata: dict, **kwargs: object) -> float:
return float(metadata.get("f1", 0.0))


class ValidReviewOutput:
def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
return bool(metadata.get("valid_review_output", False))
3 changes: 2 additions & 1 deletion scripts/BCBenchUtils.psm1
Original file line number Diff line number Diff line change
Expand Up @@ -490,13 +490,14 @@ function Get-BCBenchDatasetPath {
param(
[Parameter(Mandatory = $true)]
# Category validation lives only here: every caller resolves the dataset path through this function, so there's no need to duplicate ValidateSet on each caller.
[ValidateSet("bug-fix", "test-generation")]
[ValidateSet("bug-fix", "test-generation", "code-review")]
[string] $Category
)

switch ($Category) {
"bug-fix" { $DatasetName = "bcbench.jsonl" }
"test-generation" { $DatasetName = "bcbench.jsonl" }
"code-review" { $DatasetName = "codereview.jsonl" }
}

[string] $projectRoot = Split-Path $PSScriptRoot -Parent
Expand Down
21 changes: 18 additions & 3 deletions src/bcbench/agent/claude/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@
import yaml

from bcbench.agent.claude.metrics import parse_metrics
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.agent.shared import (
build_al_lsp_plugin,
build_mcp_config,
build_prompt,
parse_skill_read_diagnostics_from_hooks,
parse_tool_usage_from_hooks,
)
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
Expand Down Expand Up @@ -111,8 +117,17 @@ def run_claude_code(
logger.warning(f"Skipping non-JSON line: {striped_line}")

tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
if metrics and tool_usage:
metrics = metrics.model_copy(update={"tool_usage": tool_usage})
skill_read_diagnostics: dict[str, bool] | None = None
if skills_enabled:
skill_read_diagnostics = parse_skill_read_diagnostics_from_hooks(tool_log_path, repo_path, AgentType.CLAUDE)

if metrics and (tool_usage or skill_read_diagnostics):
metrics = metrics.model_copy(
update={
"tool_usage": tool_usage,
"skill_read_diagnostics": skill_read_diagnostics,
}
)

return metrics, config
except subprocess.TimeoutExpired:
Expand Down
52 changes: 48 additions & 4 deletions src/bcbench/agent/copilot/agent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""GitHub Copilot CLI Agent implementation."""

import json
import shutil
import subprocess
import sys
Expand All @@ -8,7 +9,14 @@
import yaml

from bcbench.agent.copilot.metrics import parse_metrics
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.agent.shared import (
build_al_lsp_plugin,
build_mcp_config,
build_prompt,
parse_skill_read_diagnostics_from_hooks,
parse_skill_read_diagnostics_from_session_log,
parse_tool_usage_from_hooks,
)
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
Expand Down Expand Up @@ -88,17 +96,21 @@ def run_copilot_agent(
result = subprocess.run(
cmd_args,
cwd=str(repo_path),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE, # only capture stderr where metrics are printed
timeout=_config.timeout.agent_execution,
check=True,
)

stdout = result.stdout.decode("utf-8", errors="replace") if result.stdout else ""
stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
cli_output_log = output_dir / f"{entry.instance_id}.copilot-cli.log"

if result.stderr:
sys.stdout.buffer.write(result.stderr)
sys.stdout.buffer.flush()
logger.info(f"Copilot CLI run complete for: {entry.instance_id}")

stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
stderr_lines = stderr.splitlines()

# Find the most recent session log for turn count parsing
Expand All @@ -108,8 +120,40 @@ def run_copilot_agent(
metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)

tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
if metrics and tool_usage:
metrics = metrics.model_copy(update={"tool_usage": tool_usage})
skill_read_diagnostics: dict[str, bool] | None = None
skill_read_diagnostics_source = "none"
if skills_enabled:
skill_read_diagnostics = parse_skill_read_diagnostics_from_hooks(tool_log_path, repo_path, AgentType.COPILOT)
if skill_read_diagnostics is not None:
skill_read_diagnostics_source = "hooks"
if skill_read_diagnostics is None and session_log_path is not None:
skill_read_diagnostics = parse_skill_read_diagnostics_from_session_log(
session_log_path, repo_path, AgentType.COPILOT
)
if skill_read_diagnostics is not None:
skill_read_diagnostics_source = "session_log"
if skill_read_diagnostics is None:
logger.warning("skills_enabled=true but no skill read diagnostics were captured")

diagnostics_payload = {
"skills_enabled": skills_enabled,
"skill_read_diagnostics_source": skill_read_diagnostics_source,
"skill_read_diagnostics": skill_read_diagnostics,
"tool_usage": tool_usage,
}
cli_output_log.write_text(
f"[stdout]\n{stdout}\n\n[stderr]\n{stderr}\n\n[diagnostics]\n{json.dumps(diagnostics_payload, indent=2)}\n",
encoding="utf-8",
)
logger.info(f"Saved Copilot CLI output + diagnostics to: {cli_output_log}")

if metrics and (tool_usage or skill_read_diagnostics):
metrics = metrics.model_copy(
update={
"tool_usage": tool_usage,
"skill_read_diagnostics": skill_read_diagnostics,
}
)

return metrics, config
except subprocess.TimeoutExpired:
Expand Down
11 changes: 10 additions & 1 deletion src/bcbench/agent/shared/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
"""Shared code for CLI-based agents (Claude, Copilot)."""

from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
from bcbench.agent.shared.hooks_parser import parse_skill_read_diagnostics_from_session_log
from bcbench.agent.shared.hooks_parser import parse_skill_read_diagnostics_from_hooks
from bcbench.agent.shared.lsp import build_al_lsp_plugin
from bcbench.agent.shared.mcp import build_mcp_config
from bcbench.agent.shared.prompt import build_prompt

__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
__all__ = [
"build_al_lsp_plugin",
"build_mcp_config",
"build_prompt",
"parse_skill_read_diagnostics_from_session_log",
"parse_skill_read_diagnostics_from_hooks",
"parse_tool_usage_from_hooks",
]
19 changes: 17 additions & 2 deletions src/bcbench/agent/shared/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,21 @@ prompt:
{{task}}
{% endif %}

code-review-template: |
Use the `al-code-review` skill to review the current working-tree AL file changes.

Run a full-domain review (do not pass a domain so all domains run).

Review ONLY the current working-tree AL file changes for this evaluation entry.
Do NOT compare commits (for example, do NOT use HEAD~1..HEAD or origin/main comparisons).
Use working tree diff only (git diff HEAD), and focus on changed *.al files.

Save findings to a file named "review.json" in the repository root.
The file must contain valid JSON with a top-level object named findings.
Each finding must include: filePath, lineNumber, severity, issue, recommendation.
Allowed severity values are: critical, high, medium, low.
If there are no findings, write an empty findings list.

# controls:
# 1. whether to copy custom instructions from `src/bcbench/agent/shared/instructions/<sanitized-repo>/`
# - Copilot: copies to repo/.github/ and renames AGENTS.md to copilot-instructions.md
Expand All @@ -59,14 +74,14 @@ prompt:
# NOTE: the canonical source file is AGENTS.md; it is automatically renamed
# to the agent-specific filename (AgentType.instruction_filename) during setup
instructions:
enabled: false
enabled: true

# controls:
# 1. whether to copy skills from `src/bcbench/agent/shared/instructions/<sanitized-repo>/skills/`
# - Copilot: copies to repo/.github/skills/
# - Claude: copies to repo/.claude/skills/
skills:
enabled: false
enabled: true

# controls:
# 1. whether to copy custom agents from `src/bcbench/agent/shared/instructions/<sanitized-repo>/agents/`
Expand Down
27 changes: 25 additions & 2 deletions src/bcbench/agent/shared/hooks/log-tool-usage.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ try {
$inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json
$toolName = if ($inputJson.tool_name) { $inputJson.tool_name } else { $inputJson.toolName }
$timestamp = $inputJson.timestamp
$toolPath = $null

# LSP calls share the tool name "lsp"; the specific operation (findReferences, goToDefinition, hover, ...) lives in the tool arguments.
# Capture it as an "lsp:<operation>" sub-label so usage stats stay meaningful.
$toolArgs = if ($null -ne $inputJson.toolArgs) { $inputJson.toolArgs } else { $inputJson.tool_input }

if ($toolName -eq "lsp") {
$toolArgs = if ($null -ne $inputJson.toolArgs) { $inputJson.toolArgs } else { $inputJson.tool_input }
if ($toolArgs -is [string]) {
try { $toolArgs = $toolArgs | ConvertFrom-Json } catch { $toolArgs = $null }
}
Expand All @@ -17,8 +19,29 @@ try {
}
}

# Capture target file path for read-like tools so diagnostics can verify
# whether skills/instructions were actually opened.
if ($toolName -in @("Read", "read", "read_file", "functions.read_file", "view")) {
if ($toolArgs -is [string]) {
try { $toolArgs = $toolArgs | ConvertFrom-Json } catch { $toolArgs = $null }
}

if ($toolArgs) {
if ($toolArgs.filePath) {
$toolPath = [string]$toolArgs.filePath
}
elseif ($toolArgs.path) {
$toolPath = [string]$toolArgs.path
}
}
}

if ($toolName -and $env:BCBENCH_TOOL_LOG) {
$entry = @{ tool_name = $toolName; timestamp = $timestamp } | ConvertTo-Json -Compress
$entryPayload = @{ tool_name = $toolName; timestamp = $timestamp }
if ($toolPath) {
$entryPayload["tool_path"] = $toolPath
}
$entry = $entryPayload | ConvertTo-Json -Compress
Add-Content -Path $env:BCBENCH_TOOL_LOG -Value $entry -Encoding UTF8
}

Expand Down
81 changes: 81 additions & 0 deletions src/bcbench/agent/shared/hooks_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from collections import Counter
from pathlib import Path

from bcbench.types import AgentType


def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | None:
if not hooks_output_path.exists():
Expand All @@ -17,3 +19,82 @@ def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | Non
continue

return dict(counts) or None


def parse_skill_read_diagnostics_from_hooks(
hooks_output_path: Path,
repo_path: Path,
agent_type: AgentType,
) -> dict[str, bool] | None:
if not hooks_output_path.exists():
return None

target_dir = agent_type.get_target_dir(repo_path)
expected_skill_path = (target_dir / "skills" / "al-code-review" / "SKILL.md").resolve()
domain_files = ["security", "performance", "style", "accessibility", "upgrade", "privacy"]
expected_instruction_paths = {
domain: (target_dir / "instructions" / f"{domain}.md").resolve() for domain in domain_files
}

normalized_reads: set[str] = set()
for line in hooks_output_path.read_text(encoding="utf-8").splitlines():
try:
entry = json.loads(line)
except (json.JSONDecodeError, TypeError):
continue

if not isinstance(entry, dict):
continue

path_value = entry.get("tool_path")
if not isinstance(path_value, str) or not path_value:
continue

normalized_reads.add(str(Path(path_value).resolve()).lower())

diagnostics: dict[str, bool] = {
"skill_file_read": str(expected_skill_path).lower() in normalized_reads,
}

instruction_flags = {
f"instruction_{domain}_read": str(path).lower() in normalized_reads
for domain, path in expected_instruction_paths.items()
}
diagnostics.update(instruction_flags)
diagnostics["any_domain_instruction_read"] = any(instruction_flags.values())
diagnostics["all_domain_instructions_read"] = all(instruction_flags.values())

return diagnostics


def parse_skill_read_diagnostics_from_session_log(
session_log_path: Path,
repo_path: Path,
agent_type: AgentType,
) -> dict[str, bool] | None:
if not session_log_path.exists():
return None

target_dir = agent_type.get_target_dir(repo_path)
expected_skill_path = (target_dir / "skills" / "al-code-review" / "SKILL.md").resolve()
domain_files = ["security", "performance", "style", "accessibility", "upgrade", "privacy"]
expected_instruction_paths = {
domain: (target_dir / "instructions" / f"{domain}.md").resolve() for domain in domain_files
}

log_text = session_log_path.read_text(encoding="utf-8", errors="replace")
normalized_log = log_text.replace("\\", "/").lower()

diagnostics: dict[str, bool] = {
"skill_file_read": str(expected_skill_path).replace("\\", "/").lower() in normalized_log,
}

instruction_flags = {
f"instruction_{domain}_read": str(path).replace("\\", "/").lower() in normalized_log
for domain, path in expected_instruction_paths.items()
}
diagnostics.update(instruction_flags)
diagnostics["any_domain_instruction_read"] = any(instruction_flags.values())
diagnostics["all_domain_instructions_read"] = all(instruction_flags.values())

return diagnostics
Loading
Loading