microsoft · WaelAbuSeada · Apr 8, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
@@ -23,6 +23,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "code-review"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false

diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
@@ -30,6 +30,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "code-review"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false

diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml
@@ -108,7 +108,8 @@ jobs:
             --use-capi ${{ !inputs.mock && '--storage braintrust --storage kusto' || '' }}
 
       - name: Update leaderboard in a new branch
-        if: ${{ !inputs.mock && !inputs.skip-leaderboard }}
+        # WIP for code-review category
+        if: ${{ !inputs.mock && !inputs.skip-leaderboard && inputs.category != 'code-review' }}
         run: |
           git fetch origin main
 

diff --git a/dataset/codereview.jsonl b/dataset/codereview.jsonl
diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json
@@ -0,0 +1,4 @@
+{
+    "runs": [],
+    "aggregate": []
+}
diff --git a/evaluator/scores.py b/evaluator/scores.py
@@ -19,3 +19,23 @@ def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
 class PostPatchPassedRate:
     def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
         return metadata.get("post_patch_passed", False)
+
+
+class PrecisionScore:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("precision", 0.0))
+
+
+class RecallScore:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("recall", 0.0))
+
+
+class F1Score:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("f1", 0.0))
+
+
+class ValidReviewOutput:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
+        return bool(metadata.get("valid_review_output", False))
diff --git a/scripts/BCBenchUtils.psm1 b/scripts/BCBenchUtils.psm1
@@ -490,13 +490,14 @@ function Get-BCBenchDatasetPath {
     param(
         [Parameter(Mandatory = $true)]
         # Category validation lives only here: every caller resolves the dataset path through this function, so there's no need to duplicate ValidateSet on each caller.
-        [ValidateSet("bug-fix", "test-generation")]
+        [ValidateSet("bug-fix", "test-generation", "code-review")]
         [string] $Category
     )
 
     switch ($Category) {
         "bug-fix" { $DatasetName = "bcbench.jsonl" }
         "test-generation" { $DatasetName = "bcbench.jsonl" }
+        "code-review" { $DatasetName = "codereview.jsonl" }
     }
 
     [string] $projectRoot = Split-Path $PSScriptRoot -Parent

diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py
@@ -6,7 +6,13 @@
 import yaml
 
 from bcbench.agent.claude.metrics import parse_metrics
-from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
+from bcbench.agent.shared import (
+    build_al_lsp_plugin,
+    build_mcp_config,
+    build_prompt,
+    parse_skill_read_diagnostics_from_hooks,
+    parse_tool_usage_from_hooks,
+)
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
@@ -111,8 +117,17 @@ def run_claude_code(
                     logger.warning(f"Skipping non-JSON line: {striped_line}")
 
         tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
-        if metrics and tool_usage:
-            metrics = metrics.model_copy(update={"tool_usage": tool_usage})
+        skill_read_diagnostics: dict[str, bool] | None = None
+        if skills_enabled:
+            skill_read_diagnostics = parse_skill_read_diagnostics_from_hooks(tool_log_path, repo_path, AgentType.CLAUDE)
+
+        if metrics and (tool_usage or skill_read_diagnostics):
+            metrics = metrics.model_copy(
+                update={
+                    "tool_usage": tool_usage,
+                    "skill_read_diagnostics": skill_read_diagnostics,
+                }
+            )
 
         return metrics, config
     except subprocess.TimeoutExpired:

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
@@ -1,5 +1,6 @@
 """GitHub Copilot CLI Agent implementation."""
 
+import json
 import shutil
 import subprocess
 import sys
@@ -8,7 +9,14 @@
 import yaml
 
 from bcbench.agent.copilot.metrics import parse_metrics
-from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
+from bcbench.agent.shared import (
+    build_al_lsp_plugin,
+    build_mcp_config,
+    build_prompt,
+    parse_skill_read_diagnostics_from_hooks,
+    parse_skill_read_diagnostics_from_session_log,
+    parse_tool_usage_from_hooks,
+)
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
@@ -88,17 +96,21 @@ def run_copilot_agent(
         result = subprocess.run(
             cmd_args,
             cwd=str(repo_path),
+            stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,  # only capture stderr where metrics are printed
             timeout=_config.timeout.agent_execution,
             check=True,
         )
 
+        stdout = result.stdout.decode("utf-8", errors="replace") if result.stdout else ""
+        stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
+        cli_output_log = output_dir / f"{entry.instance_id}.copilot-cli.log"
+
         if result.stderr:
             sys.stdout.buffer.write(result.stderr)
             sys.stdout.buffer.flush()
         logger.info(f"Copilot CLI run complete for: {entry.instance_id}")
 
-        stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
         stderr_lines = stderr.splitlines()
 
         # Find the most recent session log for turn count parsing
@@ -108,8 +120,40 @@ def run_copilot_agent(
         metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)
 
         tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
-        if metrics and tool_usage:
-            metrics = metrics.model_copy(update={"tool_usage": tool_usage})
+        skill_read_diagnostics: dict[str, bool] | None = None
+        skill_read_diagnostics_source = "none"
+        if skills_enabled:
+            skill_read_diagnostics = parse_skill_read_diagnostics_from_hooks(tool_log_path, repo_path, AgentType.COPILOT)
+            if skill_read_diagnostics is not None:
+                skill_read_diagnostics_source = "hooks"
+            if skill_read_diagnostics is None and session_log_path is not None:
+                skill_read_diagnostics = parse_skill_read_diagnostics_from_session_log(
+                    session_log_path, repo_path, AgentType.COPILOT
+                )
+                if skill_read_diagnostics is not None:
+                    skill_read_diagnostics_source = "session_log"
+            if skill_read_diagnostics is None:
+                logger.warning("skills_enabled=true but no skill read diagnostics were captured")
+
+        diagnostics_payload = {
+            "skills_enabled": skills_enabled,
+            "skill_read_diagnostics_source": skill_read_diagnostics_source,
+            "skill_read_diagnostics": skill_read_diagnostics,
+            "tool_usage": tool_usage,
+        }
+        cli_output_log.write_text(
+            f"[stdout]\n{stdout}\n\n[stderr]\n{stderr}\n\n[diagnostics]\n{json.dumps(diagnostics_payload, indent=2)}\n",
+            encoding="utf-8",
+        )
+        logger.info(f"Saved Copilot CLI output + diagnostics to: {cli_output_log}")
+
+        if metrics and (tool_usage or skill_read_diagnostics):
+            metrics = metrics.model_copy(
+                update={
+                    "tool_usage": tool_usage,
+                    "skill_read_diagnostics": skill_read_diagnostics,
+                }
+            )
 
         return metrics, config
     except subprocess.TimeoutExpired:

diff --git a/src/bcbench/agent/shared/__init__.py b/src/bcbench/agent/shared/__init__.py
@@ -1,8 +1,17 @@
 """Shared code for CLI-based agents (Claude, Copilot)."""
 
 from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
+from bcbench.agent.shared.hooks_parser import parse_skill_read_diagnostics_from_session_log
+from bcbench.agent.shared.hooks_parser import parse_skill_read_diagnostics_from_hooks
 from bcbench.agent.shared.lsp import build_al_lsp_plugin
 from bcbench.agent.shared.mcp import build_mcp_config
 from bcbench.agent.shared.prompt import build_prompt
 
-__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
+__all__ = [
+	"build_al_lsp_plugin",
+	"build_mcp_config",
+	"build_prompt",
+	"parse_skill_read_diagnostics_from_session_log",
+	"parse_skill_read_diagnostics_from_hooks",
+	"parse_tool_usage_from_hooks",
+]
diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
@@ -50,6 +50,21 @@ prompt:
     {{task}}
     {% endif %}
 
+  code-review-template: |
+    Use the `al-code-review` skill to review the current working-tree AL file changes.
+
+    Run a full-domain review (do not pass a domain so all domains run).
+
+    Review ONLY the current working-tree AL file changes for this evaluation entry.
+    Do NOT compare commits (for example, do NOT use HEAD~1..HEAD or origin/main comparisons).
+    Use working tree diff only (git diff HEAD), and focus on changed *.al files.
+
+    Save findings to a file named "review.json" in the repository root.
+    The file must contain valid JSON with a top-level object named findings.
+    Each finding must include: filePath, lineNumber, severity, issue, recommendation.
+    Allowed severity values are: critical, high, medium, low.
+    If there are no findings, write an empty findings list.
+
 # controls:
 # 1. whether to copy custom instructions from `src/bcbench/agent/shared/instructions/<sanitized-repo>/`
 #    - Copilot: copies to repo/.github/ and renames AGENTS.md to copilot-instructions.md
@@ -59,14 +74,14 @@ prompt:
 # NOTE: the canonical source file is AGENTS.md; it is automatically renamed
 #       to the agent-specific filename (AgentType.instruction_filename) during setup
 instructions:
-  enabled: false
+  enabled: true
 
 # controls:
 # 1. whether to copy skills from `src/bcbench/agent/shared/instructions/<sanitized-repo>/skills/`
 #    - Copilot: copies to repo/.github/skills/
 #    - Claude: copies to repo/.claude/skills/
 skills:
-  enabled: false
+  enabled: true
 
 # controls:
 # 1. whether to copy custom agents from `src/bcbench/agent/shared/instructions/<sanitized-repo>/agents/`

diff --git a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1
@@ -4,11 +4,13 @@ try {
     $inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json
     $toolName = if ($inputJson.tool_name) { $inputJson.tool_name } else { $inputJson.toolName }
     $timestamp = $inputJson.timestamp
+    $toolPath = $null
 
     # LSP calls share the tool name "lsp"; the specific operation (findReferences, goToDefinition, hover, ...) lives in the tool arguments.
     # Capture it as an "lsp:<operation>" sub-label so usage stats stay meaningful.
+    $toolArgs = if ($null -ne $inputJson.toolArgs) { $inputJson.toolArgs } else { $inputJson.tool_input }
+
     if ($toolName -eq "lsp") {
-        $toolArgs = if ($null -ne $inputJson.toolArgs) { $inputJson.toolArgs } else { $inputJson.tool_input }
         if ($toolArgs -is [string]) {
             try { $toolArgs = $toolArgs | ConvertFrom-Json } catch { $toolArgs = $null }
         }
@@ -17,8 +19,29 @@ try {
         }
     }
 
+    # Capture target file path for read-like tools so diagnostics can verify
+    # whether skills/instructions were actually opened.
+    if ($toolName -in @("Read", "read", "read_file", "functions.read_file", "view")) {
+        if ($toolArgs -is [string]) {
+            try { $toolArgs = $toolArgs | ConvertFrom-Json } catch { $toolArgs = $null }
+        }
+
+        if ($toolArgs) {
+            if ($toolArgs.filePath) {
+                $toolPath = [string]$toolArgs.filePath
+            }
+            elseif ($toolArgs.path) {
+                $toolPath = [string]$toolArgs.path
+            }
+        }
+    }
+
     if ($toolName -and $env:BCBENCH_TOOL_LOG) {
-        $entry = @{ tool_name = $toolName; timestamp = $timestamp } | ConvertTo-Json -Compress
+        $entryPayload = @{ tool_name = $toolName; timestamp = $timestamp }
+        if ($toolPath) {
+            $entryPayload["tool_path"] = $toolPath
+        }
+        $entry = $entryPayload | ConvertTo-Json -Compress
         Add-Content -Path $env:BCBENCH_TOOL_LOG -Value $entry -Encoding UTF8
     }
 

diff --git a/src/bcbench/agent/shared/hooks_parser.py b/src/bcbench/agent/shared/hooks_parser.py
@@ -2,6 +2,8 @@
 from collections import Counter
 from pathlib import Path
 
+from bcbench.types import AgentType
+
 
 def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | None:
     if not hooks_output_path.exists():
@@ -17,3 +19,82 @@ def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | Non
             continue
 
     return dict(counts) or None
+
+
+def parse_skill_read_diagnostics_from_hooks(
+    hooks_output_path: Path,
+    repo_path: Path,
+    agent_type: AgentType,
+) -> dict[str, bool] | None:
+    if not hooks_output_path.exists():
+        return None
+
+    target_dir = agent_type.get_target_dir(repo_path)
+    expected_skill_path = (target_dir / "skills" / "al-code-review" / "SKILL.md").resolve()
+    domain_files = ["security", "performance", "style", "accessibility", "upgrade", "privacy"]
+    expected_instruction_paths = {
+        domain: (target_dir / "instructions" / f"{domain}.md").resolve() for domain in domain_files
+    }
+
+    normalized_reads: set[str] = set()
+    for line in hooks_output_path.read_text(encoding="utf-8").splitlines():
+        try:
+            entry = json.loads(line)
+        except (json.JSONDecodeError, TypeError):
+            continue
+
+        if not isinstance(entry, dict):
+            continue
+
+        path_value = entry.get("tool_path")
+        if not isinstance(path_value, str) or not path_value:
+            continue
+
+        normalized_reads.add(str(Path(path_value).resolve()).lower())
+
+    diagnostics: dict[str, bool] = {
+        "skill_file_read": str(expected_skill_path).lower() in normalized_reads,
+    }
+
+    instruction_flags = {
+        f"instruction_{domain}_read": str(path).lower() in normalized_reads
+        for domain, path in expected_instruction_paths.items()
+    }
+    diagnostics.update(instruction_flags)
+    diagnostics["any_domain_instruction_read"] = any(instruction_flags.values())
+    diagnostics["all_domain_instructions_read"] = all(instruction_flags.values())
+
+    return diagnostics
+
+
+def parse_skill_read_diagnostics_from_session_log(
+    session_log_path: Path,
+    repo_path: Path,
+    agent_type: AgentType,
+) -> dict[str, bool] | None:
+    if not session_log_path.exists():
+        return None
+
+    target_dir = agent_type.get_target_dir(repo_path)
+    expected_skill_path = (target_dir / "skills" / "al-code-review" / "SKILL.md").resolve()
+    domain_files = ["security", "performance", "style", "accessibility", "upgrade", "privacy"]
+    expected_instruction_paths = {
+        domain: (target_dir / "instructions" / f"{domain}.md").resolve() for domain in domain_files
+    }
+
+    log_text = session_log_path.read_text(encoding="utf-8", errors="replace")
+    normalized_log = log_text.replace("\\", "/").lower()
+
+    diagnostics: dict[str, bool] = {
+        "skill_file_read": str(expected_skill_path).replace("\\", "/").lower() in normalized_log,
+    }
+
+    instruction_flags = {
+        f"instruction_{domain}_read": str(path).replace("\\", "/").lower() in normalized_log
+        for domain, path in expected_instruction_paths.items()
+    }
+    diagnostics.update(instruction_flags)
+    diagnostics["any_domain_instruction_read"] = any(instruction_flags.values())
+    diagnostics["all_domain_instructions_read"] = all(instruction_flags.values())
+
+    return diagnostics