From cfde540b76b9221f92e79df0f57931d97a54e902 Mon Sep 17 00:00:00 2001 From: Jared Wein Date: Thu, 28 May 2026 17:28:06 -0400 Subject: [PATCH 1/4] Gate automated review on risk/complexity scores and test coverage Adds a cheap pre-screen so the (expensive) review agent only runs on revisions that score low on risk and complexity, letting teams "ease in" by raising thresholds as confidence grows. Library (bugbug/tools/code_review/): - diff_analysis.py: classify the patch's in-diff test signal (absent / sparse / adequate / tests_only / no_code_change) and collect changed non-test paths. - test_coverage.py: probe mozilla-central via the async searchfox client for existing tests referencing the changed files (covered / partial / uncovered); degrades gracefully if searchfox is unavailable. - scoring.py: RiskComplexityScorer, a single structured Haiku call returning 0-10 risk/complexity plus factors, fed the precomputed block. - agent.py: inject the test-signals block into the review prompt and surface token usage + tool-call counts in the response details. - llms.py: DEFAULT_SCORING_MODEL and a shared usage_from_messages() helper. Service (reviewhelper-api): - reviewer_groups.py + reviewer_groups.yaml: per-group thresholds (with defaults) loaded from a checked-in YAML file; matching_groups() resolves a revision's reviewer groups via the new PhabricatorPatch plumbing. - review_processor.py: _score_and_gate() scores before reviewing and raises ReviewSkipped("above_threshold") when either score meets its threshold. - ReviewStatus.SKIPPED (terminal, no retry) + skipped_reason and risk/complexity/test-signal columns on review_requests, recorded even when skipped; alembic migration + status index. - internal.py records skips as SKIPPED (HTTP 204); request.py messaging. - pricing.py: per-model token pricing for later cost reporting. --- bugbug/tools/code_review/agent.py | 51 ++++- bugbug/tools/code_review/diff_analysis.py | 180 +++++++++++++++ bugbug/tools/code_review/scoring.py | 204 +++++++++++++++++ bugbug/tools/code_review/test_coverage.py | 161 +++++++++++++ bugbug/tools/core/llms.py | 28 +++ ...e5f6_add_risk_scores_and_skipped_status.py | 90 ++++++++ services/reviewhelper-api/app/config.py | 4 + .../reviewhelper-api/app/database/models.py | 12 + services/reviewhelper-api/app/enums.py | 9 +- services/reviewhelper-api/app/pricing.py | 63 ++++++ .../reviewhelper-api/app/review_processor.py | 120 +++++++++- .../reviewhelper-api/app/reviewer_groups.py | 105 +++++++++ .../reviewhelper-api/app/routers/internal.py | 9 + .../reviewhelper-api/app/routers/request.py | 3 + services/reviewhelper-api/pyproject.toml | 1 + .../reviewhelper-api/reviewer_groups.yaml | 22 ++ services/reviewhelper-api/tests/conftest.py | 23 ++ .../tests/test_review_gating.py | 90 ++++++++ .../tests/test_reviewer_groups.py | 57 +++++ tests/test_code_review_scoring.py | 212 ++++++++++++++++++ 20 files changed, 1429 insertions(+), 15 deletions(-) create mode 100644 bugbug/tools/code_review/diff_analysis.py create mode 100644 bugbug/tools/code_review/scoring.py create mode 100644 bugbug/tools/code_review/test_coverage.py create mode 100644 services/reviewhelper-api/alembic/versions/c1a2b3d4e5f6_add_risk_scores_and_skipped_status.py create mode 100644 services/reviewhelper-api/app/pricing.py create mode 100644 services/reviewhelper-api/app/reviewer_groups.py create mode 100644 services/reviewhelper-api/reviewer_groups.yaml create mode 100644 services/reviewhelper-api/tests/conftest.py create mode 100644 services/reviewhelper-api/tests/test_review_gating.py create mode 100644 services/reviewhelper-api/tests/test_reviewer_groups.py create mode 100644 tests/test_code_review_scoring.py diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py index badab3f528..b9d3674e9f 100644 --- a/bugbug/tools/code_review/agent.py +++ b/bugbug/tools/code_review/agent.py @@ -55,7 +55,11 @@ LargeDiffError, RecursionLimitError, ) -from bugbug.tools.core.llms import DEFAULT_ANTHROPIC_MODEL, get_tokenizer +from bugbug.tools.core.llms import ( + DEFAULT_ANTHROPIC_MODEL, + get_tokenizer, + usage_from_messages, +) from bugbug.tools.core.platforms.base import Patch logger = getLogger(__name__) @@ -204,11 +208,15 @@ def count_tokens(self, text): return len(self._tokenizer.encode(text)) def generate_initial_prompt( - self, patch: Patch, patch_summary: str, external_context: str = "" + self, + patch: Patch, + patch_summary: str, + external_context: str = "", + test_signals_block: str | None = None, ) -> str: created_before = patch.date_created if self.is_experiment_env else None - return FIRST_MESSAGE_TEMPLATE.format( + prompt = FIRST_MESSAGE_TEMPLATE.format( patch=format_patch_set(patch.patch_set), patch_summarization=patch_summary, external_context=external_context, @@ -216,9 +224,17 @@ def generate_initial_prompt( approved_examples=self._get_generated_examples(patch, created_before), ) + if test_signals_block: + prompt = f"{test_signals_block}\n\n{prompt}" + + return prompt + async def generate_review_comments( - self, patch: Patch, patch_summary: str - ) -> tuple[list[GeneratedReviewComment], list[dict]]: + self, + patch: Patch, + patch_summary: str, + test_signals_block: str | None = None, + ) -> tuple[dict, list[dict]]: external_context = "" manifest: list[dict] = [] if self._review_context_repo: @@ -240,7 +256,10 @@ async def generate_review_comments( "messages": [ HumanMessage( self.generate_initial_prompt( - patch, patch_summary, external_context + patch, + patch_summary, + external_context, + test_signals_block, ) ), ] @@ -253,7 +272,7 @@ async def generate_review_comments( except GraphRecursionError as e: raise RecursionLimitError("The model could not complete the review") from e - return result["structured_response"].comments, manifest + return result, manifest async def assess_patch_scope( self, patch: Patch, patch_summary: str @@ -270,16 +289,19 @@ async def assess_patch_scope( result = await self.scope_agent.ainvoke({"messages": [HumanMessage(prompt)]}) return result["structured_response"].comments[:1] - async def run(self, patch: Patch) -> CodeReviewToolResponse: + async def run( + self, patch: Patch, test_signals_block: str | None = None + ) -> CodeReviewToolResponse: if self.count_tokens(patch.raw_diff) > 21000: raise LargeDiffError("The diff is too large") patch_summary = self.patch_summarizer.run(patch) - ( - unfiltered_suggestions, - external_content_manifest, - ) = await self.generate_review_comments(patch, patch_summary) + result, external_content_manifest = await self.generate_review_comments( + patch, patch_summary, test_signals_block + ) + messages = result.get("messages", []) + unfiltered_suggestions = result["structured_response"].comments if not unfiltered_suggestions: logger.info("No suggestions were generated") @@ -305,6 +327,11 @@ async def run(self, patch: Patch) -> CodeReviewToolResponse: "num_unfiltered_suggestions": len(unfiltered_suggestions), "num_scope_suggestions": len(scope_suggestions), "external_content": external_content_manifest, + "usage": usage_from_messages(messages), + "tool_calls": sum( + len(getattr(message, "tool_calls", []) or []) + for message in messages + ), }, ) diff --git a/bugbug/tools/code_review/diff_analysis.py b/bugbug/tools/code_review/diff_analysis.py new file mode 100644 index 0000000000..cc831a718f --- /dev/null +++ b/bugbug/tools/code_review/diff_analysis.py @@ -0,0 +1,180 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Parse a unified diff into structured test signals used by risk scoring. + +The diff analyzer answers one question deterministically, so we don't have to +ask the model to eyeball it: did the patch include test changes, and how much? +The resulting signal feeds both the risk/complexity scorer and the review +prompt. The set of changed non-test paths is also returned so the existing-test +coverage lookup (``test_coverage``) knows what to probe for in mozilla-central. + +Ported from qreviews (qreviews/diff_analysis.py); the hunk-anchor tracking used +there to validate inline comments is dropped, since bugbug validates generated +comments against the patch set via ``utils.find_comment_scope``. +""" + +import re +from dataclasses import dataclass, field + +# Mozilla-flavoured test-path matchers. Applied to the new-side path +# of each `diff --git a/X b/Y` (or to X if the file was deleted). +_TEST_DIR_RE = re.compile( + r"(^|/)(tests?|xpcshell|mochitests?|gtests?|reftests?|crashtests?|" + r"jsapi-tests?|web-platform/tests?|googletest)(/|$)" +) +_TEST_FILENAME_RES = ( + re.compile(r"(^|/)test_[^/]+$"), + re.compile(r"[^/]*_test\.[A-Za-z0-9]+$"), + re.compile(r"[^/]*\.test\.[A-Za-z0-9]+$"), + re.compile(r"(^|/)head[^/]*\.js$"), + re.compile(r"(^|/)browser_[^/]+\.js$"), +) + + +def _is_test_path(path: str) -> bool: + if not path: + return False + if _TEST_DIR_RE.search(path): + return True + return any(r.search(path) for r in _TEST_FILENAME_RES) + + +_DIFF_GIT_RE = re.compile(r"^diff --git a/(\S+) b/(\S+)\s*$") +_HUNK_RE = re.compile( + r"^@@ -(?P\d+)(?:,(?P\d+))? " + r"\+(?P\d+)(?:,(?P\d+))? @@" +) + +# Possible values of DiffStats.in_diff_test_signal. +IN_DIFF_TEST_SIGNALS = ("absent", "sparse", "adequate", "tests_only", "no_code_change") + + +@dataclass(frozen=True) +class DiffStats: + files_changed: int + test_files_changed: int + non_test_files_changed: int + lines_added: int + test_lines_added: int + non_test_lines_added: int + # One of IN_DIFF_TEST_SIGNALS. + in_diff_test_signal: str + non_test_paths: list[str] = field(default_factory=list) + + +def _classify_signal( + *, non_test_files: int, test_files: int, non_test_added: int, test_added: int +) -> str: + if non_test_files == 0 and test_files == 0: + return "no_code_change" + if non_test_files == 0: + return "tests_only" + if test_files == 0: + return "absent" + # Both kinds present — ratio decides. + if non_test_added <= 0: + # All deletions on non-test side; test additions are still meaningful. + return "adequate" if test_added > 0 else "sparse" + ratio = test_added / non_test_added + return "adequate" if ratio >= 0.3 else "sparse" + + +def analyze_diff(raw_diff: str) -> DiffStats: + """Walk a unified diff once and emit a DiffStats.""" + files_changed = 0 + test_files = 0 + non_test_files = 0 + total_added = 0 + test_added = 0 + non_test_added = 0 + non_test_paths: list[str] = [] + + current_path: str | None = None + current_is_test = False + in_hunk = False + + for line in raw_diff.splitlines(): + m = _DIFF_GIT_RE.match(line) + if m: + files_changed += 1 + # Use the new-side path; on a deletion this still names the + # original file (Phabricator passes the same path on both sides). + current_path = m.group(2) + current_is_test = _is_test_path(current_path) + if current_is_test: + test_files += 1 + else: + non_test_files += 1 + non_test_paths.append(current_path) + in_hunk = False + continue + + if current_path is None: + continue + + if _HUNK_RE.match(line): + in_hunk = True + continue + + if not in_hunk or not line: + continue + + if line[0] == "+": + total_added += 1 + if current_is_test: + test_added += 1 + else: + non_test_added += 1 + elif line[0] not in "- \\": + # Unexpected line inside a hunk (e.g. a stray header from a + # malformed diff). Drop out of hunk mode to resync. + in_hunk = False + + signal = _classify_signal( + non_test_files=non_test_files, + test_files=test_files, + non_test_added=non_test_added, + test_added=test_added, + ) + + return DiffStats( + files_changed=files_changed, + test_files_changed=test_files, + non_test_files_changed=non_test_files, + lines_added=total_added, + test_lines_added=test_added, + non_test_lines_added=non_test_added, + in_diff_test_signal=signal, + non_test_paths=non_test_paths, + ) + + +def format_test_signal_block( + stats: DiffStats, + *, + coverage_block: str | None = None, +) -> str: + """Render the structured signal block prepended to scoring/review messages. + + The optional ``coverage_block`` is the existing-coverage layer rendered by + ``test_coverage.format_coverage_block``; the two-layer rendering lives here + so it isn't duplicated across scoring and review. + """ + lines = [ + "", + "Pre-computed from the diff (not the model):", + f" files_changed={stats.files_changed}", + f" test_files_changed={stats.test_files_changed}", + f" non_test_files_changed={stats.non_test_files_changed}", + f" lines_added={stats.lines_added} " + f"(test={stats.test_lines_added}, non_test={stats.non_test_lines_added})", + f" in_diff_test_signal={stats.in_diff_test_signal}", + ] + if coverage_block: + lines.append("") + lines.append(coverage_block.rstrip()) + lines.append("") + return "\n".join(lines) diff --git a/bugbug/tools/code_review/scoring.py b/bugbug/tools/code_review/scoring.py new file mode 100644 index 0000000000..4ef34a1278 --- /dev/null +++ b/bugbug/tools/code_review/scoring.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Risk + complexity scoring via a cheap Claude model. + +One call per fresh diff, returning two independent 0-10 integer scores plus the +factors behind them. The result gates whether the (expensive) review agent runs +at all, so this deliberately uses a small model. Token usage is captured so the +dashboard can show cost. + +Ported from qreviews (qreviews/scoring.py); the raw Anthropic SDK call is +replaced with LangChain structured output to match the rest of bugbug. +""" + +from dataclasses import dataclass, field +from logging import getLogger + +from langchain.chat_models import init_chat_model +from langchain.messages import HumanMessage, SystemMessage +from pydantic import BaseModel, Field + +from bugbug.tools.core.exceptions import ModelResultError +from bugbug.tools.core.llms import DEFAULT_SCORING_MODEL, usage_from_messages + +logger = getLogger(__name__) + + +SCORING_SYSTEM_PROMPT = """\ +You are a senior Mozilla Firefox reviewer. Rate an incoming Phabricator +revision on two integer 0-10 axes. You MUST use the full 0-10 range — +trivial changes deserve scores of 0 or 1, not 2 or 3. + + - RISK: blast radius if this change is wrong. Considers sensitive + components (security, IPC, prefs, permissions, crypto, sandbox, + network, auth), irreversibility (data migrations, schema changes), + breadth (number of touched modules), and dangerous patterns (eval, + raw HTML injection, regex on untrusted input, concurrency / async + races, file/network I/O, telemetry definitions, mots.yaml / + build / CI files). + + - COMPLEXITY: how hard the change is for a human reviewer to + understand and verify. Considers LOC added/removed, number of + files, control-flow density, new abstractions, refactors / renames, + presence vs absence of tests, and clarity of the diff. + +Both axes are independent. A simple 5-line patch to a security-critical +file is HIGH risk but LOW complexity. + +The user message includes a `` block computed +deterministically from the diff and from a searchfox lookup over +mozilla-central. Treat these as hints, not absolutes: + + - `in_diff_test_signal=absent` means the patch adds no test + changes. On non-trivial non-test code, this is mildly + risk-increasing; on docs/CSS/string changes it does not matter. + - `in_diff_test_signal=tests_only` means the patch is test-only. + This is risk-decreasing — broken tests fail loudly without + affecting production. + - `coverage_signal=uncovered` means searchfox found no existing + tests in mozilla-central that reference the changed non-test + files. Combined with `in_diff_test_signal=absent` on a + sensitive area, this materially raises risk. + - `coverage_signal=covered` or `partial` lowers risk slightly — + the touched code at least has some automated check around it. + - `coverage_signal=skipped_*` means the lookup wasn't run + (large diff or searchfox unavailable). Do not let it affect + your score in either direction. + +Continue to use the full 0-10 scale; do not let the test signals +overwhelm other risk factors (security, IPC, etc.) — they are one +input among many. + +Score anchors (use these as a calibration reference, do not be afraid +to score 0 or 1): + + RISK + 0 = pure docs / comments / strings, no executable code change + 1 = CSS-only, dead-code removal, isolated UI tweak in a leaf component, + a localization string addition + 2 = small JS/HTML change in a non-sensitive UI component, no network + or storage touched + 3-4 = moderate change to UI logic, or any touch of a moderately + sensitive area (telemetry, prefs reads) + 5-6 = multi-file change with cross-cutting effects, or touches + moderately sensitive subsystems + 7-8 = security-relevant code, IPC, sandbox, auth, crypto, network + protocols, build / CI / signing + 9-10 = data migrations, irreversible schema changes, security boundary + changes + + COMPLEXITY + 0 = whitespace, single-line value change, single-line string change + 1 = <10 LOC, single file, no control flow added + 2 = ~10-30 LOC, 1-2 files, simple straight-line additions + 3-4 = 30-100 LOC, new function/method, simple control flow + 5-6 = 100-300 LOC OR 5+ files OR new abstraction + 7-8 = significant refactor, renames, signature changes across many + callers, or non-trivial concurrency/async logic + 9-10 = large refactors with subtle invariants, new subsystems + +Provide 1-5 factors per axis. Each factor MUST be ONE short sentence, +not a paragraph. Cite specific file paths in `path:line` form when useful.\ +""" + + +class Scores(BaseModel): + """Structured risk/complexity assessment of a revision.""" + + risk: int = Field(ge=0, le=10, description="Blast radius if the change is wrong.") + complexity: int = Field( + ge=0, le=10, description="How hard the change is to review and verify." + ) + risk_factors: list[str] = Field( + default_factory=list, + description="1-5 short sentences justifying the risk score.", + ) + complexity_factors: list[str] = Field( + default_factory=list, + description="1-5 short sentences justifying the complexity score.", + ) + + +@dataclass +class ScoringResult: + scores: Scores + model: str + usage: dict[str, int] = field(default_factory=dict) + + +def _build_user_message( + *, + title: str, + summary: str | None, + revision_id: int, + author: str, + bug_id: str | int | None, + raw_diff: str, + test_signals_block: str | None = None, +) -> str: + header = ( + f"Revision: D{revision_id}\n" + f"Title: {title}\n" + f"Author: {author}\n" + f"Bug: {bug_id or '(none)'}\n" + f"\nSummary:\n{summary or '(no summary provided)'}\n" + ) + signals = f"\n{test_signals_block}\n" if test_signals_block else "" + return ( + f"{header}{signals}\n----- BEGIN DIFF -----\n{raw_diff}\n----- END DIFF -----\n" + ) + + +class RiskComplexityScorer: + """Scores a revision's risk and complexity with a single cheap LLM call.""" + + def __init__(self, llm, model_name: str) -> None: + self._model_name = model_name + self._structured = llm.with_structured_output(Scores, include_raw=True) + + @classmethod + def create( + cls, + model: str = DEFAULT_SCORING_MODEL, + max_tokens: int = 1024, + ) -> "RiskComplexityScorer": + llm = init_chat_model(model, max_tokens=max_tokens, temperature=0) + return cls(llm, model) + + async def run( + self, + *, + title: str, + summary: str | None, + revision_id: int, + author: str, + bug_id: str | int | None, + raw_diff: str, + test_signals_block: str | None = None, + ) -> ScoringResult: + """Call the model and return parsed scores plus token usage.""" + user_msg = _build_user_message( + title=title, + summary=summary, + revision_id=revision_id, + author=author, + bug_id=bug_id, + raw_diff=raw_diff, + test_signals_block=test_signals_block, + ) + result = await self._structured.ainvoke( + [SystemMessage(SCORING_SYSTEM_PROMPT), HumanMessage(user_msg)] + ) + + scores = result.get("parsed") + if scores is None: + raise ModelResultError( + f"Risk/complexity scoring did not return valid scores: " + f"{result.get('parsing_error')}" + ) + + usage = usage_from_messages([result["raw"]]) + return ScoringResult(scores=scores, model=self._model_name, usage=usage) diff --git a/bugbug/tools/code_review/test_coverage.py b/bugbug/tools/code_review/test_coverage.py new file mode 100644 index 0000000000..0e744c746a --- /dev/null +++ b/bugbug/tools/code_review/test_coverage.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Probe mozilla-central via searchfox for existing tests of a revision's files. + +This complements ``diff_analysis``. The diff analyzer tells us whether the patch +itself includes test changes; this module tells us whether existing tests in the +tree already reference the touched non-test files, even when the patch doesn't +add tests. + +The signal is intentionally a hint, not a guarantee — basename queries can hit +unrelated same-named files. Risk scoring treats it as one input among many. + +Ported from qreviews (qreviews/test_coverage.py), with the searchfox-cli +subprocess backend swapped for bugbug's async ``AsyncSearchfoxClient`` (the same +client the review agent's searchfox tools already use). +""" + +import os +from dataclasses import dataclass, field +from logging import getLogger + +from bugbug.tools.code_review.langchain_tools import _get_client + +logger = getLogger(__name__) + +# Cap how many non-test files we look up per revision. searchfox queries are +# cheap but not free; large refactors would otherwise issue dozens of requests. +DEFAULT_FILE_CAP = 10 + +# Per-query hit cap. We only need to know "are there any tests" — a small +# limit keeps the result bounded. +PER_QUERY_LIMIT = 5 + + +@dataclass(frozen=True) +class ExistingCoverage: + covered_paths: dict[str, list[str]] = field(default_factory=dict) + uncovered_paths: list[str] = field(default_factory=list) + # "covered" | "partial" | "uncovered" | "no_non_test_files" | + # "skipped_large_diff" | "skipped_searchfox_error" + coverage_signal: str = "skipped_searchfox_error" + # Total non-test files in the diff, regardless of cap. + candidate_count: int = 0 + + +def _basename_no_ext(path: str) -> str: + """Strip up to two extensions so a basename collapses to its module name. + + Two so Mozilla suffixes like ``foo.sys.mjs`` collapse to ``foo``, but no + further so single-dot names (``some.module.cpp``) keep ``some.module``. + """ + base = os.path.basename(path) + for _ in range(2): + name, ext = os.path.splitext(base) + if not ext: + break + base = name + return base + + +async def lookup_existing_coverage( + non_test_paths: list[str], + *, + file_cap: int = DEFAULT_FILE_CAP, +) -> ExistingCoverage: + """Run a small searchfox query per non-test path and aggregate. + + Returns an ExistingCoverage describing the per-file hit lists and an overall + ``coverage_signal``. Any searchfox failure degrades to a skipped signal + rather than raising — coverage is advisory, never required. + """ + if not non_test_paths: + return ExistingCoverage(coverage_signal="no_non_test_files", candidate_count=0) + + candidate_count = len(non_test_paths) + skipped_large = candidate_count > file_cap + paths = non_test_paths[:file_cap] if skipped_large else list(non_test_paths) + + client = _get_client() + covered: dict[str, list[str]] = {} + uncovered: list[str] = [] + + for path in paths: + query = _basename_no_ext(path) + if not query or len(query) < 3: + # Generic short names like 'x' or 'io' produce too much noise. + uncovered.append(path) + continue + try: + results = await client.search( + query=query, + tests="only", + limit=PER_QUERY_LIMIT, + ) + except Exception as e: # searchfox raises plain Exception + logger.warning("searchfox coverage lookup failed at %s: %s", path, e) + return ExistingCoverage( + covered_paths=covered, + uncovered_paths=uncovered + paths[len(covered) + len(uncovered) :], + coverage_signal="skipped_searchfox_error", + candidate_count=candidate_count, + ) + + # Drop self-references — the queried file can show up if it shares a + # basename with a test path by sheer regex. + hits: list[str] = [] + seen: set[str] = set() + for hit_path, _line, _content in results: + if hit_path == path or hit_path in seen: + continue + seen.add(hit_path) + hits.append(hit_path) + + if hits: + covered[path] = hits + else: + uncovered.append(path) + + if skipped_large: + signal = "skipped_large_diff" + elif covered and uncovered: + signal = "partial" + elif covered: + signal = "covered" + else: + signal = "uncovered" + + return ExistingCoverage( + covered_paths=covered, + uncovered_paths=uncovered, + coverage_signal=signal, + candidate_count=candidate_count, + ) + + +def format_coverage_block(coverage: ExistingCoverage) -> str | None: + """Render the existing-coverage layer for the ```` block. + + Returns None when there's nothing meaningful to show. + """ + if coverage.coverage_signal == "no_non_test_files": + return None + lines = [ + "Existing tests referencing changed non-test files (searchfox lookup):", + f" coverage_signal={coverage.coverage_signal}", + f" candidate_files={coverage.candidate_count}", + ] + if coverage.covered_paths: + lines.append(" covered:") + for src, tests in coverage.covered_paths.items(): + sample = ", ".join(tests[:3]) + more = "" if len(tests) <= 3 else f" (+{len(tests) - 3} more)" + lines.append(f" - {src} <- {sample}{more}") + if coverage.uncovered_paths: + lines.append(" uncovered:") + for src in coverage.uncovered_paths: + lines.append(f" - {src}") + return "\n".join(lines) diff --git a/bugbug/tools/core/llms.py b/bugbug/tools/core/llms.py index e247aad8b2..6da5dcbcf5 100644 --- a/bugbug/tools/core/llms.py +++ b/bugbug/tools/core/llms.py @@ -11,6 +11,34 @@ DEFAULT_OPENAI_MODEL = "gpt-5-2025-08-07" DEFAULT_ANTHROPIC_MODEL = "claude-opus-4-6" +# Cheap, fast model used for the risk/complexity pre-screen, separate from the +# review model so the gate stays inexpensive. +DEFAULT_SCORING_MODEL = "claude-haiku-4-5-20251001" + + +def usage_from_messages(messages) -> dict[str, int]: + """Aggregate LangChain ``usage_metadata`` across one or more AI messages. + + Returns a dict with input/output token totals and the cache-read / + cache-creation breakdown, suitable for cost estimation. Messages without + usage metadata (e.g. tool or human messages) are ignored. + """ + totals = { + "input_tokens": 0, + "output_tokens": 0, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + } + for message in messages: + usage = getattr(message, "usage_metadata", None) + if not usage: + continue + totals["input_tokens"] += usage.get("input_tokens", 0) or 0 + totals["output_tokens"] += usage.get("output_tokens", 0) or 0 + details = usage.get("input_token_details") or {} + totals["cache_read_input_tokens"] += details.get("cache_read", 0) or 0 + totals["cache_creation_input_tokens"] += details.get("cache_creation", 0) or 0 + return totals def get_tokenizer(model_name): diff --git a/services/reviewhelper-api/alembic/versions/c1a2b3d4e5f6_add_risk_scores_and_skipped_status.py b/services/reviewhelper-api/alembic/versions/c1a2b3d4e5f6_add_risk_scores_and_skipped_status.py new file mode 100644 index 0000000000..7de42ebca9 --- /dev/null +++ b/services/reviewhelper-api/alembic/versions/c1a2b3d4e5f6_add_risk_scores_and_skipped_status.py @@ -0,0 +1,90 @@ +"""Add risk/complexity scores, test signals, and SKIPPED status. + +Revision ID: c1a2b3d4e5f6 +Revises: 714c920ab85b +Create Date: 2026-05-28 00:00:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from alembic_postgresql_enum import TableReference + +# revision identifiers, used by Alembic. +revision: str = "c1a2b3d4e5f6" +down_revision: Union[str, Sequence[str], None] = "714c920ab85b" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.sync_enum_values( + enum_schema="public", + enum_name="review_status_enum", + new_values=[ + "PENDING", + "PROCESSING", + "RETRY_PENDING", + "PUBLISHED", + "FAILED", + "SKIPPED", + ], + affected_columns=[ + TableReference( + table_schema="public", + table_name="review_requests", + column_name="status", + ) + ], + enum_values_to_rename=[], + ) + + op.add_column("review_requests", sa.Column("risk", sa.Integer(), nullable=True)) + op.add_column( + "review_requests", sa.Column("complexity", sa.Integer(), nullable=True) + ) + op.add_column( + "review_requests", sa.Column("in_diff_test_signal", sa.Text(), nullable=True) + ) + op.add_column( + "review_requests", sa.Column("coverage_signal", sa.Text(), nullable=True) + ) + op.add_column( + "review_requests", sa.Column("skipped_reason", sa.Text(), nullable=True) + ) + op.create_index( + "ix_review_requests_status", "review_requests", ["status"], unique=False + ) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_index("ix_review_requests_status", table_name="review_requests") + op.drop_column("review_requests", "skipped_reason") + op.drop_column("review_requests", "coverage_signal") + op.drop_column("review_requests", "in_diff_test_signal") + op.drop_column("review_requests", "complexity") + op.drop_column("review_requests", "risk") + + op.sync_enum_values( + enum_schema="public", + enum_name="review_status_enum", + new_values=[ + "PENDING", + "PROCESSING", + "RETRY_PENDING", + "PUBLISHED", + "FAILED", + ], + affected_columns=[ + TableReference( + table_schema="public", + table_name="review_requests", + column_name="status", + ) + ], + enum_values_to_rename=[], + ) diff --git a/services/reviewhelper-api/app/config.py b/services/reviewhelper-api/app/config.py index a0dd25afd1..a278cd15be 100644 --- a/services/reviewhelper-api/app/config.py +++ b/services/reviewhelper-api/app/config.py @@ -38,6 +38,10 @@ class Settings(BaseSettings): # Agent settings todo_enabled: bool = True + # Risk/complexity pre-screen (gates whether the review agent runs). + scoring_model: str = "claude-haiku-4-5-20251001" + scoring_max_tokens: int = 1024 + model_config = { "env_file": ".env", "env_file_encoding": "utf-8", diff --git a/services/reviewhelper-api/app/database/models.py b/services/reviewhelper-api/app/database/models.py index 2c8339dbd0..eb91dec843 100644 --- a/services/reviewhelper-api/app/database/models.py +++ b/services/reviewhelper-api/app/database/models.py @@ -74,6 +74,16 @@ class ReviewRequest(Base): error: Mapped[str | None] = mapped_column(Text) summary: Mapped[str | None] = mapped_column(Text) + # Risk/complexity pre-screen scores (0-10), recorded even when the review is + # skipped so thresholds can be tuned and the dashboard can chart them. The + # verbose factors and coverage detail live in `details`. + risk: Mapped[int | None] + complexity: Mapped[int | None] + in_diff_test_signal: Mapped[str | None] = mapped_column(Text) + coverage_signal: Mapped[str | None] = mapped_column(Text) + # Why the review was skipped (e.g. "above_threshold"); set with status=SKIPPED. + skipped_reason: Mapped[str | None] = mapped_column(Text) + # Relationships comments: Mapped[list["GeneratedComment"]] = relationship( "GeneratedComment", @@ -115,6 +125,8 @@ async def is_first_published_review(self) -> bool: unique=True, postgresql_where=(platform == Platform.GITHUB), ), + # Speeds up dashboard aggregations grouped by status / skip reason. + Index("ix_review_requests_status", "status"), ) diff --git a/services/reviewhelper-api/app/enums.py b/services/reviewhelper-api/app/enums.py index d392dccdb5..08d1e4b899 100644 --- a/services/reviewhelper-api/app/enums.py +++ b/services/reviewhelper-api/app/enums.py @@ -12,10 +12,17 @@ class ReviewStatus(str, Enum): RETRY_PENDING = "retry_pending" PUBLISHED = "published" FAILED = "failed" + # The revision was intentionally not reviewed (gated out by risk/complexity + # thresholds, reviewer-group rules, etc.). Terminal, like PUBLISHED/FAILED. + SKIPPED = "skipped" @property def is_final(self) -> bool: - return self is ReviewStatus.PUBLISHED or self is ReviewStatus.FAILED + return self in ( + ReviewStatus.PUBLISHED, + ReviewStatus.FAILED, + ReviewStatus.SKIPPED, + ) class FeedbackType(str, Enum): diff --git a/services/reviewhelper-api/app/pricing.py b/services/reviewhelper-api/app/pricing.py new file mode 100644 index 0000000000..d1078df506 --- /dev/null +++ b/services/reviewhelper-api/app/pricing.py @@ -0,0 +1,63 @@ +"""Per-model pricing for cost estimation. USD per 1M tokens. + +Ported from qreviews (qreviews/pricing.py). Used to turn the token usage recorded +on each review request into an estimated dollar cost for the dashboard. +""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ModelPrice: + input_per_mtok: float + output_per_mtok: float + cache_read_per_mtok: float + cache_write_per_mtok: float + + +# Anthropic published rates (USD / 1M tokens) as of late 2025/early 2026. +# Update this table when rates change. +PRICES: dict[str, ModelPrice] = { + # Claude Opus 4.x + "claude-opus-4-8": ModelPrice(15.00, 75.00, 1.50, 18.75), + "claude-opus-4-7": ModelPrice(15.00, 75.00, 1.50, 18.75), + "claude-opus-4-6": ModelPrice(15.00, 75.00, 1.50, 18.75), + # Claude Sonnet 4.x + "claude-sonnet-4-6": ModelPrice(3.00, 15.00, 0.30, 3.75), + "claude-sonnet-4-5": ModelPrice(3.00, 15.00, 0.30, 3.75), + # Claude Haiku 4.x + "claude-haiku-4-5": ModelPrice(1.00, 5.00, 0.10, 1.25), + "claude-haiku-4-5-20251001": ModelPrice(1.00, 5.00, 0.10, 1.25), +} + +# Generic fallback if a model ID isn't in the table — assume Sonnet-class. +FALLBACK_PRICE = ModelPrice(3.00, 15.00, 0.30, 3.75) + + +def price_for(model: str) -> ModelPrice: + if not model: + return FALLBACK_PRICE + if model in PRICES: + return PRICES[model] + # Try a prefix match (e.g. snapshot IDs). + for key, price in PRICES.items(): + if model.startswith(key): + return price + return FALLBACK_PRICE + + +def estimate_cost_usd( + model: str, + *, + input_tokens: int = 0, + output_tokens: int = 0, + cache_read: int = 0, + cache_write: int = 0, +) -> float: + p = price_for(model) + return ( + input_tokens * p.input_per_mtok + + output_tokens * p.output_per_mtok + + cache_read * p.cache_read_per_mtok + + cache_write * p.cache_write_per_mtok + ) / 1_000_000.0 diff --git a/services/reviewhelper-api/app/review_processor.py b/services/reviewhelper-api/app/review_processor.py index 84cffce308..3f002fc784 100644 --- a/services/reviewhelper-api/app/review_processor.py +++ b/services/reviewhelper-api/app/review_processor.py @@ -7,6 +7,16 @@ from app.config import settings from app.database.models import GeneratedComment, ReviewRequest from app.enums import Platform +from app.reviewer_groups import get_reviewer_groups_config, matching_groups +from bugbug.tools.code_review.diff_analysis import ( + analyze_diff, + format_test_signal_block, +) +from bugbug.tools.code_review.scoring import RiskComplexityScorer +from bugbug.tools.code_review.test_coverage import ( + format_coverage_block, + lookup_existing_coverage, +) from bugbug.tools.core.exceptions import LargeDiffError, RecursionLimitError from bugbug.tools.core.platforms.phabricator import ( PhabricatorPatch, @@ -29,6 +39,19 @@ class RevisionNotYetPublicError(Exception): """ +class ReviewSkipped(Exception): + """The revision was intentionally not reviewed (e.g. gated out by scores). + + Carries the reason and the details to persist. Not an error: the caller + records it as a terminal SKIPPED status without retrying. + """ + + def __init__(self, reason: str, *, details: dict | None = None): + super().__init__(reason) + self.reason = reason + self.details = details or {} + + @cache def get_code_review_tool(): from bugbug.tools.code_review import CodeReviewTool @@ -36,6 +59,14 @@ def get_code_review_tool(): return CodeReviewTool.create(todo_enabled=settings.todo_enabled) +@cache +def get_risk_scorer() -> RiskComplexityScorer: + return RiskComplexityScorer.create( + model=settings.scoring_model, + max_tokens=settings.scoring_max_tokens, + ) + + async def process_review( review_request: ReviewRequest, ) -> tuple[list[GeneratedComment], str, dict]: @@ -73,10 +104,12 @@ async def process_review( "the revision is private or has restricted visibility." ) + test_signals_block, scoring_details = await _score_and_gate(patch, review_request) + tool = get_code_review_tool() try: - result = await tool.run(patch) + result = await tool.run(patch, test_signals_block=test_signals_block) except LargeDiffError as e: raise ReviewProcessingError( "The diff size exceeds the current processing limits." @@ -99,7 +132,90 @@ async def process_review( for comment in result.review_comments ] - return generated_comments, result.patch_summary, result.details + details = {**result.details, **scoring_details} + + return generated_comments, result.patch_summary, details + + +async def _score_and_gate( + patch: PhabricatorPatch, review_request: ReviewRequest +) -> tuple[str, dict]: + """Score the revision's risk/complexity and decide whether to review it. + + Records the scores and test signals on ``review_request`` (so they're + persisted even when skipped) and returns the rendered ```` + block plus the scoring details to merge into the review details. Raises + ``ReviewSkipped`` when either score is at or above the group's threshold. + """ + config = get_reviewer_groups_config() + groups = matching_groups(patch) + # First matching group acts as the primary; otherwise fall back to defaults. + primary = groups[0] if groups else None + risk_threshold = ( + primary.effective_risk_threshold(config.defaults) + if primary + else config.defaults.risk_threshold + ) + complexity_threshold = ( + primary.effective_complexity_threshold(config.defaults) + if primary + else config.defaults.complexity_threshold + ) + + diff_stats = analyze_diff(patch.raw_diff) + coverage = await lookup_existing_coverage(diff_stats.non_test_paths) + test_signals_block = format_test_signal_block( + diff_stats, coverage_block=format_coverage_block(coverage) + ) + + scorer = get_risk_scorer() + scoring = await scorer.run( + title=patch.patch_title, + summary=patch.summary, + revision_id=patch.revision_id, + author=patch.author_phid, + bug_id=patch.bug_id if patch.has_bug else None, + raw_diff=patch.raw_diff, + test_signals_block=test_signals_block, + ) + scores = scoring.scores + + # Persist the scalar signals on the request so they survive a skip and feed + # the dashboard; verbose factors/coverage go in the details JSON. + review_request.risk = scores.risk + review_request.complexity = scores.complexity + review_request.in_diff_test_signal = diff_stats.in_diff_test_signal + review_request.coverage_signal = coverage.coverage_signal + + scoring_details = { + "scoring_model": scoring.model, + "scoring_usage": scoring.usage, + "risk_factors": scores.risk_factors, + "complexity_factors": scores.complexity_factors, + "coverage": { + "covered_paths": coverage.covered_paths, + "uncovered_paths": coverage.uncovered_paths, + "candidate_count": coverage.candidate_count, + }, + "thresholds": { + "risk": risk_threshold, + "complexity": complexity_threshold, + "group": primary.slug if primary else None, + }, + } + + if scores.risk >= risk_threshold or scores.complexity >= complexity_threshold: + logger.info( + "Skipping review for D%s: risk=%s complexity=%s (thresholds %s/%s)", + review_request.revision_id, + scores.risk, + scores.complexity, + risk_threshold, + complexity_threshold, + ) + raise ReviewSkipped("above_threshold", details=scoring_details) + + return test_signals_block, scoring_details def submit_review_to_platform( diff --git a/services/reviewhelper-api/app/reviewer_groups.py b/services/reviewhelper-api/app/reviewer_groups.py new file mode 100644 index 0000000000..377c732b7b --- /dev/null +++ b/services/reviewhelper-api/app/reviewer_groups.py @@ -0,0 +1,105 @@ +"""Per-reviewer-group configuration for automated code review. + +This is the shared config that risk/complexity gating (and, later, group-specific +skills and member-author restriction) reads. It is a checked-in YAML file rather +than env vars or a DB table: the config is a small, structured, version-controlled +list, and "easing in" a group is a reviewed change to the file. + +The file path defaults to ``reviewer_groups.yaml`` at the service root and can be +overridden with the ``REVIEWER_GROUPS_CONFIG`` environment variable. +""" + +import os +from functools import cache +from logging import getLogger +from pathlib import Path + +import yaml +from pydantic import BaseModel, Field + +logger = getLogger(__name__) + +_DEFAULT_CONFIG_PATH = Path(__file__).resolve().parent.parent / "reviewer_groups.yaml" + + +class Defaults(BaseModel): + """Fallback thresholds applied to groups that don't override them.""" + + # A review runs only when BOTH scores are strictly below their threshold. + risk_threshold: int = 3 + complexity_threshold: int = 3 + + +class ReviewerGroup(BaseModel): + """Configuration for a single Phabricator reviewer group (project).""" + + slug: str + enabled: bool = False + risk_threshold: int | None = None + complexity_threshold: int | None = None + + def effective_risk_threshold(self, defaults: Defaults) -> int: + return ( + self.risk_threshold + if self.risk_threshold is not None + else defaults.risk_threshold + ) + + def effective_complexity_threshold(self, defaults: Defaults) -> int: + return ( + self.complexity_threshold + if self.complexity_threshold is not None + else defaults.complexity_threshold + ) + + +class ReviewerGroupsConfig(BaseModel): + defaults: Defaults = Field(default_factory=Defaults) + groups: list[ReviewerGroup] = Field(default_factory=list) + + def by_slug(self, slug: str) -> ReviewerGroup | None: + for group in self.groups: + if group.slug == slug: + return group + return None + + +def _config_path() -> Path: + return Path(os.getenv("REVIEWER_GROUPS_CONFIG", str(_DEFAULT_CONFIG_PATH))) + + +@cache +def get_reviewer_groups_config() -> ReviewerGroupsConfig: + """Load and cache the reviewer-groups config for the process lifetime.""" + path = _config_path() + if not path.exists(): + logger.warning( + "reviewer groups config not found at %s; using empty config", path + ) + return ReviewerGroupsConfig() + data = yaml.safe_load(path.read_text()) or {} + return ReviewerGroupsConfig.model_validate(data) + + +def matching_groups(patch) -> list[ReviewerGroup]: + """Return the configured groups the patch is requesting review from. + + A patch matches a group when one of its reviewer-group PHIDs resolves to the + same project as the group's slug. The result preserves config order so the + first match can act as the "primary" group. + """ + # Imported here so the heavy Phabricator/searchfox import chain isn't pulled + # in just to parse the config (and so tests can monkeypatch the resolver). + from bugbug.tools.core.platforms import phabricator as phab + + config = get_reviewer_groups_config() + reviewer_projects = set(patch.reviewer_project_phids) + if not reviewer_projects: + return [] + + matched = [] + for group in config.groups: + phid = phab.resolve_project_phid(group.slug) + if phid and phid in reviewer_projects: + matched.append(group) + return matched diff --git a/services/reviewhelper-api/app/routers/internal.py b/services/reviewhelper-api/app/routers/internal.py index b83b68afdd..4886ae86c7 100644 --- a/services/reviewhelper-api/app/routers/internal.py +++ b/services/reviewhelper-api/app/routers/internal.py @@ -13,6 +13,7 @@ from app.enums import ReviewStatus from app.review_processor import ( ReviewProcessingError, + ReviewSkipped, RevisionNotYetPublicError, process_review, submit_review_to_platform, @@ -137,6 +138,14 @@ async def _set_retry_pending(db: AsyncSession): review_request.status = ReviewStatus.RETRY_PENDING await db.commit() return Response(status_code=status.HTTP_409_CONFLICT) + except ReviewSkipped as e: + logger.info("Review request %s skipped: %s", review_request_id, e.reason) + review_request.status = ReviewStatus.SKIPPED + review_request.skipped_reason = e.reason + review_request.details = e.details + await db.commit() + # Terminal, like a failure: 204 so Cloud Tasks doesn't retry. + return Response(status_code=status.HTTP_204_NO_CONTENT) except ReviewProcessingError as e: review_request.error = str(e) review_request.status = ReviewStatus.FAILED diff --git a/services/reviewhelper-api/app/routers/request.py b/services/reviewhelper-api/app/routers/request.py index e3ba4578b8..40955df049 100644 --- a/services/reviewhelper-api/app/routers/request.py +++ b/services/reviewhelper-api/app/routers/request.py @@ -117,6 +117,9 @@ def _build_response_message(review_request: ReviewRequest) -> str: if review_request.status == ReviewStatus.PUBLISHED: return f"Review Helper already posted its review for Diff {review_request.diff_id}." + if review_request.status == ReviewStatus.SKIPPED: + return f"Review Helper did not review Diff {review_request.diff_id} (reason: {review_request.skipped_reason})." + if review_request.status == ReviewStatus.FAILED: return f"The review processing for Diff {review_request.diff_id} failed with error: {review_request.error}" diff --git a/services/reviewhelper-api/pyproject.toml b/services/reviewhelper-api/pyproject.toml index 58c28a0ec5..71dfcb227e 100644 --- a/services/reviewhelper-api/pyproject.toml +++ b/services/reviewhelper-api/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "google-cloud-tasks>=2.22.0", "cloud-sql-python-connector[asyncpg]>=1.20.3", "python-dotenv>=1.0.0", + "pyyaml>=6.0", "bugbug", "sentry-sdk>=2.60.0", ] diff --git a/services/reviewhelper-api/reviewer_groups.yaml b/services/reviewhelper-api/reviewer_groups.yaml new file mode 100644 index 0000000000..ea802f68d1 --- /dev/null +++ b/services/reviewhelper-api/reviewer_groups.yaml @@ -0,0 +1,22 @@ +# Per-reviewer-group configuration for automated code review. +# +# `defaults` apply to any revision whose reviewer group isn't listed below, and +# to listed groups that don't override a value. A review runs only when BOTH the +# risk and complexity scores are strictly below their thresholds, so raising a +# threshold widens what gets auto-reviewed ("easing in"). +# +# Thresholds are 0-10. Start conservative (3/3 → only scores of 0,1,2 pass) and +# raise as the team grows comfortable. + +defaults: + risk_threshold: 3 + complexity_threshold: 3 + +groups: + - slug: ip-protection-reviewers + risk_threshold: 3 + complexity_threshold: 3 + + - slug: home-newtab-reviewers + risk_threshold: 3 + complexity_threshold: 3 diff --git a/services/reviewhelper-api/tests/conftest.py b/services/reviewhelper-api/tests/conftest.py new file mode 100644 index 0000000000..7bc0c21247 --- /dev/null +++ b/services/reviewhelper-api/tests/conftest.py @@ -0,0 +1,23 @@ +import os + +# app.config builds a pydantic Settings() at import time, which requires a set +# of env vars. Provide dummies so the app package is importable under test. +_DUMMY_ENV = { + "cloud_sql_instance": "test:test:test", + "db_user": "test", + "db_pass": "test", + "db_name": "test", + "external_api_key": "test", + "internal_api_key": "test", + "cloud_tasks_project": "test", + "cloud_tasks_location": "test", + "cloud_tasks_queue": "test", + "worker_url": "https://worker.test", + "phabricator_url": "https://phabricator.test", + "phabricator_api_key": "test", + "bugzilla_url": "https://bugzilla.test", + "bugzilla_api_key": "test", +} + +for key, value in _DUMMY_ENV.items(): + os.environ.setdefault(key, value) diff --git a/services/reviewhelper-api/tests/test_review_gating.py b/services/reviewhelper-api/tests/test_review_gating.py new file mode 100644 index 0000000000..f6abf61bee --- /dev/null +++ b/services/reviewhelper-api/tests/test_review_gating.py @@ -0,0 +1,90 @@ +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest +from app import review_processor +from app.review_processor import ReviewSkipped, _score_and_gate +from app.reviewer_groups import Defaults, ReviewerGroup, ReviewerGroupsConfig + +from bugbug.tools.code_review.scoring import Scores, ScoringResult +from bugbug.tools.code_review.test_coverage import ExistingCoverage + + +def _fake_patch(): + return SimpleNamespace( + raw_diff="diff --git a/foo.js b/foo.js\n--- a/foo.js\n+++ b/foo.js\n@@ -1 +1 @@\n+x\n", + patch_title="Tweak", + summary=None, + revision_id=123, + author_phid="PHID-USER-x", + has_bug=False, + bug_id=0, + reviewer_project_phids=[], + ) + + +def _patch_common(monkeypatch, *, risk, complexity, group=None): + config = ReviewerGroupsConfig( + defaults=Defaults(risk_threshold=3, complexity_threshold=3) + ) + monkeypatch.setattr(review_processor, "get_reviewer_groups_config", lambda: config) + monkeypatch.setattr( + review_processor, "matching_groups", lambda patch: [group] if group else [] + ) + monkeypatch.setattr( + review_processor, + "lookup_existing_coverage", + AsyncMock(return_value=ExistingCoverage(coverage_signal="uncovered")), + ) + scorer = SimpleNamespace( + run=AsyncMock( + return_value=ScoringResult( + scores=Scores(risk=risk, complexity=complexity), + model="claude-haiku-4-5", + usage={"input_tokens": 10}, + ) + ) + ) + monkeypatch.setattr(review_processor, "get_risk_scorer", lambda: scorer) + return config + + +@pytest.mark.asyncio +async def test_gate_skips_when_above_threshold(monkeypatch): + _patch_common(monkeypatch, risk=5, complexity=1) + review_request = SimpleNamespace(revision_id=123) + + with pytest.raises(ReviewSkipped) as exc: + await _score_and_gate(_fake_patch(), review_request) + + assert exc.value.reason == "above_threshold" + # Scores are recorded on the request even though we skipped. + assert review_request.risk == 5 + assert review_request.complexity == 1 + assert review_request.coverage_signal == "uncovered" + + +@pytest.mark.asyncio +async def test_gate_passes_when_below_threshold(monkeypatch): + _patch_common(monkeypatch, risk=1, complexity=2) + review_request = SimpleNamespace(revision_id=123) + + block, details = await _score_and_gate(_fake_patch(), review_request) + + assert "" in block + assert details["scoring_model"] == "claude-haiku-4-5" + assert review_request.risk == 1 + assert review_request.complexity == 2 + + +@pytest.mark.asyncio +async def test_gate_uses_group_threshold_override(monkeypatch): + group = ReviewerGroup(slug="ip-protection-reviewers", risk_threshold=6) + _patch_common(monkeypatch, risk=5, complexity=1, group=group) + review_request = SimpleNamespace(revision_id=123) + + # risk=5 would skip under the default threshold of 3, but the group raised + # it to 6, so the review proceeds. + block, details = await _score_and_gate(_fake_patch(), review_request) + assert details["thresholds"]["group"] == "ip-protection-reviewers" + assert details["thresholds"]["risk"] == 6 diff --git a/services/reviewhelper-api/tests/test_reviewer_groups.py b/services/reviewhelper-api/tests/test_reviewer_groups.py new file mode 100644 index 0000000000..da4787993a --- /dev/null +++ b/services/reviewhelper-api/tests/test_reviewer_groups.py @@ -0,0 +1,57 @@ +from types import SimpleNamespace + +from app.reviewer_groups import ( + Defaults, + ReviewerGroup, + ReviewerGroupsConfig, + matching_groups, +) + + +def test_effective_thresholds_fall_back_to_defaults(): + defaults = Defaults(risk_threshold=3, complexity_threshold=4) + group = ReviewerGroup(slug="g", risk_threshold=5) + assert group.effective_risk_threshold(defaults) == 5 + assert group.effective_complexity_threshold(defaults) == 4 + + +def test_by_slug(): + config = ReviewerGroupsConfig( + groups=[ReviewerGroup(slug="a"), ReviewerGroup(slug="b")] + ) + assert config.by_slug("b").slug == "b" + assert config.by_slug("missing") is None + + +def test_matching_groups_filters_to_requested_projects(monkeypatch): + config = ReviewerGroupsConfig( + groups=[ + ReviewerGroup(slug="ip-protection-reviewers"), + ReviewerGroup(slug="home-newtab-reviewers"), + ] + ) + monkeypatch.setattr( + "app.reviewer_groups.get_reviewer_groups_config", lambda: config + ) + + phid_by_slug = { + "ip-protection-reviewers": "PHID-PROJ-ip", + "home-newtab-reviewers": "PHID-PROJ-newtab", + } + monkeypatch.setattr( + "bugbug.tools.core.platforms.phabricator.resolve_project_phid", + lambda slug: phid_by_slug.get(slug), + ) + + patch = SimpleNamespace(reviewer_project_phids=["PHID-PROJ-ip"]) + matched = matching_groups(patch) + assert [g.slug for g in matched] == ["ip-protection-reviewers"] + + +def test_matching_groups_empty_when_no_reviewer_projects(monkeypatch): + monkeypatch.setattr( + "app.reviewer_groups.get_reviewer_groups_config", + lambda: ReviewerGroupsConfig(groups=[ReviewerGroup(slug="a")]), + ) + patch = SimpleNamespace(reviewer_project_phids=[]) + assert matching_groups(patch) == [] diff --git a/tests/test_code_review_scoring.py b/tests/test_code_review_scoring.py new file mode 100644 index 0000000000..acdc6f6228 --- /dev/null +++ b/tests/test_code_review_scoring.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from bugbug.tools.code_review.diff_analysis import ( + analyze_diff, + format_test_signal_block, +) +from bugbug.tools.code_review.scoring import RiskComplexityScorer, Scores +from bugbug.tools.code_review.test_coverage import ( + ExistingCoverage, + format_coverage_block, + lookup_existing_coverage, +) + + +def _diff(*files: str) -> str: + return "\n".join(files) + + +def _git_file(path: str, added: int = 1, removed: int = 0) -> str: + body = "\n".join(["+new line"] * added + ["-old line"] * removed) + return ( + f"diff --git a/{path} b/{path}\n" + f"--- a/{path}\n+++ b/{path}\n" + f"@@ -1,{removed or 1} +1,{added or 1} @@\n" + f"{body}" + ) + + +# --------------------------------------------------------------------------- +# diff_analysis +# --------------------------------------------------------------------------- + + +def test_no_code_change(): + assert analyze_diff("").in_diff_test_signal == "no_code_change" + + +def test_tests_only(): + diff = _git_file("browser/components/foo/test/browser_foo.js", added=5) + stats = analyze_diff(diff) + assert stats.in_diff_test_signal == "tests_only" + assert stats.non_test_files_changed == 0 + assert stats.test_files_changed == 1 + + +def test_absent_when_only_non_test_code(): + diff = _git_file("browser/components/foo/Foo.sys.mjs", added=20) + stats = analyze_diff(diff) + assert stats.in_diff_test_signal == "absent" + assert stats.non_test_paths == ["browser/components/foo/Foo.sys.mjs"] + + +def test_sparse_low_test_ratio(): + diff = _diff( + _git_file("browser/components/foo/Foo.sys.mjs", added=100), + _git_file("browser/components/foo/test/test_foo.js", added=5), + ) + assert analyze_diff(diff).in_diff_test_signal == "sparse" + + +def test_adequate_high_test_ratio(): + diff = _diff( + _git_file("browser/components/foo/Foo.sys.mjs", added=10), + _git_file("browser/components/foo/test/test_foo.js", added=10), + ) + assert analyze_diff(diff).in_diff_test_signal == "adequate" + + +def test_format_test_signal_block_includes_coverage(): + stats = analyze_diff(_git_file("a/b/C.cpp", added=3)) + block = format_test_signal_block(stats, coverage_block="coverage_signal=uncovered") + assert "" in block + assert "in_diff_test_signal=absent" in block + assert "coverage_signal=uncovered" in block + assert block.rstrip().endswith("") + + +# --------------------------------------------------------------------------- +# test_coverage +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_lookup_no_non_test_files(): + result = await lookup_existing_coverage([]) + assert result.coverage_signal == "no_non_test_files" + + +@pytest.mark.asyncio +async def test_lookup_partial_coverage(): + async def fake_search(query, tests, limit): + if query == "Covered": + return [("dom/tests/test_covered.js", 10, "...")] + return [] + + client = AsyncMock() + client.search.side_effect = fake_search + with patch( + "bugbug.tools.code_review.test_coverage._get_client", return_value=client + ): + result = await lookup_existing_coverage( + ["dom/base/Covered.cpp", "dom/base/Uncovered.cpp"] + ) + assert result.coverage_signal == "partial" + assert "dom/base/Covered.cpp" in result.covered_paths + assert "dom/base/Uncovered.cpp" in result.uncovered_paths + + +@pytest.mark.asyncio +async def test_lookup_searchfox_error_degrades(): + client = AsyncMock() + client.search.side_effect = Exception("searchfox down") + with patch( + "bugbug.tools.code_review.test_coverage._get_client", return_value=client + ): + result = await lookup_existing_coverage(["dom/base/Element.cpp"]) + assert result.coverage_signal == "skipped_searchfox_error" + + +@pytest.mark.asyncio +async def test_lookup_large_diff_skipped(): + client = AsyncMock() + client.search.return_value = [] + paths = [f"dom/base/File{i}.cpp" for i in range(15)] + with patch( + "bugbug.tools.code_review.test_coverage._get_client", return_value=client + ): + result = await lookup_existing_coverage(paths, file_cap=10) + assert result.coverage_signal == "skipped_large_diff" + assert result.candidate_count == 15 + + +def test_format_coverage_block_none_when_no_files(): + assert ( + format_coverage_block(ExistingCoverage(coverage_signal="no_non_test_files")) + is None + ) + + +# --------------------------------------------------------------------------- +# scoring +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_risk_scorer_parses_and_captures_usage(): + fake_llm = MagicMock() + structured = AsyncMock() + fake_llm.with_structured_output.return_value = structured + scorer = RiskComplexityScorer(fake_llm, "claude-haiku-4-5") + + raw = SimpleNamespace( + usage_metadata={ + "input_tokens": 100, + "output_tokens": 20, + "input_token_details": {"cache_read": 10, "cache_creation": 5}, + } + ) + structured.ainvoke.return_value = { + "parsed": Scores(risk=2, complexity=1, risk_factors=["leaf UI"]), + "raw": raw, + } + + result = await scorer.run( + title="Tweak button", + summary=None, + revision_id=42, + author="PHID-USER-x", + bug_id=None, + raw_diff="diff --git a/a b/a", + test_signals_block="...", + ) + + assert result.scores.risk == 2 + assert result.scores.complexity == 1 + assert result.model == "claude-haiku-4-5" + assert result.usage["input_tokens"] == 100 + assert result.usage["cache_read_input_tokens"] == 10 + assert result.usage["cache_creation_input_tokens"] == 5 + + +@pytest.mark.asyncio +async def test_risk_scorer_raises_on_parse_failure(): + from bugbug.tools.core.exceptions import ModelResultError + + fake_llm = MagicMock() + structured = AsyncMock() + fake_llm.with_structured_output.return_value = structured + scorer = RiskComplexityScorer(fake_llm, "claude-haiku-4-5") + structured.ainvoke.return_value = { + "parsed": None, + "raw": SimpleNamespace(), + "parsing_error": "bad", + } + + with pytest.raises(ModelResultError): + await scorer.run( + title="t", + summary=None, + revision_id=1, + author="a", + bug_id=None, + raw_diff="d", + ) From 11661611fcdf18437f5b9e15fb303390406362bd Mon Sep 17 00:00:00 2001 From: Jared Wein Date: Tue, 23 Jun 2026 16:39:26 -0400 Subject: [PATCH 2/4] Match reviewer groups via aliases and rotation history Consume PhabricatorPatch.historical_reviewer_project_phids so a group that was assigned via a rotation (added, then swapped for an individual and removed) still matches. Add an `aliases` field to ReviewerGroup so a rotation project (e.g. home-newtab-reviewers-rotation) counts as the same logical group as home-newtab-reviewers, and resolve every slug (own + aliases) when matching. Falls back to the current reviewer snapshot for patches without history. --- .../reviewhelper-api/app/reviewer_groups.py | 27 ++++++++++--- .../reviewhelper-api/reviewer_groups.yaml | 5 +++ .../tests/test_reviewer_groups.py | 40 +++++++++++++++++++ 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/services/reviewhelper-api/app/reviewer_groups.py b/services/reviewhelper-api/app/reviewer_groups.py index 377c732b7b..02b62bbe79 100644 --- a/services/reviewhelper-api/app/reviewer_groups.py +++ b/services/reviewhelper-api/app/reviewer_groups.py @@ -34,10 +34,18 @@ class ReviewerGroup(BaseModel): """Configuration for a single Phabricator reviewer group (project).""" slug: str + # Additional project slugs that count as this same logical group — e.g. a + # review rotation's project (`-rotation`), which is added as the + # reviewer and then swapped for an individual member. + aliases: list[str] = Field(default_factory=list) enabled: bool = False risk_threshold: int | None = None complexity_threshold: int | None = None + def all_slugs(self) -> list[str]: + """The group's own slug plus any alias slugs.""" + return [self.slug, *self.aliases] + def effective_risk_threshold(self, defaults: Defaults) -> int: return ( self.risk_threshold @@ -84,22 +92,29 @@ def get_reviewer_groups_config() -> ReviewerGroupsConfig: def matching_groups(patch) -> list[ReviewerGroup]: """Return the configured groups the patch is requesting review from. - A patch matches a group when one of its reviewer-group PHIDs resolves to the - same project as the group's slug. The result preserves config order so the - first match can act as the "primary" group. + A patch matches a group when any of the group's slugs (its own or an alias) + resolves to a project that was a reviewer of the revision. We match against + ``historical_reviewer_project_phids`` so a rotation group that was added and + later removed (replaced by an individual) still counts; falls back to the + current reviewer snapshot for patches that don't expose history. The result + preserves config order so the first match can act as the "primary" group. """ # Imported here so the heavy Phabricator/searchfox import chain isn't pulled # in just to parse the config (and so tests can monkeypatch the resolver). from bugbug.tools.core.platforms import phabricator as phab config = get_reviewer_groups_config() - reviewer_projects = set(patch.reviewer_project_phids) + reviewer_projects = set( + getattr(patch, "historical_reviewer_project_phids", None) + or patch.reviewer_project_phids + ) if not reviewer_projects: return [] matched = [] for group in config.groups: - phid = phab.resolve_project_phid(group.slug) - if phid and phid in reviewer_projects: + group_phids = {phab.resolve_project_phid(slug) for slug in group.all_slugs()} + group_phids.discard(None) + if group_phids & reviewer_projects: matched.append(group) return matched diff --git a/services/reviewhelper-api/reviewer_groups.yaml b/services/reviewhelper-api/reviewer_groups.yaml index ea802f68d1..635aba74f2 100644 --- a/services/reviewhelper-api/reviewer_groups.yaml +++ b/services/reviewhelper-api/reviewer_groups.yaml @@ -12,11 +12,16 @@ defaults: risk_threshold: 3 complexity_threshold: 3 +# `aliases` lists extra project slugs that count as the same logical group — +# notably a review rotation's `-rotation` project, which is added as the +# reviewer and then swapped for an individual member before review time. groups: - slug: ip-protection-reviewers + aliases: [ip-protection-reviewers-rotation] risk_threshold: 3 complexity_threshold: 3 - slug: home-newtab-reviewers + aliases: [home-newtab-reviewers-rotation] risk_threshold: 3 complexity_threshold: 3 diff --git a/services/reviewhelper-api/tests/test_reviewer_groups.py b/services/reviewhelper-api/tests/test_reviewer_groups.py index da4787993a..c0bd03b569 100644 --- a/services/reviewhelper-api/tests/test_reviewer_groups.py +++ b/services/reviewhelper-api/tests/test_reviewer_groups.py @@ -55,3 +55,43 @@ def test_matching_groups_empty_when_no_reviewer_projects(monkeypatch): ) patch = SimpleNamespace(reviewer_project_phids=[]) assert matching_groups(patch) == [] + + +def test_all_slugs_includes_aliases(): + group = ReviewerGroup( + slug="home-newtab-reviewers", aliases=["home-newtab-reviewers-rotation"] + ) + assert group.all_slugs() == [ + "home-newtab-reviewers", + "home-newtab-reviewers-rotation", + ] + + +def test_matching_groups_recovers_rotation_via_alias_and_history(monkeypatch): + # Models D308166: the rotation project was a reviewer, then removed and + # replaced by an individual — so it survives only in the historical list. + config = ReviewerGroupsConfig( + groups=[ + ReviewerGroup( + slug="home-newtab-reviewers", + aliases=["home-newtab-reviewers-rotation"], + ) + ] + ) + monkeypatch.setattr( + "app.reviewer_groups.get_reviewer_groups_config", lambda: config + ) + monkeypatch.setattr( + "bugbug.tools.core.platforms.phabricator.resolve_project_phid", + lambda slug: { + "home-newtab-reviewers": "PHID-PROJ-newtab", + "home-newtab-reviewers-rotation": "PHID-PROJ-newtabrotation", + }.get(slug), + ) + + patch = SimpleNamespace( + reviewer_project_phids=[], # group already removed from current reviewers + historical_reviewer_project_phids=["PHID-PROJ-newtabrotation"], + ) + matched = matching_groups(patch) + assert [g.slug for g in matched] == ["home-newtab-reviewers"] From 259cb76f02927c4500f47c0e4925d0a54103a6ce Mon Sep 17 00:00:00 2001 From: Jared Wein Date: Thu, 28 May 2026 17:31:17 -0400 Subject: [PATCH 3/4] Limit automated review to enabled groups and member authors Builds on the reviewer-groups config to control rollout per team: - Only revisions requesting review from a group marked `enabled: true` are auto-reviewed (skip reason "not_enabled"), so teams can be turned on one at a time (e.g. ip-protection-reviewers first, home-newtab-reviewers later). - `restrict_to_member_authors` (default true) reviews only patches whose author is a member of the group's Phabricator project (skip "author_not_in_group"); an empty membership lookup over-includes rather than dropping everything. - An optional per-group `opt_out` PHID list excludes individual members (skip "author_opted_out"). These cheap metadata gates run before the paid scoring call. Adds tests for each gate. --- .../reviewhelper-api/app/review_processor.py | 52 +++++--- .../reviewhelper-api/app/reviewer_groups.py | 7 ++ .../reviewhelper-api/reviewer_groups.yaml | 10 ++ .../tests/test_review_gating.py | 117 ++++++++++++++++-- 4 files changed, 162 insertions(+), 24 deletions(-) diff --git a/services/reviewhelper-api/app/review_processor.py b/services/reviewhelper-api/app/review_processor.py index 3f002fc784..c156e27459 100644 --- a/services/reviewhelper-api/app/review_processor.py +++ b/services/reviewhelper-api/app/review_processor.py @@ -21,6 +21,8 @@ from bugbug.tools.core.platforms.phabricator import ( PhabricatorPatch, get_phabricator_client, + get_project_members, + resolve_project_phid, ) logger = logging.getLogger(__name__) @@ -148,19 +150,20 @@ async def _score_and_gate( ``ReviewSkipped`` when either score is at or above the group's threshold. """ config = get_reviewer_groups_config() - groups = matching_groups(patch) - # First matching group acts as the primary; otherwise fall back to defaults. - primary = groups[0] if groups else None - risk_threshold = ( - primary.effective_risk_threshold(config.defaults) - if primary - else config.defaults.risk_threshold - ) - complexity_threshold = ( - primary.effective_complexity_threshold(config.defaults) - if primary - else config.defaults.complexity_threshold - ) + + # Only revisions requesting review from an enabled group are auto-reviewed. + enabled_groups = [group for group in matching_groups(patch) if group.enabled] + if not enabled_groups: + raise ReviewSkipped("not_enabled") + + # First enabled matching group acts as the primary. + primary = enabled_groups[0] + + # Cheap author gates run before the (paid) scoring call. + _enforce_author_gates(primary, patch) + + risk_threshold = primary.effective_risk_threshold(config.defaults) + complexity_threshold = primary.effective_complexity_threshold(config.defaults) diff_stats = analyze_diff(patch.raw_diff) coverage = await lookup_existing_coverage(diff_stats.non_test_paths) @@ -200,7 +203,7 @@ async def _score_and_gate( "thresholds": { "risk": risk_threshold, "complexity": complexity_threshold, - "group": primary.slug if primary else None, + "group": primary.slug, }, } @@ -218,6 +221,27 @@ async def _score_and_gate( return test_signals_block, scoring_details +def _enforce_author_gates(group, patch: PhabricatorPatch) -> None: + """Skip review based on the revision author's relationship to the group. + + Raises ``ReviewSkipped`` when the author has opted out, or when the group + restricts review to its own members and the author isn't one. + """ + author_phid = patch.author_phid + + if author_phid in group.opt_out: + raise ReviewSkipped("author_opted_out") + + if group.restrict_to_member_authors: + group_phid = resolve_project_phid(group.slug) + members = get_project_members(group_phid) if group_phid else frozenset() + # If the membership lookup is empty (transient failure / unresolved + # project), don't skip — we'd rather over-include than silently drop + # everything for the group. + if members and author_phid not in members: + raise ReviewSkipped("author_not_in_group") + + def submit_review_to_platform( review_request: ReviewRequest, generated_comments: Iterable[GeneratedComment] ) -> AsyncIterator[tuple[GeneratedComment, int]]: diff --git a/services/reviewhelper-api/app/reviewer_groups.py b/services/reviewhelper-api/app/reviewer_groups.py index 02b62bbe79..a035cece2c 100644 --- a/services/reviewhelper-api/app/reviewer_groups.py +++ b/services/reviewhelper-api/app/reviewer_groups.py @@ -41,6 +41,13 @@ class ReviewerGroup(BaseModel): enabled: bool = False risk_threshold: int | None = None complexity_threshold: int | None = None + # When True, only review revisions authored by a member of this group's + # Phabricator project. Lets a team dogfood automated review on its own + # patches before opening it to outside contributors. + restrict_to_member_authors: bool = True + # PHIDs of group members who have opted out of automated review on their + # own patches. + opt_out: list[str] = Field(default_factory=list) def all_slugs(self) -> list[str]: """The group's own slug plus any alias slugs.""" diff --git a/services/reviewhelper-api/reviewer_groups.yaml b/services/reviewhelper-api/reviewer_groups.yaml index 635aba74f2..64633a4b05 100644 --- a/services/reviewhelper-api/reviewer_groups.yaml +++ b/services/reviewhelper-api/reviewer_groups.yaml @@ -12,16 +12,26 @@ defaults: risk_threshold: 3 complexity_threshold: 3 +# Only revisions whose reviewer group is listed AND `enabled: true` get an +# automated review. Roll out one group at a time by flipping `enabled`. +# `restrict_to_member_authors` (default true) limits review to patches authored +# by a member of the group; add a member's PHID to `opt_out` to exclude them. # `aliases` lists extra project slugs that count as the same logical group — # notably a review rotation's `-rotation` project, which is added as the # reviewer and then swapped for an individual member before review time. groups: - slug: ip-protection-reviewers aliases: [ip-protection-reviewers-rotation] + enabled: true risk_threshold: 3 complexity_threshold: 3 + restrict_to_member_authors: true + opt_out: [] - slug: home-newtab-reviewers aliases: [home-newtab-reviewers-rotation] + enabled: false risk_threshold: 3 complexity_threshold: 3 + restrict_to_member_authors: true + opt_out: [] diff --git a/services/reviewhelper-api/tests/test_review_gating.py b/services/reviewhelper-api/tests/test_review_gating.py index f6abf61bee..509f80daf9 100644 --- a/services/reviewhelper-api/tests/test_review_gating.py +++ b/services/reviewhelper-api/tests/test_review_gating.py @@ -9,21 +9,31 @@ from bugbug.tools.code_review.scoring import Scores, ScoringResult from bugbug.tools.code_review.test_coverage import ExistingCoverage +AUTHOR = "PHID-USER-x" -def _fake_patch(): + +def _fake_patch(author_phid=AUTHOR): return SimpleNamespace( raw_diff="diff --git a/foo.js b/foo.js\n--- a/foo.js\n+++ b/foo.js\n@@ -1 +1 @@\n+x\n", patch_title="Tweak", summary=None, revision_id=123, - author_phid="PHID-USER-x", + author_phid=author_phid, has_bug=False, bug_id=0, - reviewer_project_phids=[], + reviewer_project_phids=["PHID-PROJ-x"], ) -def _patch_common(monkeypatch, *, risk, complexity, group=None): +def _patch_common(monkeypatch, *, risk, complexity, group=None, members=frozenset()): + if group is None: + # Default: an enabled group with no author restriction, so the gating + # tests exercise only the score threshold. + group = ReviewerGroup( + slug="ip-protection-reviewers", + enabled=True, + restrict_to_member_authors=False, + ) config = ReviewerGroupsConfig( defaults=Defaults(risk_threshold=3, complexity_threshold=3) ) @@ -31,6 +41,10 @@ def _patch_common(monkeypatch, *, risk, complexity, group=None): monkeypatch.setattr( review_processor, "matching_groups", lambda patch: [group] if group else [] ) + monkeypatch.setattr( + review_processor, "resolve_project_phid", lambda slug: "PHID-PROJ-x" + ) + monkeypatch.setattr(review_processor, "get_project_members", lambda phid: members) monkeypatch.setattr( review_processor, "lookup_existing_coverage", @@ -49,6 +63,11 @@ def _patch_common(monkeypatch, *, risk, complexity, group=None): return config +# --------------------------------------------------------------------------- +# score threshold gate +# --------------------------------------------------------------------------- + + @pytest.mark.asyncio async def test_gate_skips_when_above_threshold(monkeypatch): _patch_common(monkeypatch, risk=5, complexity=1) @@ -58,7 +77,6 @@ async def test_gate_skips_when_above_threshold(monkeypatch): await _score_and_gate(_fake_patch(), review_request) assert exc.value.reason == "above_threshold" - # Scores are recorded on the request even though we skipped. assert review_request.risk == 5 assert review_request.complexity == 1 assert review_request.coverage_signal == "uncovered" @@ -74,17 +92,96 @@ async def test_gate_passes_when_below_threshold(monkeypatch): assert "" in block assert details["scoring_model"] == "claude-haiku-4-5" assert review_request.risk == 1 - assert review_request.complexity == 2 @pytest.mark.asyncio async def test_gate_uses_group_threshold_override(monkeypatch): - group = ReviewerGroup(slug="ip-protection-reviewers", risk_threshold=6) + group = ReviewerGroup( + slug="ip-protection-reviewers", + enabled=True, + risk_threshold=6, + restrict_to_member_authors=False, + ) _patch_common(monkeypatch, risk=5, complexity=1, group=group) review_request = SimpleNamespace(revision_id=123) - # risk=5 would skip under the default threshold of 3, but the group raised - # it to 6, so the review proceeds. - block, details = await _score_and_gate(_fake_patch(), review_request) + _block, details = await _score_and_gate(_fake_patch(), review_request) assert details["thresholds"]["group"] == "ip-protection-reviewers" assert details["thresholds"]["risk"] == 6 + + +# --------------------------------------------------------------------------- +# enablement + author gates (run before scoring) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_skips_when_no_enabled_group(monkeypatch): + disabled = ReviewerGroup(slug="ip-protection-reviewers", enabled=False) + scorer = _patch_common(monkeypatch, risk=0, complexity=0, group=disabled) # noqa: F841 + + with pytest.raises(ReviewSkipped) as exc: + await _score_and_gate(_fake_patch(), SimpleNamespace(revision_id=123)) + assert exc.value.reason == "not_enabled" + + +@pytest.mark.asyncio +async def test_skips_when_author_not_a_member(monkeypatch): + group = ReviewerGroup( + slug="ip-protection-reviewers", enabled=True, restrict_to_member_authors=True + ) + _patch_common( + monkeypatch, + risk=0, + complexity=0, + group=group, + members=frozenset({"PHID-USER-other"}), + ) + + with pytest.raises(ReviewSkipped) as exc: + await _score_and_gate(_fake_patch(), SimpleNamespace(revision_id=123)) + assert exc.value.reason == "author_not_in_group" + + +@pytest.mark.asyncio +async def test_member_author_passes_membership_gate(monkeypatch): + group = ReviewerGroup( + slug="ip-protection-reviewers", enabled=True, restrict_to_member_authors=True + ) + _patch_common( + monkeypatch, risk=0, complexity=0, group=group, members=frozenset({AUTHOR}) + ) + + block, _details = await _score_and_gate( + _fake_patch(), SimpleNamespace(revision_id=123) + ) + assert "" in block + + +@pytest.mark.asyncio +async def test_empty_membership_lookup_does_not_skip(monkeypatch): + group = ReviewerGroup( + slug="ip-protection-reviewers", enabled=True, restrict_to_member_authors=True + ) + # Empty membership set (transient failure) — over-include rather than drop. + _patch_common(monkeypatch, risk=0, complexity=0, group=group, members=frozenset()) + + block, _details = await _score_and_gate( + _fake_patch(), SimpleNamespace(revision_id=123) + ) + assert "" in block + + +@pytest.mark.asyncio +async def test_opted_out_author_is_skipped(monkeypatch): + group = ReviewerGroup( + slug="ip-protection-reviewers", + enabled=True, + restrict_to_member_authors=False, + opt_out=[AUTHOR], + ) + _patch_common(monkeypatch, risk=0, complexity=0, group=group) + + with pytest.raises(ReviewSkipped) as exc: + await _score_and_gate(_fake_patch(), SimpleNamespace(revision_id=123)) + assert exc.value.reason == "author_opted_out" From 2dcfe1adf6ee3920186e22e08fb6d27ad16e898e Mon Sep 17 00:00:00 2001 From: Jared Wein Date: Mon, 22 Jun 2026 15:43:08 -0400 Subject: [PATCH 4/4] Match review-context rules on reviewer groups Implements the previously-stubbed `review` predicate in the code-review external-content engine so a review-context.toml rule can target a reviewer group (the capability upstream left unimplemented): - review_context.py: implement _review_matches for `review.reviewer`, `review.blocking_reviewer`, and `review.author`. A predicate matches when every facet it specifies matches; reviewer groups are matched by Phabricator project slug (resolved via resolve_project_phid) against the revision's requested reviewers. Thread a ReviewInfo (author + reviewer/blocking group PHIDs) through rule_matches/predicate_matches/collect_actions, built from the patch in load_external_content_for_diff. Fails closed when no review info is available (e.g. a GitHub diff). - phabricator.py: expose blocking_reviewer_project_phids and author_username, refactoring the reviewers fetch into a shared _reviewers cached_property. - docs/code-review-skills.md: document the now-working `review` predicate. This replaces the earlier bespoke per-group skill mechanism: teams attach area guidance by reviewer group through the existing review-context.toml, with no second skill system. Depends on the reviewer-group plumbing from the PhabricatorPatch membership change. --- bugbug/tools/code_review/review_context.py | 101 +++++++++++++++++++-- bugbug/tools/core/platforms/phabricator.py | 32 +++++-- docs/code-review-skills.md | 13 ++- tests/test_code_review.py | 84 +++++++++++++++++ 4 files changed, 209 insertions(+), 21 deletions(-) diff --git a/bugbug/tools/code_review/review_context.py b/bugbug/tools/code_review/review_context.py index fa34beb409..586f6f1aea 100644 --- a/bugbug/tools/code_review/review_context.py +++ b/bugbug/tools/code_review/review_context.py @@ -150,10 +150,42 @@ def parse_diff_files(diff: str) -> set[str]: return files +@dataclass(frozen=True) +class ReviewInfo: + """Reviewer-related facts about a revision, used by the ``review`` predicate.""" + + author: str | None = None + # Reviewer-group (Phabricator project) PHIDs requested on the revision. + reviewer_groups: frozenset[str] = frozenset() + # The subset of reviewer groups that are *blocking* reviewers. + blocking_reviewer_groups: frozenset[str] = frozenset() + + +async def build_review_info(patch) -> ReviewInfo: + """Build a ReviewInfo from a patch, tolerating patches that lack the fields.""" + try: + reviewer_groups = frozenset(getattr(patch, "reviewer_project_phids", []) or []) + blocking = frozenset( + getattr(patch, "blocking_reviewer_project_phids", []) or [] + ) + author = getattr(patch, "author_username", None) + except Exception: + logger.exception("Could not build review info for the patch") + return ReviewInfo() + return ReviewInfo( + author=author, + reviewer_groups=reviewer_groups, + blocking_reviewer_groups=blocking, + ) + + def rule_matches( - rule: Rule, changed_files: set[str], bug_component: str | None = None + rule: Rule, + changed_files: set[str], + bug_component: str | None = None, + review: ReviewInfo | None = None, ) -> bool: - return predicate_matches(rule.when, changed_files, bug_component) + return predicate_matches(rule.when, changed_files, bug_component, review) def _normalise_path(path: str) -> str: @@ -188,8 +220,51 @@ def _bugzilla_matches( return False -def _review_matches(predicate: ReviewPredicate) -> bool: - return False +def _resolve_group_slugs(slugs: list[str]) -> set[str]: + """Resolve reviewer-group slugs to project PHIDs (cached, best-effort).""" + from bugbug.tools.core.platforms import phabricator as phab + + resolved = set() + for slug in slugs: + phid = phab.resolve_project_phid(slug) + if phid: + resolved.add(phid) + return resolved + + +def _review_matches( + predicate: ReviewPredicate, review: ReviewInfo | None = None +) -> bool: + """Match a revision's reviewers/author against a ReviewPredicate. + + Each specified facet (``author``, ``reviewer``, ``blocking_reviewer``) must + match — mirroring FilePredicate's include/exclude/ext conjunction. A + predicate with no facets, or a revision with no review info, never matches. + """ + if review is None: + return False + + matched_any_facet = False + + if predicate.author: + if review.author not in predicate.author: + return False + matched_any_facet = True + + if predicate.reviewer: + if not (review.reviewer_groups & _resolve_group_slugs(predicate.reviewer)): + return False + matched_any_facet = True + + if predicate.blocking_reviewer: + if not ( + review.blocking_reviewer_groups + & _resolve_group_slugs(predicate.blocking_reviewer) + ): + return False + matched_any_facet = True + + return matched_any_facet def _patch_matches(predicate: PatchPredicate) -> bool: @@ -200,20 +275,21 @@ def predicate_matches( predicate: Predicate, changed_files: set[str], bug_component: str | None = None, + review: ReviewInfo | None = None, ) -> bool: match predicate: case AllPredicate(children=children): return all( - predicate_matches(child, changed_files, bug_component) + predicate_matches(child, changed_files, bug_component, review) for child in children ) case AnyPredicate(children=children): return any( - predicate_matches(child, changed_files, bug_component) + predicate_matches(child, changed_files, bug_component, review) for child in children ) case NotPredicate(child=child): - return not predicate_matches(child, changed_files, bug_component) + return not predicate_matches(child, changed_files, bug_component, review) case AnyFilePredicate(predicate=file_predicate): return any(_file_matches(file_predicate, f) for f in changed_files) case AllFilesPredicate(predicate=file_predicate): @@ -223,7 +299,7 @@ def predicate_matches( case BugzillaPredicate(): return _bugzilla_matches(predicate, bug_component) case ReviewPredicate(): - return _review_matches(predicate) + return _review_matches(predicate, review) case PatchPredicate(): return _patch_matches(predicate) raise TypeError(f"Unknown predicate type: {type(predicate).__name__}") @@ -356,6 +432,7 @@ def collect_actions( config: ReviewContextConfig, bug_component: str | None = None, extra_context_toml: str | None = None, + review: ReviewInfo | None = None, ) -> list[MatchedAction]: """Return deduplicated actions matched from config against the diff. @@ -386,7 +463,7 @@ def collect_actions( enumerate(config.rules), key=lambda item: (-item[1].priority, item[0]) ) for _, rule in ordered_rules: - if not rule_matches(rule, changed_files, bug_component): + if not rule_matches(rule, changed_files, bug_component, review): continue rule_name = rule.name new_actions = [] @@ -530,11 +607,15 @@ async def load_external_content_for_diff( return [] bug_component: str | None = None + review: ReviewInfo | None = None if patch is not None: bug_component = await patch.bug_component() + review = await build_review_info(patch) try: - actions = collect_actions(diff, config, bug_component, extra_context_toml) + actions = collect_actions( + diff, config, bug_component, extra_context_toml, review + ) except (tomllib.TOMLDecodeError, ReviewContextValidationError): logger.exception("Could not parse extra review context") return [] diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py index 39baabd645..8a89d47c35 100644 --- a/bugbug/tools/core/platforms/phabricator.py +++ b/bugbug/tools/core/platforms/phabricator.py @@ -561,11 +561,12 @@ def author_phid(self) -> str: return self._revision_metadata["fields"]["authorPHID"] @cached_property - def reviewer_phids(self) -> list[str]: - """PHIDs of the revision's reviewers (a mix of users and projects). + def _reviewers(self) -> list[dict]: + """Raw reviewer entries from the revision's reviewers attachment. The reviewers attachment is not part of the default revision metadata, so we fetch it with a dedicated ``differential.revision.search`` call. + Each entry carries ``reviewerPHID`` and ``isBlocking``. """ phabricator = get_phabricator_client() response = phabricator.request( @@ -576,12 +577,12 @@ def reviewer_phids(self) -> list[str]: data = response.get("data") or [] if not data: return [] - reviewers = data[0].get("attachments", {}).get("reviewers", {}) - return [ - r["reviewerPHID"] - for r in reviewers.get("reviewers", []) - if r.get("reviewerPHID") - ] + return data[0].get("attachments", {}).get("reviewers", {}).get("reviewers", []) + + @property + def reviewer_phids(self) -> list[str]: + """PHIDs of the revision's reviewers (a mix of users and projects).""" + return [r["reviewerPHID"] for r in self._reviewers if r.get("reviewerPHID")] @property def reviewer_project_phids(self) -> list[str]: @@ -633,6 +634,21 @@ def _add(phid: str) -> None: return phids + @property + def blocking_reviewer_project_phids(self) -> list[str]: + """PHIDs of the revision's *blocking* reviewer groups.""" + return [ + r["reviewerPHID"] + for r in self._reviewers + if r.get("isBlocking") + and r.get("reviewerPHID", "").startswith("PHID-PROJ-") + ] + + @property + def author_username(self) -> str | None: + """The revision author's Phabricator username (email), if resolvable.""" + return self._users_info.get(self.author_phid, {}).get("email") + @property def diff_author_phid(self) -> str: return self._diff_metadata["authorPHID"] diff --git a/docs/code-review-skills.md b/docs/code-review-skills.md index ab0cbfd9ae..ad5a611b31 100644 --- a/docs/code-review-skills.md +++ b/docs/code-review-skills.md @@ -83,9 +83,16 @@ The patch's associated bug ID is read from `patch.bug_id` (available on component string compared against the rule is `"Product::Component"`. The schema accepts `bugzilla`, `review`, and `patch` predicates. Runtime -matching currently implements `bugzilla.component`; `bugzilla.product`, -`bugzilla.keywords`, `bugzilla.severity`, `review.*`, and `patch.*` are parsed -and validated but fail closed until trusted metadata is wired into the matcher. +matching implements `bugzilla.component` and the `review` predicate +(`review.reviewer`, `review.blocking_reviewer`, `review.author`); a `review` +predicate matches when every facet it specifies matches, so a rule can target a +reviewer group (e.g. `review = { reviewer = ["ip-protection-reviewers"] }`). +Reviewer groups are matched by Phabricator project slug, resolved to the +revision's requested reviewers. `bugzilla.product`, `bugzilla.keywords`, +`bugzilla.severity`, and `patch.*` are parsed and validated but fail closed +until the corresponding metadata is wired into the matcher. When the platform +provides no reviewer information (e.g. a plain GitHub diff), `review` predicates +fail closed. ### `fetch_revision` diff --git a/tests/test_code_review.py b/tests/test_code_review.py index 8a533766da..7b4475fb06 100644 --- a/tests/test_code_review.py +++ b/tests/test_code_review.py @@ -25,7 +25,9 @@ search_text, ) from bugbug.tools.code_review.review_context import ( + ReviewInfo, _merge_rules, + build_review_info, external_content_manifest, format_external_content, github_repo_allowed, @@ -39,6 +41,7 @@ BugzillaPredicate, FilePredicate, ReviewContextValidationError, + ReviewPredicate, Rule, ) from bugbug.tools.code_review.review_context_schema import ( @@ -788,6 +791,87 @@ def test_rule_bugzilla_component_matches(): ) +def _patch_resolver(monkeypatch, slug_to_phid): + monkeypatch.setattr( + "bugbug.tools.core.platforms.phabricator.resolve_project_phid", + lambda slug: slug_to_phid.get(slug), + ) + + +def test_rule_review_reviewer_group_matches(monkeypatch): + _patch_resolver(monkeypatch, {"ip-protection-reviewers": "PHID-PROJ-ip"}) + rule = Rule( + name="ip", + when=ReviewPredicate(reviewer=["ip-protection-reviewers"]), + load=[], + ) + review = ReviewInfo(reviewer_groups=frozenset({"PHID-PROJ-ip"})) + assert rule_matches(rule, {"any/file.cpp"}, review=review) + + +def test_rule_review_reviewer_group_no_match(monkeypatch): + _patch_resolver(monkeypatch, {"ip-protection-reviewers": "PHID-PROJ-ip"}) + rule = Rule( + name="ip", + when=ReviewPredicate(reviewer=["ip-protection-reviewers"]), + load=[], + ) + review = ReviewInfo(reviewer_groups=frozenset({"PHID-PROJ-other"})) + assert not rule_matches(rule, {"any/file.cpp"}, review=review) + + +def test_rule_review_fails_closed_without_review_info(monkeypatch): + _patch_resolver(monkeypatch, {"ip-protection-reviewers": "PHID-PROJ-ip"}) + rule = Rule( + name="ip", + when=ReviewPredicate(reviewer=["ip-protection-reviewers"]), + load=[], + ) + # No review info (e.g. a GitHub diff) → never matches. + assert not rule_matches(rule, {"any/file.cpp"}) + + +def test_rule_review_blocking_reviewer(monkeypatch): + _patch_resolver(monkeypatch, {"security-reviewers": "PHID-PROJ-sec"}) + rule = Rule( + name="sec", + when=ReviewPredicate(blocking_reviewer=["security-reviewers"]), + load=[], + ) + # Present as a reviewer but not blocking → no match. + non_blocking = ReviewInfo(reviewer_groups=frozenset({"PHID-PROJ-sec"})) + assert not rule_matches(rule, {"f.cpp"}, review=non_blocking) + # Blocking → match. + blocking = ReviewInfo( + reviewer_groups=frozenset({"PHID-PROJ-sec"}), + blocking_reviewer_groups=frozenset({"PHID-PROJ-sec"}), + ) + assert rule_matches(rule, {"f.cpp"}, review=blocking) + + +def test_rule_review_author(): + rule = Rule(name="a", when=ReviewPredicate(author=["alice@mozilla.com"]), load=[]) + assert rule_matches(rule, {"f.cpp"}, review=ReviewInfo(author="alice@mozilla.com")) + assert not rule_matches( + rule, {"f.cpp"}, review=ReviewInfo(author="bob@mozilla.com") + ) + + +@pytest.mark.asyncio +async def test_build_review_info_from_patch(): + from types import SimpleNamespace + + patch = SimpleNamespace( + reviewer_project_phids=["PHID-PROJ-ip", "PHID-PROJ-newtab"], + blocking_reviewer_project_phids=["PHID-PROJ-ip"], + author_username="alice@mozilla.com", + ) + info = await build_review_info(patch) + assert info.author == "alice@mozilla.com" + assert info.reviewer_groups == frozenset({"PHID-PROJ-ip", "PHID-PROJ-newtab"}) + assert info.blocking_reviewer_groups == frozenset({"PHID-PROJ-ip"}) + + @pytest.mark.asyncio async def test_patch_bug_component(): from bugbug.tools.core.platforms.phabricator import PhabricatorPatch