Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions judgearena/battles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Persistable data model for arena battles and ELO ratings.

A :class:`Battle` is the atomic unit of an arena evaluation: two models, an
outcome, and where that outcome came from (an LLM judge or human votes). ELO
ratings are a pure function of a list of battles, so persisting the battles
(plus the bootstrap ratings) is enough to reconstruct or re-analyse a run --
completions and judge transcripts stay in the cache for now.
"""

from __future__ import annotations

import json
from dataclasses import asdict, dataclass, fields
from pathlib import Path

import numpy as np
import pandas as pd

# Winners accepted by compute_bradley_terry.
WINNERS = frozenset({"model_a", "model_b", "tie", "tie (bothbad)"})


@dataclass(frozen=True)
class Battle:
"""One pairwise outcome. ``source`` records its provenance."""

model_a: str
model_b: str
winner: str # one of WINNERS
source: str # "llm-judge" | "human"
question_id: str | None = None # join key back to cache / transcripts
judge_model: str | None = None # llm-judge battles only

@classmethod
def from_dict(cls, d: dict) -> Battle:
"""Build from a dict, ignoring unknown keys (forward-compatible)."""
known = {f.name for f in fields(cls)}
return cls(**{k: v for k, v in d.items() if k in known})


def write_battles(path: str | Path, battles: list[Battle]) -> None:
"""Write battles as JSON Lines."""
with Path(path).open("w") as f:
for b in battles:
f.write(json.dumps(asdict(b)) + "\n")


def read_battles(path: str | Path) -> list[Battle]:
"""Read battles from a JSON Lines file."""
with Path(path).open() as f:
return [Battle.from_dict(json.loads(line)) for line in f if line.strip()]


def battles_to_frame(battles: list[Battle]) -> pd.DataFrame:
"""Tabular view, suitable for ``compute_bradley_terry(..., winner_col='winner')``."""
return pd.DataFrame(asdict(b) for b in battles)


@dataclass(frozen=True)
class RatingEntry:
"""One model's place on the leaderboard."""

model: str
rating: float # mean over bootstraps
ci_low: float
ci_high: float
n_battles: int
source: str # "evaluated" (model under test) | "human"


@dataclass
class EloReport:
"""The leaderboard plus the run metadata that produced it."""

arena: str
model: str
judge_model: str
n_bootstraps: int
seed: int
ratings: list[RatingEntry]

def write(self, path: str | Path) -> None:
with Path(path).open("w") as f:
json.dump(asdict(self), f, indent=2)


def summarize_bootstrap(
bootstrap_ratings: list[dict[str, float]],
battle_counts: dict[str, int],
model_under_test: str,
ci: tuple[float, float] = (2.5, 97.5),
) -> list[RatingEntry]:
"""Collapse per-bootstrap ratings into one :class:`RatingEntry` per model,
sorted from highest rating to lowest."""
models = sorted({m for r in bootstrap_ratings for m in r})
entries = []
for m in models:
vals = np.array([r[m] for r in bootstrap_ratings if m in r], dtype=float)
lo, hi = np.percentile(vals, ci)
entries.append(
RatingEntry(
model=m,
rating=float(vals.mean()),
ci_low=float(lo),
ci_high=float(hi),
n_battles=int(battle_counts.get(m, 0)),
source="evaluated" if m == model_under_test else "human",
)
)
entries.sort(key=lambda e: -e.rating)
return entries
85 changes: 75 additions & 10 deletions judgearena/estimate_elo_ratings.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,25 @@
import hashlib
from dataclasses import dataclass
from dataclasses import asdict, dataclass
from datetime import UTC, datetime
from functools import partial
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe
from judgearena.battles import (
Battle,
EloReport,
summarize_bootstrap,
write_battles,
)
from judgearena.cli_common import BaseCliArgs
from judgearena.evaluate import judge_and_parse_prefs
from judgearena.generate import generate_instructions
from judgearena.log import get_logger
from judgearena.repro import write_run_metadata
from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model

logger = get_logger(__name__)
Expand Down Expand Up @@ -148,6 +157,7 @@ def compute_bradley_terry(


def main(args: CliEloArgs) -> dict:
run_started_at = datetime.now(UTC)
rng = np.random.default_rng(args.seed)

# Step 1: Load arena battles
Expand Down Expand Up @@ -322,9 +332,14 @@ def run_judge() -> pd.DataFrame:

# Map preferences back to model-name-level battle results
model_name = args.model
question_ids = (
df_battles["question_id"].tolist()
if "question_id" in df_battles.columns
else [None] * n
)
battle_results = []
for pref, is_pos_a, opp_model in zip(
prefs, our_model_is_position_a, opponent_models, strict=True
for pref, is_pos_a, opp_model, qid in zip(
prefs, our_model_is_position_a, opponent_models, question_ids, strict=True
):
if pref is None or pref == 0.5:
winner = "tie"
Expand All @@ -333,14 +348,16 @@ def run_judge() -> pd.DataFrame:
else:
winner = "model_b"

common = {
"winner": winner,
"source": "llm-judge",
"judge_model": args.judge_model,
"question_id": qid,
}
if is_pos_a:
battle_results.append(
{"model_a": model_name, "model_b": opp_model, "winner": winner}
)
battle_results.append({"model_a": model_name, "model_b": opp_model, **common})
else:
battle_results.append(
{"model_a": opp_model, "model_b": model_name, "winner": winner}
)
battle_results.append({"model_a": opp_model, "model_b": model_name, **common})

# LLM-judge battle results for our model
df_llm_judge = pd.DataFrame(battle_results)
Expand All @@ -364,7 +381,10 @@ def run_judge() -> pd.DataFrame:

# Combine LLM-judge battles with human-annotated arena battles,
# keeping only arena models with at least 500 human battles
df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]]
arena_cols = ["model_a", "model_b", "winner"]
if "question_id" in df_arena_all.columns:
arena_cols.append("question_id")
df_arena = df_arena_all.loc[:, arena_cols]
human_battle_counts = pd.concat(
[df_arena["model_a"], df_arena["model_b"]]
).value_counts()
Expand All @@ -373,6 +393,7 @@ def run_judge() -> pd.DataFrame:
df_arena["model_a"].isin(well_represented)
& df_arena["model_b"].isin(well_represented)
]
df_arena = df_arena.assign(source="human", judge_model=None)
df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True)

# Bootstrap Bradley-Terry ELO ratings
Expand Down Expand Up @@ -419,8 +440,52 @@ def run_judge() -> pd.DataFrame:
else:
print(" Not enough data to compute ELO ratings.")

# Persist artifacts: battles.jsonl is the source of truth (ELO is a pure
# function of it); elo_ratings.json + bootstrap_ratings.csv keep the leaderboard.
name = (
f"{args.arena}-{replace_slash(args.model)}-{replace_slash(args.judge_model)}"
)
res_folder = Path(args.result_folder) / f"{name}-{datetime.now():%Y%m%d_%H%M%S}"
res_folder.mkdir(parents=True, exist_ok=True)

records = df_results.astype(object).where(pd.notna(df_results), None).to_dict("records")
write_battles(res_folder / "battles.jsonl", [Battle.from_dict(r) for r in records])

if bootstrap_ratings:
pd.DataFrame(bootstrap_ratings).to_csv(
res_folder / "bootstrap_ratings.csv", index=False
)
EloReport(
arena=args.arena,
model=model_name,
judge_model=args.judge_model,
n_bootstraps=n_bootstraps,
seed=args.seed,
ratings=summarize_bootstrap(bootstrap_ratings, battle_counts, model_name),
).write(res_folder / "elo_ratings.json")

# Reproducibility metadata: git hash, dependency versions, timings, and an
# artifacts manifest of the files we just wrote. The run args live under
# "run", so no separate config file is needed.
write_run_metadata(
output_dir=res_folder,
entrypoint="judgearena.estimate_elo_ratings.main",
run=asdict(args),
results={
**summary,
"n_llm_battles": n_llm,
"n_human_battles": n_human,
"result_folder": str(res_folder),
},
input_payloads={"instruction_index": question_ids},
started_at_utc=run_started_at,
)

print(f"\n📁 Results: {res_folder}")

return {
**summary,
"bootstrap_ratings": bootstrap_ratings,
"model_name": model_name,
"result_folder": str(res_folder),
}
Loading