diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 000000000..d4dd4a69a --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,39 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python application + +on: + push: + branches: [ "v8" ] + pull_request: + branches: [ "v8" ] + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 000000000..8ffa69240 --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,33 @@ +# Results and outputs +results/ +*.log +*.json + +# LLM API interactions +.env +*.apikey +token.txt + +# Python +__pycache__/ +*.pyc +*.pyo +*.egg-info/ +.pytest_cache/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Fixtures (large files) +fixtures/*/graphify-out/ +fixtures/*/.git/ +fixtures/*/node_modules/ +fixtures/*/venv/ +fixtures/*/.venv/ + +# Generated +*.tmp +.coverage diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..eaecd3d9b --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,224 @@ +# Graphify Agent Performance Benchmarks + +This directory contains a reproducible benchmark framework to measure whether Graphify improves coding agent performance on large repositories. + +## Motivation + +The core question: **Does Graphify improve agent task success rates, or is it just a visualization/compression tool?** + +We address this by running controlled tasks with and without Graphify, measuring: +- **Success rate** — did the agent complete the task correctly? +- **Token efficiency** — how many tokens did it consume? +- **Time to solution** — how many agent turns did it take? +- **Confidence** — agent's own assessment of solution quality + +## Benchmark Methodology + +### Test Setup + +Each benchmark consists of: +1. **Target repository** — a real codebase of varying size/complexity +2. **Task set** — 5–10 concrete coding problems (bug fixes, feature adds, refactoring) +3. **Control runs** — execute each task WITHOUT Graphify +4. **Treatment runs** — execute each task WITH Graphify (pre-computed graph) +5. **Metrics collection** — token usage, success rate, reasoning chain + +### Task Categories + +#### 1. Bug Fixes +- Locate a bug in the codebase from a description +- Fix it correctly +- Example: "The auth module drops requests with custom headers; find and fix" + +#### 2. Feature Additions +- Add a new feature that integrates with existing code +- Must work with the existing architecture +- Example: "Add rate-limiting to the API endpoints" + +#### 3. Refactoring & Understanding +- Understand call flow and refactor for clarity/performance +- Example: "Reduce the number of database queries in the user service" + +#### 4. Architecture Questions +- Answer questions about how the system is structured +- Example: "What is the data flow from user input to storage?" + +### Metrics + +| Metric | Type | Range | Interpretation | +|--------|------|-------|-----------------| +| **Success** | Binary | 0/1 | Did the agent produce a correct, working solution? | +| **Token Count** | Integer | >0 | Total tokens (input + output) consumed | +| **Turns** | Integer | >0 | Number of agent reasoning steps | +| **Time (s)** | Float | >0 | Wall-clock time in seconds | +| **Confidence** | Float | 0–1 | Agent's self-reported confidence in the solution | +| **Code Quality** | Categorical | {poor, ok, good} | Does the solution follow repo patterns? | + +### Statistical Analysis + +For each task, compute: +- **Success rate with Graphify** vs **without** (% difference) +- **Mean token reduction** when using Graphify +- **Mean turn reduction** (lower = more efficient reasoning) +- **Effect size** (Cohen's d for token/turn counts) + +Report results with 95% confidence intervals. + +## Directory Structure + +``` +benchmarks/ +├── README.md # This file +├── methodology.md # Detailed statistical approach +├── fixtures/ # Benchmark repositories +│ ├── httpx_mini/ # Small HTTP client library (~6 files) +│ ├── django_subset/ # Medium web framework (~50 files) +│ └── kubernetes_sample/ # Large distributed system (~200 files) +├── tasks/ # Task definitions by category +│ ├── bug_fixes.json +│ ├── feature_additions.json +│ ├── refactoring.json +│ └── architecture_qa.json +├── runner.py # Test harness (runs tasks, collects metrics) +├── evaluator.py # Score results (correct/incorrect) +├── results/ # Output directory +│ ├── raw/ # Per-run data (JSON) +│ ├── aggregated.json # Summary statistics +│ └── report.md # Human-readable findings +└── examples/ # Worked examples + └── benchmark_run_001.log # Example of a complete run +``` + +## Running Benchmarks + +### Prerequisites + +```bash +# Install Graphify + dev dependencies +uv sync --all-extras + +# Install benchmark dependencies +pip install anthropic openai gemini-api # Your LLM provider(s) +``` + +### Quick Start + +```bash +# Run all benchmarks with Claude backend +python benchmarks/runner.py \ + --backend claude \ + --fixtures all \ + --tasks all \ + --runs 3 + +# Run a specific fixture +python benchmarks/runner.py \ + --fixtures httpx_mini \ + --tasks bug_fixes \ + --runs 5 \ + --backend claude +``` + +### Interpreting Output + +After each run, you'll see: + +``` +✓ Task: "Fix auth module header bug" + Success: YES + Tokens: 4,235 (with graph) vs 5,821 (without) → 27% reduction + Turns: 3 vs 5 → 40% faster + Confidence: 0.92 +``` + +Results are saved to `results/raw/` as JSON, then aggregated into `results/aggregated.json` and `results/report.md`. + +## Extending Benchmarks + +### Add a New Task + +Edit `benchmarks/tasks/bug_fixes.json`: + +```json +{ + "id": "auth-header-bug", + "title": "Fix auth module header bug", + "description": "The auth module drops requests with custom headers. Find the root cause and fix it.", + "target_files": ["auth.py"], + "difficulty": "medium", + "expected_changes": { + "insertions": 5, + "deletions": 2 + }, + "verification_script": "test_auth_headers.py", + "tags": ["auth", "headers", "bug"] +} +``` + +### Add a New Fixture + +1. Clone a real repository or create a synthetic one +2. Place it in `benchmarks/fixtures//` +3. Add metadata: `benchmarks/fixtures//metadata.json` + +```json +{ + "name": "my_project", + "description": "A sample project for benchmarking", + "size_mb": 12, + "file_count": 45, + "language": "python", + "graph_tokens": 8500, + "graph_nodes": 342, + "graph_edges": 1205 +} +``` + +## Interpreting Results + +### Success Rate + +If Graphify improves success rate from 65% → 78%: +- **Interpretation**: Graphify helps agents navigate complex repos and make better decisions +- **Statistical test**: Binomial test (p < 0.05 = significant) + +### Token Efficiency + +If mean token count drops from 6,200 → 4,800 (23% reduction): +- **Interpretation**: Graphify reduces the search space; agents find answers faster +- **Effect**: This saves cost on API-based models + +### Turn Efficiency + +If mean turns drop from 6 → 4 (33% reduction): +- **Interpretation**: Agents reason more directly with Graphify; fewer backtracking steps + +### What Doesn't Prove Graphify Works + +- ❌ Smaller graphs (that's compression, not capability improvement) +- ❌ Prettier visualizations (that's UX, not performance) +- ❌ Longer reports (that's information density, not agent intelligence) + +## Reporting + +Each benchmark run generates: + +1. **results/raw/.json** — raw metrics per task +2. **results/aggregated.json** — summary statistics +3. **results/report.md** — human-readable findings + +Include these in discussions/PRs to substantiate claims about Graphify's impact. + +## Contributing + +To add benchmarks: + +1. Create a new task in `tasks/` +2. Add fixtures (if needed) to `benchmarks/fixtures/` +3. Run locally and validate results +4. Open a PR with reproducible results + +## References + +- Original discussion: [Graphify-Labs/graphify#1328](https://github.com/Graphify-Labs/graphify/discussions/1328) +- Methodology paper: [How to Benchmark Code Understanding Tools](docs/methodology.md) diff --git a/benchmarks/evaluator.py b/benchmarks/evaluator.py new file mode 100644 index 000000000..f1bee2cc9 --- /dev/null +++ b/benchmarks/evaluator.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +""" +Task Evaluator + +Determines whether an agent's solution is correct. +Uses multiple validation strategies: +1. Automated checks (syntax, imports, tests) +2. Semantic checks (does it solve the problem?) +3. Human review (for ambiguous cases) +""" + +import json +import subprocess +from pathlib import Path +from typing import Literal + + +class TaskEvaluator: + def __init__(self, fixture_path: Path): + self.fixture_path = Path(fixture_path) + + def evaluate(self, task: dict, solution: str) -> dict: + """ + Evaluate whether a solution is correct. + + Args: + task: Task definition (includes verification_script, expected_changes, etc.) + solution: Agent's proposed code + + Returns: + { + "success": bool, # Overall verdict + "score": float, # 0.0–1.0 (0=fail, 0.5=partial, 1.0=pass) + "checks": { + "syntax": bool, + "imports": bool, + "tests": bool, + "semantic": bool, + }, + "feedback": str, + } + """ + + checks = { + "syntax": self._check_syntax(solution), + "imports": self._check_imports(solution), + "tests": self._check_tests(task, solution), + "semantic": self._check_semantic(task, solution), + } + + # Aggregate score + if all(checks.values()): + score = 1.0 + feedback = "✓ Full success" + elif checks["syntax"] and checks["imports"]: + score = 0.5 + feedback = "⚠ Partial success (code runs but semantic checks failed)" + else: + score = 0.0 + feedback = "✗ Failed (code doesn't parse or run)" + + return { + "success": score >= 0.5, + "score": score, + "checks": checks, + "feedback": feedback, + } + + def _check_syntax(self, code: str) -> bool: + """Check that code parses without syntax errors.""" + try: + compile(code, "", "exec") + return True + except SyntaxError: + return False + + def _check_imports(self, code: str) -> bool: + """Check that all imports can be resolved.""" + try: + # Try to parse and extract imports + import ast + + tree = ast.parse(code) + imports = [] + + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + imports.append(alias.name) + elif isinstance(node, ast.ImportFrom): + if node.module: + imports.append(node.module) + + # Try to import each one + for imp in imports: + try: + __import__(imp) + except ImportError: + # Some imports may not be available; be lenient + pass + + return True + + except Exception: + return False + + def _check_tests(self, task: dict, solution: str) -> bool: + """ + Run verification tests if defined in the task. + + Task should specify: + "verification_script": "path/to/test_something.py" + "verification_command": "pytest tests/test_auth.py -v" + """ + + if "verification_script" not in task and "verification_command" not in task: + # No verification defined; assume pass + return True + + try: + if "verification_command" in task: + # Run explicit command + cmd = task["verification_command"].split() + result = subprocess.run( + cmd, + cwd=self.fixture_path, + capture_output=True, + timeout=30, + text=True, + ) + return result.returncode == 0 + + elif "verification_script" in task: + # Run test script + script_path = self.fixture_path / task["verification_script"] + if not script_path.exists(): + return False + + result = subprocess.run( + ["python", str(script_path)], + cwd=self.fixture_path, + capture_output=True, + timeout=30, + text=True, + ) + return result.returncode == 0 + + except subprocess.TimeoutExpired: + return False + except Exception: + return False + + return True + + def _check_semantic(self, task: dict, solution: str) -> bool: + """ + Check that the solution semantically addresses the task. + + Uses simple heuristics: + - Contains function/class names mentioned in the task + - Modifies the right files + - Includes expected keywords (bug, fix, add, refactor, etc.) + """ + + task_desc = task.get("description", "").lower() + target_files = task.get("target_files", []) + solution_lower = solution.lower() + + # Check 1: Does solution mention target files? + if target_files: + file_mentions = sum( + 1 + for f in target_files + if Path(f).stem.lower() in solution_lower + ) + if file_mentions == 0: + # Might still be correct, but suspicious + pass + + # Check 2: Does it contain implementation (not just comments)? + if len(solution.strip()) < 50: + # Too short to be meaningful + return False + + # Check 3: Does it contain keywords matching the task type? + task_lower = task.get("title", "").lower() + + if "fix" in task_lower or "bug" in task_lower: + # Should have some control flow changes + if not any( + kw in solution_lower for kw in ["if", "else", "return", "raise"] + ): + return False + + if "add" in task_lower or "feature" in task_lower: + # Should define new function/class + if not any( + kw in solution_lower for kw in ["def ", "class "] + ): + return False + + if "refactor" in task_lower: + # Should reorganize/restructure + if len(solution.split("\n")) < 5: + return False + + return True + + +# Test harness +if __name__ == "__main__": + # Example: evaluate a solution + fixture_path = Path("benchmarks/fixtures/httpx_mini") + evaluator = TaskEvaluator(fixture_path) + + sample_task = { + "id": "auth-header-bug", + "title": "Fix auth module header bug", + "description": "The auth module drops custom headers. Find and fix.", + "target_files": ["auth.py"], + "verification_script": "tests/test_auth.py", + } + + sample_solution = """ +def fix_headers(request): + '''Fixed version that preserves custom headers''' + if request.custom_headers: + return request.with_headers(request.custom_headers) + return request +""" + + result = evaluator.evaluate(sample_task, sample_solution) + print(json.dumps(result, indent=2)) diff --git a/benchmarks/methodology.md b/benchmarks/methodology.md new file mode 100644 index 000000000..29b997466 --- /dev/null +++ b/benchmarks/methodology.md @@ -0,0 +1,246 @@ +# Benchmark Methodology: Statistical Rigor + +## Design: Paired Comparative Trial + +This is a **paired comparative trial** where each task is run twice: +- **Treatment A** (baseline): Agent solves task WITHOUT Graphify +- **Treatment B** (intervention): Agent solves same task WITH pre-computed Graphify graph + +### Why Paired? + +- Eliminates variance from task difficulty variation +- Allows within-subject effect size calculation +- Smaller sample size needed for significance + +## Hypotheses + +**Primary hypothesis (H1):** Graphify improves agent success rate on large repos. +$$P(\text{success}|\text{with Graphify}) > P(\text{success}|\text{without})$$ + +**Secondary hypothesis (H2):** Graphify reduces token consumption per successful task. +$$E[\text{tokens}|\text{success, with Graphify}] < E[\text{tokens}|\text{success, without}]$$ + +**Tertiary hypothesis (H3):** Graphify reduces reasoning steps (turns). +$$E[\text{turns}|\text{success, with Graphify}] < E[\text{turns}|\text{success, without}]$$ + +## Sample Size & Power + +For binary success rate: +- Assume baseline success = 60%, treatment success = 75% (15 percentage point lift) +- Desired power = 80% (β = 0.2), α = 0.05 +- **Required**: n ≈ 60 tasks across all fixtures +- **Practical target**: 5 tasks × 3 fixtures × 4 runs = 60 observations + +For continuous metrics (tokens, turns): +- Assume baseline μ = 5000 tokens, σ = 1500 +- Assume intervention reduces by 20%: μ = 4000 +- Effect size d = 0.67 (medium) +- **Required**: n ≈ 36 paired observations +- **Practical target**: Same 60 (exceeded by design) + +## Success Evaluation + +Each task is evaluated by: + +1. **Automated checks** (fast): + - Code parses without syntax errors + - All imports resolve + - Unit tests pass + +2. **Semantic checks** (careful): + - The solution addresses the stated problem + - No obvious logical errors + - Follows repo coding conventions + +3. **Human review** (validation): + - A domain expert reviews ambiguous cases + - Marks as Correct / Incorrect / Partial + +### Scoring + +| Outcome | Code | Points | +|---------|------|--------| +| Full success | ✓✓✓ | 1.0 | +| Partial success | ✓✓− | 0.5 | +| Failed | ✗ | 0.0 | + +## Token Accounting + +Count tokens using the agent's LLM's tokenizer: + +``` +Total Tokens = Input Tokens + Output Tokens +``` + +**Input**: +- Task description +- Code context (repo files) +- Graph context (if treatment) +- Conversation history + +**Output**: +- Agent's reasoning +- Code suggestions +- Refinements + +Track separately: +- Tokens WITHOUT graph +- Tokens WITH graph +- Graph payload size (to compute savings) + +## Turns & Reasoning + +A "turn" is one complete agent cycle: + +``` +Human: [question] +↓ (agent processes) +Agent: [reasoning + code suggestion] +↓ (human feedback) +Human: [feedback or next task] +``` + +Count until: +- Agent produces final answer, OR +- Agent gives up / says "I can't" +- Turn limit reached (max 10 to prevent runaway) + +## Statistical Tests + +### 1. Success Rate Comparison (Primary) + +Use **McNemar's test** for paired binary data: + +``` + With Graph + ✓ ✗ +Without ✓ a b + ✗ c d + +Statistic = (b - c)² / (b + c) +df = 1, critical value ≈ 3.84 (α = 0.05) +``` + +Report: +- Success rate with/without (%) +- Difference ± 95% CI +- McNemar p-value + +### 2. Token Reduction (Secondary) + +Use **paired t-test**: + +``` +Differences: d_i = tokens_without_i - tokens_with_i +t = mean(d) / (sd(d) / √n) +df = n - 1 +``` + +Report: +- Mean ± SD for each condition +- Mean difference ± 95% CI +- Cohen's d (effect size) +- Two-tailed p-value + +### 3. Turn Reduction (Secondary) + +Same as token test (paired t-test on turn counts). + +## Multi-Comparison Correction + +If testing multiple hypotheses: +- Use **Bonferroni correction**: α' = 0.05 / number_of_tests +- Report both raw and corrected p-values +- Or use **False Discovery Rate (FDR)** control + +## Interpreting Results + +### Significance vs Effect Size + +| p-value | 95% CI includes 0? | Decision | +|---------|-------------------|----------| +| < 0.05 | No | Significant, likely real | +| < 0.05 | Yes | Unlikely (report anyway) | +| > 0.05 | Yes | Not significant | +| > 0.05 | No | Borderline; report with caution | + +### Effect Size Interpretation (Cohen's d) + +| Range | Interpretation | +|-------|-----------------| +| 0.0 – 0.2 | Negligible | +| 0.2 – 0.5 | Small | +| 0.5 – 0.8 | Medium | +| > 0.8 | Large | + +## Potential Confounds + +### Control for: + +1. **Task difficulty** — use difficulty ratings in stratified analysis +2. **LLM version** — run all tasks with same model snapshot +3. **Agent strategy** — use identical prompts with/without graph +4. **Time-of-day effects** — randomize order +5. **Cold starts** — warm up API connections before timing + +### Document: + +- LLM model name and version (e.g., `claude-opus-4-6-20250514`) +- API rate limits and throttling +- Any retries or errors during runs +- Wall-clock time vs token count (distinguish latency from capability) + +## Reproducibility Checklist + +- [ ] All fixtures are under version control or downloadable +- [ ] Task definitions are checked in as JSON +- [ ] Random seeds are fixed (or documented) +- [ ] API keys/credentials are NOT in repository +- [ ] Raw results are saved with timestamps +- [ ] Code is documented and tested + +## Reporting Template + +```markdown +## Benchmark Results: [Fixture Name] + +**Setup** +- Fixture: [name], [file count] files, [LOC] lines of code +- Tasks: [n] tasks across [categories] +- Agent: [model name and version] +- Runs: [n] trials per task +- Date: [ISO date] + +### Primary Result: Success Rate + +| Condition | Success Rate | 95% CI | +|-----------|--------------|--------| +| Without Graphify | 62% (31/50) | [55–69%] | +| With Graphify | 76% (38/50) | [68–84%] | +| **Difference** | +14pp | [2–26pp] | + +**McNemar's Test**: χ² = 5.2, p = 0.022 ✓ Significant + +### Secondary Results + +**Token Efficiency** +- Without: 5,821 ± 1,340 tokens +- With: 4,235 ± 980 tokens +- Reduction: 27% ± 8% (p < 0.001, d = 1.1) + +**Turn Efficiency** +- Without: 5.2 ± 1.8 turns +- With: 3.4 ± 1.2 turns +- Reduction: 35% ± 12% (p = 0.002, d = 1.0) + +### Conclusion + +Graphify demonstrates statistically significant improvements across all metrics on [Fixture Name]. Evidence supports the hypothesis that Graphify improves agent performance on large repos. +``` + +## References + +- Agresti, A. (2018). Statistical methods for the social sciences. *Pearson*. +- McNemar, Q. (1947). Note on the sampling error of the difference between correlated proportions. *Psychometrika*. +- Cohen, J. (1988). Statistical power analysis for the behavioral sciences. + diff --git a/benchmarks/runner.py b/benchmarks/runner.py new file mode 100644 index 000000000..b63c6187e --- /dev/null +++ b/benchmarks/runner.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 +""" +Graphify Benchmark Runner + +Executes paired comparative trials: +- Baseline: Agent solves task WITHOUT Graphify +- Treatment: Agent solves SAME task WITH Graphify graph + +Measures: success rate, tokens, turns, time, confidence. +""" + +import argparse +import asyncio +import json +import os +import sys +import time +from dataclasses import asdict, dataclass +from datetime import datetime +from pathlib import Path +from typing import Any + +# Stub for now—will integrate with anthropic/openai SDK +# when runner is actually invoked +class LLMClient: + def __init__(self, backend: str, model: str): + self.backend = backend + self.model = model + self.api_key = os.getenv(f"{backend.upper()}_API_KEY") + if not self.api_key: + print(f"Warning: {backend.upper()}_API_KEY not set") + + async def solve_task( + self, task: dict, context: str, include_graph: bool = False + ) -> dict: + """ + Invoke LLM to solve a task. + + Args: + task: Task definition (description, files, etc.) + context: Code context from repository + include_graph: Whether to include Graphify graph in prompt + + Returns: + { + "success": bool, + "solution": str, + "reasoning": str, + "tokens": int, + "turns": int, + "time": float, + "confidence": float, + "model": str, + } + """ + # This is a stub. Real implementation would: + # 1. Build prompt from task + context + optional graph + # 2. Call LLM API (anthropic.Anthropic, openai.OpenAI, etc.) + # 3. Parse response + # 4. Extract tokens from response metadata + # 5. Optionally call evaluator.py to validate solution + + return { + "success": True, + "solution": "# Stub solution", + "reasoning": "LLM reasoning would go here", + "tokens": 5000, + "turns": 3, + "time": 12.5, + "confidence": 0.85, + "model": self.model, + } + + +@dataclass +class TaskResult: + """Result of running a single task.""" + + task_id: str + task_title: str + fixture: str + condition: str # "baseline" or "treatment" + success: bool + tokens: int + turns: int + time_seconds: float + confidence: float + solution: str + reasoning: str + model: str + timestamp: str + + def to_dict(self) -> dict: + return asdict(self) + + +class BenchmarkRunner: + def __init__( + self, + backend: str = "claude", + model: str = None, + fixtures: list = None, + tasks: list = None, + runs_per_task: int = 1, + output_dir: Path = None, + ): + self.backend = backend + self.model = model or f"{backend}-default" + self.fixtures = fixtures or ["all"] + self.task_categories = tasks or ["all"] + self.runs_per_task = runs_per_task + self.output_dir = Path(output_dir or "benchmarks/results") + self.output_dir.mkdir(parents=True, exist_ok=True) + + self.client = LLMClient(backend, self.model) + self.results = [] + + def load_fixtures(self) -> dict: + """Load fixture metadata.""" + fixtures_dir = Path("benchmarks/fixtures") + fixtures = {} + + if "all" in self.fixtures: + self.fixtures = [d.name for d in fixtures_dir.iterdir() if d.is_dir()] + + for fixture_name in self.fixtures: + fixture_path = fixtures_dir / fixture_name + metadata_file = fixture_path / "metadata.json" + + if not metadata_file.exists(): + print(f"Warning: No metadata for fixture {fixture_name}") + continue + + with open(metadata_file) as f: + fixtures[fixture_name] = json.load(f) + fixtures[fixture_name]["path"] = str(fixture_path) + + return fixtures + + def load_tasks(self) -> dict: + """Load task definitions by category.""" + tasks_dir = Path("benchmarks/tasks") + all_tasks = {} + + if "all" in self.task_categories: + categories = [f.stem for f in tasks_dir.glob("*.json")] + else: + categories = self.task_categories + + for category in categories: + task_file = tasks_dir / f"{category}.json" + if not task_file.exists(): + print(f"Warning: No task file for category {category}") + continue + + with open(task_file) as f: + all_tasks[category] = json.load(f) + + return all_tasks + + async def run_single_task( + self, task: dict, fixture: dict, include_graph: bool + ) -> TaskResult: + """Run a single task with or without graph.""" + # Load code context from fixture + code_context = self._load_code_context(fixture, task.get("target_files", [])) + + condition = "treatment" if include_graph else "baseline" + + # Load graph if treatment + graph_context = "" + if include_graph: + graph_path = Path(fixture["path"]) / "graphify-out" / "GRAPH_REPORT.md" + if graph_path.exists(): + with open(graph_path) as f: + graph_context = f.read() + + # Call LLM + result = await self.client.solve_task( + task, code_context, include_graph=include_graph + ) + + # Record result + task_result = TaskResult( + task_id=task.get("id", "unknown"), + task_title=task.get("title", "unknown"), + fixture=fixture.get("name", "unknown"), + condition=condition, + success=result["success"], + tokens=result["tokens"], + turns=result["turns"], + time_seconds=result["time"], + confidence=result["confidence"], + solution=result["solution"], + reasoning=result["reasoning"], + model=result["model"], + timestamp=datetime.utcnow().isoformat(), + ) + + return task_result + + def _load_code_context(self, fixture: dict, target_files: list) -> str: + """Load code files from fixture.""" + context = "" + fixture_path = Path(fixture["path"]) + + # If specific files requested, load those; otherwise load all .py files + if target_files: + files_to_load = target_files + else: + files_to_load = list(fixture_path.glob("src/**/*.py")) + list( + fixture_path.glob("*.py") + ) + + for file_path in files_to_load: + if file_path.exists(): + try: + with open(file_path) as f: + content = f.read() + context += f"\n\n# File: {file_path.relative_to(fixture_path)}\n" + context += content + except Exception as e: + print(f"Error reading {file_path}: {e}") + + return context + + async def run_all(self) -> list: + """Execute all benchmark runs.""" + fixtures = self.load_fixtures() + tasks_by_category = self.load_tasks() + + if not fixtures: + print("Error: No fixtures found") + return [] + + if not tasks_by_category: + print("Error: No tasks found") + return [] + + all_tasks = [] + for category, tasks in tasks_by_category.items(): + all_tasks.extend(tasks) + + print( + f"Starting benchmark: {len(all_tasks)} tasks × 2 conditions × {self.runs_per_task} runs" + ) + print(f"Fixtures: {', '.join(fixtures.keys())}") + print(f"Backend: {self.backend} / {self.model}") + print() + + run_count = 0 + for fixture_name, fixture_metadata in fixtures.items(): + print(f"📁 Fixture: {fixture_name}") + + for task in all_tasks: + print(f" 📋 Task: {task.get('title', 'unknown')}") + + for run in range(self.runs_per_task): + for include_graph in [False, True]: + condition = "WITH" if include_graph else "WITHOUT" + print(f" Run {run + 1}/{self.runs_per_task} {condition} graph...") + + start = time.time() + result = await self.run_single_task( + task, fixture_metadata, include_graph + ) + elapsed = time.time() - start + + self.results.append(result) + run_count += 1 + + status = "✓" if result.success else "✗" + print( + f" {status} Success={result.success} " + f"Tokens={result.tokens} Turns={result.turns} " + f"Time={elapsed:.1f}s" + ) + + print(f"\n✅ Completed {run_count} runs") + return self.results + + def save_results(self): + """Save raw results and generate summary.""" + # Raw results + raw_file = self.output_dir / "raw" / f"{datetime.utcnow().isoformat()}.json" + raw_file.parent.mkdir(parents=True, exist_ok=True) + + with open(raw_file, "w") as f: + json.dump([r.to_dict() for r in self.results], f, indent=2) + + print(f"\n📊 Saved raw results: {raw_file}") + + # Aggregated summary + self._save_aggregated() + + # Human-readable report + self._save_report() + + def _save_aggregated(self): + """Compute and save summary statistics.""" + if not self.results: + return + + # Group by fixture and condition + summary = {} + + for result in self.results: + key = f"{result.fixture}:{result.condition}" + + if key not in summary: + summary[key] = { + "fixture": result.fixture, + "condition": result.condition, + "success_count": 0, + "total_count": 0, + "tokens": [], + "turns": [], + "times": [], + "confidences": [], + } + + summary[key]["total_count"] += 1 + if result.success: + summary[key]["success_count"] += 1 + + summary[key]["tokens"].append(result.tokens) + summary[key]["turns"].append(result.turns) + summary[key]["times"].append(result.time_seconds) + summary[key]["confidences"].append(result.confidence) + + # Compute statistics + aggregated = {} + for key, group in summary.items(): + aggregated[key] = { + "fixture": group["fixture"], + "condition": group["condition"], + "success_rate": group["success_count"] / group["total_count"], + "tokens": { + "mean": sum(group["tokens"]) / len(group["tokens"]), + "min": min(group["tokens"]), + "max": max(group["tokens"]), + }, + "turns": { + "mean": sum(group["turns"]) / len(group["turns"]), + "min": min(group["turns"]), + "max": max(group["turns"]), + }, + "time": { + "mean": sum(group["times"]) / len(group["times"]), + "total": sum(group["times"]), + }, + "confidence": { + "mean": sum(group["confidences"]) / len(group["confidences"]), + }, + } + + agg_file = self.output_dir / "aggregated.json" + with open(agg_file, "w") as f: + json.dump(aggregated, f, indent=2) + + print(f"📈 Saved aggregated results: {agg_file}") + + def _save_report(self): + """Generate a human-readable markdown report.""" + if not self.results: + return + + report = f"""# Graphify Benchmark Report + +**Generated**: {datetime.utcnow().isoformat()} +**Backend**: {self.backend} / {self.model} +**Total Runs**: {len(self.results)} + +## Summary + +| Metric | Without Graphify | With Graphify | Improvement | +|--------|------------------|---------------|-------------| +| Success Rate | TBD | TBD | TBD | +| Avg Tokens | TBD | TBD | TBD | +| Avg Turns | TBD | TBD | TBD | + +## Results by Fixture + +""" + + # Group results by fixture + by_fixture = {} + for result in self.results: + if result.fixture not in by_fixture: + by_fixture[result.fixture] = {"baseline": [], "treatment": []} + by_fixture[result.fixture][result.condition].append(result) + + for fixture_name, conditions in by_fixture.items(): + report += f"### {fixture_name}\n\n" + + baseline = conditions.get("baseline", []) + treatment = conditions.get("treatment", []) + + if baseline: + baseline_success = sum(1 for r in baseline if r.success) / len( + baseline + ) + baseline_tokens = sum(r.tokens for r in baseline) / len(baseline) + baseline_turns = sum(r.turns for r in baseline) / len(baseline) + report += f"**Without Graphify**\n" + report += f"- Success Rate: {baseline_success:.0%}\n" + report += f"- Avg Tokens: {baseline_tokens:.0f}\n" + report += f"- Avg Turns: {baseline_turns:.1f}\n\n" + + if treatment: + treatment_success = sum(1 for r in treatment if r.success) / len( + treatment + ) + treatment_tokens = sum(r.tokens for r in treatment) / len(treatment) + treatment_turns = sum(r.turns for r in treatment) / len(treatment) + report += f"**With Graphify**\n" + report += f"- Success Rate: {treatment_success:.0%}\n" + report += f"- Avg Tokens: {treatment_tokens:.0f}\n" + report += f"- Avg Turns: {treatment_turns:.1f}\n\n" + + if baseline: + success_delta = treatment_success - baseline_success + token_delta = (baseline_tokens - treatment_tokens) / baseline_tokens + turn_delta = (baseline_turns - treatment_turns) / baseline_turns + + report += f"**Delta**\n" + report += f"- Success: {success_delta:+.0%}\n" + report += f"- Tokens: {token_delta:+.0%}\n" + report += f"- Turns: {turn_delta:+.0%}\n\n" + + report_file = self.output_dir / "report.md" + with open(report_file, "w") as f: + f.write(report) + + print(f"📝 Saved report: {report_file}") + + +async def main(): + parser = argparse.ArgumentParser( + description="Run Graphify benchmarks with paired comparative trials." + ) + parser.add_argument( + "--backend", + default="claude", + choices=["claude", "openai", "gemini"], + help="LLM backend to use", + ) + parser.add_argument( + "--model", + default=None, + help="Specific model to use (e.g., claude-opus-4-6)", + ) + parser.add_argument( + "--fixtures", + nargs="+", + default=["all"], + help="Fixture(s) to run (or 'all')", + ) + parser.add_argument( + "--tasks", + nargs="+", + default=["all"], + help="Task categories to run (or 'all')", + ) + parser.add_argument( + "--runs", + type=int, + default=1, + help="Number of runs per task", + ) + parser.add_argument( + "--output", + default="benchmarks/results", + help="Output directory", + ) + + args = parser.parse_args() + + runner = BenchmarkRunner( + backend=args.backend, + model=args.model, + fixtures=args.fixtures, + tasks=args.tasks, + runs_per_task=args.runs, + output_dir=args.output, + ) + + results = await runner.run_all() + runner.save_results() + + if results: + print("\n✅ Benchmarks complete!") + else: + print("\n❌ No results collected") + sys.exit(1) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/benchmarks/tasks/architecture_qa.json b/benchmarks/tasks/architecture_qa.json new file mode 100644 index 000000000..cba47e8c1 --- /dev/null +++ b/benchmarks/tasks/architecture_qa.json @@ -0,0 +1,50 @@ +[ + { + "id": "data-flow-user-input", + "title": "Trace data flow: user input to storage", + "description": "Describe the complete data flow when a user makes an HTTP request: from input parsing through validation, processing, and finally to storage. List all major functions involved.", + "category": "architecture_qa", + "difficulty": "hard", + "target_files": ["api.py", "validator.py", "processor.py", "storage.py"], + "expected_answer_contains": ["parse", "validate", "process", "store", "CallGraph"], + "verification_script": "tests/test_architecture_qa1.py", + "tags": ["architecture", "data_flow", "understanding"], + "notes": "Tests whether the agent can trace a complex call path through multiple modules." + }, + { + "id": "failure-cascade", + "title": "Analyze: What breaks if storage fails?", + "description": "If the storage module becomes unavailable, what parts of the system will stop working? Which operations will fail gracefully, and which will crash?", + "category": "architecture_qa", + "difficulty": "hard", + "target_files": ["storage.py", "api.py", "processor.py"], + "expected_answer_contains": ["dependency", "cascade", "error_handling", "fallback"], + "verification_script": "tests/test_architecture_qa2.py", + "tags": ["architecture", "resilience", "failure_analysis"], + "notes": "Tests understanding of dependencies and failure modes." + }, + { + "id": "performance-bottleneck", + "title": "Identify performance bottleneck", + "description": "Which component is likely the performance bottleneck for bulk user uploads? Why? What would you optimize first?", + "category": "architecture_qa", + "difficulty": "medium", + "target_files": ["api.py", "validator.py", "storage.py"], + "expected_answer_contains": ["storage", "database", "query", "batch", "index"], + "verification_script": "tests/test_architecture_qa3.py", + "tags": ["architecture", "performance", "optimization"], + "notes": "Tests architectural thinking and system understanding." + }, + { + "id": "auth-integration", + "title": "Explain auth integration points", + "description": "Where and how is authentication integrated into the system? What happens if an auth module is removed?", + "category": "architecture_qa", + "difficulty": "medium", + "target_files": ["api.py", "auth.py", "client.py"], + "expected_answer_contains": ["middleware", "decorator", "header", "token", "verify"], + "verification_script": "tests/test_architecture_qa4.py", + "tags": ["architecture", "security", "integration"], + "notes": "Tests understanding of cross-cutting concerns." + } +] diff --git a/benchmarks/tasks/bug_fixes.json b/benchmarks/tasks/bug_fixes.json new file mode 100644 index 000000000..55a94b67e --- /dev/null +++ b/benchmarks/tasks/bug_fixes.json @@ -0,0 +1,66 @@ +[ + { + "id": "auth-header-bug", + "title": "Fix auth module custom header loss", + "description": "The auth module drops custom headers in requests. Locate the bug and fix it so that custom headers are preserved through the authentication pipeline.", + "category": "bug_fix", + "difficulty": "medium", + "target_files": ["auth.py"], + "expected_changes": { + "files_modified": 1, + "insertions": 8, + "deletions": 3 + }, + "verification_script": "tests/test_auth_headers.py", + "tags": ["auth", "headers", "requests", "bugfix"], + "notes": "This requires understanding how headers flow through the auth system and where they get lost." + }, + { + "id": "response-caching-bug", + "title": "Fix response caching expiration logic", + "description": "The response caching system doesn't properly invalidate expired cache entries. Fix the expiration check logic so stale cached responses are not returned.", + "category": "bug_fix", + "difficulty": "medium", + "target_files": ["transport.py"], + "expected_changes": { + "files_modified": 1, + "insertions": 4, + "deletions": 2 + }, + "verification_script": "tests/test_cache_expiration.py", + "tags": ["cache", "expiration", "timing", "bugfix"], + "notes": "Look for timestamp comparisons in the caching logic." + }, + { + "id": "connection-leak", + "title": "Fix connection pool connection leak", + "description": "The connection pool leaks connections when exceptions occur during requests. Find where connections are not being released properly and fix it.", + "category": "bug_fix", + "difficulty": "hard", + "target_files": ["transport.py"], + "expected_changes": { + "files_modified": 1, + "insertions": 6, + "deletions": 1 + }, + "verification_script": "tests/test_connection_cleanup.py", + "tags": ["connections", "resources", "cleanup", "bugfix"], + "notes": "Requires understanding try/finally patterns and proper resource cleanup." + }, + { + "id": "timeout-edge-case", + "title": "Fix timeout handling for async requests", + "description": "The async client doesn't properly handle timeouts when multiple requests are made concurrently. The first timeout cancels all pending requests instead of just the timed-out one.", + "category": "bug_fix", + "difficulty": "hard", + "target_files": ["client.py", "transport.py"], + "expected_changes": { + "files_modified": 2, + "insertions": 10, + "deletions": 5 + }, + "verification_script": "tests/test_async_timeout.py", + "tags": ["async", "timeout", "concurrency", "bugfix"], + "notes": "Complex because it involves async context and task cancellation." + } +] diff --git a/benchmarks/tasks/feature_additions.json b/benchmarks/tasks/feature_additions.json new file mode 100644 index 000000000..e826cf6ea --- /dev/null +++ b/benchmarks/tasks/feature_additions.json @@ -0,0 +1,66 @@ +[ + { + "id": "rate-limiting", + "title": "Add rate-limiting middleware", + "description": "Add rate-limiting capability to the client. Implement a decorator/middleware that limits requests to N per second, queuing excess requests.", + "category": "feature_addition", + "difficulty": "medium", + "target_files": ["client.py"], + "expected_changes": { + "files_modified": 1, + "insertions": 30, + "deletions": 0 + }, + "verification_script": "tests/test_rate_limiting.py", + "tags": ["rate_limiting", "middleware", "throttling", "feature"], + "notes": "Must integrate cleanly with existing client API and preserve backward compatibility." + }, + { + "id": "retry-logic", + "title": "Implement configurable retry logic", + "description": "Add retry logic to the client with configurable backoff strategy (exponential, linear, custom). Requests should automatically retry on certain error codes.", + "category": "feature_addition", + "difficulty": "medium", + "target_files": ["client.py", "transport.py"], + "expected_changes": { + "files_modified": 2, + "insertions": 40, + "deletions": 2 + }, + "verification_script": "tests/test_retry_logic.py", + "tags": ["retry", "backoff", "resilience", "feature"], + "notes": "Should support multiple backoff strategies and be composable with other middleware." + }, + { + "id": "request-logging", + "title": "Add comprehensive request/response logging", + "description": "Implement structured logging for all requests and responses, including timing, headers, and error details. Make log level configurable.", + "category": "feature_addition", + "difficulty": "easy", + "target_files": ["client.py"], + "expected_changes": { + "files_modified": 1, + "insertions": 25, + "deletions": 0 + }, + "verification_script": "tests/test_logging.py", + "tags": ["logging", "observability", "debugging", "feature"], + "notes": "Straightforward integration point—should use Python's logging module." + }, + { + "id": "circuit-breaker", + "title": "Add circuit breaker pattern", + "description": "Implement the circuit breaker pattern to prevent cascading failures. When a service is failing, the circuit should open and fast-fail requests.", + "category": "feature_addition", + "difficulty": "hard", + "target_files": ["client.py", "transport.py"], + "expected_changes": { + "files_modified": 2, + "insertions": 60, + "deletions": 3 + }, + "verification_script": "tests/test_circuit_breaker.py", + "tags": ["circuit_breaker", "resilience", "pattern", "feature"], + "notes": "Must track failure counts, transitions between states (closed/open/half-open), and recovery logic." + } +] diff --git a/graphify/cache.py b/graphify/cache.py index bb3f9d593..8198cb66d 100644 --- a/graphify/cache.py +++ b/graphify/cache.py @@ -101,7 +101,7 @@ def _stat_index_file(root: Path) -> Path: def _ensure_stat_index(root: Path) -> None: - global _stat_index, _stat_index_root, _stat_index_dirty + global _stat_index, _stat_index_root if _stat_index_root is not None: return _stat_index_root = Path(root).resolve()