From 1c4e8bf793de8439647218fbfdaec5b6a4881ecc Mon Sep 17 00:00:00 2001 From: Khurdhula-Harshavardhan Date: Mon, 11 May 2026 20:48:40 -0700 Subject: [PATCH] feat(olmocr): add chandra-ocr-2 runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the existing per-provider olmOCR runners (openai_mini, grok, etc.) but dispatches each rendered PDF page to a self-hosted `datalab-to/chandra-ocr-2` vLLM endpoint over HTTPS. Auth via the `x-api-admin-key` header; runner reads `CHANDRA_MODAL_URL` and `CHANDRA_MODAL_ADMIN_KEY` from env. Request payload uses `task="ocr_layout"` and `temperature=0.0` to stay apples-to-apples with the other candidates (no reasoning/thinking mode — Chandra OCR 2 is a fine-tuned OCR VLM with none). Harness supports `--sample`, `--skip-generation`, `--generate-only`, and `--limit N` (smoke). `RATE_LIMIT=50` matches the throughput of a 10-H100 deployment; turn down if running against a single warm container. README updated under the olmOCR section to list the new runner and the two required env vars. --- README.md | 9 + .../olmocr/bench/runners/run_chandra_ocr2.py | 57 +++++ .../olmocr/olmocr_bench_chandra_ocr2.py | 218 ++++++++++++++++++ 3 files changed, 284 insertions(+) create mode 100644 benchmarks/olmocr/bench/runners/run_chandra_ocr2.py create mode 100644 benchmarks/olmocr/olmocr_bench_chandra_ocr2.py diff --git a/README.md b/README.md index 736167d..cd70797 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ uv run -m benchmarks.olmocr.olmocr_bench uv run -m benchmarks.olmocr.olmocr_bench_openai_mini uv run -m benchmarks.olmocr.olmocr_bench_gemini_pro_31 uv run -m benchmarks.olmocr.olmocr_bench_grok +uv run -m benchmarks.olmocr.olmocr_bench_chandra_ocr2 # Useful flags uv run -m benchmarks.olmocr.olmocr_bench --sample # tiny sample dataset @@ -73,6 +74,14 @@ uv run -m benchmarks.olmocr.olmocr_bench --generate-only # predictions only uv run -m benchmarks.olmocr.olmocr_bench --skip-generation # evaluation only ``` +`olmocr_bench_chandra_ocr2` dispatches to a self-hosted `datalab-to/chandra-ocr-2` +vLLM endpoint. Required env: + +``` +CHANDRA_MODAL_URL=https://--mlt-chandra-ocr-chandraocr-api.modal.run +CHANDRA_MODAL_ADMIN_KEY= +``` + --- ## RefCOCO (Object Detection) diff --git a/benchmarks/olmocr/bench/runners/run_chandra_ocr2.py b/benchmarks/olmocr/bench/runners/run_chandra_ocr2.py new file mode 100644 index 0000000..47cc5e7 --- /dev/null +++ b/benchmarks/olmocr/bench/runners/run_chandra_ocr2.py @@ -0,0 +1,57 @@ +import os + +import requests + +from benchmarks.olmocr.data.renderpdf import render_pdf_to_base64png + + +def run_chandra_ocr2( + pdf_path: str, + page_num: int = 1, + target_longest_image_dim: int = 2048, + task: str = "ocr_layout", + max_tokens: int = 12384, +) -> str: + """Convert a PDF page to markdown via Chandra OCR 2 deployed on Modal. + + Apples-to-apples with other olmOCR runners: temperature=0, no + reasoning/thinking mode (Chandra OCR 2 is a fine-tuned OCR VLM with no + such mode), no chain-of-thought. + """ + url = os.getenv("CHANDRA_MODAL_URL") + key = os.getenv("CHANDRA_MODAL_ADMIN_KEY") or os.getenv("ADMIN_KEY") + if not url: + raise SystemExit( + "CHANDRA_MODAL_URL not set — point this at the Modal endpoint base URL " + "(no trailing /ocr)." + ) + if not key: + raise SystemExit( + "CHANDRA_MODAL_ADMIN_KEY (or ADMIN_KEY) not set — required by the Modal app." + ) + + image_base64 = render_pdf_to_base64png( + pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim + ) + + resp = requests.post( + f"{url.rstrip('/')}/ocr", + headers={ + "x-api-admin-key": key, + "Content-Type": "application/json", + }, + json={ + "inputs": [f"data:image/png;base64,{image_base64}"], + "task": task, + "max_tokens": max_tokens, + "temperature": 0.0, + }, + timeout=600, + ) + resp.raise_for_status() + payload = resp.json() + + md = payload.get("markdown", "") if isinstance(payload, dict) else "" + if not md or md.strip().lower() in ("null", "none", "n/a", ""): + return "" + return md diff --git a/benchmarks/olmocr/olmocr_bench_chandra_ocr2.py b/benchmarks/olmocr/olmocr_bench_chandra_ocr2.py new file mode 100644 index 0000000..b644c5f --- /dev/null +++ b/benchmarks/olmocr/olmocr_bench_chandra_ocr2.py @@ -0,0 +1,218 @@ +""" +OlmOCR Benchmark for Chandra OCR 2 (datalab-to/chandra-ocr-2) on Modal. + +Mirrors olmocr_bench_openai_mini.py but dispatches each rendered PDF page to a +Modal-hosted Chandra OCR 2 endpoint (see ~/jigsawstack-ocr/chandra2.py). + +Required env: + CHANDRA_MODAL_URL Base URL of the deployed Modal app, e.g. + https://--mlt-chandra-ocr-chandraocr-api.modal.run + CHANDRA_MODAL_ADMIN_KEY Value of the admin-key secret used by the Modal app. + +Usage: + uv run -m benchmarks.olmocr.olmocr_bench_chandra_ocr2 + uv run -m benchmarks.olmocr.olmocr_bench_chandra_ocr2 --sample + uv run -m benchmarks.olmocr.olmocr_bench_chandra_ocr2 --skip-generation + uv run -m benchmarks.olmocr.olmocr_bench_chandra_ocr2 --generate-only +""" + +import argparse +import asyncio +import json +import os +import sys +from pathlib import Path + +from dotenv import load_dotenv +from huggingface_hub import hf_hub_download +from tqdm.asyncio import tqdm_asyncio + +load_dotenv() + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +SAMPLE_DATA_DIR = Path(__file__).resolve().parent / "bench" / "sample_data" +FULL_DATA_DIR = Path(__file__).resolve().parent / "bench" / "full_data" +CANDIDATE_NAME = "chandra_ocr2" +RATE_LIMIT = 50 # requests admitted per second (token-bucket pacing) +MAX_RETRIES = 3 + +HF_REPO = "allenai/olmOCR-bench" +SPLITS = [ + "arxiv_math", + "headers_footers", + "long_tiny_text", + "multi_column", + "old_scans", + "old_scans_math", + "table_tests", +] + +sys.path.insert(0, str(PROJECT_ROOT)) +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + + +class RateLimiter: + def __init__(self, rate: int): + self.rate = rate + self.tokens = rate + self.last_refill = 0.0 + self._lock = asyncio.Lock() + + async def acquire(self): + while True: + async with self._lock: + now = asyncio.get_running_loop().time() + elapsed = now - self.last_refill + self.tokens = min(self.rate, self.tokens + elapsed * self.rate) + self.last_refill = now + if self.tokens >= 1: + self.tokens -= 1 + return + await asyncio.sleep(1 / self.rate) + + +def download_full_dataset(): + data_dir = FULL_DATA_DIR + pdf_dir = data_dir / "pdfs" + all_pdfs = set() + for split in SPLITS: + jsonl_dest = data_dir / f"{split}.jsonl" + if jsonl_dest.exists(): + with open(jsonl_dest) as f: + tests = [json.loads(l) for l in f if l.strip()] + else: + print(f" Downloading {split}.jsonl...") + src = hf_hub_download(HF_REPO, f"bench_data/{split}.jsonl", repo_type="dataset") + with open(src) as f: + tests = [json.loads(l) for l in f if l.strip()] + data_dir.mkdir(parents=True, exist_ok=True) + with open(jsonl_dest, "w") as f: + for t in tests: + f.write(json.dumps(t) + "\n") + print(f" {split}: {len(tests)} tests") + for t in tests: + all_pdfs.add(t["pdf"]) + + print(f"\n Total unique PDFs to download: {len(all_pdfs)}") + downloaded = 0 + skipped = 0 + for pdf_rel in sorted(all_pdfs): + local_path = pdf_dir / pdf_rel + if local_path.exists(): + skipped += 1 + continue + local_path.parent.mkdir(parents=True, exist_ok=True) + try: + src = hf_hub_download(HF_REPO, f"bench_data/pdfs/{pdf_rel}", repo_type="dataset") + os.symlink(src, str(local_path)) + downloaded += 1 + except Exception as e: + print(f" Failed to download {pdf_rel}: {e}") + print(f" PDFs: {downloaded} downloaded, {skipped} already existed") + return data_dir + + +async def process_page(pdf_path, page_num, output_path, rate_limiter): + from olmocr.bench.runners.run_chandra_ocr2 import run_chandra_ocr2 + + for attempt in range(MAX_RETRIES): + await rate_limiter.acquire() + try: + result = await asyncio.to_thread(run_chandra_ocr2, pdf_path, page_num) + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + f.write(result) + return True + except Exception as e: + if attempt < MAX_RETRIES - 1: + await asyncio.sleep(2**attempt) + else: + print(f"Failed after {MAX_RETRIES} attempts: {pdf_path} page {page_num}: {e}") + return False + + +async def generate_outputs(data_dir: Path, limit: int | None = None): + pdf_folder = data_dir / "pdfs" + output_folder = data_dir / CANDIDATE_NAME + + pdf_pages = set() + for jsonl_file in data_dir.glob("*.jsonl"): + with open(jsonl_file) as f: + for line in f: + line = line.strip() + if not line: + continue + t = json.loads(line) + pdf_pages.add((t["pdf"], t["page"])) + + if limit is not None and limit > 0: + pdf_pages = set(sorted(pdf_pages)[:limit]) + print(f"--limit {limit}: capping to first {len(pdf_pages)} (pdf, page) pairs") + + print(f"Found {len(pdf_pages)} unique (pdf, page) pairs to process") + + rate_limiter = RateLimiter(RATE_LIMIT) + tasks = [] + for pdf_rel, page in sorted(pdf_pages): + pdf_path = str(pdf_folder / pdf_rel) + if not os.path.exists(pdf_path): + continue + base_name = os.path.splitext(os.path.basename(pdf_rel))[0] + parent_dir = os.path.dirname(pdf_rel) + md_filename = f"{base_name}_pg{page}_repeat1.md" + if parent_dir: + out_path = str(output_folder / parent_dir / md_filename) + else: + out_path = str(output_folder / md_filename) + if os.path.exists(out_path): + continue + tasks.append(process_page(pdf_path, page, out_path, rate_limiter)) + + if not tasks: + print("All outputs already exist, skipping generation.") + return True + print(f"Processing {len(tasks)} pages...") + results = await tqdm_asyncio.gather(*tasks, desc=f"Generating {CANDIDATE_NAME} outputs") + num_success = sum(1 for r in results if r) + num_failed = len(results) - num_success + print(f"Done: {num_success} succeeded, {num_failed} failed") + return num_failed == 0 + + +def run_evaluation(data_dir: Path): + from olmocr.bench.benchmark import main as bench_main + sys.argv = ["benchmark", "--dir", str(data_dir), "--candidate", CANDIDATE_NAME, "--force"] + bench_main() + + +async def main(): + parser = argparse.ArgumentParser(description=f"Run OlmOCR benchmark with {CANDIDATE_NAME}") + parser.add_argument("--sample", action="store_true") + parser.add_argument("--skip-generation", action="store_true") + parser.add_argument("--generate-only", action="store_true") + parser.add_argument( + "--limit", + type=int, + default=None, + help="Smoke-test mode: only generate outputs for the first N (pdf, page) pairs.", + ) + args = parser.parse_args() + + if args.sample: + data_dir = SAMPLE_DATA_DIR + print("=== Using sample data ===") + else: + print("=== Downloading full olmOCR-bench dataset from HuggingFace ===") + data_dir = download_full_dataset() + + if not args.skip_generation: + print(f"\n=== Generating {CANDIDATE_NAME} outputs ===") + await generate_outputs(data_dir, limit=args.limit) + + if not args.generate_only: + print("\n=== Running OlmOCR Benchmark Evaluation ===") + run_evaluation(data_dir) + + +if __name__ == "__main__": + asyncio.run(main())