Skip to content

Commit 0cb85b4

Browse files
rdhyeeclaude
andauthored
Mirror Eric's OC PQG to R2 with immutable cache + drift-check script (#132)
Eric Kansa's OpenContext PQG files (the ones with 48K populated thumbnails, see #131) were only served from his GCS bucket. Mirrored to R2 under oc_pqg/ with date-versioned filenames + per-file manifests + a latest.json pointer so we: 1. Have a stable source-of-truth input for the PQG pipeline rebuild. 2. Can detect drift when Eric re-uploads. 3. Get free Cloudflare edge caching via the existing Worker. Worker change: expand the immutable Cache-Control regex from a single isamples_YYYYMM_* pattern to an array that also covers oc_pqg/oc_isamples_pqg*_YYYYMMDD.parquet. Non-versioned files under oc_pqg/ (manifests, latest.json) fall through to the 5-minute default. scripts/check_oc_pqg_drift.py fetches latest.json + per-file manifests from R2, HEADs GCS, and compares etags. Exit 0 = in sync, 1 = drift, 2 = probe failure. Run manually for now; later wire to GitHub Actions cron. Mirror contents (2026-04-17): oc_pqg/oc_isamples_pqg_20251107.parquet (727 MB, narrow) oc_pqg/oc_isamples_pqg_wide_20251116.parquet (289 MB, wide) oc_pqg/*.manifest.json (per-file provenance) oc_pqg/latest.json (flavor -> current version) Verified live: cache-control on the parquets is public, max-age=31536000, immutable. Drift check passes. Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 3d20a80 commit 0cb85b4

2 files changed

Lines changed: 95 additions & 2 deletions

File tree

scripts/check_oc_pqg_drift.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/usr/bin/env python3
2+
"""Check whether Eric Kansa's OC PQG files on GCS have drifted from our R2 mirror.
3+
4+
Reads our latest.json + the per-file manifests from data.isamples.org/oc_pqg/,
5+
HEADs the GCS source, and reports whether upstream has a newer version.
6+
7+
Exit codes:
8+
0 — in sync, no drift
9+
1 — drift detected (GCS has a different etag from what we've mirrored)
10+
2 — probe failure (network error, malformed response, etc.)
11+
12+
Run manually for now:
13+
python scripts/check_oc_pqg_drift.py
14+
15+
Later: wire to GitHub Actions cron.
16+
"""
17+
import json
18+
import sys
19+
import urllib.request
20+
21+
LATEST_URL = "https://data.isamples.org/oc_pqg/latest.json"
22+
GCS_BASE = "https://storage.googleapis.com/opencontext-parquet/"
23+
GCS_FILES = {
24+
"narrow": "oc_isamples_pqg.parquet",
25+
"wide": "oc_isamples_pqg_wide.parquet",
26+
}
27+
28+
29+
def fetch_json(url, timeout=20):
30+
req = urllib.request.Request(url, headers={"User-Agent": "isamples-oc-drift-check/1.0"})
31+
with urllib.request.urlopen(req, timeout=timeout) as r:
32+
return json.loads(r.read())
33+
34+
35+
def head(url, timeout=20):
36+
req = urllib.request.Request(url, method="HEAD",
37+
headers={"User-Agent": "isamples-oc-drift-check/1.0"})
38+
with urllib.request.urlopen(req, timeout=timeout) as r:
39+
return dict(r.headers)
40+
41+
42+
def main() -> int:
43+
try:
44+
latest = fetch_json(LATEST_URL)
45+
except Exception as e:
46+
print(f"ERROR: could not fetch {LATEST_URL}: {e}", file=sys.stderr)
47+
return 2
48+
49+
drift_any = False
50+
for flavor, gcs_name in GCS_FILES.items():
51+
flavor_ptr = latest.get(flavor)
52+
if not flavor_ptr:
53+
print(f"ERROR: latest.json has no entry for {flavor!r}", file=sys.stderr)
54+
return 2
55+
56+
try:
57+
manifest = fetch_json(f"https://data.isamples.org/{flavor_ptr['manifest']}")
58+
except Exception as e:
59+
print(f"ERROR: could not fetch manifest for {flavor}: {e}", file=sys.stderr)
60+
return 2
61+
62+
try:
63+
gcs_headers = head(f"{GCS_BASE}{gcs_name}")
64+
except Exception as e:
65+
print(f"ERROR: HEAD {GCS_BASE}{gcs_name}: {e}", file=sys.stderr)
66+
return 2
67+
68+
gcs_etag = gcs_headers.get("ETag", "").strip('"')
69+
gcs_last_modified = gcs_headers.get("Last-Modified", "")
70+
our_etag = manifest.get("source_etag", "")
71+
our_updated = manifest.get("source_updated", "")
72+
73+
in_sync = gcs_etag == our_etag
74+
state = "in sync" if in_sync else "DRIFT"
75+
print(f"[{flavor}] {state}")
76+
print(f" mirrored: etag={our_etag} updated={our_updated}")
77+
print(f" gcs: etag={gcs_etag} last-modified={gcs_last_modified}")
78+
if not in_sync:
79+
drift_any = True
80+
81+
return 1 if drift_any else 0
82+
83+
84+
if __name__ == "__main__":
85+
sys.exit(main())

workers/data-isamples-org/src/index.js

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,15 @@
1818
* working.
1919
*/
2020

21-
const IMMUTABLE_PATTERN = /^isamples_\d{6}_.*\.parquet$/;
21+
// Immutable-by-filename patterns. Match files whose path fully determines
22+
// their contents (filename includes a version / date stamp).
23+
// - isamples_YYYYMM_*.parquet (monthly iSamples snapshots)
24+
// - oc_pqg/oc_isamples_pqg*_YYYYMMDD.parquet (mirror of Eric Kansa's
25+
// OpenContext PQG files — versioned by the upstream GCS updated-date)
26+
const IMMUTABLE_PATTERNS = [
27+
/^isamples_\d{6}_.*\.parquet$/,
28+
/^oc_pqg\/oc_isamples_pqg.*_\d{8}\.parquet$/,
29+
];
2230
const IMMUTABLE_MAX_AGE = 60 * 60 * 24 * 365; // 1 year
2331
const FALLBACK_MAX_AGE = 300; // 5 minutes
2432

@@ -72,7 +80,7 @@ export default {
7280
for (const [k, v] of Object.entries(CORS_HEADERS)) headers.set(k, v);
7381

7482
// Cache-Control: this is the optimization.
75-
if (IMMUTABLE_PATTERN.test(key)) {
83+
if (IMMUTABLE_PATTERNS.some(p => p.test(key))) {
7684
headers.set('Cache-Control', `public, max-age=${IMMUTABLE_MAX_AGE}, immutable`);
7785
} else {
7886
headers.set('Cache-Control', `public, max-age=${FALLBACK_MAX_AGE}`);

0 commit comments

Comments
 (0)