Skip to content

Commit d382a20

Browse files
rdhyeeclaude
andauthored
Add /current/ alias layer + enrichment script (#133)
Two additions for stable-URL access to rotating versioned parquets: 1. Worker alias route: GET /current/<flavor>.parquet reads current/manifest.json from R2 and 302-redirects to the dated file it points to. Redirect response carries short 5-min Cache-Control so rotation propagates quickly; the target (versioned file) keeps its immutable 1-year cache. DuckDB-WASM / curl / browsers all follow 302s transparently, so range requests hit the target directly. 2. scripts/enrich_wide_with_oc_thumbnails.py: DuckDB LEFT-JOIN script that takes the unified Zenodo wide parquet (thumbnail_url all NULL, see #131) and Eric Kansa's oc_isamples_pqg.parquet (48K thumbnails) and produces an enriched wide file with ~47.7K thumbnails populated for MaterialSampleRecord pids that overlap both. Used today to build and ship isamples_202604_wide.parquet via https://data.isamples.org/current/wide.parquet. The older isamples_202601_wide.parquet stays in place, untouched, still immutable. Verified via DuckDB query through the /current/ URL: 47,717 rows with thumbnail_url populated. Closes the "soft-link" piece of #131. Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 0cb85b4 commit d382a20

2 files changed

Lines changed: 120 additions & 0 deletions

File tree

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env python3
2+
"""Build an enriched unified-wide parquet by left-joining OC thumbnails.
3+
4+
Takes the unified Zenodo wide parquet (which has thumbnail_url = NULL for all
5+
6.7M samples because the upstream iSamples export doesn't carry thumbnails —
6+
see issue #131) and fills in thumbnail_url for the ~47K OpenContext samples
7+
that appear in Eric Kansa's oc_isamples_pqg.parquet.
8+
9+
Input:
10+
--src local path to source unified wide parquet
11+
(e.g. ~/Data/iSample/pqg_refining/zenodo_wide_*.parquet)
12+
--oc local path to Eric's oc_isamples_pqg.parquet (the narrow
13+
one — thumbnails live on MaterialSampleRecord rows)
14+
--out path to write the enriched output
15+
16+
Usage:
17+
python scripts/enrich_wide_with_oc_thumbnails.py \\
18+
--src ~/Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet \\
19+
--oc /tmp/oc_isamples_pqg_20251107.parquet \\
20+
--out /tmp/isamples_202604_wide.parquet
21+
22+
Then upload to R2 under a date-stamped filename (e.g. isamples_202604_wide.parquet)
23+
and update current/manifest.json to point at it.
24+
"""
25+
import argparse
26+
import os
27+
import sys
28+
import time
29+
import duckdb
30+
31+
32+
def main():
33+
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
34+
p.add_argument('--src', required=True, help='source unified wide parquet')
35+
p.add_argument('--oc', required=True, help="Eric's OC narrow parquet (for thumbnails)")
36+
p.add_argument('--out', required=True, help='output path for enriched parquet')
37+
args = p.parse_args()
38+
39+
for f in (args.src, args.oc):
40+
if not os.path.exists(f):
41+
print(f'ERROR: missing {f}', file=sys.stderr)
42+
return 2
43+
44+
con = duckdb.connect()
45+
46+
print(f'source: {args.src}')
47+
print(f'oc: {args.oc}')
48+
print(f'out: {args.out}')
49+
50+
t0 = time.time()
51+
con.execute(f"""
52+
CREATE TEMP TABLE oc_thumbs AS
53+
SELECT DISTINCT pid, thumbnail_url
54+
FROM read_parquet('{args.oc}')
55+
WHERE thumbnail_url IS NOT NULL AND thumbnail_url <> ''
56+
""")
57+
n = con.sql('SELECT COUNT(*) FROM oc_thumbs').fetchone()[0]
58+
print(f'[{time.time()-t0:.1f}s] oc_thumbs lookup: {n:,} (pid, thumbnail) pairs')
59+
60+
t0 = time.time()
61+
con.execute(f"""
62+
COPY (
63+
SELECT p.* REPLACE (COALESCE(oc.thumbnail_url, p.thumbnail_url) AS thumbnail_url)
64+
FROM read_parquet('{args.src}') p
65+
LEFT JOIN oc_thumbs oc ON p.pid = oc.pid
66+
)
67+
TO '{args.out}' (FORMAT PARQUET, COMPRESSION ZSTD)
68+
""")
69+
print(f'[{time.time()-t0:.1f}s] wrote enriched parquet')
70+
71+
# Verify
72+
r = con.sql(f"""
73+
SELECT COUNT(*) AS rows,
74+
COUNT(*) FILTER (WHERE thumbnail_url IS NOT NULL AND thumbnail_url <> '') AS with_thumb
75+
FROM read_parquet('{args.out}')
76+
""").df()
77+
print(r.to_string(index=False))
78+
print(f'output size: {os.path.getsize(args.out)/1024/1024:.1f} MB')
79+
return 0
80+
81+
82+
if __name__ == '__main__':
83+
sys.exit(main())

workers/data-isamples-org/src/index.js

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,43 @@ export default {
5858
});
5959
}
6060

61+
// === /current/ alias layer ===
62+
// `/current/<flavor>.parquet` reads `current/manifest.json` from R2 and
63+
// 302-redirects to the dated file it points to. Lets consumers pin to a
64+
// stable URL while the underlying immutable file rotates out-of-band.
65+
const currentAliasMatch = key.match(/^current\/([a-z0-9_-]+)\.parquet$/i);
66+
if (currentAliasMatch) {
67+
const flavor = currentAliasMatch[1];
68+
const manifestObj = await env.BUCKET.get('current/manifest.json');
69+
if (!manifestObj) {
70+
return new Response('current/manifest.json not found', { status: 503, headers: CORS_HEADERS });
71+
}
72+
let manifest;
73+
try {
74+
manifest = JSON.parse(await manifestObj.text());
75+
} catch (e) {
76+
return new Response('current/manifest.json is invalid JSON', { status: 503, headers: CORS_HEADERS });
77+
}
78+
const entry = manifest[flavor];
79+
if (!entry || !entry.public_url) {
80+
return new Response(
81+
`current/manifest.json has no entry for flavor '${flavor}'`,
82+
{ status: 404, headers: CORS_HEADERS }
83+
);
84+
}
85+
// 302 Found preserves the request method semantics and lets clients
86+
// re-issue range requests against the target URL directly.
87+
return new Response(null, {
88+
status: 302,
89+
headers: {
90+
'Location': entry.public_url,
91+
// Short TTL so rotation propagates quickly without stale fanout.
92+
'Cache-Control': `public, max-age=${FALLBACK_MAX_AGE}`,
93+
...CORS_HEADERS,
94+
},
95+
});
96+
}
97+
6198
// Parse Range header if present. R2's get() accepts { offset, length } or
6299
// { suffix }, mirroring HTTP Range semantics.
63100
const rangeHeader = request.headers.get('range');

0 commit comments

Comments
 (0)