Skip to content

Commit 09230e1

Browse files
indexzeroclaude
andauthored
feat(cli) add view enrich command for NDJSON enrichment (#25)
Add new view enrich command that reads NDJSON input and enriches each record with fields from cached packuments: - Use --add expressions with .field syntax for record references e.g., time[.version] uses record.version as lookup key - Support multiple --add expressions for enriching with several fields - Handle missing packuments with --on-missing modes (skip, null, error) - Cache packuments in memory to avoid repeated disk reads Enables build manifest generation, audit trail creation, and metadata enrichment for package version lists. Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 582e357 commit 09230e1

4 files changed

Lines changed: 539 additions & 0 deletions

File tree

cli/cli/src/cmd/view/enrich.js

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
import { createReadStream } from 'node:fs';
2+
import { createInterface } from 'node:readline';
3+
import { parseArgs } from 'node:util';
4+
import { Cache, createStorageDriver, createPackumentKey } from '@_all_docs/cache';
5+
6+
export const usage = `Usage: _all_docs view enrich [options]
7+
8+
Enrich package specs with fields from cached packuments.
9+
10+
Options:
11+
-i, --input <file> Input NDJSON file ('-' for stdin)
12+
--add <expr> Add field from packument (repeatable)
13+
--origin <origin> Packument origin (default: npm)
14+
--name-field <f> Input field for name (default: name)
15+
--version-field <f> Input field for version (default: version)
16+
--on-missing <mode> skip, null, or error (default: null)
17+
--progress Show progress
18+
19+
Add Expression Syntax:
20+
<selector> as <alias>
21+
22+
Use .field to reference input record fields:
23+
time[.version] as addedAt
24+
versions[.version].dist.integrity as integrity
25+
26+
Examples:
27+
# Add publish dates
28+
_all_docs view enrich -i specs.ndjson --add 'time[.version] as addedAt'
29+
30+
# Add multiple fields
31+
_all_docs view enrich -i specs.ndjson \\
32+
--add 'time[.version] as publishedAt' \\
33+
--add 'versions[.version].dist.integrity as integrity'
34+
`;
35+
36+
export const command = async (cli) => {
37+
if (cli.values.help) {
38+
console.log(usage);
39+
return;
40+
}
41+
42+
// Parse command-specific args
43+
const { values } = parseArgs({
44+
args: cli._,
45+
options: {
46+
input: { type: 'string', short: 'i' },
47+
add: { type: 'string', multiple: true },
48+
origin: { type: 'string', default: 'npm' },
49+
'name-field': { type: 'string', default: 'name' },
50+
'version-field': { type: 'string', default: 'version' },
51+
'on-missing': { type: 'string', default: 'null' },
52+
progress: { type: 'boolean', default: false }
53+
},
54+
allowPositionals: true
55+
});
56+
57+
// Also check cli.values for global flags
58+
const input = values.input || cli.values.input;
59+
const addExprs = values.add || cli.values.add || [];
60+
const origin = values.origin || cli.values.origin || 'npm';
61+
const nameField = values['name-field'] || cli.values['name-field'] || 'name';
62+
const versionField = values['version-field'] || cli.values['version-field'] || 'version';
63+
const onMissing = values['on-missing'] || cli.values['on-missing'] || 'null';
64+
const showProgress = values.progress || cli.values.progress;
65+
66+
if (!input) {
67+
console.error('Error: --input required');
68+
console.log(usage);
69+
process.exit(1);
70+
}
71+
72+
if (!addExprs || addExprs.length === 0) {
73+
console.error('Error: at least one --add expression required');
74+
process.exit(1);
75+
}
76+
77+
// Parse add expressions
78+
const enrichments = addExprs.map(parseAddExpression);
79+
80+
// Setup cache
81+
const driver = await createStorageDriver({ CACHE_DIR: cli.dir('packuments') });
82+
const cache = new Cache({ path: cli.dir('packuments'), driver });
83+
84+
// Packument cache (avoid re-fetching for same package)
85+
const packumentCache = new Map();
86+
87+
// Setup input stream
88+
const inputStream = input === '-'
89+
? process.stdin
90+
: createReadStream(input);
91+
92+
const rl = createInterface({ input: inputStream, crlfDelay: Infinity });
93+
94+
let processed = 0;
95+
let enriched = 0;
96+
let skipped = 0;
97+
98+
for await (const line of rl) {
99+
if (!line.trim()) continue;
100+
101+
processed++;
102+
if (showProgress && processed % 1000 === 0) {
103+
process.stderr.write(`\rProcessed ${processed}, enriched ${enriched}, skipped ${skipped}...`);
104+
}
105+
106+
try {
107+
const record = JSON.parse(line);
108+
const name = record[nameField];
109+
110+
if (!name) {
111+
if (onMissing === 'skip') { skipped++; continue; }
112+
if (onMissing === 'error') throw new Error('Missing name field');
113+
console.log(line); // Pass through unchanged
114+
continue;
115+
}
116+
117+
// Get packument (cached)
118+
let packument = packumentCache.get(name);
119+
if (packument === undefined) {
120+
const key = createPackumentKey(name, origin === 'npm' ? 'https://registry.npmjs.org' : origin);
121+
try {
122+
const entry = await cache.fetch(key);
123+
packument = entry?.body || entry || null;
124+
} catch {
125+
packument = null;
126+
}
127+
packumentCache.set(name, packument);
128+
}
129+
130+
if (!packument) {
131+
if (onMissing === 'skip') { skipped++; continue; }
132+
if (onMissing === 'error') {
133+
throw new Error(`Packument not found: ${name}`);
134+
}
135+
// null mode: output with null values
136+
for (const e of enrichments) {
137+
record[e.alias] = null;
138+
}
139+
console.log(JSON.stringify(record));
140+
continue;
141+
}
142+
143+
// Apply enrichments
144+
for (const enrichment of enrichments) {
145+
const value = extractValue(packument, enrichment.selector, record);
146+
record[enrichment.alias] = value;
147+
}
148+
149+
console.log(JSON.stringify(record));
150+
enriched++;
151+
152+
} catch (err) {
153+
if (onMissing === 'error') {
154+
throw err;
155+
}
156+
console.error(`Error processing line ${processed}: ${err.message}`);
157+
}
158+
}
159+
160+
if (showProgress) {
161+
process.stderr.write(`\rCompleted: ${processed} processed, ${enriched} enriched, ${skipped} skipped\n`);
162+
}
163+
};
164+
165+
/**
166+
* Parse "selector as alias" expression
167+
* @param {string} expr - Expression like "time[.version] as addedAt"
168+
* @returns {{ selector: string, alias: string }}
169+
*/
170+
export function parseAddExpression(expr) {
171+
const match = expr.match(/^(.+?)\s+as\s+(\w+)$/);
172+
if (!match) {
173+
throw new Error(`Invalid --add expression: ${expr}\nExpected: <selector> as <alias>`);
174+
}
175+
return {
176+
selector: match[1].trim(),
177+
alias: match[2].trim()
178+
};
179+
}
180+
181+
/**
182+
* Extract value from packument using selector with record field references
183+
* @param {object} packument - The packument data
184+
* @param {string} selector - Selector with optional .field references
185+
* @param {object} record - Input record for .field resolution
186+
* @returns {*} The extracted value
187+
*/
188+
export function extractValue(packument, selector, record) {
189+
// Replace .field references with actual values from record
190+
const resolvedSelector = selector.replace(/\[\.(\w+)\]/g, (_, field) => {
191+
const val = record[field];
192+
if (val === undefined) return '[null]';
193+
// Escape special characters in the value
194+
const escaped = String(val).replace(/"/g, '\\"');
195+
return `["${escaped}"]`;
196+
});
197+
198+
// Now evaluate the selector against packument
199+
return evaluateSelector(packument, resolvedSelector);
200+
}
201+
202+
/**
203+
* Simple selector evaluation
204+
* Handles: field.nested, field["key"], field[0]
205+
* @param {object} obj - Object to evaluate against
206+
* @param {string} selector - Selector path
207+
* @returns {*} The value at the path
208+
*/
209+
export function evaluateSelector(obj, selector) {
210+
// Parse selector into segments
211+
const parts = [];
212+
let current = '';
213+
let inBracket = false;
214+
let bracketContent = '';
215+
216+
for (let i = 0; i < selector.length; i++) {
217+
const char = selector[i];
218+
219+
if (char === '[' && !inBracket) {
220+
if (current) {
221+
parts.push({ type: 'field', value: current });
222+
current = '';
223+
}
224+
inBracket = true;
225+
bracketContent = '';
226+
} else if (char === ']' && inBracket) {
227+
// Remove quotes from bracket content if present
228+
let key = bracketContent;
229+
if ((key.startsWith('"') && key.endsWith('"')) ||
230+
(key.startsWith("'") && key.endsWith("'"))) {
231+
key = key.slice(1, -1);
232+
}
233+
parts.push({ type: 'bracket', value: key });
234+
inBracket = false;
235+
} else if (char === '.' && !inBracket) {
236+
if (current) {
237+
parts.push({ type: 'field', value: current });
238+
current = '';
239+
}
240+
} else if (inBracket) {
241+
bracketContent += char;
242+
} else {
243+
current += char;
244+
}
245+
}
246+
247+
if (current) {
248+
parts.push({ type: 'field', value: current });
249+
}
250+
251+
// Traverse the object
252+
let result = obj;
253+
for (const part of parts) {
254+
if (result === null || result === undefined) return null;
255+
256+
if (part.type === 'bracket') {
257+
result = result[part.value];
258+
} else {
259+
result = result[part.value];
260+
}
261+
}
262+
263+
return result ?? null;
264+
}

0 commit comments

Comments
 (0)