diff --git a/.zed/rules b/.zed/rules index 885a1a42..c6b86cec 100644 --- a/.zed/rules +++ b/.zed/rules @@ -139,13 +139,13 @@ Examples: src/orcapod/ types.py — Schema, ColumnConfig, ContentHash system_constants.py — Column prefixes and separators - errors.py — InputValidationError, DuplicateTagError, FieldNotResolvableError + errors.py — InputValidationError, DuplicateKeyError, FieldNotResolvableError config.py — Config dataclass contexts/ — DataContext (semantic_hasher, arrow_hasher, type_converter) protocols/ hashing_protocols.py — PipelineElementProtocol, ContentIdentifiableProtocol core_protocols/ — StreamProtocol, PodProtocol, SourceProtocol, - DataFunctionProtocol, DatagramProtocol, TagProtocol, + DataFunctionProtocol, DatagramProtocol, KeyProtocol, DataProtocol, TrackerProtocol core/ base.py — ContentIdentifiableBase, PipelineElementBase, TraceableBase @@ -156,7 +156,7 @@ src/orcapod/ tracker.py — Invocation tracking datagrams/ datagram.py — Datagram (unified dict/Arrow backing, lazy conversion) - tag_data.py — Tag (+ system tags), Data (+ source info) + key_data.py — Key (+ system keys), Data (+ source info) sources/ base.py — RootSource (abstract, no upstream) arrow_table_source.py — Core source — all other sources delegate to it @@ -173,15 +173,15 @@ src/orcapod/ merge_join.py — MergeJoin (binary, colliding cols → sorted list[T]) semijoin.py — SemiJoin (binary, non-commutative) batch.py — Batch (group rows, types become list[T]) - column_selection.py — Select/Drop Tag/Data columns - mappers.py — MapTags, MapData (rename columns) + column_selection.py — Select/Drop Key/Data columns + mappers.py — MapKeys, MapData (rename columns) filters.py — PolarsFilter hashing/ semantic_hashing/ — BaseSemanticHasher, type handlers semantic_types/ — Type conversion (Python ↔ Arrow) databases/ — ArrowDatabaseProtocol implementations (Delta Lake, in-memory) utils/ - arrow_data_utils.py — System tag manipulation, source info, column helpers + arrow_data_utils.py — System key manipulation, source info, column helpers arrow_utils.py — Arrow table utilities schema_utils.py — Schema extraction, union, intersection, compatibility lazy_module.py — LazyModule for deferred heavy imports @@ -208,26 +208,26 @@ See orcapod-design.md at the project root for the full design specification. RootSource → ArrowTableStream → [Operator / FunctionPod] → ArrowTableStream → ... -Every stream is an immutable sequence of (Tag, Data) pairs backed by a PyArrow Table. -Tag columns are join keys and metadata; data columns are the data payload. +Every stream is an immutable sequence of (Key, Data) pairs backed by a PyArrow Table. +Key columns are join keys and metadata; data columns are the data payload. ### Core abstractions Datagram (core/datagrams/datagram.py) — immutable data container with lazy dict ↔ Arrow conversion. Two specializations: -- Tag — metadata columns + hidden system tag columns for provenance tracking +- Key — metadata columns + hidden system key columns for provenance tracking - Data — data columns + per-column source info provenance tokens -Stream (core/streams/arrow_table_stream.py) — immutable (Tag, Data) sequence. +Stream (core/streams/arrow_table_stream.py) — immutable (Key, Data) sequence. Key methods: output_schema(), keys(), iter_data(), as_table(). Source (core/sources/) — produces a stream from external data. ArrowTableSource is the core implementation; CSV/Delta/DataFrame/Dict/List sources all delegate to it internally. Each -source adds source-info columns and a system tag column. DerivedSource wraps a +source adds source-info columns and a system key column. DerivedSource wraps a FunctionNode/OperatorNode's DB records as a new source. Function Pod (core/function_pod.py) — wraps a DataFunction that transforms individual -data. Never inspects tags. Two execution models: +data. Never inspects keys. Two execution models: - FunctionPod → FunctionPodStream: lazy, in-memory - FunctionNode: DB-backed, two-phase (yield cached results first, then compute missing) @@ -242,8 +242,8 @@ FunctionNode. ### Strict operator / function pod boundary -Operators: inspect tags (never data content), can rename columns, cannot synthesize values. -Function Pods: inspect data content (never tags), synthesize new values, cached by content. +Operators: inspect keys (never data content), can rename columns, cannot synthesize values. +Function Pods: inspect data content (never keys), synthesize new values, cached by content. ### Two identity chains @@ -253,7 +253,7 @@ Every pipeline element has two parallel hashes: 2. pipeline_hash() — schema + topology only. Ignores data content. Used for DB path scoping so that different sources with identical schemas share database tables. -Base case: RootSource.pipeline_identity_structure() returns (tag_schema, data_schema). +Base case: RootSource.pipeline_identity_structure() returns (key_schema, data_schema). Each downstream node's pipeline hash commits to its own identity plus upstream pipeline hashes, forming a Merkle chain. @@ -261,17 +261,17 @@ hashes, forming a Merkle chain. __ prefix — System metadata (ColumnConfig meta) _source_ prefix — Source info provenance (ColumnConfig source) - _tag:: prefix — System tag (ColumnConfig system_tags) + _key:: prefix — System key (ColumnConfig system_keys) _context_key — Data context (ColumnConfig context) Prefixes are computed from SystemConstant in system_constants.py. -### System tag evolution rules +### System key evolution rules 1. Name-preserving — single-stream ops. Column name/value pass through unchanged. -2. Name-extending — multi-input ops. System tag column name gets +2. Name-extending — multi-input ops. System key column name gets ::{pipeline_hash}:{canonical_position} appended. Commutative operators sort by - pipeline_hash and sort system tag values per row. + pipeline_hash and sort system key values per row. 3. Type-evolving — aggregation ops. Column type changes from str to list[str]. ### Key patterns @@ -285,7 +285,7 @@ Prefixes are computed from SystemConstant in system_constants.py. ### Important implementation details -- ArrowTableSource raises ValueError if any tag_columns are not in the table. +- ArrowTableSource raises ValueError if any key_columns are not in the table. - ArrowTableStream requires at least one data column; raises ValueError otherwise. - FunctionNode Phase 1 returns ALL records in the shared pipeline_path DB table. Phase 2 skips inputs whose hash is already in the DB. @@ -293,4 +293,4 @@ Prefixes are computed from SystemConstant in system_constants.py. - DerivedSource before run() → raises ValueError (no computed records). - Join requires non-overlapping data columns; raises InputValidationError on collision. - MergeJoin requires colliding columns to have identical types; merges into sorted list[T]. -- Operators predict output schema (including system tag names) without computation. +- Operators predict output schema (including system key names) without computation. diff --git a/CHANGELOG.md b/CHANGELOG.md index e0d699e8..f8eece80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,37 @@ ### Breaking Changes +#### `tag` → `key` rename (hard break) + +All identifiers containing `tag`/`tags`/`Tag` have been renamed to +`key`/`keys`/`Key`. No deprecation aliases. Pre-v0.1 artifacts will not load. + +| Old name | New name | +|---|---| +| `Tag` | `Key` | +| `TagProtocol` | `KeyProtocol` | +| `TagValue` | `KeyValue` | +| `DuplicateTagError` | `DuplicateKeyError` | +| `SelectTagColumns` | `SelectKeyColumns` | +| `DropTagColumns` | `DropKeyColumns` | +| `MapTags` | `MapKeys` | +| `system_tags()` | `system_keys()` | +| `map_tags()` | `map_keys()` | +| `select_tag_columns()` | `select_key_columns()` | +| `drop_tag_columns()` | `drop_key_columns()` | +| `sort_by_tags` | `sort_by_keys` | +| `SYSTEM_TAG_PREFIX` | `SYSTEM_KEY_PREFIX` | +| `SYSTEM_TAG_PREFIX_NAME` (`"tag"`) | `SYSTEM_KEY_PREFIX_NAME` (`"key"`) | +| `SYSTEM_TAG_SOURCE_ID_PREFIX` | `SYSTEM_KEY_SOURCE_ID_PREFIX` | +| `SYSTEM_TAG_RECORD_ID_PREFIX` | `SYSTEM_KEY_RECORD_ID_PREFIX` | +| `SYSTEM_TAG_SOURCE_ID_FIELD` | `SYSTEM_KEY_SOURCE_ID_FIELD` | +| `SYSTEM_TAG_RECORD_ID_FIELD` | `SYSTEM_KEY_RECORD_ID_FIELD` | +| `ColumnConfig(system_tags=...)` | `ColumnConfig(system_keys=...)` | +| Column prefix `_tag_` | `_key_` (e.g. `_tag_source_id` → `_key_source_id`) | +| Column prefix `_tag::` | `_key::` (e.g. `_tag::source:abc` → `_key::source:abc`) | +| `src/orcapod/core/datagrams/tag_data.py` | `key_data.py` | +| `test-objective/unit/test_tag.py` | `test_key.py` | + #### `packets` → `data` rename (hard break) All identifiers containing `packet`/`packets`/`Packet` have been renamed to diff --git a/CLAUDE.md b/CLAUDE.md index bcfdb6b8..ba3711a9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -149,13 +149,13 @@ Examples: src/orcapod/ ├── types.py # Schema, ColumnConfig, ContentHash ├── system_constants.py # Column prefixes and separators -├── errors.py # InputValidationError, DuplicateTagError, FieldNotResolvableError +├── errors.py # InputValidationError, DuplicateKeyError, FieldNotResolvableError ├── config.py # Config dataclass ├── contexts/ # DataContext (semantic_hasher, arrow_hasher, type_converter) ├── protocols/ │ ├── hashing_protocols.py # PipelineElementProtocol, ContentIdentifiableProtocol │ └── core_protocols/ # StreamProtocol, PodProtocol, SourceProtocol, -│ # DataFunctionProtocol, DatagramProtocol, TagProtocol, +│ # DataFunctionProtocol, DatagramProtocol, KeyProtocol, │ # DataProtocol, TrackerProtocol ├── core/ │ ├── base.py # ContentIdentifiableBase, PipelineElementBase, TraceableBase @@ -166,7 +166,7 @@ src/orcapod/ │ ├── tracker.py # Invocation tracking │ ├── datagrams/ │ │ ├── datagram.py # Datagram (unified dict/Arrow backing, lazy conversion) -│ │ └── tag_data.py # Tag (+ system tags), Data (+ source info) +│ │ └── key_data.py # Key (+ system keys), Data (+ source info) │ ├── sources/ │ │ ├── base.py # RootSource (abstract, no upstream) │ │ ├── arrow_table_source.py # Core source — all other sources delegate to it @@ -183,15 +183,15 @@ src/orcapod/ │ ├── merge_join.py # MergeJoin (binary, colliding cols → sorted list[T]) │ ├── semijoin.py # SemiJoin (binary, non-commutative) │ ├── batch.py # Batch (group rows, types become list[T]) -│ ├── column_selection.py # Select/Drop Tag/Data columns -│ ├── mappers.py # MapTags, MapData (rename columns) +│ ├── column_selection.py # Select/Drop Key/Data columns +│ ├── mappers.py # MapKeys, MapData (rename columns) │ └── filters.py # PolarsFilter ├── hashing/ │ └── semantic_hashing/ # BaseSemanticHasher, type handlers ├── semantic_types/ # Type conversion (Python ↔ Arrow) ├── databases/ # ArrowDatabaseProtocol implementations (Delta Lake, in-memory) └── utils/ - ├── arrow_data_utils.py # System tag manipulation, source info, column helpers + ├── arrow_data_utils.py # System key manipulation, source info, column helpers ├── arrow_utils.py # Arrow table utilities ├── schema_utils.py # Schema extraction, union, intersection, compatibility └── lazy_module.py # LazyModule for deferred heavy imports @@ -221,26 +221,26 @@ See `orcapod-design.md` at the project root for the full design specification. RootSource → ArrowTableStream → [Operator / FunctionPod] → ArrowTableStream → ... ``` -Every stream is an immutable sequence of (Tag, Data) pairs backed by a PyArrow Table. -Tag columns are join keys and metadata; data columns are the data payload. +Every stream is an immutable sequence of (Key, Data) pairs backed by a PyArrow Table. +Key columns are join keys and metadata; data columns are the data payload. ### Core abstractions **Datagram** (`core/datagrams/datagram.py`) — immutable data container with lazy dict ↔ Arrow conversion. Two specializations: -- **Tag** — metadata columns + hidden system tag columns for provenance tracking +- **Key** — metadata columns + hidden system key columns for provenance tracking - **Data** — data columns + per-column source info provenance tokens -**Stream** (`core/streams/arrow_table_stream.py`) — immutable (Tag, Data) sequence. +**Stream** (`core/streams/arrow_table_stream.py`) — immutable (Key, Data) sequence. Key methods: `output_schema()`, `keys()`, `iter_data()`, `as_table()`. **Source** (`core/sources/`) — produces a stream from external data. `ArrowTableSource` is the core implementation; CSV/Delta/DataFrame/Dict/List sources all delegate to it internally. Each -source adds source-info columns and a system tag column. `DerivedSource` wraps a +source adds source-info columns and a system key column. `DerivedSource` wraps a FunctionNode/OperatorNode's DB records as a new source. **Function Pod** (`core/function_pod.py`) — wraps a `DataFunction` that transforms individual -data. Never inspects tags. Two execution models: +data. Never inspects keys. Two execution models: - `FunctionPod` → `FunctionPodStream`: lazy, in-memory - `FunctionNode`: DB-backed, two-phase (yield cached results first, then compute missing) @@ -258,7 +258,7 @@ FunctionNode. | | Operator | Function Pod | |---|---|---| | Inspects data content | Never | Yes | -| Inspects / uses tags | Yes | No | +| Inspects / uses keys | Yes | No | | Can rename columns | Yes | No | | Synthesizes new values | No | Yes | | Stream arity | Configurable | Single in, single out | @@ -272,7 +272,7 @@ Every pipeline element has two parallel hashes: 2. **`pipeline_hash()`** — schema + topology only. Ignores data content. Used for DB path scoping so that different sources with identical schemas share database tables. -Base case: `RootSource.pipeline_identity_structure()` returns `(tag_schema, data_schema)`. +Base case: `RootSource.pipeline_identity_structure()` returns `(key_schema, data_schema)`. Each downstream node's pipeline hash commits to its own identity plus the pipeline hashes of its upstreams, forming a Merkle chain. @@ -285,28 +285,28 @@ The pipeline hash uses a **resolver pattern** — `PipelineElementProtocol` obje |--------|---------|---------|---------------| | `__` | System metadata | `__data_id`, `__pod_version` | `ColumnConfig(meta=True)` | | `_source_` | Source info provenance | `_source_age` | `ColumnConfig(source=True)` | -| `_tag::` | System tag | `_tag::source:abc123` | `ColumnConfig(system_tags=True)` | +| `_key::` | System key | `_key::source:abc123` | `ColumnConfig(system_keys=True)` | | `_context_key` | Data context | `_context_key` | `ColumnConfig(context=True)` | Prefixes are computed from `SystemConstant` in `system_constants.py`. The `constants` singleton (with no global prefix) is used throughout. -### System tag evolution rules +### System key evolution rules 1. **Name-preserving** — single-stream ops (filter, select, map). Column name and value pass through unchanged. -2. **Name-extending** — multi-input ops (join, merge join). Each input's system tag column +2. **Name-extending** — multi-input ops (join, merge join). Each input's system key column name gets `::{pipeline_hash}:{canonical_position}` appended. Commutative operators - canonically order inputs by `pipeline_hash` and sort system tag values per row. + canonically order inputs by `pipeline_hash` and sort system key values per row. 3. **Type-evolving** — aggregation ops (batch). Column type changes from `str` to `list[str]`. ### Schema types and ColumnConfig `Schema` (`types.py`) — immutable `Mapping[str, DataType]` with `optional_fields` support. -`output_schema()` always returns `(tag_schema, data_schema)` as a tuple of Schemas. +`output_schema()` always returns `(key_schema, data_schema)` as a tuple of Schemas. `ColumnConfig` (`types.py`) — frozen dataclass controlling which column groups are included. -Fields: `meta`, `context`, `source`, `system_tags`, `content_hash`, `sort_by_tags`. +Fields: `meta`, `context`, `source`, `system_keys`, `content_hash`, `sort_by_keys`. Normalize via `ColumnConfig.handle_config(columns, all_info)` at the top of `output_schema()` and `as_table()` methods. `all_info=True` sets everything to True. @@ -323,7 +323,7 @@ and `as_table()` methods. `all_info=True` sets everything to True. ### Important implementation details -- `ArrowTableSource.__init__` raises `ValueError` if any `tag_columns` are not in the table. +- `ArrowTableSource.__init__` raises `ValueError` if any `key_columns` are not in the table. - `ArrowTableStream` requires at least one data column; raises `ValueError` otherwise. - `FunctionNode.iter_data()` Phase 1 returns ALL records in the shared `pipeline_path` DB table (not filtered to current inputs). Phase 2 skips inputs whose hash is already @@ -333,5 +333,5 @@ and `as_table()` methods. `all_info=True` sets everything to True. - Join requires non-overlapping data columns; raises `InputValidationError` on collision. - MergeJoin requires colliding data columns to have identical types; merges into sorted `list[T]` with source columns reordered to match. -- Operators predict their output schema (including system tag column names) without +- Operators predict their output schema (including system key column names) without performing the actual computation. diff --git a/DESIGN_ISSUES.md b/DESIGN_ISSUES.md index 1fc0ae20..b1c4f839 100644 --- a/DESIGN_ISSUES.md +++ b/DESIGN_ISSUES.md @@ -30,7 +30,7 @@ gaps rather than intentional choices: Note: merging into `TraceableBase` is correct at the *computation-node* level. `ContentIdentifiableBase` (which `TraceableBase` builds on) should **not** absorb -`PipelineElementBase` — data datagrams (`Tag`, `Data`) are legitimately content-identifiable +`PipelineElementBase` — data datagrams (`Key`, `Data`) are legitimately content-identifiable without being pipeline elements. **Fix:** Added `PipelineElementBase` to `TraceableBase`'s bases. Added @@ -193,7 +193,7 @@ duplication that diverges silently over time. ### F5 — `FunctionPodStream` and `FunctionPodNodeStream` are near-identical copy-pastes **Status:** open **Severity:** medium -`iter_data`, `as_table` (including content_hash and sort_by_tags logic), `keys`, +`iter_data`, `as_table` (including content_hash and sort_by_keys logic), `keys`, `output_schema`, `source`, and `upstreams` are duplicated almost line-for-line. The only behavioural differences are: - `FunctionPodNodeStream` has `refresh_cache()` @@ -219,7 +219,7 @@ is `self`. **Severity:** medium The method checks for an existing record with `get_record_by_id` and skips insertion if found. But it then calls `add_record(..., skip_duplicates=False)`, which will raise on a duplicate. A -race between the lookup and the insert (e.g. two concurrent processes handling the same tag+data) +race between the lookup and the insert (e.g. two concurrent processes handling the same key+data) would cause a crash instead of a graceful skip. Should use `skip_duplicates=True` for consistency with the intent. @@ -251,22 +251,22 @@ Fix: change `ValueError` to `InputValidationError`. --- -### F12 — System tag columns excluded from cache entry ID +### F12 — System key columns excluded from cache entry ID **Status:** open **Severity:** high -`FunctionPodNode.record_data_for_cache()` (line ~1077) builds a tag table for entry-ID -computation but excludes system tag columns: +`FunctionPodNode.record_data_for_cache()` (line ~1077) builds a key table for entry-ID +computation but excludes system key columns: ```python -# TODO: add system tag columns +# TODO: add system key columns ``` -Two data with identical user tags but different provenance (arriving from different -pipeline branches, thus having different system tags) produce the same cache key. This can +Two data with identical user keys but different provenance (arriving from different +pipeline branches, thus having different system keys) produce the same cache key. This can cause cache collisions where a result computed for one pipeline branch is returned for another. -Fix: include system tag columns in the `tag_with_hash` table before computing the entry ID hash. +Fix: include system key columns in the `key_with_hash` table before computing the entry ID hash. --- @@ -289,7 +289,7 @@ incomplete schema, inconsistent with `as_table()` which does include source colu **Status:** open **Severity:** medium -`as_table()` (line ~568) converts Arrow → Polars → sort → Arrow when sorting by tags: +`as_table()` (line ~568) converts Arrow → Polars → sort → Arrow when sorting by keys: ```python # TODO: reimplement using polars natively ``` @@ -307,11 +307,11 @@ even when results are already stored in the result/pipeline databases. This def of the two-database design (result DB + pipeline DB) used to cache computed outputs. **Fix:** Refactored `iter_data` to first call `FunctionPodNode.get_all_records(columns={"meta": True})` -to load already-computed (tag, output-data) pairs from the databases (mirroring the legacy +to load already-computed (key, output-data) pairs from the databases (mirroring the legacy `PodNodeStream` design), yield those via `TableStream`, then collect the set of already-processed `INPUT_PACKET_HASH` values and only call `process_data` for input data not yet in the DB. Also added `FunctionPodNode.get_all_records(columns, all_info)` using `ColumnConfig` to control -which column groups (meta, source, system_tags) are returned. +which column groups (meta, source, system_keys) are returned. --- @@ -375,7 +375,7 @@ Delegating sources make this worse: - `DeltaTableSource` sets `source_name = resolved.name` but never sets `source_id` → same issue Additionally, delegating sources all return `self._arrow_source.identity_structure()` which is -`("ArrowTableSource", tag_columns, table_hash)`. This means the outer source type (CSV, Delta, +`("ArrowTableSource", key_columns, table_hash)`. This means the outer source type (CSV, Delta, etc.) is invisible to the content hash, and `source_id` (defaulting to content hash) will be identical for a CSVSource and an ArrowTableSource with the same data. @@ -391,9 +391,9 @@ Added `computed_label()` to `RootSource` returning `_explicit_source_id`. **Status:** resolved **Severity:** high Both `FunctionPodStream.as_table()` and `FunctionPodNodeStream.as_table()` unconditionally call -`.drop([constants.CONTEXT_KEY])` on the tags table built from the accumulated data. When the +`.drop([constants.CONTEXT_KEY])` on the keys table built from the accumulated data. When the stream is empty (e.g. because the data function is inactive), `iter_data()` yields nothing, -`tag_schema` stays `None`, and `pa.Table.from_pylist([], schema=None)` produces a zero-column +`key_schema` stays `None`, and `pa.Table.from_pylist([], schema=None)` produces a zero-column table. The subsequent `.drop([constants.CONTEXT_KEY])` then raises `KeyError` because the column does not exist. @@ -439,7 +439,7 @@ Relevant for future streaming/chunked processing of large datasets. **Status:** open **Severity:** medium -`SelectTagColumns`, `SelectDataColumns`, `DropTagColumns`, `DropDataColumns` (in +`SelectKeyColumns`, `SelectDataColumns`, `DropKeyColumns`, `DropDataColumns` (in `column_selection.py:58`, `137`, `214`, `292`) and `PolarsFilterByDataColumns` (`filters.py:135`) each have near-identical `validate_unary_input()` implementations. All are marked: @@ -447,7 +447,7 @@ marked: # TODO: remove redundant logic ``` -The only difference between them is which key set (tag vs. data) is checked and the error +The only difference between them is which key set (key vs. data) is checked and the error message text. A shared parameterized validation helper would eliminate the duplication. --- @@ -469,14 +469,14 @@ Three categories of improvement are planned: independently: - ~~`PolarsFilter` — evaluate predicate per row, emit or drop immediately~~ (kept barrier: Polars expressions require DataFrame context for evaluation) - - `MapTags` / `MapData` — rename columns per row, emit immediately ✅ - - `SelectTagColumns` / `SelectDataColumns` — project columns per row, emit immediately ✅ - - `DropTagColumns` / `DropDataColumns` — drop columns per row, emit immediately ✅ + - `MapKeys` / `MapData` — rename columns per row, emit immediately ✅ + - `SelectKeyColumns` / `SelectDataColumns` — project columns per row, emit immediately ✅ + - `DropKeyColumns` / `DropDataColumns` — drop columns per row, emit immediately ✅ 2. **Incremental overrides (stateful, eager emit)** — for multi-input operators that can produce partial results before all inputs are consumed: - `Join` — symmetric hash join for 2 inputs (streaming, with correct - system-tag name-extending via `input_pipeline_hashes` passed directly + system-key name-extending via `input_pipeline_hashes` passed directly to `async_execute`); barrier fallback for N>2 inputs via `static_process`. ✅ - `MergeJoin` — kept barrier: complex column-merging logic - `SemiJoin` — build right, stream left through hash lookup ✅ @@ -487,7 +487,7 @@ Three categories of improvement are planned: **Remaining:** `PolarsFilter` (barrier), `MergeJoin` (barrier) could receive incremental overrides in the future but require careful handling of Polars expression evaluation and -system-tag evolution respectively. +system-key evolution respectively. --- @@ -509,10 +509,10 @@ A naïve decomposition into `FunctionPod + Join` works but has unnecessary overh 1. **Materialization waste** — FunctionPod produces an intermediate stream that is only created to be immediately joined back. AddResult can compute new columns and merge them into the original data in a single pass, with no intermediate stream. -2. **Redundant tag matching** — Join must re-match tags that trivially correspond (they came - from the same input row). AddResult already holds the (tag, data) pair and can skip the +2. **Redundant key matching** — Join must re-match keys that trivially correspond (they came + from the same input row). AddResult already holds the (key, data) pair and can skip the matching entirely. -3. **Simpler async path** — streams row-by-row like FunctionPod: read (tag, data), call +3. **Simpler async path** — streams row-by-row like FunctionPod: read (key, data), call the data function, merge original data columns + new columns, emit. No broadcast, passthrough channel, or rejoin wiring needed. @@ -561,7 +561,7 @@ await AddResult(grade_pf).async_execute([input_ch], output_ch) #### Implementation notes -- `output_schema()` returns `(input_tag_schema, input_data_schema | function_output_schema)` +- `output_schema()` returns `(input_key_schema, input_data_schema | function_output_schema)` — the union of original data columns and new computed columns. - Must raise `InputValidationError` if function output keys collide with existing data column names (same constraint as Join on overlapping data columns). diff --git a/README.md b/README.md index 39df8081..8bb5bd19 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Orcapod's Python library for developing reproducbile scientific pipelines. ## Releasing -To cut a release, tag a commit on `main` — `hatch-vcs` derives the version +To cut a release, key a commit on `main` — `hatch-vcs` derives the version automatically and CI publishes to PyPI. See [RELEASING.md](RELEASING.md) for the full workflow. @@ -112,24 +112,24 @@ While the following is subject to change based on future development, it represe │ ctx_observer = obs.contextualize(node_hash, node_label) │ ctx_observer.on_node_start(node_label, node_hash) │ - │ for each non-cached (tag, data): + │ for each non-cached (key, data): │ - │ ctx_observer.on_data_start(node_label, tag, data) + │ ctx_observer.on_data_start(node_label, key, data) │ - ├─► pkt_logger = ctx_observer.create_data_logger(tag, data, pipeline_path=...) + ├─► pkt_logger = ctx_observer.create_data_logger(key, data, pipeline_path=...) │ │ │ └─► _ContextualizedLoggingObserver creates a DataLogger bound to - │ (run_id, node_label, node_hash, tag_data, log_path) + │ (run_id, node_label, node_hash, key_data, log_path) │ - ├─► FunctionNode._process_data_internal(tag, data, logger=pkt_logger) + ├─► FunctionNode._process_data_internal(key, data, logger=pkt_logger) │ │ - │ ├─► CachedFunctionPod.process_data(tag, data, logger=pkt_logger) + │ ├─► CachedFunctionPod.process_data(key, data, logger=pkt_logger) │ │ │ │ │ │ checks pod-level cache (ResultCache.lookup) - │ │ │ cache hit? → return (tag, cached_data) + │ │ │ cache hit? → return (key, cached_data) │ │ │ cache miss ↓ │ │ │ - │ │ ├─► _FunctionPodBase.process_data(tag, data, logger=pkt_logger) + │ │ ├─► _FunctionPodBase.process_data(key, data, logger=pkt_logger) │ │ │ │ │ │ │ ├─► PythonDataFunction.call(data, logger=pkt_logger) │ │ │ │ │ @@ -168,27 +168,27 @@ While the following is subject to change based on future development, it represe │ │ │ │ │ │ │ │ │ └─► returns Data | None (or raises) │ │ │ │ - │ │ │ └─► returns (tag, Data | None) + │ │ │ └─► returns (key, Data | None) │ │ │ │ │ │ stores result in pod-level cache (on success) │ │ │ - │ │ └─► returns (tag, Data | None) + │ │ └─► returns (key, Data | None) │ │ │ │ writes pipeline provenance record (on success) │ │ caches result internally │ │ - │ └─► returns (tag, Data | None) + │ └─► returns (key, Data | None) │ - │ ← back in FunctionNode.execute() with (tag_out, result) + │ ← back in FunctionNode.execute() with (key_out, result) │ │ (logger.record already called inside the executor — nothing to do here) │ ├─► try/except around _process_data_internal: │ on success: - │ ctx_observer.on_data_end(node_label, tag, data, result, cached=False) - │ emit (tag_out, result) downstream + │ ctx_observer.on_data_end(node_label, key, data, result, cached=False) + │ emit (key_out, result) downstream │ on exception: - │ ctx_observer.on_data_crash(node_label, tag, data, exc) + │ ctx_observer.on_data_crash(node_label, key, data, exc) │ if error_policy == "fail_fast": raise │ otherwise: skip this data, continue │ @@ -204,7 +204,7 @@ While the following is subject to change based on future development, it represe Execution output columns (from **kwargs, prefixed with "`__`"): __stdout, __stderr, __python_logs, __traceback, __success (or any other fields the executor passes — protocol is generic) - Tag columns (unprefixed, from tag_data baked in at creation): + Key columns (unprefixed, from key_data baked in at creation): e.g. "idx" → "0", "key" → "a" Writes the row to the database at the mirrored log path. @@ -222,4 +222,4 @@ Writes the row to the database at the mirrored log path. identify the node). - No auto-executor in the class — DataFunctionBase.__init__ does not assign a default executor. Pipeline.compile() assigns LocalExecutor to function nodes that have none. Users can override per-node (pipeline.node.executor = ...) or globally via pipeline.run(execution_engine=...). - - Log columns use __ prefix — fixed columns (__log_id, __stdout, __success, etc.) are prefixed to avoid collision with user-defined tag column names. \ No newline at end of file + - Log columns use __ prefix — fixed columns (__log_id, __stdout, __success, etc.) are prefixed to avoid collision with user-defined key column names. \ No newline at end of file diff --git a/RELEASING.md b/RELEASING.md index 2f6606eb..a6f4c6c0 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -12,18 +12,18 @@ This document describes how to cut a release of `orcapod` to PyPI. 1. **Merge your branch into `main`** — open a PR, get it reviewed, merge it. -2. **Tag the commit on `main`** — the version is derived automatically from the git - tag by `hatch-vcs` (`dynamic = ["version"]` in `pyproject.toml`). No manual +2. **Key the commit on `main`** — the version is derived automatically from the git + key by `hatch-vcs` (`dynamic = ["version"]` in `pyproject.toml`). No manual version bump is needed. ```bash git checkout main git pull origin main - git tag v0.1.0 # or v0.1.0rc1 for a pre-release + git key v0.1.0 # or v0.1.0rc1 for a pre-release git push origin v0.1.0 ``` -3. **CI takes over** — pushing the tag triggers the publish workflow +3. **CI takes over** — pushing the key triggers the publish workflow (`.github/workflows/publish.yml`): ``` @@ -40,9 +40,9 @@ the stable vs pre-release distinction natively: - `pip install orcapod` — installs the latest **stable** release only - `pip install --pre orcapod` — installs the latest release including pre-releases -## Tag Format +## Key Format -| Release type | Tag format | Example | +| Release type | Key format | Example | |-------------|------------|---------| | Stable | `vMAJOR.MINOR.PATCH` | `v0.1.0` | | Release candidate | `vMAJOR.MINOR.PATCHrcN` | `v0.1.0rc1` | diff --git a/TESTING_PLAN.md b/TESTING_PLAN.md index e43b6d90..6d250247 100644 --- a/TESTING_PLAN.md +++ b/TESTING_PLAN.md @@ -29,7 +29,7 @@ test-objective/ │ ├── __init__.py │ ├── test_types.py # Schema, ColumnConfig, ContentHash │ ├── test_datagram.py # Datagram core behavior -│ ├── test_tag.py # Tag (system tags, ColumnConfig filtering) +│ ├── test_key.py # Key (system keys, ColumnConfig filtering) │ ├── test_data.py # Data (source info, provenance) │ ├── test_stream.py # ArrowTableStream construction & iteration │ ├── test_sources.py # All source types + error conditions @@ -42,7 +42,7 @@ test-objective/ │ ├── test_databases.py # InMemory, DeltaLake, NoOp databases │ ├── test_schema_utils.py # Schema extraction, union, intersection │ ├── test_arrow_utils.py # Arrow table/schema utilities -│ ├── test_arrow_data_utils.py # System tags, source info, column helpers +│ ├── test_arrow_data_utils.py # System keys, source info, column helpers │ ├── test_semantic_types.py # UniversalTypeConverter, SemanticTypeRegistry │ ├── test_contexts.py # DataContext resolution, validation │ ├── test_tracker.py # BasicTrackerManager, GraphTracker @@ -52,7 +52,7 @@ test-objective/ │ ├── test_pipeline_flows.py # End-to-end pipeline scenarios │ ├── test_caching_flows.py # DB-backed caching (FunctionNode, OperatorNode) │ ├── test_hash_invariants.py # Hash stability & Merkle chain properties -│ ├── test_provenance.py # System tag lineage through pipelines +│ ├── test_provenance.py # System key lineage through pipelines │ └── test_column_config_filtering.py # ColumnConfig behavior across all components └── property/ ├── __init__.py @@ -156,19 +156,19 @@ test-objective/ - `test_datagram_content_hash_changes_with_data` — different data → different hash - `test_datagram_equality_by_content` — equal content → equal datagrams -### 3. `test_tag.py` — Tag +### 3. `test_key.py` — Key -- `test_tag_construction_with_system_tags` — system tags stored separately from data -- `test_tag_system_tags_excluded_from_default_keys` — keys() doesn't show system tags -- `test_tag_system_tags_included_with_column_config` — keys(columns={"system_tags": True}) shows them -- `test_tag_as_dict_excludes_system_tags_by_default` — as_dict() only has data -- `test_tag_as_dict_all_info_includes_system_tags` — as_dict(all_info=True) has everything -- `test_tag_as_table_excludes_system_tags_by_default` -- `test_tag_as_table_all_info_includes_system_tags` -- `test_tag_schema_excludes_system_tags_by_default` -- `test_tag_copy_preserves_system_tags` — copy() includes system tags -- `test_tag_as_datagram_conversion` — as_datagram() returns Datagram (not Tag) -- `test_tag_system_tags_method_returns_copy` — system_tags() returns dict copy, not reference +- `test_key_construction_with_system_keys` — system keys stored separately from data +- `test_key_system_keys_excluded_from_default_keys` — keys() doesn't show system keys +- `test_key_system_keys_included_with_column_config` — keys(columns={"system_keys": True}) shows them +- `test_key_as_dict_excludes_system_keys_by_default` — as_dict() only has data +- `test_key_as_dict_all_info_includes_system_keys` — as_dict(all_info=True) has everything +- `test_key_as_table_excludes_system_keys_by_default` +- `test_key_as_table_all_info_includes_system_keys` +- `test_key_schema_excludes_system_keys_by_default` +- `test_key_copy_preserves_system_keys` — copy() includes system keys +- `test_key_as_datagram_conversion` — as_datagram() returns Datagram (not Key) +- `test_key_system_keys_method_returns_copy` — system_keys() returns dict copy, not reference ### 4. `test_data.py` — Data @@ -186,23 +186,23 @@ test-objective/ ### 5. `test_stream.py` — ArrowTableStream **Construction:** -- `test_stream_from_table_with_tag_columns` — tag/data column separation +- `test_stream_from_table_with_key_columns` — key/data column separation - `test_stream_requires_at_least_one_data_column` — ValueError if no data columns -- `test_stream_with_system_tag_columns` — system tag columns tracked +- `test_stream_with_system_key_columns` — system key columns tracked - `test_stream_with_source_info` — source info attached to data columns - `test_stream_with_producer` — producer property set - `test_stream_with_upstreams` — upstreams tuple set **Schema & Keys:** -- `test_stream_keys_returns_tag_and_data_keys` — tuple of (tag_keys, data_keys) -- `test_stream_output_schema_returns_two_schemas` — (tag_schema, data_schema) +- `test_stream_keys_returns_key_and_data_keys` — tuple of (key_keys, data_keys) +- `test_stream_output_schema_returns_two_schemas` — (key_schema, data_schema) - `test_stream_schema_matches_actual_data` — output_schema() types match as_table() types - `test_stream_keys_with_column_config` — ColumnConfig filtering works **Iteration:** -- `test_stream_iter_data_yields_tag_data_pairs` — each yield is (Tag, Data) +- `test_stream_iter_data_yields_key_data_pairs` — each yield is (Key, Data) - `test_stream_iter_data_count_matches_rows` — number of yields = number of rows -- `test_stream_iter_data_tag_keys_correct` — tag column names match +- `test_stream_iter_data_key_keys_correct` — key column names match - `test_stream_iter_data_data_keys_correct` — data column names match - `test_stream_as_table_matches_iter_data` — table materialization consistent with iteration @@ -219,22 +219,22 @@ test-objective/ **ArrowTableSource:** - `test_arrow_source_from_valid_table` — normal construction succeeds - `test_arrow_source_empty_table_raises` — ValueError("Table is empty") -- `test_arrow_source_missing_tag_column_raises` — ValueError if tag_columns not in table -- `test_arrow_source_adds_system_tag_column` — system tag column added automatically +- `test_arrow_source_missing_key_column_raises` — ValueError if key_columns not in table +- `test_arrow_source_adds_system_key_column` — system key column added automatically - `test_arrow_source_adds_source_info_columns` — _source_ columns added - `test_arrow_source_source_id_set` — source_id property populated - `test_arrow_source_producer_is_none` — root sources have no producer - `test_arrow_source_upstreams_empty` — root sources have no upstreams - `test_arrow_source_resolve_field_by_record_id` — resolves field value - `test_arrow_source_resolve_field_missing_raises` — FieldNotResolvableError -- `test_arrow_source_pipeline_identity_structure` — returns (tag_schema, data_schema) +- `test_arrow_source_pipeline_identity_structure` — returns (key_schema, data_schema) - `test_arrow_source_iter_data_yields_correct_pairs` - `test_arrow_source_as_table_has_all_columns` **DictSource:** - `test_dict_source_from_dict_of_lists` — constructs correctly - `test_dict_source_delegates_to_arrow_table_source` — same behavior as ArrowTableSource -- `test_dict_source_with_tag_columns` +- `test_dict_source_with_key_columns` **ListSource:** - `test_list_source_from_list_of_dicts` — constructs correctly @@ -242,7 +242,7 @@ test-objective/ **CSVSource:** - `test_csv_source_from_file` — reads CSV correctly -- `test_csv_source_with_tag_columns` +- `test_csv_source_with_key_columns` **DataFrameSource:** - `test_dataframe_source_from_polars` — constructs from Polars DataFrame @@ -304,7 +304,7 @@ test-objective/ - `test_function_pod_validate_inputs_multiple_raises` — rejects multiple streams - `test_function_pod_output_schema_prediction` — output_schema() matches actual output - `test_function_pod_callable_alias` — __call__ same as process() -- `test_function_pod_never_modifies_tags` — tags pass through unchanged +- `test_function_pod_never_modifies_keys` — keys pass through unchanged - `test_function_pod_transforms_data` — data are transformed by function **FunctionPodStream:** @@ -322,55 +322,55 @@ test-objective/ ### 10. `test_operators.py` — All Operators **Join (N-ary, commutative):** -- `test_join_two_streams_on_common_tags` — inner join on shared tag columns +- `test_join_two_streams_on_common_keys` — inner join on shared key columns - `test_join_non_overlapping_data_columns_required` — InputValidationError on collision - `test_join_commutative` — join(A, B) == join(B, A) (same rows regardless of order) - `test_join_three_or_more_streams` — N-ary join works -- `test_join_empty_result_when_no_matches` — disjoint tags → empty stream -- `test_join_system_tag_name_extending` — system tag columns get ::pipeline_hash:position suffix -- `test_join_system_tag_values_sorted_for_commutativity` — canonical ordering of tag values +- `test_join_empty_result_when_no_matches` — disjoint keys → empty stream +- `test_join_system_key_name_extending` — system key columns get ::pipeline_hash:position suffix +- `test_join_system_key_values_sorted_for_commutativity` — canonical ordering of key values - `test_join_output_schema_prediction` — output_schema() matches actual output **MergeJoin (binary):** - `test_merge_join_colliding_columns_become_sorted_lists` — same-name data cols → list[T] - `test_merge_join_requires_identical_types` — different types raise error - `test_merge_join_non_colliding_columns_pass_through` — unmatched columns kept as-is -- `test_merge_join_system_tag_name_extending` +- `test_merge_join_system_key_name_extending` - `test_merge_join_output_schema_prediction` — predicts list[T] types correctly **SemiJoin (binary, non-commutative):** -- `test_semijoin_filters_left_by_right_tags` — keeps left rows matching right tags +- `test_semijoin_filters_left_by_right_keys` — keeps left rows matching right keys - `test_semijoin_non_commutative` — semijoin(A, B) != semijoin(B, A) in general - `test_semijoin_preserves_left_data_columns` — right data columns dropped -- `test_semijoin_system_tag_name_extending` +- `test_semijoin_system_key_name_extending` **Batch:** -- `test_batch_groups_rows` — groups rows by tag, aggregates data +- `test_batch_groups_rows` — groups rows by key, aggregates data - `test_batch_types_become_lists` — data column types become list[T] -- `test_batch_system_tag_type_evolving` — system tag type becomes list[str] +- `test_batch_system_key_type_evolving` — system key type becomes list[str] - `test_batch_with_batch_size` — batch_size limits group size - `test_batch_drop_partial_batch` — drop_partial_batch=True drops incomplete groups - `test_batch_output_schema_prediction` — predicts list[T] types -**Column Selection (Select/Drop Tag/Data):** -- `test_select_tag_columns` — keeps only specified tag columns -- `test_select_tag_columns_strict_missing_raises` — strict=True raises on missing column +**Column Selection (Select/Drop Key/Data):** +- `test_select_key_columns` — keeps only specified key columns +- `test_select_key_columns_strict_missing_raises` — strict=True raises on missing column - `test_select_data_columns` — keeps only specified data columns -- `test_drop_tag_columns` — removes specified tag columns +- `test_drop_key_columns` — removes specified key columns - `test_drop_data_columns` — removes specified data columns -- `test_column_selection_system_tag_name_preserving` — system tags unchanged +- `test_column_selection_system_key_name_preserving` — system keys unchanged -**MapTags/MapData:** -- `test_map_tags_renames_tag_columns` — renames specified tag columns -- `test_map_tags_drop_unmapped` — drop_unmapped=True removes unrenamed columns +**MapKeys/MapData:** +- `test_map_keys_renames_key_columns` — renames specified key columns +- `test_map_keys_drop_unmapped` — drop_unmapped=True removes unrenamed columns - `test_map_data_renames_data_columns` -- `test_map_preserves_system_tags` — system tag columns unchanged (name-preserving) +- `test_map_preserves_system_keys` — system key columns unchanged (name-preserving) **PolarsFilter:** - `test_polars_filter_with_predicate` — filters rows matching predicate - `test_polars_filter_with_constraints` — filters by column=value constraints - `test_polars_filter_preserves_schema` — output schema same as input -- `test_polars_filter_system_tag_name_preserving` +- `test_polars_filter_system_key_name_preserving` **Operator Base Classes:** - `test_unary_operator_rejects_multiple_inputs` — validate_inputs raises for >1 stream @@ -381,7 +381,7 @@ test-objective/ **FunctionNode:** - `test_function_node_iter_data` — iterates and transforms all data -- `test_function_node_process_data` — transforms single (tag, data) pair +- `test_function_node_process_data` — transforms single (key, data) pair - `test_function_node_producer_is_function_pod` - `test_function_node_upstreams` - `test_function_node_clear_cache` @@ -486,13 +486,13 @@ test-objective/ - `test_check_arrow_schema_compatibility` — compatible schemas pass - `test_split_by_column_groups` — splits table into multiple tables -### 16. `test_arrow_data_utils.py` — System Tags & Source Info +### 16. `test_arrow_data_utils.py` — System Keys & Source Info -- `test_add_system_tag_columns` — adds _tag:: prefixed columns -- `test_add_system_tag_columns_empty_table_raises` — ValueError -- `test_add_system_tag_columns_length_mismatch_raises` — ValueError -- `test_append_to_system_tags` — extends existing system tag values -- `test_sort_system_tag_values` — canonical sorting for commutativity +- `test_add_system_key_columns` — adds _key:: prefixed columns +- `test_add_system_key_columns_empty_table_raises` — ValueError +- `test_add_system_key_columns_length_mismatch_raises` — ValueError +- `test_append_to_system_keys` — extends existing system key values +- `test_sort_system_key_values` — canonical sorting for commutativity - `test_add_source_info` — adds _source_ prefixed columns - `test_drop_columns_with_prefix` — removes columns matching prefix - `test_drop_system_columns` — removes __ and __ prefixed columns @@ -542,7 +542,7 @@ test-objective/ - `test_source_to_stream_to_single_operator` — Source → Filter → Stream - `test_source_to_function_pod` — Source → FunctionPod → Stream with transformed data - `test_multi_source_join` — Two sources → Join → Stream with combined data -- `test_chained_operators` — Source → Filter → Select → MapTags → Stream +- `test_chained_operators` — Source → Filter → Select → MapKeys → Stream - `test_function_pod_then_operator` — Source → FunctionPod → Filter → Stream - `test_join_then_batch` — Two sources → Join → Batch → Stream - `test_semijoin_filters_correctly` — Source A semi-joined with Source B @@ -569,20 +569,20 @@ test-objective/ - `test_commutative_join_pipeline_hash_order_independent` — join(A,B) pipeline_hash == join(B,A) - `test_non_commutative_semijoin_pipeline_hash_order_dependent` — semijoin(A,B) != semijoin(B,A) -### `test_provenance.py` — System Tag Lineage Tracking +### `test_provenance.py` — System Key Lineage Tracking -- `test_source_creates_system_tag_column` — source adds _tag::source:hash column -- `test_unary_operator_preserves_system_tags` — filter/select/map: name+value unchanged -- `test_join_extends_system_tag_names` — multi-input: column names get ::hash:pos suffix -- `test_join_sorts_system_tag_values` — commutative ops sort tag values -- `test_batch_evolves_system_tag_type` — batch: str → list[str] +- `test_source_creates_system_key_column` — source adds _key::source:hash column +- `test_unary_operator_preserves_system_keys` — filter/select/map: name+value unchanged +- `test_join_extends_system_key_names` — multi-input: column names get ::hash:pos suffix +- `test_join_sorts_system_key_values` — commutative ops sort key values +- `test_batch_evolves_system_key_type` — batch: str → list[str] - `test_full_pipeline_provenance_chain` — source → join → filter → batch: all rules applied ### `test_column_config_filtering.py` — ColumnConfig Across All Components - `test_datagram_column_config_meta` — meta=True includes __ columns - `test_datagram_column_config_data_only` — all False = data columns only -- `test_tag_column_config_system_tags` — system_tags=True includes _tag:: columns +- `test_key_column_config_system_keys` — system_keys=True includes _key:: columns - `test_data_column_config_source` — source=True includes _source_ columns - `test_stream_column_config_all_info` — all_info=True on keys/output_schema/as_table - `test_stream_column_config_consistency` — keys(), output_schema(), as_table() all respect same config @@ -619,7 +619,7 @@ test-objective/ ### Recommended additions (not implemented in this PR, but suggested): 3. **Mutation testing** with `mutmut` — run `uv run mutmut run --paths-to-mutate=src/orcapod/ --tests-dir=test-objective/` to verify tests catch code mutations. A surviving mutant indicates a test gap -4. **Metamorphic testing** — "if I add a row to source A that matches source B's tags, the join output should have one more row" — tests relationships between inputs/outputs without knowing exact expected values +4. **Metamorphic testing** — "if I add a row to source A that matches source B's keys, the join output should have one more row" — tests relationships between inputs/outputs without knowing exact expected values 5. **Protocol conformance automation** — use `runtime_checkable` protocols and `isinstance` checks to verify every concrete class satisfies its protocol at import time 6. **Specification oracle** — for each documented behavior in `orcapod-design.md`, create a test that constructs the exact scenario described and verifies the documented outcome 7. **Fuzz testing** — feed malformed inputs (wrong types, extreme sizes, Unicode edge cases) to constructors and verify graceful error handling @@ -630,7 +630,7 @@ test-objective/ 1. **`conftest.py`** — shared fixtures (reusable sources, streams, data functions, databases) 2. **`unit/test_types.py`** — foundational types (Schema, ContentHash, ColumnConfig) -3. **`unit/test_datagram.py`**, **`test_tag.py`**, **`test_data.py`** — data containers +3. **`unit/test_datagram.py`**, **`test_key.py`**, **`test_data.py`** — data containers 4. **`unit/test_stream.py`** — stream construction and iteration 5. **`unit/test_sources.py`** + **`test_source_registry.py`** — all source types 6. **`unit/test_hashing.py`** — semantic hasher and handlers diff --git a/design/async-execution-implementation-plan.md b/design/async-execution-implementation-plan.md index 6b6dba23..390baec0 100644 --- a/design/async-execution-implementation-plan.md +++ b/design/async-execution-implementation-plan.md @@ -67,9 +67,9 @@ sync execution — this just makes every node async-capable. **New file:** `src/orcapod/core/execution/materialization.py` -- `materialize_to_stream(rows: list[tuple[TagProtocol, DataProtocol]]) -> ArrowTableStream` - — converts a list of (tag, data) pairs back into an ArrowTableStream -- `stream_to_rows(stream: StreamProtocol) -> list[tuple[TagProtocol, DataProtocol]]` +- `materialize_to_stream(rows: list[tuple[KeyProtocol, DataProtocol]]) -> ArrowTableStream` + — converts a list of (key, data) pairs back into an ArrowTableStream +- `stream_to_rows(stream: StreamProtocol) -> list[tuple[KeyProtocol, DataProtocol]]` — the inverse (thin wrapper around `iter_data`) **Tests:** `tests/test_core/test_execution/test_materialization.py` @@ -166,8 +166,8 @@ Each step is independent — can be done in any order or in parallel. **Modify:** `src/orcapod/core/operators/column_selection.py` -- Override `async_execute` on `SelectTagColumns`, `SelectDataColumns`, - `DropTagColumns`, `DropDataColumns` +- Override `async_execute` on `SelectKeyColumns`, `SelectDataColumns`, + `DropKeyColumns`, `DropDataColumns` - Each: iterate input, project/drop columns per row, emit **Tests:** `tests/test_core/test_execution/test_streaming_operators.py` @@ -178,7 +178,7 @@ Each step is independent — can be done in any order or in parallel. **Modify:** `src/orcapod/core/operators/mappers.py` -- Override `async_execute` on `MapTags`, `MapData` +- Override `async_execute` on `MapKeys`, `MapData` - Each: iterate input, rename columns per row, emit **Tests:** added to `test_streaming_operators.py` @@ -199,7 +199,7 @@ Each step is independent — can be done in any order or in parallel. - Override `async_execute` with symmetric hash join - Concurrent consumption of all inputs via TaskGroup - Per-row index probing and immediate emission -- System tag extension logic (reuse existing `_extend_system_tag_columns` logic) +- System key extension logic (reuse existing `_extend_system_key_columns` logic) **Tests:** `tests/test_core/test_execution/test_incremental_join.py` - Same result set as sync join (order may differ, compare as sets) @@ -312,7 +312,7 @@ Phase 5 depends on everything above. | Risk | Mitigation | |---|---| -| Row ordering differs between sync/async | Document clearly; `sort_by_tags` provides determinism | +| Row ordering differs between sync/async | Document clearly; `sort_by_keys` provides determinism | | Incremental Join correctness | Extensive property-based tests comparing to sync | | Deadlocks from channel misuse | Strict rule: every node MUST close output channel | | Per-row Datagram operations are slow | Benchmark; fall back to barrier if perf regresses | diff --git a/design/async-execution-system.md b/design/async-execution-system.md index e5e47470..0f2a3220 100644 --- a/design/async-execution-system.md +++ b/design/async-execution-system.md @@ -41,11 +41,11 @@ Every pipeline node — source, operator, or function pod — implements a singl class AsyncExecutableProtocol(Protocol): async def async_execute( self, - inputs: Sequence[ReadableChannel[tuple[TagProtocol, DataProtocol]]], - output: WritableChannel[tuple[TagProtocol, DataProtocol]], + inputs: Sequence[ReadableChannel[tuple[KeyProtocol, DataProtocol]]], + output: WritableChannel[tuple[KeyProtocol, DataProtocol]], ) -> None: """ - Consume (tag, data) pairs from input channels, produce to output channel. + Consume (key, data) pairs from input channels, produce to output channel. MUST close output channel when done (signals completion to downstream). """ ... @@ -128,31 +128,31 @@ in **when** the node reads, **how much** it buffers, and **when** it emits. ### 1. Streaming (Row-by-Row) -**Applies to:** Filter, MapTags, MapData, Select/Drop columns, FunctionPod +**Applies to:** Filter, MapKeys, MapData, Select/Drop columns, FunctionPod Zero buffering. Each input row is independently transformed and emitted immediately. ```python # Example: PolarsFilter async def async_execute(self, inputs, output): - async for tag, data in inputs[0]: - if self._evaluate_predicate(tag, data): - await output.send((tag, data)) + async for key, data in inputs[0]: + if self._evaluate_predicate(key, data): + await output.send((key, data)) await output.close() # Example: FunctionPod with concurrency control async def async_execute(self, inputs, output): sem = asyncio.Semaphore(self.node_config.max_concurrency or _INF) - async def process_one(tag, data): + async def process_one(key, data): async with sem: result = await self.data_function.async_call(data) if result is not None: - await output.send((tag, result)) + await output.send((key, result)) async with asyncio.TaskGroup() as tg: - async for tag, data in inputs[0]: - tg.create_task(process_one(tag, data)) + async for key, data in inputs[0]: + tg.create_task(process_one(key, data)) await output.close() ``` @@ -169,14 +169,14 @@ async def async_execute(self, inputs, output): indexes: list[dict[JoinKey, list[Row]]] = [{} for _ in inputs] async def consume(i: int, channel): - async for tag, data in channel: - key = self._extract_join_key(tag) - indexes[i].setdefault(key, []).append((tag, data)) + async for key, data in channel: + key = self._extract_join_key(key) + indexes[i].setdefault(key, []).append((key, data)) # Probe all OTHER indexes for matches other_lists = [indexes[j].get(key, []) for j in range(len(inputs)) if j != i] for combo in itertools.product(*other_lists): - joined = self._merge_rows((tag, data), *combo) + joined = self._merge_rows((key, data), *combo) await output.send(joined) async with asyncio.TaskGroup() as tg: @@ -194,15 +194,15 @@ async def async_execute(self, inputs, output): # Phase 1: Build right-side index right_keys = set() - async for tag, data in right: - key = self._extract_join_key(tag) + async for key, data in right: + key = self._extract_join_key(key) right_keys.add(key) # Phase 2: Stream left, emit matches - async for tag, data in left: - key = self._extract_join_key(tag) + async for key, data in left: + key = self._extract_join_key(key) if key in right_keys: - await output.send((tag, data)) + await output.send((key, data)) await output.close() ``` @@ -224,8 +224,8 @@ async def async_execute(self, inputs, output): result_stream = self.static_process(*streams) # Phase 3: Emit results asynchronously - for tag, data in result_stream.iter_data(): - await output.send((tag, data)) + for key, data in result_stream.iter_data(): + await output.send((key, data)) await output.close() ``` @@ -249,8 +249,8 @@ class UnaryOperator(StaticOutputPod): rows = await inputs[0].collect() stream = self._materialize_to_stream(rows) result = self.static_process(stream) - for tag, data in result.iter_data(): - await output.send((tag, data)) + for key, data in result.iter_data(): + await output.send((key, data)) await output.close() @@ -262,8 +262,8 @@ class BinaryOperator(StaticOutputPod): left_stream = self._materialize_to_stream(left_rows) right_stream = self._materialize_to_stream(right_rows) result = self.static_process(left_stream, right_stream) - for tag, data in result.iter_data(): - await output.send((tag, data)) + for key, data in result.iter_data(): + await output.send((key, data)) await output.close() @@ -272,8 +272,8 @@ class NonZeroInputOperator(StaticOutputPod): all_rows = await asyncio.gather(*(ch.collect() for ch in inputs)) streams = [self._materialize_to_stream(rows) for rows in all_rows] result = self.static_process(*streams) - for tag, data in result.iter_data(): - await output.send((tag, data)) + for key, data in result.iter_data(): + await output.send((key, data)) await output.close() ``` @@ -290,15 +290,15 @@ class FunctionPod: async def async_execute(self, inputs, output): sem = asyncio.Semaphore(self.node_config.max_concurrency or _INF) - async def process_one(tag, data): + async def process_one(key, data): async with sem: result_data = await self.data_function.async_call(data) if result_data is not None: - await output.send((tag, result_data)) + await output.send((key, result_data)) async with asyncio.TaskGroup() as tg: - async for tag, data in inputs[0]: - tg.create_task(process_one(tag, data)) + async for key, data in inputs[0]: + tg.create_task(process_one(key, data)) await output.close() ``` @@ -311,22 +311,22 @@ class FunctionNode: async def async_execute(self, inputs, output): sem = asyncio.Semaphore(self.node_config.max_concurrency or _INF) - async def process_one(tag, data): + async def process_one(key, data): cache_key = self._compute_cache_key(data) cached = await self._db_lookup(cache_key) if cached is not None: - await output.send((tag, cached)) + await output.send((key, cached)) return async with sem: result = await self.data_function.async_call(data) await self._db_store(cache_key, result) if result is not None: - await output.send((tag, result)) + await output.send((key, result)) async with asyncio.TaskGroup() as tg: - async for tag, data in inputs[0]: - tg.create_task(process_one(tag, data)) + async for key, data in inputs[0]: + tg.create_task(process_one(key, data)) await output.close() ``` @@ -429,8 +429,8 @@ Sources have no input channels — they just push their data onto the output cha class SourceNode: async def async_execute(self, inputs, output): # inputs is empty for sources - for tag, data in self.stream.iter_data(): - await output.send((tag, data)) + for key, data in self.stream.iter_data(): + await output.send((key, data)) await output.close() ``` @@ -447,9 +447,9 @@ while allowing each consumer to read at its own pace. | Operator | Default Strategy | Async Override? | |---|---|---| | PolarsFilter | Barrier (inherited) | **Streaming** — evaluate predicate per row | -| MapTags / MapData | Barrier (inherited) | **Streaming** — rename per row | -| SelectTagColumns / SelectDataColumns | Barrier (inherited) | **Streaming** — project per row | -| DropTagColumns / DropDataColumns | Barrier (inherited) | **Streaming** — project per row | +| MapKeys / MapData | Barrier (inherited) | **Streaming** — rename per row | +| SelectKeyColumns / SelectDataColumns | Barrier (inherited) | **Streaming** — project per row | +| DropKeyColumns / DropDataColumns | Barrier (inherited) | **Streaming** — project per row | | FunctionPod | N/A (new) | **Streaming** — transform data per row | | FunctionNode | N/A (new) | **Streaming** — cache check + transform per row | | Join | Barrier (inherited) | **Incremental** — symmetric hash join | @@ -503,7 +503,7 @@ Streaming and incremental strategies may change row ordering compared to synchro from upstream. The result set is identical but row order may differ. - **Barrier**: row order matches synchronous mode exactly. -The `sort_by_tags` option in `ColumnConfig` provides deterministic ordering when needed, +The `sort_by_keys` option in `ColumnConfig` provides deterministic ordering when needed, independent of execution strategy. --- diff --git a/docs/api/index.md b/docs/api/index.md index bb475cf8..06b54b06 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -22,5 +22,5 @@ Everything else lives in subpackages: | [`orcapod.operators`](operators.md) | Structural stream transformations (join, filter, select, batch, etc.) | | [`orcapod.databases`](databases.md) | Persistent storage backends for computation results | | [`orcapod.nodes`](nodes.md) | DB-backed pipeline elements that persist their results | -| [`orcapod.streams`](streams.md) | Immutable (Tag, Data) sequences backed by PyArrow tables | +| [`orcapod.streams`](streams.md) | Immutable (Key, Data) sequences backed by PyArrow tables | | [`orcapod.types`](types.md) | Core type definitions: `Schema`, `ColumnConfig`, `ContentHash` | diff --git a/docs/api/operators.md b/docs/api/operators.md index ef1d69dd..d5742e88 100644 --- a/docs/api/operators.md +++ b/docs/api/operators.md @@ -10,15 +10,15 @@ Operators perform structural transformations on streams without inspecting or sy ::: orcapod.core.operators.Batch -::: orcapod.core.operators.SelectTagColumns +::: orcapod.core.operators.SelectKeyColumns ::: orcapod.core.operators.SelectDataColumns -::: orcapod.core.operators.DropTagColumns +::: orcapod.core.operators.DropKeyColumns ::: orcapod.core.operators.DropDataColumns -::: orcapod.core.operators.MapTags +::: orcapod.core.operators.MapKeys ::: orcapod.core.operators.MapData diff --git a/docs/api/sources.md b/docs/api/sources.md index 7895999d..9db232fd 100644 --- a/docs/api/sources.md +++ b/docs/api/sources.md @@ -2,7 +2,7 @@ Source classes provide the entry point for external data into Orcapod pipelines. All sources convert their input to a PyArrow Table and use `SourceStreamBuilder` for -enrichment (provenance columns, system tags, hashing). +enrichment (provenance columns, system keys, hashing). ::: orcapod.core.sources.ArrowTableSource diff --git a/docs/api/streams.md b/docs/api/streams.md index 98a8635e..3ba1a062 100644 --- a/docs/api/streams.md +++ b/docs/api/streams.md @@ -1,6 +1,6 @@ # Streams -Streams are immutable sequences of (Tag, Data) pairs backed by PyArrow tables. +Streams are immutable sequences of (Key, Data) pairs backed by PyArrow tables. ::: orcapod.core.streams.ArrowTableStream diff --git a/docs/concepts/function-pods.md b/docs/concepts/function-pods.md index 37a8af79..6021901b 100644 --- a/docs/concepts/function-pods.md +++ b/docs/concepts/function-pods.md @@ -3,7 +3,7 @@ Function pods are data-level transforms -- they take each data in a [stream](streams.md), apply a Python function to its values, and produce a new data with the function's outputs. Unlike [operators](operators.md), function pods never inspect or modify -tags. They are the primary mechanism for adding computation to an Orcapod pipeline: data +keys. They are the primary mechanism for adding computation to an Orcapod pipeline: data cleaning, feature extraction, model inference, or any transformation that produces new values from existing ones. @@ -58,27 +58,27 @@ source = DictSource( {"subject_id": "mouse_01", "weight": 25.3, "height": 0.12}, {"subject_id": "mouse_02", "weight": 22.1, "height": 0.10}, ], - tag_columns=["subject_id"], + key_columns=["subject_id"], ) # Apply the function pod to the source stream result = compute_bmi.pod(source) # shorthand for compute_bmi.pod.process(source) -# Inspect the output schema -- tags pass through, data are replaced -tag_schema, data_schema = result.output_schema() -print("Tag schema:", dict(tag_schema)) -# Tag schema: {'subject_id': } +# Inspect the output schema -- keys pass through, data are replaced +key_schema, data_schema = result.output_schema() +print("Key schema:", dict(key_schema)) +# Key schema: {'subject_id': } print("Data schema:", dict(data_schema)) # Data schema: {'bmi': } # Iterate over results -for tag, data in result.iter_data(): - print(f" {tag.as_dict()} -> {data.as_dict()}") +for key, data in result.iter_data(): + print(f" {key.as_dict()} -> {data.as_dict()}") # {'subject_id': 'mouse_01'} -> {'bmi': 1756.9444444444446} # {'subject_id': 'mouse_02'} -> {'bmi': 2209.9999999999995} ``` -The function pod preserves tags and replaces data columns with the function's output. If the +The function pod preserves keys and replaces data columns with the function's output. If the input stream has multiple data columns but the function only needs some of them, Orcapod extracts the matching columns by name. @@ -107,7 +107,7 @@ source = DictSource( {"subject_id": "mouse_01", "weight": 25.3, "height": 0.12}, {"subject_id": "mouse_02", "weight": 22.1, "height": 0.10}, ], - tag_columns=["subject_id"], + key_columns=["subject_id"], ) db = InMemoryArrowDatabase() @@ -122,8 +122,8 @@ node = FunctionNode( node.run() # Iterate over cached results -for tag, data in node.iter_data(): - print(f" {tag.as_dict()} -> {data.as_dict()}") +for key, data in node.iter_data(): + print(f" {key.as_dict()} -> {data.as_dict()}") ``` `FunctionNode` also provides: @@ -141,7 +141,7 @@ If you pass multiple streams to a function pod, they are automatically joined (u result = compute_bmi.pod(weight_stream, height_stream) ``` -The join happens on shared tag columns, and the merged data columns are fed to the function. +The join happens on shared key columns, and the merged data columns are fed to the function. ## DataFunction internals diff --git a/docs/concepts/identity.md b/docs/concepts/identity.md index abf3cbd1..b98fa2fd 100644 --- a/docs/concepts/identity.md +++ b/docs/concepts/identity.md @@ -42,8 +42,8 @@ benefits from results already cached for previous data with the same schema. Consider two sources with the same schema but different data: ``` -source_a = DictSource(data=[{"x": 1, "y": 2}], tag_columns=["x"]) -source_b = DictSource(data=[{"x": 10, "y": 20}], tag_columns=["x"]) +source_a = DictSource(data=[{"x": 1, "y": 2}], key_columns=["x"]) +source_b = DictSource(data=[{"x": 10, "y": 20}], key_columns=["x"]) ``` - `source_a.content_hash() != source_b.content_hash()` -- different source identity @@ -59,7 +59,7 @@ identity plus the pipeline hashes of all its upstream elements. ### Base case: sources -A `RootSource`'s pipeline identity is simply its `(tag_schema, data_schema)`. Sources with +A `RootSource`'s pipeline identity is simply its `(key_schema, data_schema)`. Sources with the same column names and types have the same pipeline hash, regardless of their data. ### Recursive case: downstream elements diff --git a/docs/concepts/operators.md b/docs/concepts/operators.md index 2c45a5fb..5c660a2b 100644 --- a/docs/concepts/operators.md +++ b/docs/concepts/operators.md @@ -5,7 +5,7 @@ synthesizing data values. They join, filter, batch, rename, and select columns - that affect the *structure* of the data (which rows exist, which columns are present, how columns are named) but never compute new values from data content. This is the key distinction from [function pods](function-pods.md), which do the opposite: they transform -data values but never touch tags or stream structure. +data values but never touch keys or stream structure. ## The operator / function pod boundary @@ -14,7 +14,7 @@ This separation is a core Orcapod design principle: | | Operator | Function Pod | |---|---|---| | Inspects data content | Never | Yes | -| Inspects / uses tags | Yes | No | +| Inspects / uses keys | Yes | No | | Can rename columns | Yes | No | | Synthesizes new values | No | Yes | | Stream arity | Configurable (1, 2, or N inputs) | Single in, single out | @@ -42,7 +42,7 @@ Takes one or more streams. Used for `Join`, which performs an N-ary inner join. ### Join -N-ary inner join on shared tag columns. Requires that input streams have non-overlapping +N-ary inner join on shared key columns. Requires that input streams have non-overlapping data columns (raises `InputValidationError` on collision). Join is **commutative** -- the order of input streams does not affect the result. @@ -55,7 +55,7 @@ subjects = DictSource( {"subject_id": "mouse_01", "age": 12}, {"subject_id": "mouse_02", "age": 8}, ], - tag_columns=["subject_id"], + key_columns=["subject_id"], ) measurements = DictSource( @@ -63,7 +63,7 @@ measurements = DictSource( {"subject_id": "mouse_01", "weight": 25.3}, {"subject_id": "mouse_02", "weight": 22.1}, ], - tag_columns=["subject_id"], + key_columns=["subject_id"], ) join = Join() @@ -82,10 +82,10 @@ is **commutative** -- the order of the two input streams does not affect the res ### SemiJoin -Binary join that filters the left stream to only include rows whose tags match the right +Binary join that filters the left stream to only include rows whose keys match the right stream. The right stream's data columns are discarded. SemiJoin is **not commutative** -- the order of inputs matters. The first stream is the one being filtered; the second stream -provides the set of matching tags. +provides the set of matching keys. ### Batch @@ -101,14 +101,14 @@ source = DictSource( {"subject_id": "mouse_01", "age": 12}, {"subject_id": "mouse_02", "age": 8}, ], - tag_columns=["subject_id"], + key_columns=["subject_id"], ) batch = Batch() batched = batch.process(source) -for tag, data in batched.iter_data(): - print("Tags:", tag.as_dict()) - # Tags: {'subject_id': ['mouse_01', 'mouse_02']} +for key, data in batched.iter_data(): + print("Keys:", key.as_dict()) + # Keys: {'subject_id': ['mouse_01', 'mouse_02']} print("Data:", data.as_dict()) # Data: {'age': [12, 8]} ``` @@ -123,9 +123,9 @@ batch = Batch(batch_size=10, drop_partial_batch=False) Four operators for including or excluding columns: -- **`SelectTagColumns(columns=["col1", "col2"])`** -- keep only the specified tag columns +- **`SelectKeyColumns(columns=["col1", "col2"])`** -- keep only the specified key columns - **`SelectDataColumns(columns=["col1", "col2"])`** -- keep only the specified data columns -- **`DropTagColumns(columns=["col1"])`** -- remove the specified tag columns +- **`DropKeyColumns(columns=["col1"])`** -- remove the specified key columns - **`DropDataColumns(columns=["col1"])`** -- remove the specified data columns ```python @@ -138,7 +138,7 @@ print(result.keys()[1]) # ('weight',) ### Column renaming -- **`MapTags(mapping={"old_name": "new_name"})`** -- rename tag columns +- **`MapKeys(mapping={"old_name": "new_name"})`** -- rename key columns - **`MapData(mapping={"old_name": "new_name"})`** -- rename data columns ### PolarsFilter @@ -151,8 +151,8 @@ from orcapod.operators import PolarsFilter filt = PolarsFilter(predicates=[pl.col("age") > 10]) filtered = filt.process(source) -for tag, pkt in filtered.iter_data(): - print(f"{tag.as_dict()} -> {pkt.as_dict()}") +for key, pkt in filtered.iter_data(): + print(f"{key.as_dict()} -> {pkt.as_dict()}") # {'subject_id': 'mouse_01'} -> {'age': 12, 'weight': 25.3} # {'subject_id': 'mouse_03'} -> {'age': 15, 'weight': 27.8} ``` diff --git a/docs/concepts/sources.md b/docs/concepts/sources.md index 4430d04c..f2e3ceed 100644 --- a/docs/concepts/sources.md +++ b/docs/concepts/sources.md @@ -3,8 +3,8 @@ Sources are the entry points for external data into an Orcapod pipeline. Every pipeline begins with one or more sources that load raw data -- from Python dicts, lists, CSV files, Delta Lake tables, or Pandas DataFrames -- and present it as an immutable -[stream](streams.md) of (Tag, Data) pairs. Sources also attach provenance metadata -(source-info columns and system tag columns) so that every downstream value can be traced back +[stream](streams.md) of (Key, Data) pairs. Sources also attach provenance metadata +(source-info columns and system key columns) so that every downstream value can be traced back to its origin. ## Key classes @@ -21,13 +21,13 @@ dependencies -- it sits at the root of the computational graph. Key properties: ### Concrete source types All sources follow the same pattern: convert input data to a PyArrow Table, then pass it -through `SourceStreamBuilder` which handles enrichment (provenance columns, system tags, +through `SourceStreamBuilder` which handles enrichment (provenance columns, system keys, hashing) and produces the final immutable stream. | Source | Input type | Notes | |---|---|---| | `ArrowTableSource` | PyArrow `Table` | Accepts an Arrow table directly | -| `DictSource` | `list[dict]` | Each dict becomes one (Tag, Data) pair | +| `DictSource` | `list[dict]` | Each dict becomes one (Key, Data) pair | | `ListSource` | `list[Any]` | Each element stored under a named data column | | `DataFrameSource` | Pandas `DataFrame` | Converts via Arrow | | `CSVSource` | File path (string) | Reads CSV into Arrow | @@ -48,7 +48,7 @@ data lineage: For example, a data column `weight` gets a companion `_source_weight` column. These tokens identify which source originally produced each value. -**System tag columns** (prefix `_tag::`) track which source contributed each row. These +**System key columns** (prefix `_key::`) track which source contributed each row. These columns are used internally during [joins](operators.md) to maintain provenance through multi-stream operations. @@ -58,8 +58,8 @@ These columns are hidden by default. You can reveal them using `ColumnConfig`: # Show source-info columns table = source.as_table(columns={"source": True}) -# Show system tag columns -tag_schema, data_schema = source.output_schema(columns={"system_tags": True}) +# Show system key columns +key_schema, data_schema = source.output_schema(columns={"system_keys": True}) # Show everything table = source.as_table(all_info=True) @@ -78,27 +78,27 @@ source = DictSource( {"subject_id": "mouse_02", "age": 8, "weight": 22.1}, {"subject_id": "mouse_03", "age": 15, "weight": 27.8}, ], - tag_columns=["subject_id"], + key_columns=["subject_id"], ) # Inspect the schema -tag_schema, data_schema = source.output_schema() -print("Tag schema:", dict(tag_schema)) -# Tag schema: {'subject_id': } +key_schema, data_schema = source.output_schema() +print("Key schema:", dict(key_schema)) +# Key schema: {'subject_id': } print("Data schema:", dict(data_schema)) # Data schema: {'age': , 'weight': } # Get column names -tag_keys, data_keys = source.keys() -print("Tag keys:", tag_keys) # ('subject_id',) +key_keys, data_keys = source.keys() +print("Key keys:", key_keys) # ('subject_id',) print("Data keys:", data_keys) # ('age', 'weight') -# Iterate over (Tag, Data) pairs -for tag, data in source.iter_data(): - print(f" Tag: {tag.as_dict()}, Data: {data.as_dict()}") -# Tag: {'subject_id': 'mouse_01'}, Data: {'age': 12, 'weight': 25.3} -# Tag: {'subject_id': 'mouse_02'}, Data: {'age': 8, 'weight': 22.1} -# Tag: {'subject_id': 'mouse_03'}, Data: {'age': 15, 'weight': 27.8} +# Iterate over (Key, Data) pairs +for key, data in source.iter_data(): + print(f" Key: {key.as_dict()}, Data: {data.as_dict()}") +# Key: {'subject_id': 'mouse_01'}, Data: {'age': 12, 'weight': 25.3} +# Key: {'subject_id': 'mouse_02'}, Data: {'age': 8, 'weight': 22.1} +# Key: {'subject_id': 'mouse_03'}, Data: {'age': 15, 'weight': 27.8} # Convert to a PyArrow table table = source.as_table() @@ -111,7 +111,7 @@ print(table.to_pandas()) ## How it connects to other concepts -- Sources produce [Streams](streams.md) -- immutable sequences of (Tag, Data) pairs +- Sources produce [Streams](streams.md) -- immutable sequences of (Key, Data) pairs - Streams flow into [Operators](operators.md) for structural transforms (joins, filters, column selection) - Streams flow into [Function Pods](function-pods.md) for value-level transforms diff --git a/docs/concepts/streams.md b/docs/concepts/streams.md index ec26615f..d47c6926 100644 --- a/docs/concepts/streams.md +++ b/docs/concepts/streams.md @@ -1,19 +1,19 @@ # Streams -A stream is an immutable sequence of (Tag, Data) pairs backed by a PyArrow Table. Streams +A stream is an immutable sequence of (Key, Data) pairs backed by a PyArrow Table. Streams are the universal data currency in Orcapod -- every [source](sources.md) produces a stream, every [operator](operators.md) consumes and produces streams, and every [function pod](function-pods.md) transforms data within a stream. Immutability guarantees that once a stream is created, its data cannot change, which is essential for reproducible pipelines. -## Tag columns vs Data columns +## Key columns vs Data columns Every stream divides its columns into two groups: -**Tag columns** are join keys and metadata. They identify *which* record you are looking at +**Key columns** are join keys and metadata. They identify *which* record you are looking at (e.g., `subject_id`, `session_date`). Operators like [Join](operators.md) match rows across -streams using shared tag columns. +streams using shared key columns. **Data columns** are the data payload. They hold the actual values being processed (e.g., `age`, `weight`, `spike_count`). [Function pods](function-pods.md) read data @@ -21,14 +21,14 @@ columns as function inputs and write new data columns as outputs. This separation is enforced throughout the framework: -- Operators inspect and restructure tags but never look inside data -- Function pods inspect and transform data but never look at tags +- Operators inspect and restructure keys but never look inside data +- Function pods inspect and transform data but never look at keys ## Key classes ### `ArrowTableStream` -The primary stream implementation. Wraps a PyArrow Table with designated tag and data +The primary stream implementation. Wraps a PyArrow Table with designated key and data columns. Created internally by sources and operators -- you rarely construct one directly. ### `StreamBase` @@ -42,32 +42,32 @@ Every stream exposes four key methods: ### `output_schema()` -Returns the `(tag_schema, data_schema)` tuple describing column names and their Python types: +Returns the `(key_schema, data_schema)` tuple describing column names and their Python types: ```python -tag_schema, data_schema = stream.output_schema() -print(dict(tag_schema)) # {'subject_id': } +key_schema, data_schema = stream.output_schema() +print(dict(key_schema)) # {'subject_id': } print(dict(data_schema)) # {'age': , 'weight': } ``` ### `keys()` -Returns column names as `(tag_keys, data_keys)`: +Returns column names as `(key_keys, data_keys)`: ```python -tag_keys, data_keys = stream.keys() -# tag_keys = ('subject_id',) +key_keys, data_keys = stream.keys() +# key_keys = ('subject_id',) # data_keys = ('age', 'weight') ``` ### `iter_data()` -Iterates over (Tag, Data) pairs. Each Tag and Data is an immutable datagram that you can +Iterates over (Key, Data) pairs. Each Key and Data is an immutable datagram that you can inspect with `.as_dict()`: ```python -for tag, data in stream.iter_data(): - print(tag.as_dict()) # {'subject_id': 'mouse_01'} +for key, data in stream.iter_data(): + print(key.as_dict()) # {'subject_id': 'mouse_01'} print(data.as_dict()) # {'age': 12, 'weight': 25.3} ``` @@ -83,17 +83,17 @@ df = table.to_pandas() ## Controlling column visibility with `ColumnConfig` -By default, streams only expose user-facing tag and data columns. Orcapod also maintains +By default, streams only expose user-facing key and data columns. Orcapod also maintains hidden columns for provenance tracking and metadata. Use `ColumnConfig` (or the `all_info` shortcut) to control which column groups are included. | Config field | What it reveals | Column prefix | |---|---|---| -| `system_tags` | System tag columns (provenance tracking) | `_tag::` | +| `system_keys` | System key columns (provenance tracking) | `_key::` | | `source` | Source-info columns (per-data provenance tokens) | `_source_` | | `context` | Data context column | `_context_key` | | `content_hash` | Content hash column | `_content_hash` | -| `sort_by_tags` | Sort rows by tag columns | (ordering only) | +| `sort_by_keys` | Sort rows by key columns | (ordering only) | Pass config as a dict or a `ColumnConfig` object: @@ -105,7 +105,7 @@ source = DictSource( {"subject_id": "mouse_01", "age": 12, "weight": 25.3}, {"subject_id": "mouse_02", "age": 8, "weight": 22.1}, ], - tag_columns=["subject_id"], + key_columns=["subject_id"], ) # Default: user-facing columns only @@ -121,7 +121,7 @@ print(table.column_names) # Include everything table = source.as_table(all_info=True) print(table.column_names) -# ['subject_id', 'age', 'weight', '_tag_source_id::...', '_tag_record_id::...', +# ['subject_id', 'age', 'weight', '_key_source_id::...', '_key_record_id::...', # '_content_hash', '_context_key', '_source_age', '_source_weight'] ``` @@ -138,19 +138,19 @@ source = DictSource( {"subject_id": "mouse_02", "age": 8, "weight": 22.1}, {"subject_id": "mouse_03", "age": 15, "weight": 27.8}, ], - tag_columns=["subject_id"], + key_columns=["subject_id"], ) # Schema inspection -tag_schema, data_schema = source.output_schema() -print("Tag schema:", dict(tag_schema)) -# Tag schema: {'subject_id': } +key_schema, data_schema = source.output_schema() +print("Key schema:", dict(key_schema)) +# Key schema: {'subject_id': } print("Data schema:", dict(data_schema)) # Data schema: {'age': , 'weight': } -# Iterate over (Tag, Data) pairs -for tag, data in source.iter_data(): - print(f" {tag.as_dict()} -> {data.as_dict()}") +# Iterate over (Key, Data) pairs +for key, data in source.iter_data(): + print(f" {key.as_dict()} -> {data.as_dict()}") # {'subject_id': 'mouse_01'} -> {'age': 12, 'weight': 25.3} # {'subject_id': 'mouse_02'} -> {'age': 8, 'weight': 22.1} # {'subject_id': 'mouse_03'} -> {'age': 15, 'weight': 27.8} diff --git a/docs/design/2026-03-27-non-active-node-semantics-design.md b/docs/design/2026-03-27-non-active-node-semantics-design.md index 1bf21a05..46ed5111 100644 --- a/docs/design/2026-03-27-non-active-node-semantics-design.md +++ b/docs/design/2026-03-27-non-active-node-semantics-design.md @@ -67,7 +67,7 @@ All of these will become passive after the fix — empty before `run()`, correct **`_make_empty_table() -> "pa.Table"`** -Builds a zero-row PyArrow table whose columns match the node's full output schema (tags + data). Uses `self.output_schema()` and `self.data_context.type_converter`. This is a pure, side-effect-free method. +Builds a zero-row PyArrow table whose columns match the node's full output schema (keys + data). Uses `self.output_schema()` and `self.data_context.type_converter`. This is a pure, side-effect-free method. - The return type annotation must use a string literal (`"pa.Table"`) because `pa` is imported via `LazyModule` at runtime; the real type is only available under `TYPE_CHECKING`. - `output_schema()` is safe on live and read-only deserialized nodes (uses `_stored_schema` when `_operator is None`). @@ -85,7 +85,7 @@ Guards (return `None` immediately if any apply): If all guards pass, call `self.pipeline_path` directly (no try/except). This is safe: by the time we reach this point, `_pipeline_database is not None`, and `pipeline_path` only raises `RuntimeError` when `_pipeline_database is None`. For live nodes `_pipeline_node_hash` is always set in `__init__`; for read-only deserialized nodes `_operator is None` causes `pipeline_path` to return `_stored_pipeline_path`. Then call `self._pipeline_database.get_all_records(self.pipeline_path)`: -- If records are non-None (zero or more rows): wrap in `ArrowTableStream(records, tag_columns=self.keys()[0])` and return it. +- If records are non-None (zero or more rows): wrap in `ArrowTableStream(records, key_columns=self.keys()[0])` and return it. Note: the DB stores records with a `_record_hash` column added by `_store_output_stream`. `get_all_records` does not strip this column. `_load_cached_stream_from_db` inherits this behavior — it returns an `ArrowTableStream` that includes `_record_hash`. This matches the existing behavior of `_replay_from_cache`, which also does not strip `_record_hash`. - If records are `None` (no prior LOG run has written to this path): build an empty table via `_make_empty_table()`, wrap in `ArrowTableStream`, and return it. @@ -126,7 +126,7 @@ In `src/orcapod/core/streams/base.py`, update the `flow()` docstring: **Before:** "This will trigger any upstream computation of the stream." -**After:** "Returns the entire collection of (TagProtocol, DataProtocol) as a list. This is a read-only operation — results reflect whatever has been computed by a prior `run()` or `execute()` call. If no computation has been performed, returns an empty list." +**After:** "Returns the entire collection of (KeyProtocol, DataProtocol) as a list. This is a read-only operation — results reflect whatever has been computed by a prior `run()` or `execute()` call. If no computation has been performed, returns an empty list." No other changes to `base.py`. @@ -167,7 +167,7 @@ node.run() └─ populates self._cached_output_stream # Read path (this fix): never triggers upstream computation -for tag, data in operator_node: # __iter__ → iter_data() +for key, data in operator_node: # __iter__ → iter_data() node.flow() # flow() → iter_data() node.iter_data() node.as_table() diff --git a/docs/design/plans/2026-03-26-sqlite-table-source.md b/docs/design/plans/2026-03-26-sqlite-table-source.md index 3e53ddb8..4bb81ec7 100644 --- a/docs/design/plans/2026-03-26-sqlite-table-source.md +++ b/docs/design/plans/2026-03-26-sqlite-table-source.md @@ -2,7 +2,7 @@ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. -**Goal:** Implement `SQLiteTableSource`, a `RootSource` backed by a SQLite table that uses primary-key columns (or `rowid` for ROWID-only tables) as default tag columns, with a working `from_config` round-trip. +**Goal:** Implement `SQLiteTableSource`, a `RootSource` backed by a SQLite table that uses primary-key columns (or `rowid` for ROWID-only tables) as default key columns, with a working `from_config` round-trip. **Architecture:** `SQLiteTableSource` subclasses `DBTableSource`. Two small patches are made to existing files first (`SQLiteConnector.iter_batches` for `rowid` typing; `DBTableSource.__init__` for a `_query` hook), then the new class and its tests are added, followed by wiring into exports and the source registry. @@ -178,8 +178,8 @@ In `src/orcapod/core/sources/db_table_source.py`, update the `__init__` signatur self, connector: DBConnectorProtocol, table_name: str, - tag_columns: Collection[str] | None = None, - system_tag_columns: Collection[str] = (), + key_columns: Collection[str] | None = None, + system_key_columns: Collection[str] = (), record_id_column: str | None = None, source_id: str | None = None, label: str | None = None, @@ -195,8 +195,8 @@ to: self, connector: DBConnectorProtocol, table_name: str, - tag_columns: Collection[str] | None = None, - system_tag_columns: Collection[str] = (), + key_columns: Collection[str] | None = None, + system_key_columns: Collection[str] = (), record_id_column: str | None = None, source_id: str | None = None, label: str | None = None, @@ -249,7 +249,7 @@ git commit -m "feat(sources): add keyword-only _query parameter to DBTableSource --- -## Task 3: Implement `SQLiteTableSource` — core class + unit tests (PK, explicit tags, errors, stream, hashing) +## Task 3: Implement `SQLiteTableSource` — core class + unit tests (PK, explicit keys, errors, stream, hashing) **Files:** - Create: `src/orcapod/core/sources/sqlite_table_source.py` @@ -265,8 +265,8 @@ Create `tests/test_core/sources/test_sqlite_table_source.py`: Test sections: 1. Import / export sanity 2. Protocol conformance - 3. PK as default tag columns (single and composite) - 4. Explicit tag column override + 3. PK as default key columns (single and composite) + 4. Explicit key column override 5. ROWID fallback (no explicit PK) 6. Error cases (missing table, empty table) 7. Stream behaviour @@ -426,7 +426,7 @@ Create `src/orcapod/core/sources/sqlite_table_source.py`: """SQLiteTableSource — a read-only RootSource backed by a SQLite table. Wraps a SQLite table as an OrcaPod Source. Primary-key columns are used -as tag columns by default. For tables with no explicit primary key +as key columns by default. For tables with no explicit primary key (ROWID-only tables), the implicit ``rowid`` integer column is used automatically. @@ -436,7 +436,7 @@ Example:: source = SQLiteTableSource("/path/to/my.db", "measurements") # In-memory (for tests / throwaway pipelines; cannot round-trip) - source = SQLiteTableSource(":memory:", "events", tag_columns=["session_id"]) + source = SQLiteTableSource(":memory:", "events", key_columns=["session_id"]) Note: ``:memory:`` sources cannot be reconstructed via ``from_config`` because @@ -464,13 +464,13 @@ class SQLiteTableSource(DBTableSource): At construction time the source: 1. Opens a ``SQLiteConnector`` for *db_path*. 2. Validates the table exists. - 3. Resolves tag columns: - - If *tag_columns* is provided, uses them as-is. + 3. Resolves key columns: + - If *key_columns* is provided, uses them as-is. - Otherwise uses the table's primary-key columns. - If the table has no explicit PK (ROWID-only), falls back to the implicit ``rowid`` integer column. 4. Determines the fetch query: injects ``SELECT rowid, *`` when - ``"rowid"`` is a resolved tag column and not a normal table column + ``"rowid"`` is a resolved key column and not a normal table column (handles both auto-detection and ``from_config`` reconstruction). 5. Delegates to ``DBTableSource.__init__`` for fetching and stream building. @@ -478,10 +478,10 @@ class SQLiteTableSource(DBTableSource): db_path: Path to the SQLite database file, or ``":memory:"`` for an in-process in-memory database. table_name: Name of the table to expose as a source. - tag_columns: Columns to use as tag columns. If ``None`` (default), + key_columns: Columns to use as key columns. If ``None`` (default), the table's primary-key columns are used; ROWID-only tables fall back to ``["rowid"]``. - system_tag_columns: Additional system-level tag columns. + system_key_columns: Additional system-level key columns. record_id_column: Column for stable per-row record IDs in provenance. source_id: Canonical source name. Defaults to *table_name*. label: Human-readable label for this source node. @@ -497,8 +497,8 @@ class SQLiteTableSource(DBTableSource): self, db_path: str | os.PathLike, table_name: str, - tag_columns: Collection[str] | None = None, - system_tag_columns: Collection[str] = (), + key_columns: Collection[str] | None = None, + system_key_columns: Collection[str] = (), record_id_column: str | None = None, source_id: str | None = None, label: str | None = None, @@ -508,19 +508,19 @@ class SQLiteTableSource(DBTableSource): self._db_path = db_path connector = SQLiteConnector(db_path) - # Step 3: Resolve tag columns. - if tag_columns is None: + # Step 3: Resolve key columns. + if key_columns is None: pk_cols = connector.get_pk_columns(table_name) - resolved_tags: list[str] = pk_cols if pk_cols else ["rowid"] + resolved_keys: list[str] = pk_cols if pk_cols else ["rowid"] else: - resolved_tags = list(tag_columns) + resolved_keys = list(key_columns) # Step 4: Determine the fetch query. - # If "rowid" is in resolved_tags but not a real column, we need + # If "rowid" is in resolved_keys but not a real column, we need # SELECT rowid, * to include it. This also handles from_config - # reconstruction where tag_columns=["rowid"] is passed explicitly. + # reconstruction where key_columns=["rowid"] is passed explicitly. normal_cols = {ci.name for ci in connector.get_column_info(table_name)} - if "rowid" in resolved_tags and "rowid" not in normal_cols: + if "rowid" in resolved_keys and "rowid" not in normal_cols: _query: str | None = f'SELECT rowid, * FROM "{table_name}"' else: _query = None @@ -528,8 +528,8 @@ class SQLiteTableSource(DBTableSource): super().__init__( connector, table_name, - tag_columns=resolved_tags, - system_tag_columns=system_tag_columns, + key_columns=resolved_keys, + system_key_columns=system_key_columns, record_id_column=record_id_column, source_id=source_id, label=label, @@ -565,8 +565,8 @@ class SQLiteTableSource(DBTableSource): return cls( db_path=config["db_path"], table_name=config["table_name"], - tag_columns=config.get("tag_columns"), - system_tag_columns=config.get("system_tag_columns", ()), + key_columns=config.get("key_columns"), + system_key_columns=config.get("system_key_columns", ()), record_id_column=config.get("record_id_column"), source_id=config.get("source_id"), ) @@ -588,22 +588,22 @@ uv run pytest tests/test_core/sources/test_sqlite_table_source.py::TestProtocolC Expected: PASS. Revert the temporary import change (back to `from orcapod.core.sources import SQLiteTableSource`) before committing — Task 4 will wire the `__init__.py` export and make the standard import work. -- [ ] **Step 3.5: Add PK, explicit tag, error-case, stream, and hashing test groups to the test file** +- [ ] **Step 3.5: Add PK, explicit key, error-case, stream, and hashing test groups to the test file** Append to `tests/test_core/sources/test_sqlite_table_source.py`: ```python # =========================================================================== -# 3. PK as default tag columns +# 3. PK as default key columns # =========================================================================== -class TestPKAsDefaultTags: - def test_single_pk_is_tag_column(self, pk_connector): +class TestPKAsDefaultKeys: + def test_single_pk_is_key_column(self, pk_connector): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(pk_connector._db_path, "measurements") - tag_schema, _ = src.output_schema() - assert "session_id" in tag_schema + key_schema, _ = src.output_schema() + assert "session_id" in key_schema def test_pk_not_in_data_schema(self, pk_connector): from orcapod.core.sources import SQLiteTableSource @@ -618,12 +618,12 @@ class TestPKAsDefaultTags: assert "trial" in data_schema assert "response" in data_schema - def test_composite_pk_all_columns_are_tags(self, composite_pk_connector): + def test_composite_pk_all_columns_are_keys(self, composite_pk_connector): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(composite_pk_connector._db_path, "events") - tag_schema, _ = src.output_schema() - assert "user_id" in tag_schema - assert "event_id" in tag_schema + key_schema, _ = src.output_schema() + assert "user_id" in key_schema + assert "event_id" in key_schema def test_default_source_id_is_table_name(self, pk_connector): from orcapod.core.sources import SQLiteTableSource @@ -637,30 +637,30 @@ class TestPKAsDefaultTags: # =========================================================================== -# 4. Explicit tag column override +# 4. Explicit key column override # =========================================================================== -class TestExplicitTagOverride: - def test_explicit_tag_columns_override_pk(self, pk_connector): +class TestExplicitKeyOverride: + def test_explicit_key_columns_override_pk(self, pk_connector): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource( - pk_connector._db_path, "measurements", tag_columns=["trial"] + pk_connector._db_path, "measurements", key_columns=["trial"] ) - tag_schema, _ = src.output_schema() - assert "trial" in tag_schema - assert "session_id" not in tag_schema + key_schema, _ = src.output_schema() + assert "trial" in key_schema + assert "session_id" not in key_schema - def test_multiple_explicit_tag_columns(self, pk_connector): + def test_multiple_explicit_key_columns(self, pk_connector): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource( pk_connector._db_path, "measurements", - tag_columns=["session_id", "trial"], + key_columns=["session_id", "trial"], ) - tag_schema, _ = src.output_schema() - assert "session_id" in tag_schema - assert "trial" in tag_schema + key_schema, _ = src.output_schema() + assert "session_id" in key_schema + assert "trial" in key_schema # =========================================================================== @@ -669,11 +669,11 @@ class TestExplicitTagOverride: class TestRowidFallback: - def test_rowid_only_table_uses_rowid_as_tag(self, rowid_connector): + def test_rowid_only_table_uses_rowid_as_key(self, rowid_connector): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(rowid_connector._db_path, "logs") - tag_schema, _ = src.output_schema() - assert "rowid" in tag_schema + key_schema, _ = src.output_schema() + assert "rowid" in key_schema def test_rowid_is_not_in_data_schema(self, rowid_connector): from orcapod.core.sources import SQLiteTableSource @@ -684,15 +684,15 @@ class TestRowidFallback: def test_rowid_values_are_positive_integers(self, rowid_connector): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(rowid_connector._db_path, "logs") - for tags, _ in src.iter_data(): - assert isinstance(tags["rowid"], int) - assert tags["rowid"] > 0 + for keys, _ in src.iter_data(): + assert isinstance(keys["rowid"], int) + assert keys["rowid"] > 0 def test_rowid_type_is_int64(self, rowid_connector): """Verify rowid is actually typed as int64, not large_string.""" from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(rowid_connector._db_path, "logs") - # The raw stream table (before tag/data split) holds all columns. + # The raw stream table (before key/data split) holds all columns. # We can verify the Arrow type via the internal stream table. raw = src._stream._table # ArrowTableStream stores the enriched table assert "rowid" in raw.schema.names @@ -744,11 +744,11 @@ class TestStreamBehaviour: data = list(src.iter_data()) assert len(data) == 3 - def test_iter_data_tags_contain_pk(self, pk_connector): + def test_iter_data_keys_contain_pk(self, pk_connector): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(pk_connector._db_path, "measurements") - for tags, _ in src.iter_data(): - assert "session_id" in tags + for keys, _ in src.iter_data(): + assert "session_id" in keys def test_output_schema_returns_two_schemas(self, pk_connector): from orcapod.core.sources import SQLiteTableSource @@ -786,11 +786,11 @@ class TestDeterministicHashing: src2 = SQLiteTableSource(pk_connector._db_path, "measurements") assert src1.content_hash() == src2.content_hash() - def test_different_tag_columns_yields_different_pipeline_hash(self, pk_connector): + def test_different_key_columns_yields_different_pipeline_hash(self, pk_connector): from orcapod.core.sources import SQLiteTableSource src1 = SQLiteTableSource(pk_connector._db_path, "measurements") src2 = SQLiteTableSource( - pk_connector._db_path, "measurements", tag_columns=["trial"] + pk_connector._db_path, "measurements", key_columns=["trial"] ) assert src1.pipeline_hash() != src2.pipeline_hash() ``` @@ -969,10 +969,10 @@ class TestConfigRoundTripPKTable: src = SQLiteTableSource(file_db_path, "measurements") assert src.to_config()["table_name"] == "measurements" - def test_to_config_has_tag_columns(self, file_db_path): + def test_to_config_has_key_columns(self, file_db_path): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(file_db_path, "measurements") - assert "session_id" in src.to_config()["tag_columns"] + assert "session_id" in src.to_config()["key_columns"] def test_to_config_has_identity_fields(self, file_db_path): from orcapod.core.sources import SQLiteTableSource @@ -1019,18 +1019,18 @@ class TestConfigRoundTripRowidTable: conn.close() return db_path - def test_to_config_has_rowid_as_tag_column(self, rowid_file_db_path): + def test_to_config_has_rowid_as_key_column(self, rowid_file_db_path): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(rowid_file_db_path, "logs") - assert src.to_config()["tag_columns"] == ["rowid"] + assert src.to_config()["key_columns"] == ["rowid"] def test_from_config_reconstructs_rowid_table(self, rowid_file_db_path): from orcapod.core.sources import SQLiteTableSource src = SQLiteTableSource(rowid_file_db_path, "logs") config = src.to_config() src2 = SQLiteTableSource.from_config(config) - tag_schema, _ = src2.output_schema() - assert "rowid" in tag_schema + key_schema, _ = src2.output_schema() + assert "rowid" in key_schema def test_from_config_rowid_hashes_match(self, rowid_file_db_path): from orcapod.core.sources import SQLiteTableSource @@ -1060,7 +1060,7 @@ git commit -m "test(sources): add config round-trip tests for SQLiteTableSource" ## Task 6: Integration test — `SQLiteTableSource` in a pipeline -**Background:** `Pipeline` wires sources into a node graph. A `FunctionPod` consumes data from the source. We verify end-to-end that tag columns flow through and the pipeline produces the expected results. +**Background:** `Pipeline` wires sources into a node graph. A `FunctionPod` consumes data from the source. We verify end-to-end that key columns flow through and the pipeline produces the expected results. **Files:** - Modify: `tests/test_core/sources/test_sqlite_table_source.py` @@ -1108,15 +1108,15 @@ class TestPipelineIntegration: assert len(fn_outputs) == 1 assert len(fn_outputs[0]) == 3 - # Verify tag column (session_id) flows through and results are correct + # Verify key column (session_id) flows through and results are correct doubled_values = sorted( [pkt.as_dict()["doubled"] for _, pkt in fn_outputs[0]] ) assert doubled_values == pytest.approx([0.2, 0.4, 0.6]) - # Verify tag values are present - tag_values = sorted([tags["session_id"] for tags, _ in fn_outputs[0]]) - assert tag_values == ["s1", "s2", "s3"] + # Verify key values are present + key_values = sorted([keys["session_id"] for keys, _ in fn_outputs[0]]) + assert key_values == ["s1", "s2", "s3"] ``` - [ ] **Step 6.2: Run the integration test** @@ -1165,8 +1165,8 @@ gh-app-token-generator nauticalab | gh auth login --with-token - [ ] **Step 7.3: Create the feature branch and push** ```bash -git checkout -b eywalker/plt-1077-implement-source-based-on-sqlite-tables-with-pk-as-default-tag -git push -u origin eywalker/plt-1077-implement-source-based-on-sqlite-tables-with-pk-as-default-tag +git checkout -b eywalker/plt-1077-implement-source-based-on-sqlite-tables-with-pk-as-default-key +git push -u origin eywalker/plt-1077-implement-source-based-on-sqlite-tables-with-pk-as-default-key ``` - [ ] **Step 7.4: Open the PR against `dev`** @@ -1179,7 +1179,7 @@ gh pr create \ ## Summary - Implements `SQLiteTableSource` — a `RootSource` backed by a SQLite table -- Primary-key columns are used as tag columns by default +- Primary-key columns are used as key columns by default - ROWID-only tables (no explicit PK) automatically fall back to the implicit `rowid` integer column - `from_config` round-trip works for file-backed databases (unlike `DBTableSource`) - Registered as `\"sqlite_table\"` in the source registry diff --git a/docs/design/specs/2026-03-26-sqlite-table-source-design.md b/docs/design/specs/2026-03-26-sqlite-table-source-design.md index 6e8868e7..e466115e 100644 --- a/docs/design/specs/2026-03-26-sqlite-table-source-design.md +++ b/docs/design/specs/2026-03-26-sqlite-table-source-design.md @@ -8,7 +8,7 @@ ## Summary -Implement `SQLiteTableSource`, a `RootSource` backed by a SQLite table. Primary-key columns serve as the default tag columns. For tables with no explicit primary key (ROWID-only tables), the implicit SQLite `rowid` is used automatically. Provides a working `from_config` round-trip — the gap that `DBTableSource` cannot fill today. +Implement `SQLiteTableSource`, a `RootSource` backed by a SQLite table. Primary-key columns serve as the default key columns. For tables with no explicit primary key (ROWID-only tables), the implicit SQLite `rowid` is used automatically. Provides a working `from_config` round-trip — the gap that `DBTableSource` cannot fill today. --- @@ -34,8 +34,8 @@ Three alternatives were evaluated: ``` SQLiteTableSource(DBTableSource) - __init__(db_path, table_name, tag_columns=None, - system_tag_columns=(), record_id_column=None, + __init__(db_path, table_name, key_columns=None, + system_key_columns=(), record_id_column=None, source_id=None, label=None, data_context=None, config=None) to_config() → dict # source_type="sqlite_table", db_path, table_name, … from_config(config) → cls # reconstructs via db_path; fully working @@ -57,35 +57,35 @@ Note: `src/orcapod/sources/__init__.py` already does `from orcapod.core.sources ## Data Flow ``` -SQLiteTableSource.__init__(db_path, table_name, tag_columns, ...) +SQLiteTableSource.__init__(db_path, table_name, key_columns, ...) │ ├─ 1. SQLiteConnector(db_path) │ ├─ 2. Validate: table_name in connector.get_table_names() │ └─ missing → ValueError("Table 'x' not found in database.") │ -├─ 3. Resolve tags and query -│ ├─ tag_columns provided (non-None) -│ │ → resolved_tags = list(tag_columns) -│ └─ tag_columns is None +├─ 3. Resolve keys and query +│ ├─ key_columns provided (non-None) +│ │ → resolved_keys = list(key_columns) +│ └─ key_columns is None │ ├─ pk_cols = connector.get_pk_columns(table_name) -│ ├─ pk_cols non-empty → resolved_tags = pk_cols -│ └─ pk_cols empty → resolved_tags = ["rowid"] +│ ├─ pk_cols non-empty → resolved_keys = pk_cols +│ └─ pk_cols empty → resolved_keys = ["rowid"] │ ├─ 4. Determine query (handles both auto-detection AND from_config reconstruction) │ normal_cols = {ci.name for ci in connector.get_column_info(table_name)} -│ ├─ "rowid" in resolved_tags AND "rowid" not in normal_cols +│ ├─ "rowid" in resolved_keys AND "rowid" not in normal_cols │ │ → _query = 'SELECT rowid, * FROM "{table_name}"' │ └─ otherwise │ → _query = None (DBTableSource uses default SELECT *) │ └─ 5. super().__init__(connector, table_name, - tag_columns=resolved_tags, ← always non-None + key_columns=resolved_keys, ← always non-None _query=_query, ...) │ - │ NOTE: passing tag_columns as a non-None list bypasses + │ NOTE: passing key_columns as a non-None list bypasses │ DBTableSource's own PK-lookup-and-raise path, which only - │ fires when tag_columns is None. This is intentional. + │ fires when key_columns is None. This is intentional. │ └─ DBTableSource: fetch batches (using _query) → SourceStreamBuilder → stream (rowid column arrives typed as int64 via SQLiteConnector patch) @@ -93,7 +93,7 @@ SQLiteTableSource.__init__(db_path, table_name, tag_columns, ...) store self._db_path ``` -`from_config` calls `cls(db_path=config["db_path"], table_name=config["table_name"], tag_columns=config["tag_columns"], ...)` — the connector is recreated from `db_path`. Because `tag_columns` is passed explicitly (non-None), step 3 skips PK detection; step 4 then checks whether `"rowid"` is in the resolved tags but not in the table's normal columns and re-injects the rowid query if so. This means **ROWID-only tables also round-trip correctly** from config as long as the backing file exists. +`from_config` calls `cls(db_path=config["db_path"], table_name=config["table_name"], key_columns=config["key_columns"], ...)` — the connector is recreated from `db_path`. Because `key_columns` is passed explicitly (non-None), step 3 skips PK detection; step 4 then checks whether `"rowid"` is in the resolved keys but not in the table's normal columns and re-injects the rowid query if so. This means **ROWID-only tables also round-trip correctly** from config as long as the backing file exists. **Known limitation:** `:memory:` sources cannot be reconstructed via `from_config`. The new in-memory database is empty and does not contain the original table, causing `ValueError: Table 'x' not found`. File-backed sources (including ROWID-only tables) round-trip correctly. The config round-trip test (test 9) must use a `tmp_path`-backed SQLite file. @@ -104,14 +104,14 @@ store self._db_path | Condition | Behaviour | |---|---| | Table not found | `ValueError: Table 'x' not found in database.` — raised in step 2, before ROWID logic | -| Table found, no PK, no explicit tags | ROWID fallback — no error; `"rowid"` used as tag column | -| Table found, no PK, explicit `tag_columns=[...]` | Works normally — ROWID detection skipped when `tag_columns` is provided | -| `tag_columns=[]` provided explicitly | Proceeds with empty tag schema — `SourceStreamBuilder` does not guard against empty tag lists; no `ValueError` is raised | +| Table found, no PK, no explicit keys | ROWID fallback — no error; `"rowid"` used as key column | +| Table found, no PK, explicit `key_columns=[...]` | Works normally — ROWID detection skipped when `key_columns` is provided | +| `key_columns=[]` provided explicitly | Proceeds with empty key schema — `SourceStreamBuilder` does not guard against empty key lists; no `ValueError` is raised | | Table exists but is empty | `ValueError: Table 'x' is empty.` — raised by `DBTableSource` | | `db_path` points to non-existent file | `sqlite3.OperationalError` propagates from `SQLiteConnector.__init__` | | `"` in table name | `ValueError` from `SQLiteConnector._validate_table_name` | -The ROWID fallback is silent (no warning log). The resolved `"rowid"` tag column appears in `to_config()` for auditability. +The ROWID fallback is silent (no warning log). The resolved `"rowid"` key column appears in `to_config()` for auditability. --- @@ -123,18 +123,18 @@ All use in-memory SQLite (`:memory:`), except the config round-trip test which r 1. **Import / export sanity** — importable from `orcapod.core.sources` and `orcapod.sources`; in `__all__` 2. **Protocol conformance** — is `SourceProtocol`, `StreamProtocol`, `PipelineElementProtocol` -3. **PK as default tags** — single-column PK; composite PK; correct tag/data schema split -4. **Explicit tag override** — `tag_columns=[...]` overrides PK detection entirely -5. **ROWID fallback** — table with no explicit PK gets `"rowid"` tag; `rowid` column type is `int64`; all rows returned; rowid values are positive integers +3. **PK as default keys** — single-column PK; composite PK; correct key/data schema split +4. **Explicit key override** — `key_columns=[...]` overrides PK detection entirely +5. **ROWID fallback** — table with no explicit PK gets `"rowid"` key; `rowid` column type is `int64`; all rows returned; rowid values are positive integers 6. **Error cases** — missing table raises `ValueError`; empty table raises `ValueError` 7. **Stream behaviour** — `iter_data` count, `as_table`, `output_schema`, `producer is None`, `upstreams == ()` 8. **Deterministic hashing** — `pipeline_hash` and `content_hash` stable across two identical constructions (both in-memory) -9. **Config round-trip (PK table)** — uses file-backed `tmp_path` SQLite db; `to_config()` has `source_type="sqlite_table"`, `db_path`, `table_name`, `tag_columns`; `from_config(to_config())` reconstructs successfully; content/pipeline hashes match before and after -10. **Config round-trip (ROWID-only table)** — same as above but with a ROWID-only table; `tag_columns=["rowid"]` in config; `from_config(to_config())` reconstructs correctly and `rowid` remains the tag column +9. **Config round-trip (PK table)** — uses file-backed `tmp_path` SQLite db; `to_config()` has `source_type="sqlite_table"`, `db_path`, `table_name`, `key_columns`; `from_config(to_config())` reconstructs successfully; content/pipeline hashes match before and after +10. **Config round-trip (ROWID-only table)** — same as above but with a ROWID-only table; `key_columns=["rowid"]` in config; `from_config(to_config())` reconstructs correctly and `rowid` remains the key column ### Integration test — same file, marked `@pytest.mark.integration` -Write rows into an in-memory SQLite table via `SQLiteConnector`, wrap with `SQLiteTableSource`, feed through a `FunctionPod`, collect output — verify tag columns flow through and pipeline completes. +Write rows into an in-memory SQLite table via `SQLiteConnector`, wrap with `SQLiteTableSource`, feed through a `FunctionPod`, collect output — verify key columns flow through and pipeline completes. ### Regression tests diff --git a/docs/getting-started.md b/docs/getting-started.md index 8a8bd696..410334f3 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -18,14 +18,14 @@ source = DictSource( {"experiment": "exp_002", "temperature": 22.3, "pressure": 0.98}, {"experiment": "exp_003", "temperature": 19.8, "pressure": 1.05}, ], - tag_columns=["experiment"], + key_columns=["experiment"], source_id="lab_results", ) ``` There are two important concepts here: -- **Tag columns** (`tag_columns`) are the keys that identify each row -- like primary keys +- **Key columns** (`key_columns`) are the keys that identify each row -- like primary keys in a database or independent variables in an experiment. Here, `experiment` uniquely identifies each measurement. - **Data columns** are everything else -- the actual data payload. In this example, @@ -45,11 +45,11 @@ conversion step. ### Schema -Use `output_schema()` to see the tag and data column types: +Use `output_schema()` to see the key and data column types: ```python -tag_schema, data_schema = source.output_schema() -print(tag_schema) +key_schema, data_schema = source.output_schema() +print(key_schema) # Schema({'experiment': }) print(data_schema) # Schema({'temperature': , 'pressure': }) @@ -60,8 +60,8 @@ print(data_schema) Use `keys()` to get just the column names: ```python -tag_keys, data_keys = source.keys() -print(tag_keys) +key_keys, data_keys = source.keys() +print(key_keys) # ('experiment',) print(data_keys) # ('temperature', 'pressure') @@ -69,14 +69,14 @@ print(data_keys) ### Iterating over rows -Use `iter_data()` to walk through each (Tag, Data) pair: +Use `iter_data()` to walk through each (Key, Data) pair: ```python -for tag, data in source.iter_data(): - print(f"Tag: {tag.as_dict()}, Data: {data.as_dict()}") -# Tag: {'experiment': 'exp_001'}, Data: {'temperature': 20.5, 'pressure': 1.01} -# Tag: {'experiment': 'exp_002'}, Data: {'temperature': 22.3, 'pressure': 0.98} -# Tag: {'experiment': 'exp_003'}, Data: {'temperature': 19.8, 'pressure': 1.05} +for key, data in source.iter_data(): + print(f"Key: {key.as_dict()}, Data: {data.as_dict()}") +# Key: {'experiment': 'exp_001'}, Data: {'temperature': 20.5, 'pressure': 1.01} +# Key: {'experiment': 'exp_002'}, Data: {'temperature': 22.3, 'pressure': 0.98} +# Key: {'experiment': 'exp_003'}, Data: {'temperature': 19.8, 'pressure': 1.05} ``` ### Getting the full table @@ -128,7 +128,7 @@ result = analyze_conditions.pod(source) All standard pods support `__call__` as a shorthand for `.process()`, so `pod(stream)` is equivalent to `pod.process(stream)`. -The `result` is a new stream. Tags are preserved from the input; the data columns +The `result` is a new stream. Keys are preserved from the input; the data columns are replaced with the function's outputs. ## Inspecting the result @@ -136,24 +136,24 @@ are replaced with the function's outputs. The result stream supports the same inspection methods as the source: ```python -tag_schema, data_schema = result.output_schema() -print(tag_schema) +key_schema, data_schema = result.output_schema() +print(key_schema) # Schema({'experiment': }) print(data_schema) # Schema({'temp_fahrenheit': , 'is_high_pressure': }) ``` -The tag schema is unchanged -- function pods never modify tags. The data schema +The key schema is unchanged -- function pods never modify keys. The data schema now reflects the function's output types. Iterate over the results: ```python -for tag, data in result.iter_data(): - print(f"Tag: {tag.as_dict()}, Data: {data.as_dict()}") -# Tag: {'experiment': 'exp_001'}, Data: {'temp_fahrenheit': 68.9, 'is_high_pressure': True} -# Tag: {'experiment': 'exp_002'}, Data: {'temp_fahrenheit': 72.14, 'is_high_pressure': False} -# Tag: {'experiment': 'exp_003'}, Data: {'temp_fahrenheit': 67.64, 'is_high_pressure': True} +for key, data in result.iter_data(): + print(f"Key: {key.as_dict()}, Data: {data.as_dict()}") +# Key: {'experiment': 'exp_001'}, Data: {'temp_fahrenheit': 68.9, 'is_high_pressure': True} +# Key: {'experiment': 'exp_002'}, Data: {'temp_fahrenheit': 72.14, 'is_high_pressure': False} +# Key: {'experiment': 'exp_003'}, Data: {'temp_fahrenheit': 67.64, 'is_high_pressure': True} ``` Or view it as a table: @@ -181,7 +181,7 @@ source = DictSource( {"experiment": "exp_002", "temperature": 22.3, "pressure": 0.98}, {"experiment": "exp_003", "temperature": 19.8, "pressure": 1.05}, ], - tag_columns=["experiment"], + key_columns=["experiment"], source_id="lab_results", ) @@ -207,10 +207,10 @@ Now that you have the basics, explore these topics: - [Sources](concepts/sources.md) -- learn about the different source types and how provenance tracking works. -- [Streams](concepts/streams.md) -- understand the immutable (Tag, Data) stream model. +- [Streams](concepts/streams.md) -- understand the immutable (Key, Data) stream model. - [Function Pods](concepts/function-pods.md) -- advanced function pod usage, including caching with databases. - [Operators](concepts/operators.md) -- structural transforms like Join, Batch, and Filter - that work on tags and stream structure without inspecting data content. + that work on keys and stream structure without inspecting data content. - [Identity & Hashing](concepts/identity.md) -- how Orcapod tracks content identity and pipeline structure for reproducibility. diff --git a/docs/index.md b/docs/index.md index 20204090..6f5bbea9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -34,7 +34,7 @@ source = DictSource( {"name": "Bob", "age": 25}, {"name": "Charlie", "age": 35}, ], - tag_columns=["name"], + key_columns=["name"], source_id="people", ) @@ -45,8 +45,8 @@ def compute_birth_year(age: int) -> int: # 3. Apply the function pod and inspect the output result = compute_birth_year.pod(source) -for tag, data in result.iter_data(): - print(f"{tag.as_dict()} -> {data.as_dict()}") +for key, data in result.iter_data(): + print(f"{key.as_dict()} -> {data.as_dict()}") # {'name': 'Alice'} -> {'birth_year': 1996} # {'name': 'Bob'} -> {'birth_year': 2001} # {'name': 'Charlie'} -> {'birth_year': 1991} diff --git a/docs/plans/2026-03-26-postgresql-table-source.md b/docs/plans/2026-03-26-postgresql-table-source.md index 5291990b..4e59f9dd 100644 --- a/docs/plans/2026-03-26-postgresql-table-source.md +++ b/docs/plans/2026-03-26-postgresql-table-source.md @@ -2,7 +2,7 @@ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. -**Goal:** Implement `PostgreSQLTableSource`, a read-only OrcaPod `Source` backed by a PostgreSQL table, using PK columns as default tag columns. +**Goal:** Implement `PostgreSQLTableSource`, a read-only OrcaPod `Source` backed by a PostgreSQL table, using PK columns as default key columns. **Architecture:** Thin subclass of `DBTableSource` (which handles all source logic). `PostgreSQLTableSource.__init__` stores the DSN, creates a `PostgreSQLConnector`, delegates entirely to `DBTableSource.__init__` (which eagerly loads all data into memory), then closes the connector. No ROWID fallback needed — PostgreSQL PKs are always `NOT NULL`. @@ -81,7 +81,7 @@ Create `src/orcapod/core/sources/postgresql_table_source.py`: """PostgreSQLTableSource — a read-only RootSource backed by a PostgreSQL table. Wraps a PostgreSQL table as an OrcaPod Source. Primary-key columns are used -as tag columns by default. +as key columns by default. Example:: @@ -109,13 +109,13 @@ class PostgreSQLTableSource(DBTableSource): 1. Stores the DSN for serialisation. 2. Opens a ``PostgreSQLConnector`` for *dsn*. 3. Delegates to ``DBTableSource.__init__``, which validates the table, - resolves tag columns (defaults to PK columns), fetches all rows as + resolves key columns (defaults to PK columns), fetches all rows as Arrow batches, and builds the stream. 4. Closes the connector — all data is eagerly loaded into memory, so the connection is released immediately. - PostgreSQL PK columns are always ``NOT NULL``, so NULL tag values can - only arise when *tag_columns* is overridden to point at a nullable + PostgreSQL PK columns are always ``NOT NULL``, so NULL key values can + only arise when *key_columns* is overridden to point at a nullable column. Such NULLs are passed through as-is (Arrow supports nulls). Args: @@ -123,10 +123,10 @@ class PostgreSQLTableSource(DBTableSource): URI form: ``"postgresql://user:pass@host:5432/dbname"`` Keyword form: ``"host=localhost dbname=mydb user=alice"`` table_name: Name of the table to expose as a source. - tag_columns: Columns to use as tag columns. If ``None`` (default), + key_columns: Columns to use as key columns. If ``None`` (default), the table's primary-key columns are used. Raises ``ValueError`` if the table has no primary key and no explicit columns are given. - system_tag_columns: Additional system-level tag columns. + system_key_columns: Additional system-level key columns. record_id_column: Column for stable per-row record IDs in provenance. source_id: Canonical source name. Defaults to *table_name*. label: Human-readable label for this source node. @@ -135,7 +135,7 @@ class PostgreSQLTableSource(DBTableSource): Raises: ValueError: If the table is not found, is empty, or has no PK and - no *tag_columns* are given. + no *key_columns* are given. psycopg.OperationalError: If the DSN is invalid or connection fails. """ @@ -143,8 +143,8 @@ class PostgreSQLTableSource(DBTableSource): self, dsn: str, table_name: str, - tag_columns: Collection[str] | None = None, - system_tag_columns: Collection[str] = (), + key_columns: Collection[str] | None = None, + system_key_columns: Collection[str] = (), record_id_column: str | None = None, source_id: str | None = None, label: str | None = None, @@ -194,7 +194,7 @@ git commit -m "feat(sources): stub PostgreSQLTableSource with import/export wiri --- -## Task 2: Core `__init__` — construction, PK tags, error cases +## Task 2: Core `__init__` — construction, PK keys, error cases Implement the actual `__init__` and verify the main behaviours. @@ -297,19 +297,19 @@ class TestProtocolConformance: # =========================================================================== -# 3. PK as default tag columns +# 3. PK as default key columns # =========================================================================== -class TestPKAsDefaultTags: - def test_single_pk_is_tag_column(self): +class TestPKAsDefaultKeys: + def test_single_pk_is_key_column(self): from orcapod.core.sources import PostgreSQLTableSource with patch(_PATCH) as mock_cls: mock_cls.return_value = _make_mock_connector() src = PostgreSQLTableSource(DSN, "measurements") - tag_schema, _ = src.output_schema() - assert "session_id" in tag_schema + key_schema, _ = src.output_schema() + assert "session_id" in key_schema def test_pk_not_in_data_schema(self): from orcapod.core.sources import PostgreSQLTableSource @@ -330,7 +330,7 @@ class TestPKAsDefaultTags: assert "trial" in data_schema assert "response" in data_schema - def test_composite_pk_all_columns_are_tags(self): + def test_composite_pk_all_columns_are_keys(self): from orcapod.core.sources import PostgreSQLTableSource schema = pa.schema([ @@ -353,9 +353,9 @@ class TestPKAsDefaultTags: batches=[batch], ) src = PostgreSQLTableSource(DSN, "events") - tag_schema, _ = src.output_schema() - assert "user_id" in tag_schema - assert "event_id" in tag_schema + key_schema, _ = src.output_schema() + assert "user_id" in key_schema + assert "event_id" in key_schema def test_default_source_id_is_table_name(self): from orcapod.core.sources import PostgreSQLTableSource @@ -375,32 +375,32 @@ class TestPKAsDefaultTags: # =========================================================================== -# 4. Explicit tag_columns override +# 4. Explicit key_columns override # =========================================================================== -class TestExplicitTagOverride: - def test_explicit_tag_columns_override_pk(self): +class TestExplicitKeyOverride: + def test_explicit_key_columns_override_pk(self): from orcapod.core.sources import PostgreSQLTableSource with patch(_PATCH) as mock_cls: mock_cls.return_value = _make_mock_connector() - src = PostgreSQLTableSource(DSN, "measurements", tag_columns=["trial"]) - tag_schema, _ = src.output_schema() - assert "trial" in tag_schema - assert "session_id" not in tag_schema + src = PostgreSQLTableSource(DSN, "measurements", key_columns=["trial"]) + key_schema, _ = src.output_schema() + assert "trial" in key_schema + assert "session_id" not in key_schema - def test_multiple_explicit_tag_columns(self): + def test_multiple_explicit_key_columns(self): from orcapod.core.sources import PostgreSQLTableSource with patch(_PATCH) as mock_cls: mock_cls.return_value = _make_mock_connector() src = PostgreSQLTableSource( - DSN, "measurements", tag_columns=["session_id", "trial"] + DSN, "measurements", key_columns=["session_id", "trial"] ) - tag_schema, _ = src.output_schema() - assert "session_id" in tag_schema - assert "trial" in tag_schema + key_schema, _ = src.output_schema() + assert "session_id" in key_schema + assert "trial" in key_schema # =========================================================================== @@ -409,7 +409,7 @@ class TestExplicitTagOverride: class TestNoPKError: - def test_no_pk_and_no_tag_columns_raises(self): + def test_no_pk_and_no_key_columns_raises(self): from orcapod.core.sources import PostgreSQLTableSource with patch(_PATCH) as mock_cls: @@ -471,14 +471,14 @@ class TestStreamBehaviour: src = PostgreSQLTableSource(DSN, "measurements") assert len(list(src.iter_data())) == 3 - def test_iter_data_tags_contain_pk(self): + def test_iter_data_keys_contain_pk(self): from orcapod.core.sources import PostgreSQLTableSource with patch(_PATCH) as mock_cls: mock_cls.return_value = _make_mock_connector() src = PostgreSQLTableSource(DSN, "measurements") - for tags, _ in src.iter_data(): - assert "session_id" in tags + for keys, _ in src.iter_data(): + assert "session_id" in keys def test_output_schema_returns_two_schemas(self): from orcapod.core.sources import PostgreSQLTableSource @@ -533,7 +533,7 @@ class TestDeterministicHashing: src2 = PostgreSQLTableSource(DSN, "measurements") assert src1.content_hash() == src2.content_hash() - def test_different_tag_columns_yields_different_pipeline_hash(self): + def test_different_key_columns_yields_different_pipeline_hash(self): from orcapod.core.sources import PostgreSQLTableSource with patch(_PATCH) as mock_cls: @@ -541,7 +541,7 @@ class TestDeterministicHashing: src1 = PostgreSQLTableSource(DSN, "measurements") with patch(_PATCH) as mock_cls: mock_cls.return_value = _make_mock_connector() - src2 = PostgreSQLTableSource(DSN, "measurements", tag_columns=["trial"]) + src2 = PostgreSQLTableSource(DSN, "measurements", key_columns=["trial"]) assert src1.pipeline_hash() != src2.pipeline_hash() ``` @@ -562,8 +562,8 @@ Replace the `__init__` stub in `src/orcapod/core/sources/postgresql_table_source self, dsn: str, table_name: str, - tag_columns: Collection[str] | None = None, - system_tag_columns: Collection[str] = (), + key_columns: Collection[str] | None = None, + system_key_columns: Collection[str] = (), record_id_column: str | None = None, source_id: str | None = None, label: str | None = None, @@ -576,8 +576,8 @@ Replace the `__init__` stub in `src/orcapod/core/sources/postgresql_table_source super().__init__( connector, table_name, - tag_columns=tag_columns, - system_tag_columns=system_tag_columns, + key_columns=key_columns, + system_key_columns=system_key_columns, record_id_column=record_id_column, source_id=source_id, label=label, @@ -627,7 +627,7 @@ Expected: PASS. ```bash git add src/orcapod/core/sources/postgresql_table_source.py \ tests/test_core/sources/test_postgresql_table_source.py -git commit -m "feat(sources): implement PostgreSQLTableSource.__init__ with PK tag resolution (PLT-1072)" +git commit -m "feat(sources): implement PostgreSQLTableSource.__init__ with PK key resolution (PLT-1072)" ``` --- @@ -667,8 +667,8 @@ class TestToConfig: def test_has_table_name(self): assert self._make_src().to_config()["table_name"] == "measurements" - def test_has_tag_columns(self): - assert "session_id" in self._make_src().to_config()["tag_columns"] + def test_has_key_columns(self): + assert "session_id" in self._make_src().to_config()["key_columns"] def test_has_source_id(self): assert self._make_src().to_config()["source_id"] == "measurements" @@ -721,18 +721,18 @@ class TestFromConfig: assert src2.content_hash() == src.content_hash() assert src2.pipeline_hash() == src.pipeline_hash() - def test_from_config_with_explicit_tag_columns(self): + def test_from_config_with_explicit_key_columns(self): from orcapod.core.sources import PostgreSQLTableSource with patch(_PATCH) as mock_cls: mock_cls.return_value = _make_mock_connector() - src = PostgreSQLTableSource(DSN, "measurements", tag_columns=["trial"]) + src = PostgreSQLTableSource(DSN, "measurements", key_columns=["trial"]) config = src.to_config() with patch(_PATCH) as mock_cls: mock_cls.return_value = _make_mock_connector() src2 = PostgreSQLTableSource.from_config(config) - tag_schema, _ = src2.output_schema() - assert "trial" in tag_schema + key_schema, _ = src2.output_schema() + assert "trial" in key_schema def test_from_config_missing_dsn_raises(self): from orcapod.core.sources import PostgreSQLTableSource @@ -774,8 +774,8 @@ Replace the stubs in `src/orcapod/core/sources/postgresql_table_source.py`: return cls( dsn=config["dsn"], table_name=config["table_name"], - tag_columns=config.get("tag_columns"), - system_tag_columns=config.get("system_tag_columns", ()), + key_columns=config.get("key_columns"), + system_key_columns=config.get("system_key_columns", ()), record_id_column=config.get("record_id_column"), source_id=config.get("source_id"), label=config.get("label"), @@ -993,7 +993,7 @@ def schema_dsn(pg_schema: str) -> str: class TestSinglePKTable: """Source backed by a table with a single-column PK.""" - def test_pk_column_is_tag(self, schema_dsn: str) -> None: + def test_pk_column_is_key(self, schema_dsn: str) -> None: from orcapod.core.sources import PostgreSQLTableSource with psycopg.connect(schema_dsn) as conn: @@ -1009,8 +1009,8 @@ class TestSinglePKTable: conn.commit() src = PostgreSQLTableSource(schema_dsn, "measurements") - tag_schema, _ = src.output_schema() - assert "session_id" in tag_schema + key_schema, _ = src.output_schema() + assert "session_id" in key_schema def test_non_pk_columns_in_data_schema(self, schema_dsn: str) -> None: from orcapod.core.sources import PostgreSQLTableSource @@ -1050,7 +1050,7 @@ class TestSinglePKTable: src = PostgreSQLTableSource(schema_dsn, "measurements") assert len(list(src.iter_data())) == 3 - def test_tag_values_are_correct(self, schema_dsn: str) -> None: + def test_key_values_are_correct(self, schema_dsn: str) -> None: from orcapod.core.sources import PostgreSQLTableSource with psycopg.connect(schema_dsn) as conn: @@ -1066,15 +1066,15 @@ class TestSinglePKTable: conn.commit() src = PostgreSQLTableSource(schema_dsn, "measurements") - tag_values = sorted([tags["session_id"] for tags, _ in src.iter_data()]) - assert tag_values == ["s1", "s2", "s3"] + key_values = sorted([keys["session_id"] for keys, _ in src.iter_data()]) + assert key_values == ["s1", "s2", "s3"] @pytest.mark.postgres class TestCompositePKTable: """Source backed by a table with a composite PK.""" - def test_both_pk_columns_are_tags(self, schema_dsn: str) -> None: + def test_both_pk_columns_are_keys(self, schema_dsn: str) -> None: from orcapod.core.sources import PostgreSQLTableSource with psycopg.connect(schema_dsn) as conn: @@ -1091,16 +1091,16 @@ class TestCompositePKTable: conn.commit() src = PostgreSQLTableSource(schema_dsn, "events") - tag_schema, _ = src.output_schema() - assert "user_id" in tag_schema - assert "event_id" in tag_schema + key_schema, _ = src.output_schema() + assert "user_id" in key_schema + assert "event_id" in key_schema @pytest.mark.postgres -class TestExplicitTagOverride: - """tag_columns override overrides the PK.""" +class TestExplicitKeyOverride: + """key_columns override overrides the PK.""" - def test_explicit_tag_columns_override_pk(self, schema_dsn: str) -> None: + def test_explicit_key_columns_override_pk(self, schema_dsn: str) -> None: from orcapod.core.sources import PostgreSQLTableSource with psycopg.connect(schema_dsn) as conn: @@ -1116,11 +1116,11 @@ class TestExplicitTagOverride: conn.commit() src = PostgreSQLTableSource( - schema_dsn, "measurements", tag_columns=["trial"] + schema_dsn, "measurements", key_columns=["trial"] ) - tag_schema, _ = src.output_schema() - assert "trial" in tag_schema - assert "session_id" not in tag_schema + key_schema, _ = src.output_schema() + assert "trial" in key_schema + assert "session_id" not in key_schema @pytest.mark.postgres @@ -1173,8 +1173,8 @@ class TestPipelineIntegration: doubled_values = sorted([pkt.as_dict()["doubled"] for _, pkt in fn_outputs[0]]) assert doubled_values == pytest.approx([0.2, 0.4, 0.6]) - tag_values = sorted([tags["session_id"] for tags, _ in fn_outputs[0]]) - assert tag_values == ["s1", "s2", "s3"] + key_values = sorted([keys["session_id"] for keys, _ in fn_outputs[0]]) + assert key_values == ["s1", "s2", "s3"] ``` - [ ] **Step 5.2: Verify the integration test file is syntactically correct (dry run)** @@ -1235,7 +1235,7 @@ gh pr create \ ## Summary - Implements `PostgreSQLTableSource` as a thin subclass of `DBTableSource` -- PK columns used as default tag columns; explicit `tag_columns` override supported +- PK columns used as default key columns; explicit `key_columns` override supported - Connector opened and closed eagerly at construction time (all data loaded into memory) - `to_config` / `from_config` round-trip serialisation - Registered in `_build_source_registry()` under `"postgresql_table"` diff --git a/docs/plans/2026-03-27-non-active-node-semantics.md b/docs/plans/2026-03-27-non-active-node-semantics.md index d8105eb7..2e5e0ad9 100644 --- a/docs/plans/2026-03-27-non-active-node-semantics.md +++ b/docs/plans/2026-03-27-non-active-node-semantics.md @@ -72,7 +72,7 @@ from orcapod.types import CacheMode @pytest.fixture def simple_source() -> ArrowTableStream: - """Single-tag stream: id (tag), x (data), 3 rows.""" + """Single-key stream: id (key), x (data), 3 rows.""" return ArrowTableStream( pa.table( { @@ -80,7 +80,7 @@ def simple_source() -> ArrowTableStream: "x": pa.array([10, 20, 30], type=pa.int64()), } ), - tag_columns=["id"], + key_columns=["id"], ) @@ -189,10 +189,10 @@ def _make_empty_table(self) -> "pa.Table": Requires ``self._operator is not None`` (pre-existing limitation shared with ``_replay_from_cache``). """ - tag_schema, data_schema = self.output_schema() + key_schema, data_schema = self.output_schema() type_converter = self.data_context.type_converter empty_fields: dict = {} - for name, py_type in {**tag_schema, **data_schema}.items(): + for name, py_type in {**key_schema, **data_schema}.items(): arrow_type = type_converter.python_type_to_arrow_type(py_type) empty_fields[name] = pa.array([], type=arrow_type) return pa.table(empty_fields) @@ -213,8 +213,8 @@ def _replay_from_cache(self) -> None: if records is None: records = self._make_empty_table() - tag_keys = self.keys()[0] - self._cached_output_stream = ArrowTableStream(records, tag_columns=tag_keys) + key_keys = self.keys()[0] + self._cached_output_stream = ArrowTableStream(records, key_columns=key_keys) self._update_modified_time() ``` @@ -271,8 +271,8 @@ def _load_cached_stream_from_db(self) -> "ArrowTableStream | None": records_table = self._make_empty_table() else: records_table = records - tag_keys = self.keys()[0] - return ArrowTableStream(records_table, tag_columns=tag_keys) + key_keys = self.keys()[0] + return ArrowTableStream(records_table, key_columns=key_keys) ``` - [ ] **Step 3.2: Run the existing test suite — no regressions** @@ -306,8 +306,8 @@ This is the core fix. Both methods stop calling `self.run()` and instead do a 3- Replace lines 555–558 (current `iter_data()`): ```python -def iter_data(self) -> Iterator[tuple[TagProtocol, DataProtocol]]: - """Return an iterator over (tag, data) pairs. +def iter_data(self) -> Iterator[tuple[KeyProtocol, DataProtocol]]: + """Return an iterator over (key, data) pairs. Read-only: never triggers computation. Returns empty before ``run()`` or ``execute()`` populates the cache. Call ``node.is_stale`` before @@ -368,7 +368,7 @@ def test_iter_data(self, simple_stream, db): node.run() # <-- add this line data = list(node.iter_data()) assert len(data) == 3 - for tag, data in data: + for key, data in data: assert "renamed_x" in data.keys() def test_as_table(self, simple_stream, db): @@ -586,9 +586,9 @@ Replace the body of the `flow()` docstring: ```python def flow( self, -) -> Collection[tuple[TagProtocol, DataProtocol]]: +) -> Collection[tuple[KeyProtocol, DataProtocol]]: """ - Returns the entire collection of (TagProtocol, DataProtocol) as a list. + Returns the entire collection of (KeyProtocol, DataProtocol) as a list. This is a read-only operation — results reflect whatever has been computed by a prior ``run()`` or ``execute()`` call. If no computation has been performed, returns an empty list. @@ -634,7 +634,7 @@ from orcapod.core.streams.arrow_table_stream import ArrowTableStream src = ArrowTableStream( pa.table({"id": [1, 2], "x": [10, 20]}), - tag_columns=["id"], + key_columns=["id"], ) op_a = MapData({"x": "y"}) op_b = MapData({"y": "z"}) diff --git a/docs/specs/2026-03-26-postgresql-table-source-design.md b/docs/specs/2026-03-26-postgresql-table-source-design.md index eb9a58c1..0a2fcc78 100644 --- a/docs/specs/2026-03-26-postgresql-table-source-design.md +++ b/docs/specs/2026-03-26-postgresql-table-source-design.md @@ -1,14 +1,14 @@ # Design: PostgreSQLTableSource (PLT-1072) **Date:** 2026-03-26 -**Issue:** [PLT-1072](https://linear.app/enigma-metamorphic/issue/PLT-1072/implement-source-based-on-postgresql-tables-with-pk-as-default-tag) +**Issue:** [PLT-1072](https://linear.app/enigma-metamorphic/issue/PLT-1072/implement-source-based-on-postgresql-tables-with-pk-as-default-key) **Status:** Approved --- ## Summary -Implement `PostgreSQLTableSource`, a read-only OrcaPod `Source` backed by a PostgreSQL table. The table's primary key columns serve as default tag columns. Follows the same pattern as the already-merged `SQLiteTableSource`. +Implement `PostgreSQLTableSource`, a read-only OrcaPod `Source` backed by a PostgreSQL table. The table's primary key columns serve as default key columns. Follows the same pattern as the already-merged `SQLiteTableSource`. --- @@ -24,7 +24,7 @@ PostgreSQL is a primary production database at Metamorphic. Exposing tables as O `PostgreSQLTableSource` is a minimal subclass of `DBTableSource`. All source logic (PK resolution, eager loading, Arrow conversion, stream building) lives in `DBTableSource`. This class only handles PostgreSQL-specific initialization: accept a DSN string, create a `PostgreSQLConnector`, delegate to the base class, then close the connector. -This mirrors `SQLiteTableSource` exactly — the only meaningful difference is that PostgreSQL has no ROWID fallback, so a table with no PK and no explicit `tag_columns` raises `ValueError` (already the default `DBTableSource` behaviour). +This mirrors `SQLiteTableSource` exactly — the only meaningful difference is that PostgreSQL has no ROWID fallback, so a table with no PK and no explicit `key_columns` raises `ValueError` (already the default `DBTableSource` behaviour). --- @@ -42,8 +42,8 @@ class PostgreSQLTableSource(DBTableSource): self, dsn: str, table_name: str, - tag_columns: Collection[str] | None = None, - system_tag_columns: Collection[str] = (), + key_columns: Collection[str] | None = None, + system_key_columns: Collection[str] = (), record_id_column: str | None = None, source_id: str | None = None, label: str | None = None, @@ -75,7 +75,7 @@ The `finally` block wraps the `close()` call in a `try/except Exception: pass` t self._dsn = dsn connector = PostgreSQLConnector(dsn) # outside try — must succeed before entering try try: - super().__init__(connector, table_name, tag_columns=tag_columns, ...) + super().__init__(connector, table_name, key_columns=key_columns, ...) finally: try: connector.close() @@ -91,10 +91,10 @@ PostgreSQLTableSource.__init__(dsn, table_name) → DBTableSource.__init__(connector, ...) # full source initialisation [inside DBTableSource]: → connector.get_table_names() # validate table exists - → connector.get_pk_columns(table) # resolve default tag columns (if tag_columns=None) + → connector.get_pk_columns(table) # resolve default key columns (if key_columns=None) → connector.iter_batches(SELECT * FROM "table") → pa.Table.from_batches(...) # assemble Arrow table - → SourceStreamBuilder.build(...) # attach tags, source-info, schema hash + → SourceStreamBuilder.build(...) # attach keys, source-info, schema hash → [finally] try: connector.close() except Exception: pass ``` @@ -111,13 +111,13 @@ After construction the source holds all data in-memory as an `ArrowTableStream`. "source_type": "postgresql_table", "dsn": "", "table_name": "", - "tag_columns": [...], - "system_tag_columns": [...], + "key_columns": [...], + "system_key_columns": [...], "record_id_column": ..., "source_id": ..., "content_hash": ..., "pipeline_hash": ..., - "tag_schema": {...}, + "key_schema": {...}, "data_schema": {...}, } ``` @@ -143,8 +143,8 @@ Note: stripping `"connector"` is both a schema concern (callers should not see t |---|---| | `dsn` | `"dsn"` (required) | | `table_name` | `"table_name"` (required) | -| `tag_columns` | `"tag_columns"` | -| `system_tag_columns` | `"system_tag_columns"` (default `()`) | +| `key_columns` | `"key_columns"` | +| `system_key_columns` | `"system_key_columns"` (default `()`) | | `record_id_column` | `"record_id_column"` | | `source_id` | `"source_id"` | | `label` | `"label"` | @@ -170,8 +170,8 @@ Three places to update, identical to the `SQLiteTableSource` rollout: |---|---| | Table does not exist | `ValueError: Table 'x' not found in database.` (from `DBTableSource`) | | Table is empty | `ValueError: Table 'x' is empty.` (from `DBTableSource`) | -| No PK and no `tag_columns` given | `ValueError: Table 'x' has no primary key columns. Provide explicit tag_columns.` (from `DBTableSource`) | -| NULL values in tag columns | Passed through as-is — Arrow supports nulls natively; PostgreSQL PK columns are always `NOT NULL` so this can only arise with an explicit `tag_columns` override | +| No PK and no `key_columns` given | `ValueError: Table 'x' has no primary key columns. Provide explicit key_columns.` (from `DBTableSource`) | +| NULL values in key columns | Passed through as-is — Arrow supports nulls natively; PostgreSQL PK columns are always `NOT NULL` so this can only arise with an explicit `key_columns` override | | Connection failure | `psycopg` exception propagates naturally | | Missing `"dsn"` key in `from_config` | `KeyError` from the `from_config` body | @@ -187,13 +187,13 @@ Uses `unittest.mock.patch("psycopg.connect")` throughout, with mock cursors retu 1. Import / export sanity (`from orcapod.core.sources import PostgreSQLTableSource`, present in `__all__`, importable from `orcapod.sources`) 2. Protocol conformance (`SourceProtocol`, `StreamProtocol`, `PipelineElementProtocol`) -3. PK as default tag columns — single PK, composite PK -4. Explicit `tag_columns` override +3. PK as default key columns — single PK, composite PK +4. Explicit `key_columns` override 5. No-PK table raises `ValueError` 6. Missing / empty table raises `ValueError` 7. Stream behaviour (`iter_data`, `output_schema`, `as_table`, `producer`, `upstreams`) 8. Deterministic hashing (`pipeline_hash`, `content_hash`) -9. `to_config` shape — has `source_type`, `dsn`, `table_name`, `tag_columns`, `source_id`, `content_hash`, `pipeline_hash`; does **not** have `connector` key or `label` key +9. `to_config` shape — has `source_type`, `dsn`, `table_name`, `key_columns`, `source_id`, `content_hash`, `pipeline_hash`; does **not** have `connector` key or `label` key 10. `from_config` round-trip (reconstructs with matching hashes) 11. `resolve_source_from_config` dispatches to `PostgreSQLTableSource` @@ -203,10 +203,10 @@ Uses `unittest.mock.patch("psycopg.connect")` throughout, with mock cursors retu **Marker:** `@pytest.mark.postgres` **Fixture:** per-test schema isolation (reuse pattern from `test_postgresql_connector_integration.py`) -- Single-PK table: source yields correct data, tag column in tag schema -- Composite-PK table: both PK columns in tag schema -- Explicit `tag_columns` override: overrides PKs correctly -- Pipeline integration: `PostgreSQLTableSource` drives a full pipeline end-to-end, tag values and data values are correct +- Single-PK table: source yields correct data, key column in key schema +- Composite-PK table: both PK columns in key schema +- Explicit `key_columns` override: overrides PKs correctly +- Pipeline integration: `PostgreSQLTableSource` drives a full pipeline end-to-end, key values and data values are correct --- diff --git a/examples/async_vs_sync_pipeline.py b/examples/async_vs_sync_pipeline.py index 28839baa..98bdbc73 100644 --- a/examples/async_vs_sync_pipeline.py +++ b/examples/async_vs_sync_pipeline.py @@ -101,7 +101,7 @@ def build_pipeline(use_async_fn: bool) -> Pipeline: fn = async_slow_double if use_async_fn else sync_slow_double with pipeline: - source = ArrowTableSource(SOURCE_TABLE, tag_columns=["id"]) + source = ArrowTableSource(SOURCE_TABLE, key_columns=["id"]) pf_a = PythonDataFunction(fn, output_keys="result", function_name="branch_a") pf_b = PythonDataFunction(fn, output_keys="result", function_name="branch_b") FunctionPod( diff --git a/examples/save_and_load_pipelines.py b/examples/save_and_load_pipelines.py index 66051d27..0d47e4ef 100644 --- a/examples/save_and_load_pipelines.py +++ b/examples/save_and_load_pipelines.py @@ -4,13 +4,13 @@ database = databases.DeltaTableDatabase("./local_database") source1 = sources.DictSource( [{"id": 0, "x": 5}, {"id": 1, "x": 10}, {"id": 2, "x": 15}], - tag_columns=["id"], + key_columns=["id"], label="source1", ) source1 = source1.cached(database) source2 = sources.DictSource( [{"id": 0, "y": 3}, {"id": 2, "y": 6}, {"id": 4, "y": 9}], - tag_columns=["id"], + key_columns=["id"], label="source2", ) source2 = source2.cached(database) diff --git a/function-execution-improvements-plan.md b/function-execution-improvements-plan.md index 9137f357..69a727c4 100644 --- a/function-execution-improvements-plan.md +++ b/function-execution-improvements-plan.md @@ -77,23 +77,23 @@ executor directly on the data function. The node never sees raw option dicts. ### Current state Caching exists only at the data-function level (`CachedDataFunction`), which wraps -`call()` / `async_call()` with DB lookup/insert. This works but cannot leverage tag +`call()` / `async_call()` with DB lookup/insert. This works but cannot leverage key information (which is invisible to data functions). ### Design decision Add a **`CachedFunctionPod`** that wraps a `FunctionPod` and intercepts at the -`process_data(tag, data)` level. This complements `CachedDataFunction`: +`process_data(key, data)` level. This complements `CachedDataFunction`: | Layer | `CachedDataFunction` | `CachedFunctionPod` | |-------|------------------------|---------------------| -| Intercepts at | `call(data)` | `process_data(tag, data)` | -| Has tag access | No | Yes | -| Cache key includes | Data content hash | Tag + data content hash | +| Intercepts at | `call(data)` | `process_data(key, data)` | +| Has key access | No | Yes | +| Cache key includes | Data content hash | Key + data content hash | | Delegates to | Wrapped `DataFunction.call()` | Inner `FunctionPod.process_data()` | Both are useful: `CachedDataFunction` deduplicates purely on data content; -`CachedFunctionPod` can incorporate tag metadata into cache decisions. +`CachedFunctionPod` can incorporate key metadata into cache decisions. ### Implementation sketch @@ -113,17 +113,17 @@ class CachedFunctionPod(WrappedFunctionPod): self._record_path_prefix = record_path_prefix def process_data( - self, tag: TagProtocol, data: DataProtocol - ) -> tuple[TagProtocol, DataProtocol | None]: - # Cache key incorporates both tag and data content - cache_key = self._compute_cache_key(tag, data) + self, key: KeyProtocol, data: DataProtocol + ) -> tuple[KeyProtocol, DataProtocol | None]: + # Cache key incorporates both key and data content + cache_key = self._compute_cache_key(key, data) cached = self._lookup(cache_key) if cached is not None: - return tag, cached - tag, output = self._function_pod.process_data(tag, data) + return key, cached + key, output = self._function_pod.process_data(key, data) if output is not None: - self._store(cache_key, tag, output) - return tag, output + self._store(cache_key, key, output) + return key, output ``` ### Changes @@ -276,7 +276,7 @@ No code changes needed — this is a documentation/convention clarification. ### Phase 3: `CachedFunctionPod` 1. Create `src/orcapod/core/cached_function_pod.py`. -2. Implement `CachedFunctionPod(WrappedFunctionPod)` with tag-aware cache key computation. +2. Implement `CachedFunctionPod(WrappedFunctionPod)` with key-aware cache key computation. 3. Add `pod_cache_database` parameter to `function_pod` decorator. 4. Add tests for pod-level vs data-level caching interaction. @@ -296,5 +296,5 @@ No code changes needed — this is a documentation/convention clarification. rather than `execute(pf, data)`), the `Generic[E]` mechanism already supports this — just parameterize with the narrower protocol. - **`CachedFunctionPod` cache key design**: The exact composition of the cache key (which - tag columns to include, whether to include system tags) needs detailed design during - implementation. A reasonable default is tag content hash + data content hash. + key columns to include, whether to include system keys) needs detailed design during + implementation. A reasonable default is key content hash + data content hash. diff --git a/notebooks/old_tutorials/01_orcapod_core_concepts copy.ipynb b/notebooks/old_tutorials/01_orcapod_core_concepts copy.ipynb index 01badab7..9cbe8183 100644 --- a/notebooks/old_tutorials/01_orcapod_core_concepts copy.ipynb +++ b/notebooks/old_tutorials/01_orcapod_core_concepts copy.ipynb @@ -26,10 +26,10 @@ "* `Stream` -- a series of one or more `data` flowing from a `data producer` to a `data consumer`. In a directed acyclic graph represneing an Orcapod `pipeline`, a `stream` corresponds to a *directed* edge connecting from a data source into a `data consumer` (e.g., `pod`)\n", "* `Data producer` and `data consumer` -- in the Orcapod data pipeline, data (in form of `data` of data flowing inside a `stream`) flows from a `data producer` to a `data consumer`. Consequentially, a `data consumer` may in turn act as a `data producer` downstream\n", "* `Data source` -- Root level `data producer` (that is, the data originates from this `data producer` and it is not a `data consumer` of any stream). Typically `data source` is tied to a data storage, although you could have *procedural* `data source` where data data are produced programatically.\n", - "* `Tag` -- each `Data` in a stream *may be* associated with a `tag` that helps to assign semantic identity to the particular `data`. For example, a data `data` for an experimental data may be associated with a `tag` of session ID. Note that while `tag` provides a convenient and often meaningful ways of identifying and referring to specific data within a stream, it should **not** be considered to be the defining identity of the `data`. Identity of the `data` is strictly determined by the exact data content of the `data`, and not by how you refer to it. Consequently, it may be that two data with an identical content (and thus shared identity) are associated with distinct `tags` in a `stream`. Conversely, an identical `tag` may be associated with two distinct `data` in a stream. Typically, one would associate a unique `tag` for each data in the stream.\n", + "* `Key` -- each `Data` in a stream *may be* associated with a `key` that helps to assign semantic identity to the particular `data`. For example, a data `data` for an experimental data may be associated with a `key` of session ID. Note that while `key` provides a convenient and often meaningful ways of identifying and referring to specific data within a stream, it should **not** be considered to be the defining identity of the `data`. Identity of the `data` is strictly determined by the exact data content of the `data`, and not by how you refer to it. Consequently, it may be that two data with an identical content (and thus shared identity) are associated with distinct `keys` in a `stream`. Conversely, an identical `key` may be associated with two distinct `data` in a stream. Typically, one would associate a unique `key` for each data in the stream.\n", "* `Operation` -- A *node* in the directed acyclic graph representing an Orcapod `pipeline`, corresponding to a step of data processing/transformation/computation. An `Operation` receives can be classified into either a `mapper` or a `pod` based on their role in `data provenance`.\n", - "* `Mapper` -- A class of `operation` that does **not** result in creation/alteration of a new data -- that is, `operation` does **not** every create or modify a file *content*. More specifically, `Mapper` operation can not produce a path that was not already present in the input streams to the `mapper`. This feature ensures that a `mapper` is fundamentally not involved in the reproducibility of computation. Consequently, `mapper` information is not necessary for the maintenance of proper `data provenance` in a tree of computation. However, `mapper` plays critical role in the actual execution of a data pipeline, determining which data `data` will be fed into operations in the pipeline directed acyclic graph (DAG). Note that as long as it doesn't modify the content of any file, a `mapper` may inspect the content of any file in a `data` it receives and alter its behavior based on the content of the file. In other words, `mapper` may alter what data file(s) gets passed around without changing/creating any file based on a rule that depends on `tag`, `data` key (`argument` name) and/or file content.\n", - "* `Pod` (e.g. FunctionPod) -- fundamental unit of computation in Orcapod. `Pod` is the only class of `operation` that may create a new file. Critically, when operating within an Orcapod `pipeline`, a `pod` will **not** receive the `tag` information. Rather, `pod` must strictly operate on a single `data`. An ideal `pod` will have completely deterministic behavior that only depends on the `data` identity (that is, data keys and `pathset` contents)." + "* `Mapper` -- A class of `operation` that does **not** result in creation/alteration of a new data -- that is, `operation` does **not** every create or modify a file *content*. More specifically, `Mapper` operation can not produce a path that was not already present in the input streams to the `mapper`. This feature ensures that a `mapper` is fundamentally not involved in the reproducibility of computation. Consequently, `mapper` information is not necessary for the maintenance of proper `data provenance` in a tree of computation. However, `mapper` plays critical role in the actual execution of a data pipeline, determining which data `data` will be fed into operations in the pipeline directed acyclic graph (DAG). Note that as long as it doesn't modify the content of any file, a `mapper` may inspect the content of any file in a `data` it receives and alter its behavior based on the content of the file. In other words, `mapper` may alter what data file(s) gets passed around without changing/creating any file based on a rule that depends on `key`, `data` key (`argument` name) and/or file content.\n", + "* `Pod` (e.g. FunctionPod) -- fundamental unit of computation in Orcapod. `Pod` is the only class of `operation` that may create a new file. Critically, when operating within an Orcapod `pipeline`, a `pod` will **not** receive the `key` information. Rather, `pod` must strictly operate on a single `data`. An ideal `pod` will have completely deterministic behavior that only depends on the `data` identity (that is, data keys and `pathset` contents)." ] }, { @@ -43,7 +43,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`Orcabridge` provide prototypal implementation of the above-defined key concepts in `orcapod`, with particular focus given to `stream`, `data`, `tag`, `operation` (`pod` and `mapper`). This package provides the reference implementation of both synchronous and asynchronous `streams` as a sequence of `data` associated with a `tag`. " + "`Orcabridge` provide prototypal implementation of the above-defined key concepts in `orcapod`, with particular focus given to `stream`, `data`, `key`, `operation` (`pod` and `mapper`). This package provides the reference implementation of both synchronous and asynchronous `streams` as a sequence of `data` associated with a `key`. " ] }, { diff --git a/notebooks/old_tutorials/02_orcapod_basic_usage copy.ipynb b/notebooks/old_tutorials/02_orcapod_basic_usage copy.ipynb index 97af83dc..f3d30f1b 100644 --- a/notebooks/old_tutorials/02_orcapod_basic_usage copy.ipynb +++ b/notebooks/old_tutorials/02_orcapod_basic_usage copy.ipynb @@ -680,15 +680,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'id': 0}, Data: {'result': 12.0}\n", - "Tag: {'id': 1}, Data: {'result': 30.0}\n", - "Tag: {'id': 2}, Data: {'result': 56.0}\n" + "Key: {'id': 0}, Data: {'result': 12.0}\n", + "Key: {'id': 1}, Data: {'result': 30.0}\n", + "Key: {'id': 2}, Data: {'result': 56.0}\n" ] } ], "source": [ - "for tag, data in pod(stream):\n", - " print((f\"Tag: {tag}, Data: {data}\"))" + "for key, data in pod(stream):\n", + " print((f\"Key: {key}, Data: {data}\"))" ] }, { @@ -722,7 +722,7 @@ " object_hasher=object_hasher,\n", " arrow_hasher=ArrowDataHasher(),\n", " result_store=MyArrowDataStore(),\n", - " tag_store=MyArrowDataStore(),\n", + " key_store=MyArrowDataStore(),\n", ")" ] }, @@ -803,7 +803,7 @@ "result: double\n", "----\n", "result: [[12]]\n", - "Tag: {'id': 0}, Data: {'result': 12.0}\n", + "Key: {'id': 0}, Data: {'result': 12.0}\n", "Requested to hash arrow data pyarrow.Table\n", "x: double\n", "y: int64\n", @@ -832,7 +832,7 @@ "result: double\n", "----\n", "result: [[30]]\n", - "Tag: {'id': 1}, Data: {'result': 30.0}\n", + "Key: {'id': 1}, Data: {'result': 30.0}\n", "Requested to hash arrow data pyarrow.Table\n", "x: double\n", "y: int64\n", @@ -861,13 +861,13 @@ "result: double\n", "----\n", "result: [[56]]\n", - "Tag: {'id': 2}, Data: {'result': 56.0}\n" + "Key: {'id': 2}, Data: {'result': 56.0}\n" ] } ], "source": [ - "for tag, data in cached_pod(stream):\n", - " print((f\"Tag: {tag}, Data: {data}\"))" + "for key, data in cached_pod(stream):\n", + " print((f\"Key: {key}, Data: {data}\"))" ] }, { @@ -981,7 +981,7 @@ "metadata": {}, "outputs": [], "source": [ - "tag = {\"name\": [\"Edgar\", \"Names\"], \"age\": 37}" + "key = {\"name\": [\"Edgar\", \"Names\"], \"age\": 37}" ] }, { @@ -990,7 +990,7 @@ "metadata": {}, "outputs": [], "source": [ - "tag[\"__data_key\"] = \"some_unique_key\"\n" + "key[\"__data_key\"] = \"some_unique_key\"\n" ] }, { @@ -1012,7 +1012,7 @@ "source": [ "from orcabridge.hashing.defaults import LegacyObjectHasher\n", "\n", - "LegacyObjectHasher().hash_to_hex(tag)\n" + "LegacyObjectHasher().hash_to_hex(key)\n" ] }, { @@ -1047,7 +1047,7 @@ } ], "source": [ - "pa.Table.from_pylist([tag])" + "pa.Table.from_pylist([key])" ] }, { @@ -1748,7 +1748,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can then obtain `stream` from a `source` by invoking the source with `Source()`. The return `stream` acts as an iterator over the `data` and its `tag`.\n", + "We can then obtain `stream` from a `source` by invoking the source with `Source()`. The return `stream` acts as an iterator over the `data` and its `key`.\n", "For convenience, `source` can be treated synonymously with a `stream`, allowing you to directly iterate over the content." ] }, @@ -1761,17 +1761,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with key {'file_name': 'day1'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with key {'file_name': 'day2'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with key {'file_name': 'day3'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with key {'file_name': 'day4'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with key {'file_name': 'day6'}\n" ] } ], "source": [ - "for tag, data in dataset1():\n", - " print(f\"Data {data} with tag {tag}\")" + "for key, data in dataset1():\n", + " print(f\"Data {data} with key {key}\")" ] }, { @@ -1783,25 +1783,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with key {'file_name': 'day1'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with key {'file_name': 'day2'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with key {'file_name': 'day3'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with key {'file_name': 'day4'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with key {'file_name': 'day6'}\n" ] } ], "source": [ "# equivalent to above but more natural without the need to call `dataset1()`\n", - "for tag, data in dataset1:\n", - " print(f\"Data {data} with tag {tag}\")" + "for key, data in dataset1:\n", + " print(f\"Data {data} with key {key}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A few things to note. When creating the `GlobSource` we pass in the argument name to be associated with the `pathset` matching our glob pattern (`*.txt` in this case). By default, the `GlobSource` tags each data with a key of `file_name` and value of the name of the file that was matched (minus the file extension). This behavior can be easily changed by passing in a custom function for tag generation at the time of `GlobSource` creation." + "A few things to note. When creating the `GlobSource` we pass in the argument name to be associated with the `pathset` matching our glob pattern (`*.txt` in this case). By default, the `GlobSource` keys each data with a key of `file_name` and value of the name of the file that was matched (minus the file extension). This behavior can be easily changed by passing in a custom function for key generation at the time of `GlobSource` creation." ] }, { @@ -1816,7 +1816,7 @@ " \"data\",\n", " \"../examples/dataset1\",\n", " \"*.txt\",\n", - " tag_function=lambda x: {\"date\": Path(x).stem},\n", + " key_function=lambda x: {\"date\": Path(x).stem},\n", ")" ] }, @@ -1829,24 +1829,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data {'data': PosixPath('../examples/dataset1/day1.txt')} with tag {'date': 'day1'}\n", - "Data {'data': PosixPath('../examples/dataset1/day2.txt')} with tag {'date': 'day2'}\n", - "Data {'data': PosixPath('../examples/dataset1/day3.txt')} with tag {'date': 'day3'}\n", - "Data {'data': PosixPath('../examples/dataset1/day4.txt')} with tag {'date': 'day4'}\n", - "Data {'data': PosixPath('../examples/dataset1/day6.txt')} with tag {'date': 'day6'}\n" + "Data {'data': PosixPath('../examples/dataset1/day1.txt')} with key {'date': 'day1'}\n", + "Data {'data': PosixPath('../examples/dataset1/day2.txt')} with key {'date': 'day2'}\n", + "Data {'data': PosixPath('../examples/dataset1/day3.txt')} with key {'date': 'day3'}\n", + "Data {'data': PosixPath('../examples/dataset1/day4.txt')} with key {'date': 'day4'}\n", + "Data {'data': PosixPath('../examples/dataset1/day6.txt')} with key {'date': 'day6'}\n" ] } ], "source": [ - "for tag, data in dataset1_custom:\n", - " print(f\"Data {data} with tag {tag}\")" + "for key, data in dataset1_custom:\n", + " print(f\"Data {data} with key {key}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Custom tag function would allow one to extract information useful in controlling the flow of the data pipeline from the file path or even the file content. We will return to this a bit later." + "Custom key function would allow one to extract information useful in controlling the flow of the data pipeline from the file path or even the file content. We will return to this a bit later." ] }, { @@ -1872,18 +1872,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')} with tag {'file_name': 'session_day1'}\n", - "Data {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')} with tag {'file_name': 'session_day3'}\n", - "Data {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')} with tag {'file_name': 'session_day4'}\n", - "Data {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')} with tag {'file_name': 'session_day5'}\n" + "Data {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')} with key {'file_name': 'session_day1'}\n", + "Data {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')} with key {'file_name': 'session_day3'}\n", + "Data {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')} with key {'file_name': 'session_day4'}\n", + "Data {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')} with key {'file_name': 'session_day5'}\n" ] } ], "source": [ "dataset2 = ob.GlobSource(\"bin_data\", \"../examples/dataset2\", \"*.bin\")\n", "\n", - "for tag, data in dataset2:\n", - " print(f\"Data {data} with tag {tag}\")" + "for key, data in dataset2:\n", + " print(f\"Data {data} with key {key}\")" ] }, { @@ -1919,7 +1919,7 @@ "metadata": {}, "source": [ "\n", - "`Mappers` are `operations` that controls and alter the streams but *without generating or modifying new data files*. As we will see shortly, `mappers` work to alter the stream by alterning data tags and/or data content, but critically will never create or modify new files that were not already present somewhere in the stream feeding into the `mapper` node. While this might sound like an unnecessary restriction on what `mappers` can do, we will see that this property guarantees that *mappers can not ever alter the reproducibility of computational chains*." + "`Mappers` are `operations` that controls and alter the streams but *without generating or modifying new data files*. As we will see shortly, `mappers` work to alter the stream by alterning data keys and/or data content, but critically will never create or modify new files that were not already present somewhere in the stream feeding into the `mapper` node. While this might sound like an unnecessary restriction on what `mappers` can do, we will see that this property guarantees that *mappers can not ever alter the reproducibility of computational chains*." ] }, { @@ -1942,7 +1942,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Once you have created a `source` from which streams can be formed, you can alter the stream by applying various `mappers`. More precisely, a `mapper` can work on tags and/or data." + "Once you have created a `source` from which streams can be formed, you can alter the stream by applying various `mappers`. More precisely, a `mapper` can work on keys and/or data." ] }, { @@ -1963,47 +1963,47 @@ "output_type": "stream", "text": [ "Before mapping:\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with key {'file_name': 'day1'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with key {'file_name': 'day2'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with key {'file_name': 'day3'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with key {'file_name': 'day4'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with key {'file_name': 'day6'}\n", "After mapping:\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with key {'file_name': 'day1'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with key {'file_name': 'day2'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with key {'file_name': 'day3'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with key {'file_name': 'day4'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with key {'file_name': 'day6'}\n" ] } ], "source": [ "print(\"Before mapping:\")\n", - "for tag, data in dataset1:\n", - " print(f\"Data {data} with tag {tag}\")\n", + "for key, data in dataset1:\n", + " print(f\"Data {data} with key {key}\")\n", "\n", "\n", "# create a new stream mapping data keys 'txt_file' to 'content'\n", "data_mapper = ob.MapData(key_map={\"txt_file\": \"content\"})\n", "\n", "print(\"After mapping:\")\n", - "for tag, data in data_mapper(dataset1):\n", - " print(f\"Mapped Data {data} with tag {tag}\")" + "for key, data in data_mapper(dataset1):\n", + " print(f\"Mapped Data {data} with key {key}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You'd notice that for each data, the key `txt_file` was replaced with `content` without altering the pointed `path` or the associated tag. As the keys of the data will be used as the name of arguments when invoking pods on a stream, we will see that `MapData` are commonly used to *map* the correct path to the argument." + "You'd notice that for each data, the key `txt_file` was replaced with `content` without altering the pointed `path` or the associated key. As the keys of the data will be used as the name of arguments when invoking pods on a stream, we will see that `MapData` are commonly used to *map* the correct path to the argument." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Map tags\n", - "As we have already seen, each data in the stream is associated with a tag, often derived from the data source. In the case of `GlobFileSource`, the tags are by default the name of the file that formed the data. These tags are used to *transiently* identify the data and will be used when matching data across multiple streams (as we will see shortly in `Join` operation). You can manipulate the tags using `MapTags` operation, much like `MapKeys` but operating on the tags for each packaet under a uniform renaming rule." + "### Map keys\n", + "As we have already seen, each data in the stream is associated with a key, often derived from the data source. In the case of `GlobFileSource`, the keys are by default the name of the file that formed the data. These keys are used to *transiently* identify the data and will be used when matching data across multiple streams (as we will see shortly in `Join` operation). You can manipulate the keys using `MapKeys` operation, much like `MapKeys` but operating on the keys for each packaet under a uniform renaming rule." ] }, { @@ -2024,10 +2024,10 @@ } ], "source": [ - "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", + "key_mapper = ob.MapKeys(key_map={\"file_name\": \"day\"})\n", "\n", - "for tag, data in tag_mapper(dataset1):\n", - " print(tag, data)" + "for key, data in key_mapper(dataset1):\n", + " print(key, data)" ] }, { @@ -2041,7 +2041,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you might expect, you can chain multiple operations one after another to construct a more complex stream. Below, we first apply the key mapping and then map tags." + "As you might expect, you can chain multiple operations one after another to construct a more complex stream. Below, we first apply the key mapping and then map keys." ] }, { @@ -2053,11 +2053,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'day': 'day1'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'day': 'day2'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'day': 'day3'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'day': 'day4'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'day': 'day6'}\n" + "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with key {'day': 'day1'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with key {'day': 'day2'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with key {'day': 'day3'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with key {'day': 'day4'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with key {'day': 'day6'}\n" ] } ], @@ -2065,18 +2065,18 @@ "data_mapper = ob.MapData(key_map={\"txt_file\": \"content\"})\n", "key_mapped_stream = data_mapper(dataset1)\n", "\n", - "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", - "tag_and_data_mapped = tag_mapper(key_mapped_stream)\n", + "key_mapper = ob.MapKeys(key_map={\"file_name\": \"day\"})\n", + "key_and_data_mapped = key_mapper(key_mapped_stream)\n", "\n", - "for tag, data in tag_and_data_mapped:\n", - " print(f\"Mapped Data {data} with tag {tag}\")" + "for key, data in key_and_data_mapped:\n", + " print(f\"Mapped Data {data} with key {key}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "It's worth emphasizing again that all computations are triggered only when you iterate through the final stream `tag_and_key_mapped`" + "It's worth emphasizing again that all computations are triggered only when you iterate through the final stream `key_and_key_mapped`" ] }, { @@ -2095,20 +2095,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'day': 'day1'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'day': 'day2'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'day': 'day3'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'day': 'day4'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'day': 'day6'}\n" + "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with key {'day': 'day1'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with key {'day': 'day2'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with key {'day': 'day3'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with key {'day': 'day4'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with key {'day': 'day6'}\n" ] } ], "source": [ "# totally valid, but difficult to read and thus not recommended\n", - "for tag, data in ob.MapTags(key_map={\"file_name\": \"day\"})(\n", + "for key, data in ob.MapKeys(key_map={\"file_name\": \"day\"})(\n", " ob.MapData(key_map={\"txt_file\": \"content\"})(dataset1)\n", "):\n", - " print(f\"Mapped Data {data} with tag {tag}\")" + " print(f\"Mapped Data {data} with key {key}\")" ] }, { @@ -2119,7 +2119,7 @@ "Now that we have looked at how you can manipulate a single stream, let's turn our eyes to how you can work with more than one streams together.\n", "\n", "By the far the most common multi-stream operations will be to join two (or more) streams into a single, bigger stream. \n", - "You can combine multiple streams into one by using `Join` operation, matching data from each stream based on the matching tags. If tags from two streams have shared key, the value must be identical for all shared keys for the two data to be matched. The matched data are then merged into a one (typically larger) data and shipped to the output stream." + "You can combine multiple streams into one by using `Join` operation, matching data from each stream based on the matching keys. If keys from two streams have shared key, the value must be identical for all shared keys for the two data to be matched. The matched data are then merged into a one (typically larger) data and shipped to the output stream." ] }, { @@ -2139,30 +2139,30 @@ "output_type": "stream", "text": [ "Dataset 1:\n", - "Tag: {'file_name': 'day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt')}\n", - "Tag: {'file_name': 'day2'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt')}\n", - "Tag: {'file_name': 'day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt')}\n", - "Tag: {'file_name': 'day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt')}\n", - "Tag: {'file_name': 'day6'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt')}\n", + "Key: {'file_name': 'day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt')}\n", + "Key: {'file_name': 'day2'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt')}\n", + "Key: {'file_name': 'day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt')}\n", + "Key: {'file_name': 'day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt')}\n", + "Key: {'file_name': 'day6'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt')}\n", "\n", "Dataset 2:\n", - "Tag: {'file_name': 'session_day1'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "Tag: {'file_name': 'session_day3'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "Tag: {'file_name': 'session_day4'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "Tag: {'file_name': 'session_day5'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + "Key: {'file_name': 'session_day1'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "Key: {'file_name': 'session_day5'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" ] } ], "source": [ "# dataset 1\n", "print(\"Dataset 1:\")\n", - "for tag, data in dataset1:\n", - " print(f\"Tag: {tag}, Data: {data}\")\n", + "for key, data in dataset1:\n", + " print(f\"Key: {key}, Data: {data}\")\n", "\n", "# dataset 2\n", "print(\"\\nDataset 2:\")\n", - "for tag, data in dataset2:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in dataset2:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2180,8 +2180,8 @@ "source": [ "join_op = ob.Join()\n", "\n", - "for tag, data in join_op(dataset1, dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in join_op(dataset1, dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2202,7 +2202,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's completely rename the tag key for one of the streams and see what would happen." + "First, let's completely rename the key key for one of the streams and see what would happen." ] }, { @@ -2214,34 +2214,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "01 Tag: {'day': 'day1', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "02 Tag: {'day': 'day1', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "03 Tag: {'day': 'day1', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "04 Tag: {'day': 'day1', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", - "05 Tag: {'day': 'day2', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "06 Tag: {'day': 'day2', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "07 Tag: {'day': 'day2', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "08 Tag: {'day': 'day2', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", - "09 Tag: {'day': 'day3', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "10 Tag: {'day': 'day3', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "11 Tag: {'day': 'day3', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "12 Tag: {'day': 'day3', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", - "13 Tag: {'day': 'day4', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "14 Tag: {'day': 'day4', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "15 Tag: {'day': 'day4', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "16 Tag: {'day': 'day4', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", - "17 Tag: {'day': 'day6', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "18 Tag: {'day': 'day6', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "19 Tag: {'day': 'day6', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "20 Tag: {'day': 'day6', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + "01 Key: {'day': 'day1', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "02 Key: {'day': 'day1', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "03 Key: {'day': 'day1', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "04 Key: {'day': 'day1', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "05 Key: {'day': 'day2', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "06 Key: {'day': 'day2', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "07 Key: {'day': 'day2', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "08 Key: {'day': 'day2', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "09 Key: {'day': 'day3', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "10 Key: {'day': 'day3', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "11 Key: {'day': 'day3', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "12 Key: {'day': 'day3', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "13 Key: {'day': 'day4', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "14 Key: {'day': 'day4', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "15 Key: {'day': 'day4', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "16 Key: {'day': 'day4', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "17 Key: {'day': 'day6', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "18 Key: {'day': 'day6', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "19 Key: {'day': 'day6', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "20 Key: {'day': 'day6', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" ] } ], "source": [ - "dataset1_retagged = ob.MapTags(key_map={\"file_name\": \"day\"})(dataset1)\n", + "dataset1_retagged = ob.MapKeys(key_map={\"file_name\": \"day\"})(dataset1)\n", "\n", - "for i, (tag, data) in enumerate(join_op(dataset1_retagged, dataset2)):\n", - " print(f\"{i + 1:02d} Tag: {tag}, Data: {data}\")" + "for i, (key, data) in enumerate(join_op(dataset1_retagged, dataset2)):\n", + " print(f\"{i + 1:02d} Key: {key}, Data: {data}\")" ] }, { @@ -2250,9 +2250,9 @@ "source": [ "We are now getting something -- in fact, quite a few things. If you look carefully at the `data`, you'll notice that it now contains two keys/arguments -- `txt_file` and `bin_data`, combining the data from the two datasets. \n", "\n", - "The `tags` also now contain two keys `day` from the re-tagged dataset1 stream and `file_name` from unchanged dataset2 stream.\n", + "The `keys` also now contain two keys `day` from the re-tagged dataset1 stream and `file_name` from unchanged dataset2 stream.\n", "\n", - "Since the two streams share no common tags, the `Join` operation results in *full-multiplexing* of two streams. With the streams from dataset1 and dataset2 containing 5 data and 4 data, respectively, you get $5 \\times 4 = 20$ data" + "Since the two streams share no common keys, the `Join` operation results in *full-multiplexing* of two streams. With the streams from dataset1 and dataset2 containing 5 data and 4 data, respectively, you get $5 \\times 4 = 20$ data" ] }, { @@ -2268,7 +2268,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Although we could achieve the desired effect by changing how we load the source, passing in custom `tag_function` into `GlobSource`, let's achieve the same by using another `mapper` called `Transform`. `Transform` effectively combines `MapKey` and `MapTag` but further allows you to provide a function that will receive the tag and data, one at a time, and return a (potentially modified) tag and/or data, achieving the desired transformation." + "Although we could achieve the desired effect by changing how we load the source, passing in custom `key_function` into `GlobSource`, let's achieve the same by using another `mapper` called `Transform`. `Transform` effectively combines `MapKey` and `MapKey` but further allows you to provide a function that will receive the key and data, one at a time, and return a (potentially modified) key and/or data, achieving the desired transformation." ] }, { @@ -2280,18 +2280,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'day': 'day1'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "Tag: {'day': 'day3'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "Tag: {'day': 'day4'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "Tag: {'day': 'day5'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + "Key: {'day': 'day1'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Key: {'day': 'day3'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Key: {'day': 'day4'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "Key: {'day': 'day5'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" ] } ], "source": [ - "def transform_dataset2(tag, data):\n", + "def transform_dataset2(key, data):\n", " # Extract the second half of the filename containing day\n", - " new_tag = {\"day\": tag[\"file_name\"].split(\"_\")[1]}\n", - " return new_tag, data\n", + " new_key = {\"day\": key[\"file_name\"].split(\"_\")[1]}\n", + " return new_key, data\n", "\n", "\n", "# Speical mappers like transform can be found in the orcabridge.mapper module\n", @@ -2299,8 +2299,8 @@ "\n", "retagged_dataset2 = dataset2_transformer(dataset2)\n", "\n", - "for tag, data in retagged_dataset2:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in retagged_dataset2:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2319,22 +2319,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'day': 'day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "Tag: {'day': 'day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "Tag: {'day': 'day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n" + "Key: {'day': 'day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Key: {'day': 'day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Key: {'day': 'day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n" ] } ], "source": [ "# change filename to day for dataset1\n", - "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", - "retagged_dataset1 = tag_mapper(dataset1)\n", + "key_mapper = ob.MapKeys(key_map={\"file_name\": \"day\"})\n", + "retagged_dataset1 = key_mapper(dataset1)\n", "\n", "join_op = ob.Join()\n", "joined_stream = join_op(retagged_dataset1, retagged_dataset2)\n", "\n", - "for tag, data in joined_stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in joined_stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2362,7 +2362,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "While `mapper` operations are useful in altering tags, data, and in combining multiple streams, a data pipeline is not really useful if it cannot produce new resultsin the form of new data -- that is, introduce new files into the stream. This is precisely where `Pod` operations come in!\n", + "While `mapper` operations are useful in altering keys, data, and in combining multiple streams, a data pipeline is not really useful if it cannot produce new resultsin the form of new data -- that is, introduce new files into the stream. This is precisely where `Pod` operations come in!\n", "\n", "In fact, we have already been working with a `pod` all along -- `sources`. If you think about it, `sources` also introduce files into the stream. It is just special in that it takes no input streams (hence the name, `source`).\n", "\n", @@ -2434,15 +2434,15 @@ "output_type": "stream", "text": [ "File ../examples/dataset1/day1.txt has 24 lines.\n", - "Tag: {'file_name': 'day1'}, Data: {}\n", + "Key: {'file_name': 'day1'}, Data: {}\n", "File ../examples/dataset1/day2.txt has 15 lines.\n", - "Tag: {'file_name': 'day2'}, Data: {}\n", + "Key: {'file_name': 'day2'}, Data: {}\n", "File ../examples/dataset1/day3.txt has 27 lines.\n", - "Tag: {'file_name': 'day3'}, Data: {}\n", + "Key: {'file_name': 'day3'}, Data: {}\n", "File ../examples/dataset1/day4.txt has 22 lines.\n", - "Tag: {'file_name': 'day4'}, Data: {}\n", + "Key: {'file_name': 'day4'}, Data: {}\n", "File ../examples/dataset1/day6.txt has 22 lines.\n", - "Tag: {'file_name': 'day6'}, Data: {}\n" + "Key: {'file_name': 'day6'}, Data: {}\n" ] } ], @@ -2450,8 +2450,8 @@ "# apply the function pod on a stream\n", "processed_stream = function_pod(dataset1)\n", "\n", - "for tag, data in processed_stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in processed_stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2515,7 +2515,7 @@ " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", - "Tag: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpm2wka6il/statistics.json')}\n", + "Key: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpm2wka6il/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day3.bin\n", "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", @@ -2523,21 +2523,21 @@ " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", " 1.39972807 -0.13940519]\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmplkmx65ll/statistics.json')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmplkmx65ll/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day4.bin\n", "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpxajrzctd/statistics.json')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpxajrzctd/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day5.bin\n", "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp67rthfe1/statistics.json')}\n" + "Key: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp67rthfe1/statistics.json')}\n" ] } ], @@ -2547,8 +2547,8 @@ "# change the key from 'bin_data' to 'bin_file', matching the function's input\n", "mapped_dataset2 = ob.MapData(key_map={\"bin_data\": \"bin_file\"})(dataset2)\n", "\n", - "for tag, data in fp_stats(mapped_dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in fp_stats(mapped_dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2575,7 +2575,7 @@ " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", - "Tag: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpciwa2xl_/statistics.json')}\n", + "Key: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpciwa2xl_/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day3.bin\n", "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", @@ -2583,29 +2583,29 @@ " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", " 1.39972807 -0.13940519]\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpkq824j5b/statistics.json')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpkq824j5b/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day4.bin\n", "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmp7ii2nd6e/statistics.json')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmp7ii2nd6e/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day5.bin\n", "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmpz23q61gg/statistics.json')}\n" + "Key: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmpz23q61gg/statistics.json')}\n" ] } ], "source": [ "# everytime you run the following loop, new computations are performed and\n", "# saved in a different set of temporary files\n", - "for tag, data in fp_stats(mapped_dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in fp_stats(mapped_dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2653,7 +2653,7 @@ " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", - "Tag: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", + "Key: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day3.bin\n", "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", @@ -2661,21 +2661,21 @@ " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", " 1.39972807 -0.13940519]\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day4.bin\n", "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day5.bin\n", "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" + "Key: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" ] } ], @@ -2690,8 +2690,8 @@ "cached_stream = cache_stream(stats_stream)\n", "\n", "# iterate over the cached stream\n", - "for tag, data in cached_stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in cached_stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2710,16 +2710,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" + "Key: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", + "Key: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" ] } ], "source": [ - "for tag, data in cached_stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in cached_stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2784,16 +2784,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'session_day1'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" + "Key: {'file_name': 'session_day1'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", + "Key: {'file_name': 'session_day5'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" ] } ], "source": [ - "for tag, data in fp_stats_stored(mapped_dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in fp_stats_stored(mapped_dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -2812,16 +2812,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'session_day1'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" + "Key: {'file_name': 'session_day1'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", + "Key: {'file_name': 'session_day5'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" ] } ], "source": [ - "for tag, data in fp_stats_stored(mapped_dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in fp_stats_stored(mapped_dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { diff --git a/notebooks/old_tutorials/02_orcapod_basic_usage.ipynb b/notebooks/old_tutorials/02_orcapod_basic_usage.ipynb index bb8b302b..767ee36a 100644 --- a/notebooks/old_tutorials/02_orcapod_basic_usage.ipynb +++ b/notebooks/old_tutorials/02_orcapod_basic_usage.ipynb @@ -82,7 +82,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can then obtain `stream` from a `source` by invoking the source with `Source()`. The return `stream` acts as an iterator over the `data` and its `tag`.\n", + "We can then obtain `stream` from a `source` by invoking the source with `Source()`. The return `stream` acts as an iterator over the `data` and its `key`.\n", "For convenience, `source` can be treated synonymously with a `stream`, allowing you to directly iterate over the content." ] }, @@ -95,17 +95,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with key {'file_name': 'day1'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with key {'file_name': 'day2'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with key {'file_name': 'day3'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with key {'file_name': 'day4'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with key {'file_name': 'day6'}\n" ] } ], "source": [ - "for tag, data in dataset1():\n", - " print(f\"Data {data} with tag {tag}\")" + "for key, data in dataset1():\n", + " print(f\"Data {data} with key {key}\")" ] }, { @@ -117,25 +117,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with key {'file_name': 'day1'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with key {'file_name': 'day2'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with key {'file_name': 'day3'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with key {'file_name': 'day4'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with key {'file_name': 'day6'}\n" ] } ], "source": [ "# equivalent to above but more natural without the need to call `dataset1()`\n", - "for tag, data in dataset1:\n", - " print(f\"Data {data} with tag {tag}\")" + "for key, data in dataset1:\n", + " print(f\"Data {data} with key {key}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A few things to note. When creating the `GlobSource` we pass in the argument name to be associated with the `pathset` matching our glob pattern (`*.txt` in this case). By default, the `GlobSource` tags each data with a key of `file_name` and value of the name of the file that was matched (minus the file extension). This behavior can be easily changed by passing in a custom function for tag generation at the time of `GlobSource` creation." + "A few things to note. When creating the `GlobSource` we pass in the argument name to be associated with the `pathset` matching our glob pattern (`*.txt` in this case). By default, the `GlobSource` keys each data with a key of `file_name` and value of the name of the file that was matched (minus the file extension). This behavior can be easily changed by passing in a custom function for key generation at the time of `GlobSource` creation." ] }, { @@ -150,7 +150,7 @@ " \"data\",\n", " \"../examples/dataset1\",\n", " \"*.txt\",\n", - " tag_function=lambda x: {\"date\": Path(x).stem},\n", + " key_function=lambda x: {\"date\": Path(x).stem},\n", ")" ] }, @@ -163,24 +163,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data {'data': PosixPath('../examples/dataset1/day1.txt')} with tag {'date': 'day1'}\n", - "Data {'data': PosixPath('../examples/dataset1/day2.txt')} with tag {'date': 'day2'}\n", - "Data {'data': PosixPath('../examples/dataset1/day3.txt')} with tag {'date': 'day3'}\n", - "Data {'data': PosixPath('../examples/dataset1/day4.txt')} with tag {'date': 'day4'}\n", - "Data {'data': PosixPath('../examples/dataset1/day6.txt')} with tag {'date': 'day6'}\n" + "Data {'data': PosixPath('../examples/dataset1/day1.txt')} with key {'date': 'day1'}\n", + "Data {'data': PosixPath('../examples/dataset1/day2.txt')} with key {'date': 'day2'}\n", + "Data {'data': PosixPath('../examples/dataset1/day3.txt')} with key {'date': 'day3'}\n", + "Data {'data': PosixPath('../examples/dataset1/day4.txt')} with key {'date': 'day4'}\n", + "Data {'data': PosixPath('../examples/dataset1/day6.txt')} with key {'date': 'day6'}\n" ] } ], "source": [ - "for tag, data in dataset1_custom:\n", - " print(f\"Data {data} with tag {tag}\")" + "for key, data in dataset1_custom:\n", + " print(f\"Data {data} with key {key}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Custom tag function would allow one to extract information useful in controlling the flow of the data pipeline from the file path or even the file content. We will return to this a bit later." + "Custom key function would allow one to extract information useful in controlling the flow of the data pipeline from the file path or even the file content. We will return to this a bit later." ] }, { @@ -206,18 +206,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')} with tag {'file_name': 'session_day1'}\n", - "Data {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')} with tag {'file_name': 'session_day3'}\n", - "Data {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')} with tag {'file_name': 'session_day4'}\n", - "Data {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')} with tag {'file_name': 'session_day5'}\n" + "Data {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')} with key {'file_name': 'session_day1'}\n", + "Data {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')} with key {'file_name': 'session_day3'}\n", + "Data {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')} with key {'file_name': 'session_day4'}\n", + "Data {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')} with key {'file_name': 'session_day5'}\n" ] } ], "source": [ "dataset2 = ob.GlobSource(\"bin_data\", \"../examples/dataset2\", \"*.bin\")\n", "\n", - "for tag, data in dataset2:\n", - " print(f\"Data {data} with tag {tag}\")" + "for key, data in dataset2:\n", + " print(f\"Data {data} with key {key}\")" ] }, { @@ -253,7 +253,7 @@ "metadata": {}, "source": [ "\n", - "`Mappers` are `operations` that controls and alter the streams but *without generating or modifying new data files*. As we will see shortly, `mappers` work to alter the stream by alterning data tags and/or data content, but critically will never create or modify new files that were not already present somewhere in the stream feeding into the `mapper` node. While this might sound like an unnecessary restriction on what `mappers` can do, we will see that this property guarantees that *mappers can not ever alter the reproducibility of computational chains*." + "`Mappers` are `operations` that controls and alter the streams but *without generating or modifying new data files*. As we will see shortly, `mappers` work to alter the stream by alterning data keys and/or data content, but critically will never create or modify new files that were not already present somewhere in the stream feeding into the `mapper` node. While this might sound like an unnecessary restriction on what `mappers` can do, we will see that this property guarantees that *mappers can not ever alter the reproducibility of computational chains*." ] }, { @@ -276,7 +276,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Once you have created a `source` from which streams can be formed, you can alter the stream by applying various `mappers`. More precisely, a `mapper` can work on tags and/or data." + "Once you have created a `source` from which streams can be formed, you can alter the stream by applying various `mappers`. More precisely, a `mapper` can work on keys and/or data." ] }, { @@ -297,47 +297,47 @@ "output_type": "stream", "text": [ "Before mapping:\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", - "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with key {'file_name': 'day1'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with key {'file_name': 'day2'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with key {'file_name': 'day3'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with key {'file_name': 'day4'}\n", + "Data {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with key {'file_name': 'day6'}\n", "After mapping:\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with key {'file_name': 'day1'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with key {'file_name': 'day2'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with key {'file_name': 'day3'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with key {'file_name': 'day4'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with key {'file_name': 'day6'}\n" ] } ], "source": [ "print(\"Before mapping:\")\n", - "for tag, data in dataset1:\n", - " print(f\"Data {data} with tag {tag}\")\n", + "for key, data in dataset1:\n", + " print(f\"Data {data} with key {key}\")\n", "\n", "\n", "# create a new stream mapping data keys 'txt_file' to 'content'\n", "data_mapper = ob.MapData(key_map={\"txt_file\": \"content\"})\n", "\n", "print(\"After mapping:\")\n", - "for tag, data in data_mapper(dataset1):\n", - " print(f\"Mapped Data {data} with tag {tag}\")" + "for key, data in data_mapper(dataset1):\n", + " print(f\"Mapped Data {data} with key {key}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You'd notice that for each data, the key `txt_file` was replaced with `content` without altering the pointed `path` or the associated tag. As the keys of the data will be used as the name of arguments when invoking pods on a stream, we will see that `MapData` are commonly used to *map* the correct path to the argument." + "You'd notice that for each data, the key `txt_file` was replaced with `content` without altering the pointed `path` or the associated key. As the keys of the data will be used as the name of arguments when invoking pods on a stream, we will see that `MapData` are commonly used to *map* the correct path to the argument." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Map tags\n", - "As we have already seen, each data in the stream is associated with a tag, often derived from the data source. In the case of `GlobFileSource`, the tags are by default the name of the file that formed the data. These tags are used to *transiently* identify the data and will be used when matching data across multiple streams (as we will see shortly in `Join` operation). You can manipulate the tags using `MapTags` operation, much like `MapKeys` but operating on the tags for each packaet under a uniform renaming rule." + "### Map keys\n", + "As we have already seen, each data in the stream is associated with a key, often derived from the data source. In the case of `GlobFileSource`, the keys are by default the name of the file that formed the data. These keys are used to *transiently* identify the data and will be used when matching data across multiple streams (as we will see shortly in `Join` operation). You can manipulate the keys using `MapKeys` operation, much like `MapKeys` but operating on the keys for each packaet under a uniform renaming rule." ] }, { @@ -358,10 +358,10 @@ } ], "source": [ - "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", + "key_mapper = ob.MapKeys(key_map={\"file_name\": \"day\"})\n", "\n", - "for tag, data in tag_mapper(dataset1):\n", - " print(tag, data)" + "for key, data in key_mapper(dataset1):\n", + " print(key, data)" ] }, { @@ -375,7 +375,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you might expect, you can chain multiple operations one after another to construct a more complex stream. Below, we first apply the key mapping and then map tags." + "As you might expect, you can chain multiple operations one after another to construct a more complex stream. Below, we first apply the key mapping and then map keys." ] }, { @@ -387,11 +387,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'day': 'day1'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'day': 'day2'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'day': 'day3'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'day': 'day4'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'day': 'day6'}\n" + "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with key {'day': 'day1'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with key {'day': 'day2'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with key {'day': 'day3'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with key {'day': 'day4'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with key {'day': 'day6'}\n" ] } ], @@ -399,18 +399,18 @@ "data_mapper = ob.MapData(key_map={\"txt_file\": \"content\"})\n", "key_mapped_stream = data_mapper(dataset1)\n", "\n", - "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", - "tag_and_data_mapped = tag_mapper(key_mapped_stream)\n", + "key_mapper = ob.MapKeys(key_map={\"file_name\": \"day\"})\n", + "key_and_data_mapped = key_mapper(key_mapped_stream)\n", "\n", - "for tag, data in tag_and_data_mapped:\n", - " print(f\"Mapped Data {data} with tag {tag}\")" + "for key, data in key_and_data_mapped:\n", + " print(f\"Mapped Data {data} with key {key}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "It's worth emphasizing again that all computations are triggered only when you iterate through the final stream `tag_and_key_mapped`" + "It's worth emphasizing again that all computations are triggered only when you iterate through the final stream `key_and_key_mapped`" ] }, { @@ -429,20 +429,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'day': 'day1'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'day': 'day2'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'day': 'day3'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'day': 'day4'}\n", - "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'day': 'day6'}\n" + "Mapped Data {'content': PosixPath('../examples/dataset1/day1.txt')} with key {'day': 'day1'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day2.txt')} with key {'day': 'day2'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day3.txt')} with key {'day': 'day3'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day4.txt')} with key {'day': 'day4'}\n", + "Mapped Data {'content': PosixPath('../examples/dataset1/day6.txt')} with key {'day': 'day6'}\n" ] } ], "source": [ "# totally valid, but difficult to read and thus not recommended\n", - "for tag, data in ob.MapTags(key_map={\"file_name\": \"day\"})(\n", + "for key, data in ob.MapKeys(key_map={\"file_name\": \"day\"})(\n", " ob.MapData(key_map={\"txt_file\": \"content\"})(dataset1)\n", "):\n", - " print(f\"Mapped Data {data} with tag {tag}\")" + " print(f\"Mapped Data {data} with key {key}\")" ] }, { @@ -453,7 +453,7 @@ "Now that we have looked at how you can manipulate a single stream, let's turn our eyes to how you can work with more than one streams together.\n", "\n", "By the far the most common multi-stream operations will be to join two (or more) streams into a single, bigger stream. \n", - "You can combine multiple streams into one by using `Join` operation, matching data from each stream based on the matching tags. If tags from two streams have shared key, the value must be identical for all shared keys for the two data to be matched. The matched data are then merged into a one (typically larger) data and shipped to the output stream." + "You can combine multiple streams into one by using `Join` operation, matching data from each stream based on the matching keys. If keys from two streams have shared key, the value must be identical for all shared keys for the two data to be matched. The matched data are then merged into a one (typically larger) data and shipped to the output stream." ] }, { @@ -473,30 +473,30 @@ "output_type": "stream", "text": [ "Dataset 1:\n", - "Tag: {'file_name': 'day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt')}\n", - "Tag: {'file_name': 'day2'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt')}\n", - "Tag: {'file_name': 'day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt')}\n", - "Tag: {'file_name': 'day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt')}\n", - "Tag: {'file_name': 'day6'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt')}\n", + "Key: {'file_name': 'day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt')}\n", + "Key: {'file_name': 'day2'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt')}\n", + "Key: {'file_name': 'day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt')}\n", + "Key: {'file_name': 'day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt')}\n", + "Key: {'file_name': 'day6'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt')}\n", "\n", "Dataset 2:\n", - "Tag: {'file_name': 'session_day1'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "Tag: {'file_name': 'session_day3'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "Tag: {'file_name': 'session_day4'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "Tag: {'file_name': 'session_day5'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + "Key: {'file_name': 'session_day1'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "Key: {'file_name': 'session_day5'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" ] } ], "source": [ "# dataset 1\n", "print(\"Dataset 1:\")\n", - "for tag, data in dataset1:\n", - " print(f\"Tag: {tag}, Data: {data}\")\n", + "for key, data in dataset1:\n", + " print(f\"Key: {key}, Data: {data}\")\n", "\n", "# dataset 2\n", "print(\"\\nDataset 2:\")\n", - "for tag, data in dataset2:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in dataset2:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -514,8 +514,8 @@ "source": [ "join_op = ob.Join()\n", "\n", - "for tag, data in join_op(dataset1, dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in join_op(dataset1, dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -536,7 +536,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's completely rename the tag key for one of the streams and see what would happen." + "First, let's completely rename the key key for one of the streams and see what would happen." ] }, { @@ -548,34 +548,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "01 Tag: {'day': 'day1', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "02 Tag: {'day': 'day1', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "03 Tag: {'day': 'day1', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "04 Tag: {'day': 'day1', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", - "05 Tag: {'day': 'day2', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "06 Tag: {'day': 'day2', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "07 Tag: {'day': 'day2', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "08 Tag: {'day': 'day2', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", - "09 Tag: {'day': 'day3', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "10 Tag: {'day': 'day3', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "11 Tag: {'day': 'day3', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "12 Tag: {'day': 'day3', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", - "13 Tag: {'day': 'day4', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "14 Tag: {'day': 'day4', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "15 Tag: {'day': 'day4', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "16 Tag: {'day': 'day4', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", - "17 Tag: {'day': 'day6', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "18 Tag: {'day': 'day6', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "19 Tag: {'day': 'day6', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "20 Tag: {'day': 'day6', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + "01 Key: {'day': 'day1', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "02 Key: {'day': 'day1', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "03 Key: {'day': 'day1', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "04 Key: {'day': 'day1', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "05 Key: {'day': 'day2', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "06 Key: {'day': 'day2', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "07 Key: {'day': 'day2', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "08 Key: {'day': 'day2', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "09 Key: {'day': 'day3', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "10 Key: {'day': 'day3', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "11 Key: {'day': 'day3', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "12 Key: {'day': 'day3', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "13 Key: {'day': 'day4', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "14 Key: {'day': 'day4', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "15 Key: {'day': 'day4', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "16 Key: {'day': 'day4', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "17 Key: {'day': 'day6', 'file_name': 'session_day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "18 Key: {'day': 'day6', 'file_name': 'session_day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "19 Key: {'day': 'day6', 'file_name': 'session_day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "20 Key: {'day': 'day6', 'file_name': 'session_day5'}, Data: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" ] } ], "source": [ - "dataset1_retagged = ob.MapTags(key_map={\"file_name\": \"day\"})(dataset1)\n", + "dataset1_retagged = ob.MapKeys(key_map={\"file_name\": \"day\"})(dataset1)\n", "\n", - "for i, (tag, data) in enumerate(join_op(dataset1_retagged, dataset2)):\n", - " print(f\"{i + 1:02d} Tag: {tag}, Data: {data}\")" + "for i, (key, data) in enumerate(join_op(dataset1_retagged, dataset2)):\n", + " print(f\"{i + 1:02d} Key: {key}, Data: {data}\")" ] }, { @@ -584,9 +584,9 @@ "source": [ "We are now getting something -- in fact, quite a few things. If you look carefully at the `data`, you'll notice that it now contains two keys/arguments -- `txt_file` and `bin_data`, combining the data from the two datasets. \n", "\n", - "The `tags` also now contain two keys `day` from the re-tagged dataset1 stream and `file_name` from unchanged dataset2 stream.\n", + "The `keys` also now contain two keys `day` from the re-tagged dataset1 stream and `file_name` from unchanged dataset2 stream.\n", "\n", - "Since the two streams share no common tags, the `Join` operation results in *full-multiplexing* of two streams. With the streams from dataset1 and dataset2 containing 5 data and 4 data, respectively, you get $5 \\times 4 = 20$ data" + "Since the two streams share no common keys, the `Join` operation results in *full-multiplexing* of two streams. With the streams from dataset1 and dataset2 containing 5 data and 4 data, respectively, you get $5 \\times 4 = 20$ data" ] }, { @@ -602,7 +602,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Although we could achieve the desired effect by changing how we load the source, passing in custom `tag_function` into `GlobSource`, let's achieve the same by using another `mapper` called `Transform`. `Transform` effectively combines `MapKey` and `MapTag` but further allows you to provide a function that will receive the tag and data, one at a time, and return a (potentially modified) tag and/or data, achieving the desired transformation." + "Although we could achieve the desired effect by changing how we load the source, passing in custom `key_function` into `GlobSource`, let's achieve the same by using another `mapper` called `Transform`. `Transform` effectively combines `MapKey` and `MapKey` but further allows you to provide a function that will receive the key and data, one at a time, and return a (potentially modified) key and/or data, achieving the desired transformation." ] }, { @@ -614,18 +614,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'day': 'day1'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "Tag: {'day': 'day3'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "Tag: {'day': 'day4'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", - "Tag: {'day': 'day5'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + "Key: {'day': 'day1'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Key: {'day': 'day3'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Key: {'day': 'day4'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "Key: {'day': 'day5'}, Data: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" ] } ], "source": [ - "def transform_dataset2(tag, data):\n", + "def transform_dataset2(key, data):\n", " # Extract the second half of the filename containing day\n", - " new_tag = {\"day\": tag[\"file_name\"].split(\"_\")[1]}\n", - " return new_tag, data\n", + " new_key = {\"day\": key[\"file_name\"].split(\"_\")[1]}\n", + " return new_key, data\n", "\n", "\n", "# Speical mappers like transform can be found in the orcabridge.mapper module\n", @@ -633,8 +633,8 @@ "\n", "retagged_dataset2 = dataset2_transformer(dataset2)\n", "\n", - "for tag, data in retagged_dataset2:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in retagged_dataset2:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -653,22 +653,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'day': 'day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", - "Tag: {'day': 'day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", - "Tag: {'day': 'day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n" + "Key: {'day': 'day1'}, Data: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Key: {'day': 'day3'}, Data: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Key: {'day': 'day4'}, Data: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n" ] } ], "source": [ "# change filename to day for dataset1\n", - "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", - "retagged_dataset1 = tag_mapper(dataset1)\n", + "key_mapper = ob.MapKeys(key_map={\"file_name\": \"day\"})\n", + "retagged_dataset1 = key_mapper(dataset1)\n", "\n", "join_op = ob.Join()\n", "joined_stream = join_op(retagged_dataset1, retagged_dataset2)\n", "\n", - "for tag, data in joined_stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in joined_stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -696,7 +696,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "While `mapper` operations are useful in altering tags, data, and in combining multiple streams, a data pipeline is not really useful if it cannot produce new resultsin the form of new data -- that is, introduce new files into the stream. This is precisely where `Pod` operations come in!\n", + "While `mapper` operations are useful in altering keys, data, and in combining multiple streams, a data pipeline is not really useful if it cannot produce new resultsin the form of new data -- that is, introduce new files into the stream. This is precisely where `Pod` operations come in!\n", "\n", "In fact, we have already been working with a `pod` all along -- `sources`. If you think about it, `sources` also introduce files into the stream. It is just special in that it takes no input streams (hence the name, `source`).\n", "\n", @@ -768,15 +768,15 @@ "output_type": "stream", "text": [ "File ../examples/dataset1/day1.txt has 24 lines.\n", - "Tag: {'file_name': 'day1'}, Data: {}\n", + "Key: {'file_name': 'day1'}, Data: {}\n", "File ../examples/dataset1/day2.txt has 15 lines.\n", - "Tag: {'file_name': 'day2'}, Data: {}\n", + "Key: {'file_name': 'day2'}, Data: {}\n", "File ../examples/dataset1/day3.txt has 27 lines.\n", - "Tag: {'file_name': 'day3'}, Data: {}\n", + "Key: {'file_name': 'day3'}, Data: {}\n", "File ../examples/dataset1/day4.txt has 22 lines.\n", - "Tag: {'file_name': 'day4'}, Data: {}\n", + "Key: {'file_name': 'day4'}, Data: {}\n", "File ../examples/dataset1/day6.txt has 22 lines.\n", - "Tag: {'file_name': 'day6'}, Data: {}\n" + "Key: {'file_name': 'day6'}, Data: {}\n" ] } ], @@ -784,8 +784,8 @@ "# apply the function pod on a stream\n", "processed_stream = function_pod(dataset1)\n", "\n", - "for tag, data in processed_stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in processed_stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -849,7 +849,7 @@ " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", - "Tag: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpm2wka6il/statistics.json')}\n", + "Key: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpm2wka6il/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day3.bin\n", "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", @@ -857,21 +857,21 @@ " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", " 1.39972807 -0.13940519]\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmplkmx65ll/statistics.json')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmplkmx65ll/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day4.bin\n", "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpxajrzctd/statistics.json')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpxajrzctd/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day5.bin\n", "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp67rthfe1/statistics.json')}\n" + "Key: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp67rthfe1/statistics.json')}\n" ] } ], @@ -881,8 +881,8 @@ "# change the key from 'bin_data' to 'bin_file', matching the function's input\n", "mapped_dataset2 = ob.MapData(key_map={\"bin_data\": \"bin_file\"})(dataset2)\n", "\n", - "for tag, data in fp_stats(mapped_dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in fp_stats(mapped_dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -909,7 +909,7 @@ " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", - "Tag: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpciwa2xl_/statistics.json')}\n", + "Key: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpciwa2xl_/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day3.bin\n", "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", @@ -917,29 +917,29 @@ " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", " 1.39972807 -0.13940519]\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpkq824j5b/statistics.json')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpkq824j5b/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day4.bin\n", "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmp7ii2nd6e/statistics.json')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmp7ii2nd6e/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day5.bin\n", "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmpz23q61gg/statistics.json')}\n" + "Key: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmpz23q61gg/statistics.json')}\n" ] } ], "source": [ "# everytime you run the following loop, new computations are performed and\n", "# saved in a different set of temporary files\n", - "for tag, data in fp_stats(mapped_dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in fp_stats(mapped_dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -987,7 +987,7 @@ " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", - "Tag: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", + "Key: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day3.bin\n", "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", @@ -995,21 +995,21 @@ " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", " 1.39972807 -0.13940519]\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day4.bin\n", "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", "Computing stats for file: ../examples/dataset2/session_day5.bin\n", "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" + "Key: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" ] } ], @@ -1024,8 +1024,8 @@ "cached_stream = cache_stream(stats_stream)\n", "\n", "# iterate over the cached stream\n", - "for tag, data in cached_stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in cached_stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -1044,16 +1044,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" + "Key: {'file_name': 'session_day1'}, Data: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", + "Key: {'file_name': 'session_day5'}, Data: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" ] } ], "source": [ - "for tag, data in cached_stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in cached_stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -1118,16 +1118,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'session_day1'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" + "Key: {'file_name': 'session_day1'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", + "Key: {'file_name': 'session_day5'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" ] } ], "source": [ - "for tag, data in fp_stats_stored(mapped_dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in fp_stats_stored(mapped_dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -1146,16 +1146,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'session_day1'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", - "Tag: {'file_name': 'session_day3'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", - "Tag: {'file_name': 'session_day4'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", - "Tag: {'file_name': 'session_day5'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" + "Key: {'file_name': 'session_day1'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", + "Key: {'file_name': 'session_day3'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", + "Key: {'file_name': 'session_day4'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", + "Key: {'file_name': 'session_day5'}, Data: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" ] } ], "source": [ - "for tag, data in fp_stats_stored(mapped_dataset2):\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in fp_stats_stored(mapped_dataset2):\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { diff --git a/notebooks/old_tutorials/03_orcacapod_qol_features.ipynb b/notebooks/old_tutorials/03_orcacapod_qol_features.ipynb index d83b8da9..4406232c 100644 --- a/notebooks/old_tutorials/03_orcacapod_qol_features.ipynb +++ b/notebooks/old_tutorials/03_orcacapod_qol_features.ipynb @@ -34,11 +34,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'day1'}, Data: {'output_data': 'path/to/result/file'}\n", - "Tag: {'file_name': 'day2'}, Data: {'output_data': 'path/to/result/file'}\n", - "Tag: {'file_name': 'day3'}, Data: {'output_data': 'path/to/result/file'}\n", - "Tag: {'file_name': 'day4'}, Data: {'output_data': 'path/to/result/file'}\n", - "Tag: {'file_name': 'day6'}, Data: {'output_data': 'path/to/result/file'}\n" + "Key: {'file_name': 'day1'}, Data: {'output_data': 'path/to/result/file'}\n", + "Key: {'file_name': 'day2'}, Data: {'output_data': 'path/to/result/file'}\n", + "Key: {'file_name': 'day3'}, Data: {'output_data': 'path/to/result/file'}\n", + "Key: {'file_name': 'day4'}, Data: {'output_data': 'path/to/result/file'}\n", + "Key: {'file_name': 'day6'}, Data: {'output_data': 'path/to/result/file'}\n" ] } ], @@ -142,11 +142,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'info_day1'}, Data: {'output_data': PosixPath('/tmp/tmpb0q3mj9m/output.json')}\n", - "Tag: {'file_name': 'info_day2'}, Data: {'output_data': PosixPath('/tmp/tmpt79_hpoe/output.json')}\n", - "Tag: {'file_name': 'info_day3'}, Data: {'output_data': PosixPath('/tmp/tmp_rq1b2rq/output.json')}\n", - "Tag: {'file_name': 'info_day4'}, Data: {'output_data': PosixPath('/tmp/tmp4dyoqbix/output.json')}\n", - "Tag: {'file_name': 'info_day5'}, Data: {'output_data': PosixPath('/tmp/tmpc9a1bxx4/output.json')}\n" + "Key: {'file_name': 'info_day1'}, Data: {'output_data': PosixPath('/tmp/tmpb0q3mj9m/output.json')}\n", + "Key: {'file_name': 'info_day2'}, Data: {'output_data': PosixPath('/tmp/tmpt79_hpoe/output.json')}\n", + "Key: {'file_name': 'info_day3'}, Data: {'output_data': PosixPath('/tmp/tmp_rq1b2rq/output.json')}\n", + "Key: {'file_name': 'info_day4'}, Data: {'output_data': PosixPath('/tmp/tmp4dyoqbix/output.json')}\n", + "Key: {'file_name': 'info_day5'}, Data: {'output_data': PosixPath('/tmp/tmpc9a1bxx4/output.json')}\n" ] } ], @@ -189,7 +189,7 @@ "id": "bea0880a", "metadata": {}, "source": [ - "## Mapping tags and data with `>>` operator" + "## Mapping keys and data with `>>` operator" ] }, { @@ -267,11 +267,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'info_day1'}, Data: {'line_count': PosixPath('/tmp/tmpl9kxw4yn/line_count.json')}\n", - "Tag: {'file_name': 'info_day2'}, Data: {'line_count': PosixPath('/tmp/tmpa0d08oym/line_count.json')}\n", - "Tag: {'file_name': 'info_day3'}, Data: {'line_count': PosixPath('/tmp/tmp_9r0cryr/line_count.json')}\n", - "Tag: {'file_name': 'info_day4'}, Data: {'line_count': PosixPath('/tmp/tmpygwfjha9/line_count.json')}\n", - "Tag: {'file_name': 'info_day5'}, Data: {'line_count': PosixPath('/tmp/tmph_5zgk6j/line_count.json')}\n" + "Key: {'file_name': 'info_day1'}, Data: {'line_count': PosixPath('/tmp/tmpl9kxw4yn/line_count.json')}\n", + "Key: {'file_name': 'info_day2'}, Data: {'line_count': PosixPath('/tmp/tmpa0d08oym/line_count.json')}\n", + "Key: {'file_name': 'info_day3'}, Data: {'line_count': PosixPath('/tmp/tmp_9r0cryr/line_count.json')}\n", + "Key: {'file_name': 'info_day4'}, Data: {'line_count': PosixPath('/tmp/tmpygwfjha9/line_count.json')}\n", + "Key: {'file_name': 'info_day5'}, Data: {'line_count': PosixPath('/tmp/tmph_5zgk6j/line_count.json')}\n" ] } ], @@ -307,16 +307,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'info_day1'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day1.json')}\n", - "Tag: {'file_name': 'info_day2'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day2.json')}\n", - "Tag: {'file_name': 'info_day3'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day3.json')}\n", - "Tag: {'file_name': 'info_day4'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day4.json')}\n", - "Tag: {'file_name': 'info_day5'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day5.json')}\n", - "Tag: {'file_name': 'info_day1'}, Data: {'line_count': PosixPath('/tmp/tmpskhgsexk/line_count.json')}\n", - "Tag: {'file_name': 'info_day2'}, Data: {'line_count': PosixPath('/tmp/tmp7oto9nav/line_count.json')}\n", - "Tag: {'file_name': 'info_day3'}, Data: {'line_count': PosixPath('/tmp/tmpushxubr1/line_count.json')}\n", - "Tag: {'file_name': 'info_day4'}, Data: {'line_count': PosixPath('/tmp/tmpb2fhgner/line_count.json')}\n", - "Tag: {'file_name': 'info_day5'}, Data: {'line_count': PosixPath('/tmp/tmpnujrqytb/line_count.json')}\n" + "Key: {'file_name': 'info_day1'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day1.json')}\n", + "Key: {'file_name': 'info_day2'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day2.json')}\n", + "Key: {'file_name': 'info_day3'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day3.json')}\n", + "Key: {'file_name': 'info_day4'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day4.json')}\n", + "Key: {'file_name': 'info_day5'}, Data: {'text_file': PosixPath('../examples/dataset2/info_day5.json')}\n", + "Key: {'file_name': 'info_day1'}, Data: {'line_count': PosixPath('/tmp/tmpskhgsexk/line_count.json')}\n", + "Key: {'file_name': 'info_day2'}, Data: {'line_count': PosixPath('/tmp/tmp7oto9nav/line_count.json')}\n", + "Key: {'file_name': 'info_day3'}, Data: {'line_count': PosixPath('/tmp/tmpushxubr1/line_count.json')}\n", + "Key: {'file_name': 'info_day4'}, Data: {'line_count': PosixPath('/tmp/tmpb2fhgner/line_count.json')}\n", + "Key: {'file_name': 'info_day5'}, Data: {'line_count': PosixPath('/tmp/tmpnujrqytb/line_count.json')}\n" ] } ], @@ -346,11 +346,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'info_day1'}, Data: {'line_count': PosixPath('/tmp/tmp4ny0gm34/line_count.json')}\n", - "Tag: {'file_name': 'info_day2'}, Data: {'line_count': PosixPath('/tmp/tmpegxyuceg/line_count.json')}\n", - "Tag: {'file_name': 'info_day3'}, Data: {'line_count': PosixPath('/tmp/tmpwzvjhte9/line_count.json')}\n", - "Tag: {'file_name': 'info_day4'}, Data: {'line_count': PosixPath('/tmp/tmpf0loiyqs/line_count.json')}\n", - "Tag: {'file_name': 'info_day5'}, Data: {'line_count': PosixPath('/tmp/tmp9vatjy_m/line_count.json')}\n" + "Key: {'file_name': 'info_day1'}, Data: {'line_count': PosixPath('/tmp/tmp4ny0gm34/line_count.json')}\n", + "Key: {'file_name': 'info_day2'}, Data: {'line_count': PosixPath('/tmp/tmpegxyuceg/line_count.json')}\n", + "Key: {'file_name': 'info_day3'}, Data: {'line_count': PosixPath('/tmp/tmpwzvjhte9/line_count.json')}\n", + "Key: {'file_name': 'info_day4'}, Data: {'line_count': PosixPath('/tmp/tmpf0loiyqs/line_count.json')}\n", + "Key: {'file_name': 'info_day5'}, Data: {'line_count': PosixPath('/tmp/tmp9vatjy_m/line_count.json')}\n" ] } ], @@ -372,7 +372,7 @@ "id": "18ec64f3", "metadata": {}, "source": [ - "### Mapping tags and advanced mapping" + "### Mapping keys and advanced mapping" ] }, { @@ -380,7 +380,7 @@ "id": "05e8ff25", "metadata": {}, "source": [ - "We just saw how the rightshift operator can be used to simplify the `MapData` operation creation. How about `MapTags`? We can get `MapTags` equivalent operation also by using the rightshift (`>>`) operator, but with the help of an additional function: `tag()`." + "We just saw how the rightshift operator can be used to simplify the `MapData` operation creation. How about `MapKeys`? We can get `MapKeys` equivalent operation also by using the rightshift (`>>`) operator, but with the help of an additional function: `key()`." ] }, { @@ -393,17 +393,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'experiment_day': 'info_day1'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day1.json')}\n", - "Tag: {'experiment_day': 'info_day2'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day2.json')}\n", - "Tag: {'experiment_day': 'info_day3'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day3.json')}\n", - "Tag: {'experiment_day': 'info_day4'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day4.json')}\n", - "Tag: {'experiment_day': 'info_day5'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day5.json')}\n" + "Key: {'experiment_day': 'info_day1'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day1.json')}\n", + "Key: {'experiment_day': 'info_day2'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day2.json')}\n", + "Key: {'experiment_day': 'info_day3'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day3.json')}\n", + "Key: {'experiment_day': 'info_day4'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day4.json')}\n", + "Key: {'experiment_day': 'info_day5'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day5.json')}\n" ] } ], "source": [ - "# use ob.tag to specifically map the tag key\n", - "(json_files >> ob.tag({\"file_name\": \"experiment_day\"})).head()" + "# use ob.key to specifically map the key key\n", + "(json_files >> ob.key({\"file_name\": \"experiment_day\"})).head()" ] }, { @@ -411,7 +411,7 @@ "id": "ac34eed4", "metadata": {}, "source": [ - "Now if you were to closely inspect `MapData` and `MapData`, you would know that it is capable of taking in some additional arguments such as `drop_unmapped`. Using `tag()` and `data()` helper functions would let you specify those arguments as well while using the `>>` operator." + "Now if you were to closely inspect `MapData` and `MapData`, you would know that it is capable of taking in some additional arguments such as `drop_unmapped`. Using `key()` and `data()` helper functions would let you specify those arguments as well while using the `>>` operator." ] }, { @@ -424,11 +424,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'info_day1'}, Data: {}\n", - "Tag: {'file_name': 'info_day2'}, Data: {}\n", - "Tag: {'file_name': 'info_day3'}, Data: {}\n", - "Tag: {'file_name': 'info_day4'}, Data: {}\n", - "Tag: {'file_name': 'info_day5'}, Data: {}\n" + "Key: {'file_name': 'info_day1'}, Data: {}\n", + "Key: {'file_name': 'info_day2'}, Data: {}\n", + "Key: {'file_name': 'info_day3'}, Data: {}\n", + "Key: {'file_name': 'info_day4'}, Data: {}\n", + "Key: {'file_name': 'info_day5'}, Data: {}\n" ] } ], @@ -447,11 +447,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'info_day1'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day1.json')}\n", - "Tag: {'file_name': 'info_day2'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day2.json')}\n", - "Tag: {'file_name': 'info_day3'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day3.json')}\n", - "Tag: {'file_name': 'info_day4'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day4.json')}\n", - "Tag: {'file_name': 'info_day5'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day5.json')}\n" + "Key: {'file_name': 'info_day1'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day1.json')}\n", + "Key: {'file_name': 'info_day2'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day2.json')}\n", + "Key: {'file_name': 'info_day3'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day3.json')}\n", + "Key: {'file_name': 'info_day4'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day4.json')}\n", + "Key: {'file_name': 'info_day5'}, Data: {'json_file': PosixPath('../examples/dataset2/info_day5.json')}\n" ] } ], @@ -494,8 +494,8 @@ "metadata": {}, "outputs": [], "source": [ - "mapped_dataset1 = dataset1 >> ob.tag({\"file_name\": \"txt_file\"})\n", - "mapped_dataset2 = dataset2 >> ob.tag({\"file_name\": \"json_file\"})" + "mapped_dataset1 = dataset1 >> ob.key({\"file_name\": \"txt_file\"})\n", + "mapped_dataset2 = dataset2 >> ob.key({\"file_name\": \"json_file\"})" ] }, { @@ -508,11 +508,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'txt_file': 'day1', 'json_file': 'info_day1'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day1.json')}\n", - "Tag: {'txt_file': 'day1', 'json_file': 'info_day2'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day2.json')}\n", - "Tag: {'txt_file': 'day1', 'json_file': 'info_day3'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day3.json')}\n", - "Tag: {'txt_file': 'day1', 'json_file': 'info_day4'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day4.json')}\n", - "Tag: {'txt_file': 'day1', 'json_file': 'info_day5'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day5.json')}\n" + "Key: {'txt_file': 'day1', 'json_file': 'info_day1'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day1.json')}\n", + "Key: {'txt_file': 'day1', 'json_file': 'info_day2'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day2.json')}\n", + "Key: {'txt_file': 'day1', 'json_file': 'info_day3'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day3.json')}\n", + "Key: {'txt_file': 'day1', 'json_file': 'info_day4'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day4.json')}\n", + "Key: {'txt_file': 'day1', 'json_file': 'info_day5'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day5.json')}\n" ] } ], @@ -541,11 +541,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'txt_file': 'day1', 'json_file': 'info_day1'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day1.json')}\n", - "Tag: {'txt_file': 'day1', 'json_file': 'info_day2'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day2.json')}\n", - "Tag: {'txt_file': 'day1', 'json_file': 'info_day3'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day3.json')}\n", - "Tag: {'txt_file': 'day1', 'json_file': 'info_day4'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day4.json')}\n", - "Tag: {'txt_file': 'day1', 'json_file': 'info_day5'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day5.json')}\n" + "Key: {'txt_file': 'day1', 'json_file': 'info_day1'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day1.json')}\n", + "Key: {'txt_file': 'day1', 'json_file': 'info_day2'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day2.json')}\n", + "Key: {'txt_file': 'day1', 'json_file': 'info_day3'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day3.json')}\n", + "Key: {'txt_file': 'day1', 'json_file': 'info_day4'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day4.json')}\n", + "Key: {'txt_file': 'day1', 'json_file': 'info_day5'}, Data: {'text_file': PosixPath('../examples/dataset1/day1.txt'), 'json_file': PosixPath('../examples/dataset2/info_day5.json')}\n" ] } ], diff --git a/notebooks/old_tutorials/04_orcapod_tracker.ipynb b/notebooks/old_tutorials/04_orcapod_tracker.ipynb index fd96408c..d58f55cf 100644 --- a/notebooks/old_tutorials/04_orcapod_tracker.ipynb +++ b/notebooks/old_tutorials/04_orcapod_tracker.ipynb @@ -139,10 +139,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'day1'}, Data: {'key_info': 'pod_data/extract_keys/cf978f9c23318c91/c4c3939c-aced-18ac-8a7c-330f07780bbd/keys.json'}\n", - "Tag: {'file_name': 'day2'}, Data: {'key_info': 'pod_data/extract_keys/cf978f9c23318c91/61e78af0-346a-00f7-df3d-f2ec1693a84e/keys.json'}\n", - "Tag: {'file_name': 'day3'}, Data: {'key_info': 'pod_data/extract_keys/cf978f9c23318c91/92fbcba4-a642-4105-8be2-c01ce9c3e12e/keys.json'}\n", - "Tag: {'file_name': 'day4'}, Data: {'key_info': 'pod_data/extract_keys/cf978f9c23318c91/9e7da977-1ed3-03a2-ffd5-a3d626c286d8/keys.json'}\n" + "Key: {'file_name': 'day1'}, Data: {'key_info': 'pod_data/extract_keys/cf978f9c23318c91/c4c3939c-aced-18ac-8a7c-330f07780bbd/keys.json'}\n", + "Key: {'file_name': 'day2'}, Data: {'key_info': 'pod_data/extract_keys/cf978f9c23318c91/61e78af0-346a-00f7-df3d-f2ec1693a84e/keys.json'}\n", + "Key: {'file_name': 'day3'}, Data: {'key_info': 'pod_data/extract_keys/cf978f9c23318c91/92fbcba4-a642-4105-8be2-c01ce9c3e12e/keys.json'}\n", + "Key: {'file_name': 'day4'}, Data: {'key_info': 'pod_data/extract_keys/cf978f9c23318c91/9e7da977-1ed3-03a2-ffd5-a3d626c286d8/keys.json'}\n" ] } ], @@ -160,10 +160,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'day1'}, Data: {'yaml_file': 'pod_data/json_to_yaml/0a2282eda2b641e6/c4c3939c-aced-18ac-8a7c-330f07780bbd/data.yaml'}\n", - "Tag: {'file_name': 'day2'}, Data: {'yaml_file': 'pod_data/json_to_yaml/0a2282eda2b641e6/61e78af0-346a-00f7-df3d-f2ec1693a84e/data.yaml'}\n", - "Tag: {'file_name': 'day3'}, Data: {'yaml_file': 'pod_data/json_to_yaml/0a2282eda2b641e6/92fbcba4-a642-4105-8be2-c01ce9c3e12e/data.yaml'}\n", - "Tag: {'file_name': 'day4'}, Data: {'yaml_file': 'pod_data/json_to_yaml/0a2282eda2b641e6/9e7da977-1ed3-03a2-ffd5-a3d626c286d8/data.yaml'}\n" + "Key: {'file_name': 'day1'}, Data: {'yaml_file': 'pod_data/json_to_yaml/0a2282eda2b641e6/c4c3939c-aced-18ac-8a7c-330f07780bbd/data.yaml'}\n", + "Key: {'file_name': 'day2'}, Data: {'yaml_file': 'pod_data/json_to_yaml/0a2282eda2b641e6/61e78af0-346a-00f7-df3d-f2ec1693a84e/data.yaml'}\n", + "Key: {'file_name': 'day3'}, Data: {'yaml_file': 'pod_data/json_to_yaml/0a2282eda2b641e6/92fbcba4-a642-4105-8be2-c01ce9c3e12e/data.yaml'}\n", + "Key: {'file_name': 'day4'}, Data: {'yaml_file': 'pod_data/json_to_yaml/0a2282eda2b641e6/9e7da977-1ed3-03a2-ffd5-a3d626c286d8/data.yaml'}\n" ] } ], @@ -249,7 +249,7 @@ "id": "3b8fd7d0", "metadata": {}, "source": [ - "Does that match what you thought of the pipeline thus far? You might notice that while we used the convenience operator `>>` to map data keys, the corresponding `MapData` mapper actually shows up in the graph. Remember that `>>` is just for convenience, making the creation of `MapData` and `MapTags` more accesible." + "Does that match what you thought of the pipeline thus far? You might notice that while we used the convenience operator `>>` to map data keys, the corresponding `MapData` mapper actually shows up in the graph. Remember that `>>` is just for convenience, making the creation of `MapData` and `MapKeys` more accesible." ] }, { diff --git a/notebooks/old_tutorials/05_orcabridge_dj_integration.ipynb b/notebooks/old_tutorials/05_orcabridge_dj_integration.ipynb index 4f038d7e..ee911150 100644 --- a/notebooks/old_tutorials/05_orcabridge_dj_integration.ipynb +++ b/notebooks/old_tutorials/05_orcabridge_dj_integration.ipynb @@ -502,11 +502,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'file_name': 'day1'}, Data: {'data_file': PosixPath('../examples/dataset1/day1.txt')}\n", - "Tag: {'file_name': 'day2'}, Data: {'data_file': PosixPath('../examples/dataset1/day2.txt')}\n", - "Tag: {'file_name': 'day3'}, Data: {'data_file': PosixPath('../examples/dataset1/day3.txt')}\n", - "Tag: {'file_name': 'day4'}, Data: {'data_file': PosixPath('../examples/dataset1/day4.txt')}\n", - "Tag: {'file_name': 'day6'}, Data: {'data_file': PosixPath('../examples/dataset1/day6.txt')}\n" + "Key: {'file_name': 'day1'}, Data: {'data_file': PosixPath('../examples/dataset1/day1.txt')}\n", + "Key: {'file_name': 'day2'}, Data: {'data_file': PosixPath('../examples/dataset1/day2.txt')}\n", + "Key: {'file_name': 'day3'}, Data: {'data_file': PosixPath('../examples/dataset1/day3.txt')}\n", + "Key: {'file_name': 'day4'}, Data: {'data_file': PosixPath('../examples/dataset1/day4.txt')}\n", + "Key: {'file_name': 'day6'}, Data: {'data_file': PosixPath('../examples/dataset1/day6.txt')}\n" ] } ], diff --git a/notebooks/tutorials/01_introduction_to_orcapod.ipynb b/notebooks/tutorials/01_introduction_to_orcapod.ipynb index 8a28472e..5929297e 100644 --- a/notebooks/tutorials/01_introduction_to_orcapod.ipynb +++ b/notebooks/tutorials/01_introduction_to_orcapod.ipynb @@ -93,7 +93,7 @@ "id": "c2ac8f32", "metadata": {}, "source": [ - "Use `op.streams.TableStream` to turn table into a stream. You will also have to specify which columns are the tags." + "Use `op.streams.TableStream` to turn table into a stream. You will also have to specify which columns are the keys." ] }, { @@ -103,7 +103,7 @@ "metadata": {}, "outputs": [], "source": [ - "stream = op.sources.DataFrameSource(table, tag_columns=[\"a\", \"b\"])" + "stream = op.sources.DataFrameSource(table, key_columns=[\"a\", \"b\"])" ] }, { @@ -119,7 +119,7 @@ "id": "08a854e7", "metadata": {}, "source": [ - "Once you have a stream, you can iterate through tag, data pair:" + "Once you have a stream, you can iterate through key, data pair:" ] }, { @@ -132,15 +132,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tag: {'a': 1, 'b': 'x'}, Data: {'c': True, 'd': 1.1}\n", - "Tag: {'a': 2, 'b': 'y'}, Data: {'c': False, 'd': 2.2}\n", - "Tag: {'a': 3, 'b': 'z'}, Data: {'c': True, 'd': 3.3}\n" + "Key: {'a': 1, 'b': 'x'}, Data: {'c': True, 'd': 1.1}\n", + "Key: {'a': 2, 'b': 'y'}, Data: {'c': False, 'd': 2.2}\n", + "Key: {'a': 3, 'b': 'z'}, Data: {'c': True, 'd': 3.3}\n" ] } ], "source": [ - "for tag, data in stream:\n", - " print(f\"Tag: {tag}, Data: {data}\")" + "for key, data in stream:\n", + " print(f\"Key: {key}, Data: {data}\")" ] }, { @@ -148,7 +148,7 @@ "id": "41c7876b", "metadata": {}, "source": [ - "You can also get all tag data pairs as a list of tuples by calling `.flow()`" + "You can also get all key data pairs as a list of tuples by calling `.flow()`" ] }, { @@ -259,7 +259,7 @@ "id": "49b297f6", "metadata": {}, "source": [ - "`include_source` adds `source` column for each data (non-tag) column patterned like `_source_{column}` and will contain information about where that particular value orginated from." + "`include_source` adds `source` column for each data (non-key) column patterned like `_source_{column}` and will contain information about where that particular value orginated from." ] }, { @@ -459,7 +459,7 @@ "id": "7ce05b68", "metadata": {}, "source": [ - "### Tags and Datas" + "### Keys and Datas" ] }, { @@ -467,7 +467,7 @@ "id": "20783626", "metadata": {}, "source": [ - "The tags and data returned by the streams can be thought of as special dictionary." + "The keys and data returned by the streams can be thought of as special dictionary." ] }, { @@ -517,7 +517,7 @@ "metadata": {}, "outputs": [], "source": [ - "all_tags_and_data = stream.flow()" + "all_keys_and_data = stream.flow()" ] }, { @@ -527,7 +527,7 @@ "metadata": {}, "outputs": [], "source": [ - "tag, data = all_tags_and_data[0]" + "key, data = all_keys_and_data[0]" ] }, { @@ -548,7 +548,7 @@ } ], "source": [ - "tag" + "key" ] }, { @@ -577,7 +577,7 @@ "id": "17be117a", "metadata": {}, "source": [ - "The element of tag/data can be accessed just like dictionary:" + "The element of key/data can be accessed just like dictionary:" ] }, { @@ -598,7 +598,7 @@ } ], "source": [ - "tag[\"a\"]" + "key[\"a\"]" ] }, { @@ -619,7 +619,7 @@ } ], "source": [ - "tag[\"b\"]" + "key[\"b\"]" ] }, { @@ -806,7 +806,7 @@ "id": "37ad91d0", "metadata": {}, "source": [ - "You can also get a plain dictionary from tag/data with `as_dict`" + "You can also get a plain dictionary from key/data with `as_dict`" ] }, { @@ -827,7 +827,7 @@ } ], "source": [ - "tag.as_dict()" + "key.as_dict()" ] }, { @@ -927,7 +927,7 @@ "id": "98ab6fc7", "metadata": {}, "source": [ - "The hash of tag/data can be computed with `content_hash()` method. The result will be cached so that it won't be computed again unnecessarily." + "The hash of key/data can be computed with `content_hash()` method. The result will be cached so that it won't be computed again unnecessarily." ] }, { @@ -948,7 +948,7 @@ } ], "source": [ - "tag.content_hash()" + "key.content_hash()" ] }, { @@ -998,8 +998,8 @@ " }\n", ")\n", "\n", - "stream1 = op.streams.ArrowTableStream(table1, tag_columns=[\"id\"])\n", - "stream2 = op.streams.ArrowTableStream(table2, tag_columns=[\"id\"])" + "stream1 = op.streams.ArrowTableStream(table1, key_columns=[\"id\"])\n", + "stream2 = op.streams.ArrowTableStream(table2, key_columns=[\"id\"])" ] }, { @@ -1022,7 +1022,7 @@ "shape: (3, 3)
*idab
i64i64str
01"x"
12"y"
43"z"
" ], "text/plain": [ - "ArrowTableStream(table=['id', 'a', 'b'], tag_columns=('id',))" + "ArrowTableStream(table=['id', 'a', 'b'], key_columns=('id',))" ] }, "execution_count": 35, @@ -1054,7 +1054,7 @@ "shape: (3, 3)
*idcd
i64boolf64
0true1.1
1false2.2
2true3.3
" ], "text/plain": [ - "ArrowTableStream(table=['id', 'c', 'd'], tag_columns=('id',))" + "ArrowTableStream(table=['id', 'c', 'd'], key_columns=('id',))" ] }, "execution_count": 36, @@ -1130,7 +1130,7 @@ "shape: (2, 5)
*idabcd
i64i64strboolf64
01"x"true1.1
12"y"false2.2
" ], "text/plain": [ - "DynamicPodStream(kernel=Join, upstreams=(ArrowTableStream(table=['id', 'a', 'b'], tag_columns=('id',)), ArrowTableStream(table=['id', 'c', 'd'], tag_columns=('id',))))" + "DynamicPodStream(kernel=Join, upstreams=(ArrowTableStream(table=['id', 'a', 'b'], key_columns=('id',)), ArrowTableStream(table=['id', 'c', 'd'], key_columns=('id',))))" ] }, "execution_count": 39, @@ -1178,7 +1178,7 @@ "shape: (2, 5)
*idabcd
i64i64strboolf64
01"x"true1.1
12"y"false2.2
" ], "text/plain": [ - "DynamicPodStream(kernel=Join, upstreams=(ArrowTableStream(table=['id', 'a', 'b'], tag_columns=('id',)), ArrowTableStream(table=['id', 'c', 'd'], tag_columns=('id',))))" + "DynamicPodStream(kernel=Join, upstreams=(ArrowTableStream(table=['id', 'a', 'b'], key_columns=('id',)), ArrowTableStream(table=['id', 'c', 'd'], key_columns=('id',))))" ] }, "execution_count": 40, @@ -1210,7 +1210,7 @@ "shape: (2, 3)
*idab
i64i64str
01"x"
12"y"
" ], "text/plain": [ - "DynamicPodStream(kernel=SemiJoin, upstreams=(ArrowTableStream(table=['id', 'a', 'b'], tag_columns=('id',)), ArrowTableStream(table=['id', 'c', 'd'], tag_columns=('id',))))" + "DynamicPodStream(kernel=SemiJoin, upstreams=(ArrowTableStream(table=['id', 'a', 'b'], key_columns=('id',)), ArrowTableStream(table=['id', 'c', 'd'], key_columns=('id',))))" ] }, "execution_count": 41, @@ -1242,7 +1242,7 @@ "shape: (3, 2)
*ida_mapped
i64i64
01
12
43
" ], "text/plain": [ - "DynamicPodStream(kernel=MapData, upstreams=(ArrowTableStream(table=['id', 'a', 'b'], tag_columns=('id',)),))" + "DynamicPodStream(kernel=MapData, upstreams=(ArrowTableStream(table=['id', 'a', 'b'], key_columns=('id',)),))" ] }, "execution_count": 42, @@ -1263,7 +1263,7 @@ { "data": { "text/html": [ - "DynamicPodStream[MapTags]\n", + "DynamicPodStream[MapKeys]\n", "