diff --git a/embeddings/CHUNKING.md b/embeddings/CHUNKING.md new file mode 100644 index 00000000..9e48858a --- /dev/null +++ b/embeddings/CHUNKING.md @@ -0,0 +1,91 @@ +# Embedding strategy FFI (embeddings lib → Manticore daemon) + +Embeddings lib **v8**. The one embedding call, `make_vect_embeddings`, takes an +optional `ChunkSettings*` that selects how a document becomes one or many +vectors. Cardinality is carried as **data** in the return (a per-document +offsets sidecar), so a single method covers both 1-vector and N-vector +strategies — no second method. + +## FFI + +```c +constexpr uint32_t STRATEGY_TRUNCATE = 0; // 1 vector/doc +constexpr uint32_t STRATEGY_MEAN = 1; // 1 vector/doc +constexpr uint32_t STRATEGY_FIXED = 2; // N vectors/doc +constexpr uint32_t STRATEGY_RECURSIVE = 3; // N vectors/doc +constexpr uint32_t STRATEGY_SENTENCE = 4; // N vectors/doc + +struct ChunkSettings { + uint32_t strategy; // one of STRATEGY_* + uint32_t max_tokens; // chunk size in tokens; 0 = model max + uint32_t overlap_tokens; // token overlap between chunks; 0 = none + uint32_t max_chunks; // cap on chunks/doc; 0 = unlimited (overflow merges into last) +}; + +// settings == nullptr ⇒ truncate (today's behavior) +FloatVecResult make_vect_embeddings( + const TextModelWrapper*, const StringItem* texts, uintptr_t count, + const ChunkSettings*, int32_t threads); +``` + +### Return: flat vectors + per-row offsets (cardinality is data) + +```c +struct FloatVecResult { + char *m_szError; + const FloatVec *m_tEmbedding; // FLAT: every document's vectors concatenated, `len` total + uintptr_t len; + uintptr_t cap; + const uintptr_t *m_pRowOffsets; // length rows+1; doc i = m_tEmbedding[off[i] .. off[i+1]] + uintptr_t rows; // number of input documents + uintptr_t offsets_cap; +}; +``` + +Read document `i`'s vectors as `m_tEmbedding[m_pRowOffsets[i] .. m_pRowOffsets[i+1]]`. +- `truncate`/`mean` → one vector/doc, so `len == rows == count` and offsets are + `[0, 1, …, rows]` (you may just index `m_tEmbedding[i]`). +- `fixed`/`recursive`/`sentence` → N vectors/doc; `len` = total chunks, and the + offsets group them per document. + +Free with `free_vec_result` (it frees the offsets too). Load-time check: +`EmbedLib.version == 8`. + +## Strategies + +| strategy | val | output | what it does | +|---|---|---|---| +| **truncate** | 0 | 1 vec/doc | embed the first `max_tokens` tokens (rest dropped). `max_tokens`/`overlap`/`max_chunks` ignored. | +| **mean** | 1 | 1 vec/doc | split (recursive, token-aware) → embed every chunk → **average** into one L2-normalized vector. Whole document, no tail loss. | +| **fixed** | 2 | N vecs/doc | split into fixed `max_tokens`-token windows; one vector per chunk. | +| **recursive** | 3 | N vecs/doc | split on a separator hierarchy (paragraph → line → space) ≤ `max_tokens`; one vector per chunk. | +| **sentence** | 4 | N vecs/doc | UAX-29 sentence segmentation (ES-style), grouped to `max_tokens`; one vector per chunk. | + +- `max_tokens = 0` → the model's own max input length. +- `overlap_tokens` → token overlap between chunks (multi-vector + mean). +- `max_chunks` → cap chunks/doc; overflow merges the tail into the last chunk. +- Local models chunk on the model's real subword tokens; remote API models + (OpenAI/Voyage/Jina) chunk by a char/byte heuristic (no local tokenizer). + +## Calling it + +```cpp +// non-chunked field & queries: pass nullptr → truncate +pFuncs->make_vect_embeddings( &model, items.data(), items.size(), nullptr, iThreads ); + +// any strategy, taken from a table option: +ChunkSettings tCfg { STRATEGY_SENTENCE, /*max_tokens*/ 256, /*overlap*/ 0, /*max_chunks*/ 0 }; +FloatVecResult tRes = pFuncs->make_vect_embeddings( &model, items.data(), items.size(), &tCfg, iThreads ); +// doc i owns vectors tRes.m_tEmbedding[ tRes.m_pRowOffsets[i] .. tRes.m_pRowOffsets[i+1] ] +``` + +The daemon owns the SQL/DDL surface (e.g. a per-`float_vector`-field option), +parses it into a `ChunkSettings`, and passes it on every embed call. Queries are +short: pass `nullptr` so they are never chunked. + +## Daemon side (next phase, not this lib) + +The lib already returns N vectors/doc for `fixed`/`recursive`/`sentence` via the +offsets sidecar. To *use* them the daemon needs N-vectors-per-row storage + +max-over-chunks search; `truncate`/`mean` (1 vector/doc) work with today's +storage unchanged. diff --git a/embeddings/Cargo.lock b/embeddings/Cargo.lock index 452ff0f8..118f69d9 100644 --- a/embeddings/Cargo.lock +++ b/embeddings/Cargo.lock @@ -1889,6 +1889,7 @@ dependencies = [ "serde", "serde_json", "tokenizers", + "unicode-segmentation", ] [[package]] diff --git a/embeddings/Cargo.toml b/embeddings/Cargo.toml index b1c0b3cf..ffa9a673 100644 --- a/embeddings/Cargo.toml +++ b/embeddings/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" # For local dev with ../../candle, add a [patch] section to use path deps. [dependencies] tokenizers = "0.15.2" +unicode-segmentation = "1" hf-hub = { git = "https://github.com/huggingface/hf-hub.git", rev = "ac22200ea0b5af4d8c362f699be0340647b19060", default-features = false,features = ["ureq"] } anyhow = "1.0.81" serde_json = "1.0.114" diff --git a/embeddings/INSTRUCTIONS.md b/embeddings/INSTRUCTIONS.md index 069aafd2..b22c797c 100644 --- a/embeddings/INSTRUCTIONS.md +++ b/embeddings/INSTRUCTIONS.md @@ -53,7 +53,7 @@ src/ 1. **`src/model/text_model_wrapper.rs`** - The heart of FFI - `load_model()` - Creates model instances - - `make_vect_embeddings()` - Generates embeddings (FIXED: no empty vectors on error) + - `make_vect_embeddings()` - Generates embeddings; takes an optional `ChunkSettings*` selecting the strategy (truncate / mean / fixed / recursive / sentence). Returns a flat `FloatVecResult` grouped per document by `m_pRowOffsets`. **Contract + strategy reference: [`CHUNKING.md`](CHUNKING.md).** - `free_model_result()` / `free_vec_result()` - Memory cleanup 2. **`manticoresearch_text_embeddings.h`** - Auto-generated C header diff --git a/embeddings/manticoresearch_text_embeddings.h b/embeddings/manticoresearch_text_embeddings.h index 3a8d5ef3..ac494995 100644 --- a/embeddings/manticoresearch_text_embeddings.h +++ b/embeddings/manticoresearch_text_embeddings.h @@ -9,6 +9,17 @@ #include #include +/// Strategy, mirrored as a `u32` across the FFI in [`ChunkSettings`]. +constexpr static const uint32_t STRATEGY_TRUNCATE = 0; + +constexpr static const uint32_t STRATEGY_MEAN = 1; + +constexpr static const uint32_t STRATEGY_FIXED = 2; + +constexpr static const uint32_t STRATEGY_RECURSIVE = 3; + +constexpr static const uint32_t STRATEGY_SENTENCE = 4; + struct TextModelResult { void *m_pModel; char *m_szError; @@ -33,11 +44,25 @@ struct FloatVec { uintptr_t cap; }; +/// Embedding result for one `make_vect_embeddings` call. +/// +/// `m_tEmbedding` is a FLAT array of `len` vectors — every input document's +/// vectors concatenated. `m_pRowOffsets` (length `rows + 1`) groups them per +/// input document, Arrow-style: document `i` owns +/// `m_tEmbedding[m_pRowOffsets[i] .. m_pRowOffsets[i + 1]]`. For the v1 +/// strategies (truncate / mean) every document yields exactly one vector, so +/// `len == rows` and the offsets are `[0, 1, ..., rows]`. The sidecar lets a +/// future multi-vector strategy return N vectors per document through this same +/// struct — no second method, cardinality carried as data. +/// struct FloatVecResult { char *m_szError; const FloatVec *m_tEmbedding; uintptr_t len; uintptr_t cap; + const uintptr_t *m_pRowOffsets; + uintptr_t rows; + uintptr_t offsets_cap; }; using TextModelWrapper = void*; @@ -47,9 +72,25 @@ struct StringItem { uintptr_t len; }; +/// Chunking parameters. `#[repr(C)]` — passed straight across the FFI by the +/// daemon, which owns the DDL surface and validates against the model. +struct ChunkSettings { + /// One of the `STRATEGY_*` constants. + uint32_t strategy; + /// Target chunk size in tokens. `0` ⇒ use the model's max. Always clamped to + /// the model's real input limit. + uint32_t max_tokens; + /// Token overlap between consecutive chunks. `0` ⇒ none. + uint32_t overlap_tokens; + /// Hard cap on chunks per document. `0` ⇒ unlimited. Overflow merges the + /// tail into the last chunk (matches OpenSearch's `max_chunk_limit`). + uint32_t max_chunks; +}; + using MakeVectEmbeddingsFn = FloatVecResult(*)(const TextModelWrapper*, const StringItem*, uintptr_t, + const ChunkSettings*, int32_t); using FreeVecResultFn = void(*)(FloatVecResult); diff --git a/embeddings/src/chunk.rs b/embeddings/src/chunk.rs new file mode 100644 index 00000000..640021ff --- /dev/null +++ b/embeddings/src/chunk.rs @@ -0,0 +1,500 @@ +//! Document chunking + embedding strategies. +//! +//! One document maps to one or many vectors depending on the strategy: +//! - `truncate` / `mean` collapse to **one** vector per document. +//! - `fixed` / `recursive` / `sentence` keep **N** vectors per document (one per +//! chunk) — the daemon groups them via the row-offsets sidecar. +//! +//! Local models chunk token-accurately via their loaded tokenizer; remote API +//! models (no local tokenizer) fall back to a conservative char/byte heuristic. +//! `ChunkSettings` arrives from the daemon's DDL surface and selects everything. + +use crate::utils::normalize; +use tokenizers::Tokenizer; + +/// Strategy, mirrored as a `u32` across the FFI in [`ChunkSettings`]. +pub const STRATEGY_TRUNCATE: u32 = 0; // 1 vector/doc: first `max_tokens` tokens +pub const STRATEGY_MEAN: u32 = 1; // 1 vector/doc: chunk → embed → average +pub const STRATEGY_FIXED: u32 = 2; // N vectors/doc: fixed token windows +pub const STRATEGY_RECURSIVE: u32 = 3; // N vectors/doc: recursive token-aware split +pub const STRATEGY_SENTENCE: u32 = 4; // N vectors/doc: UAX-29 sentence segmentation + +/// Special tokens (`[CLS]`/`[SEP]`) that `predict()` re-adds when it re-tokenizes +/// a chunk string. We size chunks to leave room so a chunk is never truncated. +const SPECIAL_TOKENS_MARGIN: usize = 2; + +/// Conservative bytes-per-token for the remote heuristic. Deliberately low so a +/// heuristic chunk lands *under* the provider's token cap rather than over it. +const REMOTE_BYTES_PER_TOKEN: usize = 3; + +/// Chunking parameters. `#[repr(C)]` — passed straight across the FFI by the +/// daemon, which owns the DDL surface and validates against the model. +#[repr(C)] +pub struct ChunkSettings { + /// One of the `STRATEGY_*` constants. + pub strategy: u32, + /// Target chunk size in tokens. `0` ⇒ use the model's max. Always clamped to + /// the model's real input limit. + pub max_tokens: u32, + /// Token overlap between consecutive chunks. `0` ⇒ none. + pub overlap_tokens: u32, + /// Hard cap on chunks per document. `0` ⇒ unlimited. Overflow merges the + /// tail into the last chunk (matches OpenSearch's `max_chunk_limit`). + pub max_chunks: u32, +} + +impl ChunkSettings { + /// True when the document must be split (every strategy except `truncate`). + pub fn needs_chunking(&self) -> bool { + self.strategy != STRATEGY_TRUNCATE + } + + /// True when the strategy collapses to one vector per document + /// (`truncate`/`mean`); false for the multi-vector strategies. + pub fn is_single_vector(&self) -> bool { + self.strategy == STRATEGY_TRUNCATE || self.strategy == STRATEGY_MEAN + } +} + +/// Resolve the chunk size in *content* tokens, clamped to what the model can +/// actually take (minus the special-token budget). `0` ⇒ model max. +pub fn effective_max(settings: &ChunkSettings, model_max: usize) -> usize { + let ceiling = model_max.saturating_sub(SPECIAL_TOKENS_MARGIN).max(1); + let target = settings.max_tokens as usize; + if target == 0 { + ceiling + } else { + target.min(ceiling) + } +} + +/// Average a document's chunk embeddings into one L2-normalized vector. Chunk +/// vectors are already normalized by `predict`; the mean of unit vectors is +/// re-normalized so the result is a unit vector too. +pub fn mean_pool(chunks: &[Vec]) -> Vec { + let dim = chunks.first().map(Vec::len).unwrap_or(0); + let mut acc = vec![0.0f32; dim]; + for v in chunks { + for (a, x) in acc.iter_mut().zip(v.iter()) { + *a += *x; + } + } + let n = chunks.len().max(1) as f32; + for a in acc.iter_mut() { + *a /= n; + } + normalize(&mut acc); + acc +} + +/// Arrow-style per-row offsets (length `counts.len() + 1`) from per-document +/// vector counts: document `i` owns vectors `[offsets[i] .. offsets[i + 1]]`. +pub fn row_offsets_from_counts(counts: &[usize]) -> Vec { + let mut offsets = Vec::with_capacity(counts.len() + 1); + let mut acc = 0usize; + offsets.push(0); + for &c in counts { + acc += c; + offsets.push(acc); + } + offsets +} + +/// Split one document into chunk byte spans `(start, end)` according to the +/// strategy's split method. `tokenizer` is `Some` for local models (token-exact) +/// and `None` for remote API models (char heuristic). `STRATEGY_TRUNCATE` never +/// reaches here (it does not chunk); `STRATEGY_MEAN` splits recursively. +pub fn chunk_text( + text: &str, + max_tokens: usize, + overlap: usize, + strategy: u32, + tokenizer: Option<&Tokenizer>, +) -> Vec<(usize, usize)> { + match strategy { + STRATEGY_SENTENCE => chunk_sentence(text, max_tokens, overlap, tokenizer), + STRATEGY_FIXED => window(text, max_tokens, overlap, false, tokenizer), + // recursive (3), mean's internal split (1), and any other → recursive. + _ => window(text, max_tokens, overlap, true, tokenizer), + } +} + +/// Token-window split, picking the token-accurate or heuristic backend. +fn window( + text: &str, + max_tokens: usize, + overlap: usize, + snap: bool, + tokenizer: Option<&Tokenizer>, +) -> Vec<(usize, usize)> { + match tokenizer { + Some(t) => chunk_local(text, t, max_tokens, overlap, snap), + None => chunk_chars(text, max_tokens, overlap), + } +} + +/// Token-accurate chunking for local models. One tokenization pass; windows by +/// token count; `recursive` snaps each cut back to a natural boundary; overlap +/// is realigned to the (possibly snapped) cut so no tokens are dropped. +/// +/// NOTE: assumes `tokenizer.encode(..).get_offsets()` returns byte offsets into +/// the input (true for the WordPiece/BPE paths Manticore loads). +pub fn chunk_local( + text: &str, + tokenizer: &Tokenizer, + max_tokens: usize, + overlap: usize, + recursive: bool, +) -> Vec<(usize, usize)> { + if text.is_empty() { + return vec![(0, 0)]; + } + // `false` = no special tokens, so offsets map cleanly onto the input text. + let encoding = match tokenizer.encode(text, false) { + Ok(e) => e, + // Tokenizer failure (rare): fall back to the whole doc as one chunk. + // predict() truncates it if needed — degrades to truncate, but is safe. + Err(_) => return vec![(0, text.len())], + }; + let offsets = encoding.get_offsets(); + let n = offsets.len(); + if max_tokens == 0 || n <= max_tokens { + return vec![(0, text.len())]; + } + + let overlap = overlap.min(max_tokens / 2); + let mut chunks = Vec::new(); + let mut start = 0usize; // start token index + loop { + let win_end = (start + max_tokens).min(n); // exclusive token index + let byte_start = offsets[start].0; + let mut byte_end = offsets[win_end - 1].1; + + if recursive && win_end < n { + if let Some(cut) = snap_back(text, byte_start, byte_end) { + byte_end = cut; + } + } + chunks.push((byte_start, byte_end)); + + // First token that begins at/after the (possibly snapped) cut. + let cut_tok = offsets.partition_point(|o| o.0 < byte_end); + if cut_tok >= n { + break; + } + start = cut_tok.saturating_sub(overlap).max(start + 1); // step back overlap, always progress + } + chunks +} + +/// Char/byte heuristic for remote API models (no local tokenizer). Windows by an +/// estimated byte budget, snapping to whitespace and never cutting mid-UTF-8. +pub fn chunk_chars(text: &str, max_tokens: usize, overlap_tokens: usize) -> Vec<(usize, usize)> { + if text.is_empty() { + return vec![(0, 0)]; + } + let window = (max_tokens * REMOTE_BYTES_PER_TOKEN).max(1); + if text.len() <= window { + return vec![(0, text.len())]; + } + let overlap = (overlap_tokens * REMOTE_BYTES_PER_TOKEN).min(window / 2); + + let mut chunks = Vec::new(); + let mut start = 0usize; + while start < text.len() { + let mut end = text.floor_char_boundary((start + window).min(text.len())); + if end <= start { + // A single char wider than the window — take at least one char. + end = text.ceil_char_boundary(start + 1).min(text.len()); + } + if end < text.len() { + if let Some(cut) = snap_back(text, start, end) { + end = cut; + } + } + chunks.push((start, end)); + if end >= text.len() { + break; + } + let mut next = end.saturating_sub(overlap); + if next <= start { + next = end; + } + start = text.ceil_char_boundary(next); + } + chunks +} + +/// Count tokens in `text`: exact via the model tokenizer when available, else +/// the conservative byte estimate the remote heuristic uses. +fn count_tokens(text: &str, tokenizer: Option<&Tokenizer>) -> usize { + match tokenizer { + Some(t) => t + .encode(text, false) + .map(|e| e.get_ids().len()) + .unwrap_or(text.len() / REMOTE_BYTES_PER_TOKEN + 1), + None => text.len() / REMOTE_BYTES_PER_TOKEN + 1, + } +} + +/// UAX-29 sentence boundaries — the same standard Elasticsearch's ICU uses — +/// as contiguous byte spans tiling the whole document. +fn sentence_spans(text: &str) -> Vec<(usize, usize)> { + use unicode_segmentation::UnicodeSegmentation; + text.split_sentence_bound_indices() + .map(|(offset, s)| (offset, offset + s.len())) + .collect() +} + +/// Sentence chunking, mirroring Elasticsearch's algorithm: detect sentences +/// (UAX-29), then greedily group whole sentences up to `max_tokens`. A single +/// sentence larger than the budget is split with the token-window splitter (ES +/// splits such a sentence across chunks). `overlap` re-seeds the next chunk with +/// trailing whole sentences summing ≤ `overlap` tokens. +pub fn chunk_sentence( + text: &str, + max_tokens: usize, + overlap: usize, + tokenizer: Option<&Tokenizer>, +) -> Vec<(usize, usize)> { + if text.is_empty() { + return vec![(0, 0)]; + } + let units: Vec<(usize, usize, usize)> = sentence_spans(text) + .into_iter() + .map(|(a, b)| (a, b, count_tokens(&text[a..b], tokenizer))) + .collect(); + // 0/1 sentence (e.g. terminator-free text): nothing to group on — fall back + // to the token-window splitter so an over-long blob still gets divided. + if units.len() <= 1 { + return window(text, max_tokens, overlap, true, tokenizer); + } + + let mut chunks = Vec::new(); + let mut i = 0usize; + while i < units.len() { + // Greedily pack whole sentences up to the budget (always take ≥ 1). + let mut j = i; + let mut sum = 0usize; + while j < units.len() { + let t = units[j].2; + if j > i && sum + t > max_tokens { + break; + } + sum += t; + j += 1; + } + + let start = units[i].0; + let end = units[j - 1].1; + if j == i + 1 && units[i].2 > max_tokens { + // One sentence bigger than the budget — split it like ES does. + for (a, b) in window(&text[start..end], max_tokens, overlap, true, tokenizer) { + chunks.push((start + a, start + b)); + } + } else { + chunks.push((start, end)); + } + + if j >= units.len() { + break; + } + + // Overlap: re-seed with trailing whole sentences summing ≤ overlap, + // never back to i (guarantee forward progress). + let mut next = j; + if overlap > 0 { + let mut acc = 0usize; + let mut k = j; + while k > i + 1 { + let t = units[k - 1].2; + if acc + t > overlap { + break; + } + acc += t; + k -= 1; + } + next = k.max(i + 1); + } + i = next; + } + chunks +} + +/// Enforce `max_chunks`: if exceeded, merge the overflow tail into the last kept +/// chunk so no document text is dropped. `0` ⇒ unlimited. +pub fn cap_chunks(mut chunks: Vec<(usize, usize)>, max_chunks: usize) -> Vec<(usize, usize)> { + if max_chunks == 0 || chunks.len() <= max_chunks { + return chunks; + } + let last_end = chunks.last().map(|c| c.1).unwrap_or(0); + chunks.truncate(max_chunks); + if let Some(last) = chunks.last_mut() { + last.1 = last_end; // extend to cover everything merged in + } + chunks +} + +/// Pull a chunk's end back to the nearest paragraph/line/sentence/space boundary +/// within `[start, end)`, but never past the chunk midpoint (don't over-shrink). +/// Returns the new end byte offset, or `None` if no good boundary was found. +fn snap_back(text: &str, start: usize, end: usize) -> Option { + let floor = start + (end - start) / 2; + for sep in ["\n\n", "\n", ". ", " "] { + if let Some(pos) = text[start..end].rfind(sep) { + let cut = start + pos + sep.len(); + if cut > floor && cut < end { + return Some(cut); + } + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + fn settings(strategy: u32, max: u32, overlap: u32, max_chunks: u32) -> ChunkSettings { + ChunkSettings { + strategy, + max_tokens: max, + overlap_tokens: overlap, + max_chunks, + } + } + + #[test] + fn strategy_classification() { + assert!(!settings(STRATEGY_TRUNCATE, 0, 0, 0).needs_chunking()); + assert!(settings(STRATEGY_MEAN, 0, 0, 0).needs_chunking()); + assert!(settings(STRATEGY_SENTENCE, 0, 0, 0).needs_chunking()); + assert!(settings(STRATEGY_TRUNCATE, 0, 0, 0).is_single_vector()); + assert!(settings(STRATEGY_MEAN, 0, 0, 0).is_single_vector()); + assert!(!settings(STRATEGY_FIXED, 0, 0, 0).is_single_vector()); + assert!(!settings(STRATEGY_SENTENCE, 0, 0, 0).is_single_vector()); + } + + #[test] + fn effective_max_defaults_clamps_and_floors() { + assert_eq!(effective_max(&settings(STRATEGY_MEAN, 0, 0, 0), 512), 510); + assert_eq!( + effective_max(&settings(STRATEGY_MEAN, 100_000, 0, 0), 512), + 510 + ); + assert_eq!( + effective_max(&settings(STRATEGY_MEAN, 256, 0, 0), 8192), + 256 + ); + assert!(effective_max(&settings(STRATEGY_MEAN, 0, 0, 0), 0) >= 1); + } + + #[test] + fn mean_pool_averages_then_normalizes() { + let m = mean_pool(&[vec![1.0, 0.0], vec![0.0, 1.0]]); + let e = std::f32::consts::FRAC_1_SQRT_2; + assert!((m[0] - e).abs() < 1e-6 && (m[1] - e).abs() < 1e-6); + } + + #[test] + fn mean_pool_single_chunk_is_normalized() { + let m = mean_pool(&[vec![3.0, 4.0]]); + assert!((m[0] - 0.6).abs() < 1e-6 && (m[1] - 0.8).abs() < 1e-6); + } + + #[test] + fn row_offsets_prefix_sum() { + assert_eq!(row_offsets_from_counts(&[3, 1, 2]), vec![0, 3, 4, 6]); + assert_eq!(row_offsets_from_counts(&[1, 1, 1]), vec![0, 1, 2, 3]); + assert_eq!(row_offsets_from_counts(&[]), vec![0]); + } + + #[test] + fn short_text_is_one_chunk() { + assert_eq!(chunk_chars("hello world", 100, 0), vec![(0, 11)]); + } + + #[test] + fn chunk_chars_tiles_document_with_no_overlap() { + let text = "word ".repeat(2000); + let chunks = chunk_chars(&text, 100, 0); + assert!(chunks.len() > 1); + assert_eq!(chunks.first().unwrap().0, 0); + assert_eq!(chunks.last().unwrap().1, text.len()); + for w in chunks.windows(2) { + assert_eq!(w[0].1, w[1].0, "no gap/overlap when overlap=0"); + } + for (a, b) in &chunks { + assert!(b > a && text.is_char_boundary(*a) && text.is_char_boundary(*b)); + } + } + + #[test] + fn chunk_chars_overlap_steps_back() { + let text = "word ".repeat(2000); + let chunks = chunk_chars(&text, 100, 20); + assert!(chunks.len() > 1); + for w in chunks.windows(2) { + assert!(w[1].0 < w[0].1, "overlap: next starts before prev end"); + } + assert_eq!(chunks.last().unwrap().1, text.len()); + } + + #[test] + fn chunk_chars_multibyte_never_splits_a_char() { + let text = "🦀 данные ".repeat(500); + let chunks = chunk_chars(&text, 50, 5); + for (a, b) in &chunks { + assert!(text.is_char_boundary(*a) && text.is_char_boundary(*b)); + } + assert_eq!(chunks.last().unwrap().1, text.len()); + } + + #[test] + fn chunk_sentence_splits_at_uax29_boundary() { + let text = "Aaa bbb ccc. Ddd eee fff."; + let chunks = chunk_sentence(text, 5, 0, None); + assert_eq!(chunks.len(), 2); + assert_eq!(chunks[0].0, 0); + assert_eq!(chunks[1].1, text.len()); + assert_eq!(chunks[0].1, chunks[1].0); + assert!((12..=13).contains(&chunks[0].1)); + } + + #[test] + fn chunk_sentence_packs_and_splits_oversized() { + let text = "S one. S two. S three. S four. S five. S six."; + assert_eq!(chunk_sentence(text, 100, 0, None).len(), 1); + assert!(chunk_sentence(text, 3, 0, None).len() > 1); + // terminator-free over-long "sentence" still splits + let blob = "word ".repeat(1000); + assert!(chunk_sentence(&blob, 100, 0, None).len() > 1); + } + + #[test] + fn chunk_text_routes_every_multivector_strategy() { + let text = "word ".repeat(400); + for s in [ + STRATEGY_FIXED, + STRATEGY_RECURSIVE, + STRATEGY_SENTENCE, + STRATEGY_MEAN, + ] { + let chunks = chunk_text(&text, 50, 0, s, None); + assert!(chunks.len() > 1, "strategy {s} should split"); + assert_eq!(chunks.first().unwrap().0, 0); + assert_eq!(chunks.last().unwrap().1, text.len()); + } + } + + #[test] + fn cap_chunks_merges_tail_and_noops() { + assert_eq!( + cap_chunks(vec![(0, 10), (10, 20), (20, 30), (30, 45)], 2), + vec![(0, 10), (10, 45)] + ); + let two = vec![(0, 10), (10, 20)]; + assert_eq!(cap_chunks(two.clone(), 5), two); + assert_eq!(cap_chunks(two.clone(), 0), two); + } +} diff --git a/embeddings/src/error_handling_test.rs b/embeddings/src/error_handling_test.rs index 3b2d3b70..15ac8897 100644 --- a/embeddings/src/error_handling_test.rs +++ b/embeddings/src/error_handling_test.rs @@ -152,6 +152,9 @@ mod tests { ptr: ptr::null(), len: 0, cap: 0, + row_offsets: ptr::null(), + rows: 0, + offsets_cap: 0, }; // Verify error is set and no vectors are present @@ -194,6 +197,9 @@ mod tests { ptr: float_vecs.as_ptr(), len: float_vecs.len(), cap: float_vecs.capacity(), + row_offsets: ptr::null(), + rows: 0, + offsets_cap: 0, }; // Verify successful result structure diff --git a/embeddings/src/ffi.rs b/embeddings/src/ffi.rs index be53847e..eae47276 100644 --- a/embeddings/src/ffi.rs +++ b/embeddings/src/ffi.rs @@ -1,3 +1,4 @@ +use crate::chunk::ChunkSettings; use crate::model::text_model_wrapper::{ FloatVecResult, StringItem, TextModelResult, TextModelWrapper, }; @@ -18,8 +19,13 @@ type LoadModelFn = extern "C" fn( type FreeModelResultFn = extern "C" fn(TextModelResult); -type MakeVectEmbeddingsFn = - extern "C" fn(&TextModelWrapper, *const StringItem, usize, i32) -> FloatVecResult; +type MakeVectEmbeddingsFn = extern "C" fn( + &TextModelWrapper, + *const StringItem, + usize, + *const ChunkSettings, + i32, +) -> FloatVecResult; type FreeVecResultFn = extern "C" fn(FloatVecResult); @@ -62,7 +68,7 @@ pub struct EmbedLib { const VERSION_STR: &[u8] = concat!(env!("EMBEDDINGS_VERSION_STR"), "\0").as_bytes(); const LIB: EmbedLib = EmbedLib { - version: 4usize, + version: 8usize, version_str: VERSION_STR.as_ptr() as *const c_char, load_model: TextModelWrapper::load_model, free_model_result: TextModelWrapper::free_model_result, diff --git a/embeddings/src/lib.rs b/embeddings/src/lib.rs index d7c11f44..dec5fd09 100644 --- a/embeddings/src/lib.rs +++ b/embeddings/src/lib.rs @@ -1,3 +1,4 @@ +mod chunk; mod error; mod ffi; mod model; @@ -19,6 +20,10 @@ mod error_handling_test; #[cfg(test)] mod panic_guard_test; +pub use chunk::{ + ChunkSettings, STRATEGY_FIXED, STRATEGY_MEAN, STRATEGY_RECURSIVE, STRATEGY_SENTENCE, + STRATEGY_TRUNCATE, +}; pub use error::LibError; pub use ffi::{EmbedLib, GetLibFuncs}; pub use model::TextModel; diff --git a/embeddings/src/model/ffi_test.rs b/embeddings/src/model/ffi_test.rs index 322012fa..2ce3a2e0 100644 --- a/embeddings/src/model/ffi_test.rs +++ b/embeddings/src/model/ffi_test.rs @@ -75,8 +75,13 @@ mod tests { model_ptr as *mut std::ffi::c_void, ) }; - let vec_result = - TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, 0); + let vec_result = TextModelWrapper::make_vect_embeddings( + &wrapper, + items.as_ptr(), + 1, + std::ptr::null(), + 0, + ); assert!(vec_result.error.is_null()); assert_eq!(vec_result.len, 1); TextModelWrapper::free_vec_result(vec_result); @@ -131,6 +136,9 @@ mod tests { ptr: ptr::null(), len: 0, cap: 0, + row_offsets: ptr::null(), + rows: 0, + offsets_cap: 0, }; assert!(result.error.is_null()); @@ -339,6 +347,9 @@ mod tests { ptr: ptr::null(), len: 0, cap: 0, + row_offsets: ptr::null(), + rows: 0, + offsets_cap: 0, }; // Verify error is set @@ -365,6 +376,9 @@ mod tests { ptr: ptr::null(), len: 0, cap: 0, + row_offsets: ptr::null(), + rows: 0, + offsets_cap: 0, }; // Should not crash with null pointers @@ -394,12 +408,15 @@ mod tests { mem::size_of::<*const c_char>() + mem::size_of::() ); - // FloatVecResult should be pointer + two usizes + error pointer + // FloatVecResult: error ptr + embeddings ptr + len + cap + // + row_offsets ptr + rows + offsets_cap assert_eq!( mem::size_of::(), mem::size_of::<*mut c_char>() + mem::size_of::<*const FloatVec>() + mem::size_of::() * 2 + + mem::size_of::<*const usize>() + + mem::size_of::() * 2 ); } @@ -501,4 +518,161 @@ mod tests { fn test_concurrent_qwen_embeddings_via_ffi() { run_concurrent_ffi_embeddings("Qwen/Qwen3-Embedding-0.6B"); } + + /// End-to-end on the cached MiniLM model: all five strategies. truncate/mean + /// return one vector/doc; fixed/recursive/sentence return N vectors/doc + /// grouped by row_offsets. Verifies offsets, normalization, and clean frees. + #[test] + fn test_all_strategies_local_model() { + let model_name = to_c_string("sentence-transformers/all-MiniLM-L6-v2"); + let empty = to_c_string(""); + let loaded = TextModelWrapper::load_model( + model_name.as_ptr(), + model_name.as_bytes().len(), + empty.as_ptr(), + empty.as_bytes().len(), + empty.as_ptr(), + empty.as_bytes().len(), + empty.as_ptr(), + empty.as_bytes().len(), + 0, + false, + ); + if loaded.model.is_null() { + TextModelWrapper::free_model_result(loaded); + eprintln!("skipping: all-MiniLM-L6-v2 not available locally"); + return; + } + let wrapper = + unsafe { std::mem::transmute::<*mut std::ffi::c_void, TextModelWrapper>(loaded.model) }; + + // Long enough to exceed the model's input limit, so truncate drops the + // tail while mean covers the whole document via several chunks. + let long = "Vector search turns text into embeddings for retrieval. ".repeat(100); + let items = [create_string_item(&long)]; + + let first_vec = |res: &FloatVecResult| -> Vec { + assert!(res.error.is_null()); + assert_eq!(res.len, 1, "exactly one vector per input document"); + assert_eq!(res.rows, 1); + assert!(!res.row_offsets.is_null()); + unsafe { + let offs = std::slice::from_raw_parts(res.row_offsets, res.rows + 1); + assert_eq!(offs, &[0usize, 1usize], "trivial offsets for 1 vector/doc"); + let fv = &*res.ptr; + std::slice::from_raw_parts(fv.ptr, fv.len).to_vec() + } + }; + + // truncate (null settings) + let trunc = + TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, ptr::null(), 0); + let trunc_vec = first_vec(&trunc); + + // mean: small chunk size forces many chunks → embed each → average + let settings = crate::chunk::ChunkSettings { + strategy: crate::chunk::STRATEGY_MEAN, + max_tokens: 32, + overlap_tokens: 0, + max_chunks: 0, + }; + let mean = + TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, &settings, 0); + let mean_vec = first_vec(&mean); + + assert_eq!(trunc_vec.len(), mean_vec.len(), "same embedding dimension"); + let diff: f32 = trunc_vec + .iter() + .zip(&mean_vec) + .map(|(a, b)| (a - b).abs()) + .sum(); + assert!( + diff > 1e-3, + "mean must differ from truncate on a long document" + ); + let norm: f32 = mean_vec.iter().map(|x| x * x).sum::().sqrt(); + assert!( + (norm - 1.0).abs() < 1e-2, + "mean vector must be L2-normalized" + ); + + // multi-vector strategies: fixed/recursive/sentence keep every chunk + // vector → N vectors/doc, grouped by row_offsets [0, N]. + for strat in [ + crate::chunk::STRATEGY_FIXED, + crate::chunk::STRATEGY_RECURSIVE, + crate::chunk::STRATEGY_SENTENCE, + ] { + let s = crate::chunk::ChunkSettings { + strategy: strat, + max_tokens: 32, + overlap_tokens: 0, + max_chunks: 0, + }; + let res = TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, &s, 0); + assert!(res.error.is_null(), "strategy {strat} errored"); + assert_eq!(res.rows, 1, "one document"); + assert!( + res.len > 1, + "strategy {strat} must emit multiple chunk vectors, got {}", + res.len + ); + unsafe { + let offs = std::slice::from_raw_parts(res.row_offsets, res.rows + 1); + assert_eq!(offs[0], 0); + assert_eq!(offs[1], res.len, "row 0 owns all {} chunk vectors", res.len); + let fvs = std::slice::from_raw_parts(res.ptr, res.len); + for fv in fvs { + assert_eq!(fv.len, trunc_vec.len(), "chunk vector dimension"); + let v = std::slice::from_raw_parts(fv.ptr, fv.len); + let norm: f32 = v.iter().map(|x| x * x).sum::().sqrt(); + assert!((norm - 1.0).abs() < 1e-2, "chunk vector must be normalized"); + } + } + TextModelWrapper::free_vec_result(res); + } + + // multi-document batch: row_offsets must group N-per-doc correctly, and + // overlap_tokens > 0 must still produce valid output. doc A is long + // (many chunks), doc B is short (one chunk). + let short = "One short sentence."; + let batch = [create_string_item(&long), create_string_item(short)]; + let s = crate::chunk::ChunkSettings { + strategy: crate::chunk::STRATEGY_FIXED, + max_tokens: 32, + overlap_tokens: 4, + max_chunks: 0, + }; + let res = TextModelWrapper::make_vect_embeddings(&wrapper, batch.as_ptr(), 2, &s, 0); + assert!(res.error.is_null()); + assert_eq!(res.rows, 2, "two documents"); + unsafe { + let offs = std::slice::from_raw_parts(res.row_offsets, res.rows + 1); + assert_eq!(offs[0], 0); + assert_eq!(offs[2], res.len, "last offset == total vectors"); + assert!(offs[1] > 1, "doc A (long) has multiple chunks"); + assert_eq!(offs[2] - offs[1], 1, "doc B (short) has exactly one chunk"); + } + TextModelWrapper::free_vec_result(res); + + // max_chunks cap: the long doc is capped to ≤ 3 vectors (tail merged). + let capped = crate::chunk::ChunkSettings { + strategy: crate::chunk::STRATEGY_FIXED, + max_tokens: 32, + overlap_tokens: 0, + max_chunks: 3, + }; + let res = TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, &capped, 0); + assert!(res.error.is_null()); + assert!( + (1..=3).contains(&res.len), + "max_chunks=3 caps to ≤3 vectors, got {}", + res.len + ); + TextModelWrapper::free_vec_result(res); + + TextModelWrapper::free_vec_result(trunc); + TextModelWrapper::free_vec_result(mean); + TextModelWrapper::free_model_result(loaded); + } } diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs index 429a6fb4..72eea015 100644 --- a/embeddings/src/model/local.rs +++ b/embeddings/src/model/local.rs @@ -1393,6 +1393,19 @@ impl LocalModel { } } +impl LocalModel { + /// Borrow the loaded tokenizer for token-accurate document chunking. + fn tokenizer(&self) -> &Tokenizer { + match self { + LocalModel::Bert(m) => &m.tokenizer, + LocalModel::T5(m) => &m.tokenizer, + LocalModel::Causal(m) => &m.tokenizer, + LocalModel::Quantized(m) => &m.tokenizer, + LocalModel::Onnx(m) => &m.tokenizer, + } + } +} + impl TextModel for LocalModel { fn predict(&self, texts: &[&str], threads: usize) -> Result>, Box> { // ONNX manages its own worker count internally — no rayon pool involved. @@ -1429,6 +1442,18 @@ impl TextModel for LocalModel { // Local models don't use API keys, so validation is always successful Ok(()) } + + fn chunk( + &self, + text: &str, + max_tokens: usize, + overlap: usize, + strategy: u32, + ) -> Vec<(usize, usize)> { + // Token-accurate split via the loaded tokenizer; chunk_text picks the + // split method (fixed/recursive/sentence) from the strategy. + crate::chunk::chunk_text(text, max_tokens, overlap, strategy, Some(self.tokenizer())) + } } #[cfg(test)] diff --git a/embeddings/src/model/mod.rs b/embeddings/src/model/mod.rs index cfa48d72..6ca2a0d7 100644 --- a/embeddings/src/model/mod.rs +++ b/embeddings/src/model/mod.rs @@ -36,6 +36,21 @@ pub trait TextModel { /// Validates the API key by making a minimal test request to the API. /// For remote models, this makes an actual HTTP request. For local models, this is a no-op. fn validate_api_key(&self) -> Result<(), Box>; + + /// Split one document into chunk byte spans `(start, end)` per `strategy`, + /// sized to `max_tokens` with `overlap` token overlap. Default impl is the + /// char/byte heuristic for remote API models (no local tokenizer); + /// `LocalModel` overrides it with token-accurate chunking via its loaded + /// tokenizer. Used by every strategy except `truncate`. + fn chunk( + &self, + text: &str, + max_tokens: usize, + overlap: usize, + strategy: u32, + ) -> Vec<(usize, usize)> { + crate::chunk::chunk_text(text, max_tokens, overlap, strategy, None) + } } #[repr(C)] @@ -105,6 +120,21 @@ impl TextModel for Model { Model::Local(m) => m.validate_api_key(), } } + + fn chunk( + &self, + text: &str, + max_tokens: usize, + overlap: usize, + strategy: u32, + ) -> Vec<(usize, usize)> { + match self { + Model::OpenAI(m) => m.chunk(text, max_tokens, overlap, strategy), + Model::Voyage(m) => m.chunk(text, max_tokens, overlap, strategy), + Model::Jina(m) => m.chunk(text, max_tokens, overlap, strategy), + Model::Local(m) => m.chunk(text, max_tokens, overlap, strategy), + } + } } /// Refuse local (candle) inference on an emulated x86 CPU. Rosetta 2 and QEMU diff --git a/embeddings/src/model/text_model_wrapper.rs b/embeddings/src/model/text_model_wrapper.rs index 396de4a6..60368728 100644 --- a/embeddings/src/model/text_model_wrapper.rs +++ b/embeddings/src/model/text_model_wrapper.rs @@ -1,8 +1,14 @@ +use crate::chunk::ChunkSettings; use crate::model::{create_model, Model, ModelOptions, TextModel}; use crate::panic_guard; use std::os::raw::c_char; use std::{ffi::c_void, ptr}; +/// Per-document output vectors (flat across all documents) plus per-document +/// vector counts, used to build the row-offsets sidecar. One inner `Vec` +/// per emitted vector. +type EmbedResult = Result<(Vec>, Vec), Box>; + /// Sentinel written at offset 0 of every live model handle. Lets FFI entry /// points detect garbage, null, or freed pointers handed in by the C++ caller /// and return a clean error instead of dereferencing into UB. @@ -60,13 +66,27 @@ pub struct FloatVec { pub cap: usize, } -/// cbindgen:field-names=[m_szError,m_tEmbedding,len,cap] +/// Embedding result for one `make_vect_embeddings` call. +/// +/// `m_tEmbedding` is a FLAT array of `len` vectors — every input document's +/// vectors concatenated. `m_pRowOffsets` (length `rows + 1`) groups them per +/// input document, Arrow-style: document `i` owns +/// `m_tEmbedding[m_pRowOffsets[i] .. m_pRowOffsets[i + 1]]`. For the v1 +/// strategies (truncate / mean) every document yields exactly one vector, so +/// `len == rows` and the offsets are `[0, 1, ..., rows]`. The sidecar lets a +/// future multi-vector strategy return N vectors per document through this same +/// struct — no second method, cardinality carried as data. +/// +/// cbindgen:field-names=[m_szError,m_tEmbedding,len,cap,m_pRowOffsets,rows,offsets_cap] #[repr(C)] pub struct FloatVecResult { pub error: *mut c_char, pub ptr: *const FloatVec, pub len: usize, pub cap: usize, + pub row_offsets: *const usize, + pub rows: usize, + pub offsets_cap: usize, } #[repr(C)] @@ -201,10 +221,20 @@ impl TextModelWrapper { } } + /// Generate embeddings for a batch of documents. `settings` selects the + /// strategy (null ⇒ `truncate`): + /// - `truncate` — embed the first `max_tokens` tokens (one vector/doc). + /// - `mean` — split the doc, embed every chunk, average into one vector/doc. + /// - `fixed`/`recursive`/`sentence` — split the doc and keep every chunk + /// vector (N vectors/doc). + /// + /// Output is a flat `FloatVec[]`; `m_pRowOffsets` groups it per document + /// (1 vector/doc for truncate/mean, N for the rest). `threads`: 0 = all CPUs. pub extern "C" fn make_vect_embeddings( &self, texts: *const StringItem, count: usize, + settings: *const ChunkSettings, threads: i32, // 0 = use all available CPUs, >0 = cap thread count ) -> FloatVecResult { panic_guard::catch_panic(|| { @@ -217,6 +247,9 @@ impl TextModelWrapper { ptr: ptr::null(), len: 0, cap: 0, + row_offsets: ptr::null(), + rows: 0, + offsets_cap: 0, }; } }; @@ -234,43 +267,112 @@ impl TextModelWrapper { .collect(); let threads = if threads > 0 { threads as usize } else { 0 }; + let settings_ref = unsafe { settings.as_ref() }; + + let strategy = settings_ref + .map(|s| s.strategy) + .unwrap_or(crate::chunk::STRATEGY_TRUNCATE); + + // Each strategy yields a flat Vec> (all documents' output + // vectors) plus per-document output counts. truncate/mean emit one + // vector/doc; fixed/recursive/sentence emit one vector per chunk. + let computed: EmbedResult = + if strategy == crate::chunk::STRATEGY_TRUNCATE { + model.predict(&string_refs, threads).map(|vecs| { + let counts = vec![1usize; vecs.len()]; + (vecs, counts) + }) + } else { + let s = settings_ref.unwrap(); + let max = crate::chunk::effective_max(s, model.get_max_input_len()); + let overlap = s.overlap_tokens as usize; + let max_chunks = s.max_chunks as usize; + + // Split every document; remember each document's chunk count. + let mut flat: Vec<&str> = Vec::new(); + let mut chunk_counts: Vec = Vec::with_capacity(string_refs.len()); + for &text in &string_refs { + let spans = crate::chunk::cap_chunks( + model.chunk(text, max, overlap, strategy), + max_chunks, + ); + chunk_counts.push(spans.len()); + for (start, end) in spans { + flat.push(&text[start..end]); + } + } + + model.predict(&flat, threads).map(|chunk_vecs| { + if strategy == crate::chunk::STRATEGY_MEAN { + // pool each document's chunks into one vector + let mut out = Vec::with_capacity(chunk_counts.len()); + let mut idx = 0usize; + for c in &chunk_counts { + let end = idx + c; + out.push(crate::chunk::mean_pool(&chunk_vecs[idx..end])); + idx = end; + } + let counts = vec![1usize; out.len()]; + (out, counts) + } else { + // fixed/recursive/sentence: keep every chunk vector. + (chunk_vecs, chunk_counts) + } + }) + }; - let mut float_vec_list: Vec = Vec::new(); - let embeddings_list = model.predict(&string_refs, threads); - let c_error = match embeddings_list { - Ok(embeddings_list) => { + match computed { + Ok((embeddings_list, counts)) => { + let mut float_vec_list: Vec = + Vec::with_capacity(embeddings_list.len()); for embeddings in embeddings_list.iter() { - let ptr = embeddings.as_ptr(); - let len = embeddings.len(); - let cap = embeddings.capacity(); - let vec = FloatVec { ptr, len, cap }; - float_vec_list.push(vec); + float_vec_list.push(FloatVec { + ptr: embeddings.as_ptr(), + len: embeddings.len(), + cap: embeddings.capacity(), + }); } - std::mem::forget(embeddings_list); - ptr::null_mut() - } - Err(e) => { - // Don't push empty vector on error - return error through szError pattern - let c_error = std::ffi::CString::new(e.to_string()).unwrap(); - c_error.into_raw() - } - }; - let vec_result = FloatVecResult { - ptr: float_vec_list.as_ptr(), - len: float_vec_list.len(), - cap: float_vec_list.capacity(), - error: c_error, - }; - std::mem::forget(float_vec_list); - vec_result + // Group the flat vectors per document: truncate/mean → one + // vector/doc (offsets [0,1,..]); fixed/recursive/sentence → + // N vectors/doc, offsets = prefix sums of per-doc counts. + let row_offsets = crate::chunk::row_offsets_from_counts(&counts); + let rows = counts.len(); + + let result = FloatVecResult { + error: ptr::null_mut(), + ptr: float_vec_list.as_ptr(), + len: float_vec_list.len(), + cap: float_vec_list.capacity(), + row_offsets: row_offsets.as_ptr(), + rows, + offsets_cap: row_offsets.capacity(), + }; + std::mem::forget(float_vec_list); + std::mem::forget(row_offsets); + result + } + // Don't push empty vectors on error — surface it via szError. + Err(e) => FloatVecResult { + error: to_c_error(&e.to_string()), + ptr: ptr::null(), + len: 0, + cap: 0, + row_offsets: ptr::null(), + rows: 0, + offsets_cap: 0, + }, + } }) .unwrap_or_else(|msg| FloatVecResult { error: to_c_error(&format!("embeddings: internal error (panic): {msg}")), ptr: ptr::null(), len: 0, cap: 0, + row_offsets: ptr::null(), + rows: 0, + offsets_cap: 0, }) } @@ -293,6 +395,15 @@ impl TextModelWrapper { let _ = Vec::from_raw_parts(result.ptr as *mut FloatVec, result.len, result.cap); } + // Free the per-row offsets sidecar (length = rows + 1). + if !result.row_offsets.is_null() && result.offsets_cap > 0 { + let _ = Vec::from_raw_parts( + result.row_offsets as *mut usize, + result.rows + 1, + result.offsets_cap, + ); + } + // Free the error string if it exists if !result.error.is_null() { let _ = std::ffi::CString::from_raw(result.error); diff --git a/knn/embeddings.cpp b/knn/embeddings.cpp index 9b3a1f48..d995c364 100644 --- a/knn/embeddings.cpp +++ b/knn/embeddings.cpp @@ -265,7 +265,9 @@ bool TextToEmbeddings_c::Convert ( const std::vector & dTexts, assert(pFuncs); // iThreads: 0 = use all available CPUs (default), >0 = cap worker count in the embeddings lib - FloatVecResult tVecResult = pFuncs->make_vect_embeddings ( &m_pModel, dStringItems.data(), dStringItems.size(), iThreads ); + // nullptr ChunkSettings = truncate strategy (today's behavior). Pass a populated + // ChunkSettings (e.g. STRATEGY_MEAN) once the table DDL exposes a chunking option. + FloatVecResult tVecResult = pFuncs->make_vect_embeddings ( &m_pModel, dStringItems.data(), dStringItems.size(), nullptr, iThreads ); if ( tVecResult.m_szError ) { sError = tVecResult.m_szError; @@ -273,6 +275,9 @@ bool TextToEmbeddings_c::Convert ( const std::vector & dTexts, return false; } + // truncate/mean return one vector per input row, so tVecResult.len == rows + // and this 1:1 copy is correct. A future multi-vector strategy (N vectors + // per row) would instead group via tVecResult.m_pRowOffsets[i..i+1]. dEmbeddings.resize ( tVecResult.len ); for ( size_t i = 0; i < tVecResult.len; i++ ) { @@ -312,7 +317,7 @@ knn::EmbeddingsLib_i * LoadEmbeddingsLib ( const std::string & sLibPath, std::st if ( !pLib->Load(sError) ) return nullptr; - const int SUPPORTED_EMBEDDINGS_LIB_VER = 4; + const int SUPPORTED_EMBEDDINGS_LIB_VER = 8; if ( pLib->GetVersion()!=SUPPORTED_EMBEDDINGS_LIB_VER ) { sError = util::FormatStr ( "Unsupported embeddings library version %d (expected %d)", pLib->GetVersion(), SUPPORTED_EMBEDDINGS_LIB_VER );