diff --git a/embeddings/CHUNKING.md b/embeddings/CHUNKING.md
new file mode 100644
index 00000000..9e48858a
--- /dev/null
+++ b/embeddings/CHUNKING.md
@@ -0,0 +1,91 @@
+# Embedding strategy FFI (embeddings lib → Manticore daemon)
+
+Embeddings lib **v8**. The one embedding call, `make_vect_embeddings`, takes an
+optional `ChunkSettings*` that selects how a document becomes one or many
+vectors. Cardinality is carried as **data** in the return (a per-document
+offsets sidecar), so a single method covers both 1-vector and N-vector
+strategies — no second method.
+
+## FFI
+
+```c
+constexpr uint32_t STRATEGY_TRUNCATE  = 0;  // 1 vector/doc
+constexpr uint32_t STRATEGY_MEAN      = 1;  // 1 vector/doc
+constexpr uint32_t STRATEGY_FIXED     = 2;  // N vectors/doc
+constexpr uint32_t STRATEGY_RECURSIVE = 3;  // N vectors/doc
+constexpr uint32_t STRATEGY_SENTENCE  = 4;  // N vectors/doc
+
+struct ChunkSettings {
+  uint32_t strategy;        // one of STRATEGY_*
+  uint32_t max_tokens;      // chunk size in tokens; 0 = model max
+  uint32_t overlap_tokens;  // token overlap between chunks; 0 = none
+  uint32_t max_chunks;      // cap on chunks/doc; 0 = unlimited (overflow merges into last)
+};
+
+// settings == nullptr  ⇒  truncate (today's behavior)
+FloatVecResult make_vect_embeddings(
+    const TextModelWrapper*, const StringItem* texts, uintptr_t count,
+    const ChunkSettings*, int32_t threads);
+```
+
+### Return: flat vectors + per-row offsets (cardinality is data)
+
+```c
+struct FloatVecResult {
+  char            *m_szError;
+  const FloatVec  *m_tEmbedding;   // FLAT: every document's vectors concatenated, `len` total
+  uintptr_t        len;
+  uintptr_t        cap;
+  const uintptr_t *m_pRowOffsets;  // length rows+1; doc i = m_tEmbedding[off[i] .. off[i+1]]
+  uintptr_t        rows;           // number of input documents
+  uintptr_t        offsets_cap;
+};
+```
+
+Read document `i`'s vectors as `m_tEmbedding[m_pRowOffsets[i] .. m_pRowOffsets[i+1]]`.
+- `truncate`/`mean` → one vector/doc, so `len == rows == count` and offsets are
+  `[0, 1, …, rows]` (you may just index `m_tEmbedding[i]`).
+- `fixed`/`recursive`/`sentence` → N vectors/doc; `len` = total chunks, and the
+  offsets group them per document.
+
+Free with `free_vec_result` (it frees the offsets too). Load-time check:
+`EmbedLib.version == 8`.
+
+## Strategies
+
+| strategy | val | output | what it does |
+|---|---|---|---|
+| **truncate** | 0 | 1 vec/doc | embed the first `max_tokens` tokens (rest dropped). `max_tokens`/`overlap`/`max_chunks` ignored. |
+| **mean** | 1 | 1 vec/doc | split (recursive, token-aware) → embed every chunk → **average** into one L2-normalized vector. Whole document, no tail loss. |
+| **fixed** | 2 | N vecs/doc | split into fixed `max_tokens`-token windows; one vector per chunk. |
+| **recursive** | 3 | N vecs/doc | split on a separator hierarchy (paragraph → line → space) ≤ `max_tokens`; one vector per chunk. |
+| **sentence** | 4 | N vecs/doc | UAX-29 sentence segmentation (ES-style), grouped to `max_tokens`; one vector per chunk. |
+
+- `max_tokens = 0` → the model's own max input length.
+- `overlap_tokens` → token overlap between chunks (multi-vector + mean).
+- `max_chunks` → cap chunks/doc; overflow merges the tail into the last chunk.
+- Local models chunk on the model's real subword tokens; remote API models
+  (OpenAI/Voyage/Jina) chunk by a char/byte heuristic (no local tokenizer).
+
+## Calling it
+
+```cpp
+// non-chunked field & queries: pass nullptr → truncate
+pFuncs->make_vect_embeddings( &model, items.data(), items.size(), nullptr, iThreads );
+
+// any strategy, taken from a table option:
+ChunkSettings tCfg { STRATEGY_SENTENCE, /*max_tokens*/ 256, /*overlap*/ 0, /*max_chunks*/ 0 };
+FloatVecResult tRes = pFuncs->make_vect_embeddings( &model, items.data(), items.size(), &tCfg, iThreads );
+// doc i owns vectors tRes.m_tEmbedding[ tRes.m_pRowOffsets[i] .. tRes.m_pRowOffsets[i+1] ]
+```
+
+The daemon owns the SQL/DDL surface (e.g. a per-`float_vector`-field option),
+parses it into a `ChunkSettings`, and passes it on every embed call. Queries are
+short: pass `nullptr` so they are never chunked.
+
+## Daemon side (next phase, not this lib)
+
+The lib already returns N vectors/doc for `fixed`/`recursive`/`sentence` via the
+offsets sidecar. To *use* them the daemon needs N-vectors-per-row storage +
+max-over-chunks search; `truncate`/`mean` (1 vector/doc) work with today's
+storage unchanged.
diff --git a/embeddings/Cargo.lock b/embeddings/Cargo.lock
index 452ff0f8..118f69d9 100644
--- a/embeddings/Cargo.lock
+++ b/embeddings/Cargo.lock
@@ -1889,6 +1889,7 @@ dependencies = [
  "serde",
  "serde_json",
  "tokenizers",
+ "unicode-segmentation",
 ]
 
 [[package]]
diff --git a/embeddings/Cargo.toml b/embeddings/Cargo.toml
index b1c0b3cf..ffa9a673 100644
--- a/embeddings/Cargo.toml
+++ b/embeddings/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2021"
 # For local dev with ../../candle, add a [patch] section to use path deps.
 [dependencies]
 tokenizers = "0.15.2"
+unicode-segmentation = "1"
 hf-hub = { git = "https://github.com/huggingface/hf-hub.git", rev = "ac22200ea0b5af4d8c362f699be0340647b19060", default-features = false,features = ["ureq"] }
 anyhow = "1.0.81"
 serde_json = "1.0.114"
diff --git a/embeddings/INSTRUCTIONS.md b/embeddings/INSTRUCTIONS.md
index 069aafd2..b22c797c 100644
--- a/embeddings/INSTRUCTIONS.md
+++ b/embeddings/INSTRUCTIONS.md
@@ -53,7 +53,7 @@ src/
 
 1. **`src/model/text_model_wrapper.rs`** - The heart of FFI
    - `load_model()` - Creates model instances
-   - `make_vect_embeddings()` - Generates embeddings (FIXED: no empty vectors on error)
+   - `make_vect_embeddings()` - Generates embeddings; takes an optional `ChunkSettings*` selecting the strategy (truncate / mean / fixed / recursive / sentence). Returns a flat `FloatVecResult` grouped per document by `m_pRowOffsets`. **Contract + strategy reference: [`CHUNKING.md`](CHUNKING.md).**
    - `free_model_result()` / `free_vec_result()` - Memory cleanup
 
 2. **`manticoresearch_text_embeddings.h`** - Auto-generated C header
diff --git a/embeddings/manticoresearch_text_embeddings.h b/embeddings/manticoresearch_text_embeddings.h
index 3a8d5ef3..ac494995 100644
--- a/embeddings/manticoresearch_text_embeddings.h
+++ b/embeddings/manticoresearch_text_embeddings.h
@@ -9,6 +9,17 @@
 #include <ostream>
 #include <new>
 
+/// Strategy, mirrored as a `u32` across the FFI in [`ChunkSettings`].
+constexpr static const uint32_t STRATEGY_TRUNCATE = 0;
+
+constexpr static const uint32_t STRATEGY_MEAN = 1;
+
+constexpr static const uint32_t STRATEGY_FIXED = 2;
+
+constexpr static const uint32_t STRATEGY_RECURSIVE = 3;
+
+constexpr static const uint32_t STRATEGY_SENTENCE = 4;
+
 struct TextModelResult {
   void *m_pModel;
   char *m_szError;
@@ -33,11 +44,25 @@ struct FloatVec {
   uintptr_t cap;
 };
 
+/// Embedding result for one `make_vect_embeddings` call.
+///
+/// `m_tEmbedding` is a FLAT array of `len` vectors — every input document's
+/// vectors concatenated. `m_pRowOffsets` (length `rows + 1`) groups them per
+/// input document, Arrow-style: document `i` owns
+/// `m_tEmbedding[m_pRowOffsets[i] .. m_pRowOffsets[i + 1]]`. For the v1
+/// strategies (truncate / mean) every document yields exactly one vector, so
+/// `len == rows` and the offsets are `[0, 1, ..., rows]`. The sidecar lets a
+/// future multi-vector strategy return N vectors per document through this same
+/// struct — no second method, cardinality carried as data.
+///
 struct FloatVecResult {
   char *m_szError;
   const FloatVec *m_tEmbedding;
   uintptr_t len;
   uintptr_t cap;
+  const uintptr_t *m_pRowOffsets;
+  uintptr_t rows;
+  uintptr_t offsets_cap;
 };
 
 using TextModelWrapper = void*;
@@ -47,9 +72,25 @@ struct StringItem {
   uintptr_t len;
 };
 
+/// Chunking parameters. `#[repr(C)]` — passed straight across the FFI by the
+/// daemon, which owns the DDL surface and validates against the model.
+struct ChunkSettings {
+  /// One of the `STRATEGY_*` constants.
+  uint32_t strategy;
+  /// Target chunk size in tokens. `0` ⇒ use the model's max. Always clamped to
+  /// the model's real input limit.
+  uint32_t max_tokens;
+  /// Token overlap between consecutive chunks. `0` ⇒ none.
+  uint32_t overlap_tokens;
+  /// Hard cap on chunks per document. `0` ⇒ unlimited. Overflow merges the
+  /// tail into the last chunk (matches OpenSearch's `max_chunk_limit`).
+  uint32_t max_chunks;
+};
+
 using MakeVectEmbeddingsFn = FloatVecResult(*)(const TextModelWrapper*,
                                                const StringItem*,
                                                uintptr_t,
+                                               const ChunkSettings*,
                                                int32_t);
 
 using FreeVecResultFn = void(*)(FloatVecResult);
diff --git a/embeddings/src/chunk.rs b/embeddings/src/chunk.rs
new file mode 100644
index 00000000..640021ff
--- /dev/null
+++ b/embeddings/src/chunk.rs
@@ -0,0 +1,500 @@
+//! Document chunking + embedding strategies.
+//!
+//! One document maps to one or many vectors depending on the strategy:
+//! - `truncate` / `mean` collapse to **one** vector per document.
+//! - `fixed` / `recursive` / `sentence` keep **N** vectors per document (one per
+//!   chunk) — the daemon groups them via the row-offsets sidecar.
+//!
+//! Local models chunk token-accurately via their loaded tokenizer; remote API
+//! models (no local tokenizer) fall back to a conservative char/byte heuristic.
+//! `ChunkSettings` arrives from the daemon's DDL surface and selects everything.
+
+use crate::utils::normalize;
+use tokenizers::Tokenizer;
+
+/// Strategy, mirrored as a `u32` across the FFI in [`ChunkSettings`].
+pub const STRATEGY_TRUNCATE: u32 = 0; // 1 vector/doc: first `max_tokens` tokens
+pub const STRATEGY_MEAN: u32 = 1; // 1 vector/doc: chunk → embed → average
+pub const STRATEGY_FIXED: u32 = 2; // N vectors/doc: fixed token windows
+pub const STRATEGY_RECURSIVE: u32 = 3; // N vectors/doc: recursive token-aware split
+pub const STRATEGY_SENTENCE: u32 = 4; // N vectors/doc: UAX-29 sentence segmentation
+
+/// Special tokens (`[CLS]`/`[SEP]`) that `predict()` re-adds when it re-tokenizes
+/// a chunk string. We size chunks to leave room so a chunk is never truncated.
+const SPECIAL_TOKENS_MARGIN: usize = 2;
+
+/// Conservative bytes-per-token for the remote heuristic. Deliberately low so a
+/// heuristic chunk lands *under* the provider's token cap rather than over it.
+const REMOTE_BYTES_PER_TOKEN: usize = 3;
+
+/// Chunking parameters. `#[repr(C)]` — passed straight across the FFI by the
+/// daemon, which owns the DDL surface and validates against the model.
+#[repr(C)]
+pub struct ChunkSettings {
+    /// One of the `STRATEGY_*` constants.
+    pub strategy: u32,
+    /// Target chunk size in tokens. `0` ⇒ use the model's max. Always clamped to
+    /// the model's real input limit.
+    pub max_tokens: u32,
+    /// Token overlap between consecutive chunks. `0` ⇒ none.
+    pub overlap_tokens: u32,
+    /// Hard cap on chunks per document. `0` ⇒ unlimited. Overflow merges the
+    /// tail into the last chunk (matches OpenSearch's `max_chunk_limit`).
+    pub max_chunks: u32,
+}
+
+impl ChunkSettings {
+    /// True when the document must be split (every strategy except `truncate`).
+    pub fn needs_chunking(&self) -> bool {
+        self.strategy != STRATEGY_TRUNCATE
+    }
+
+    /// True when the strategy collapses to one vector per document
+    /// (`truncate`/`mean`); false for the multi-vector strategies.
+    pub fn is_single_vector(&self) -> bool {
+        self.strategy == STRATEGY_TRUNCATE || self.strategy == STRATEGY_MEAN
+    }
+}
+
+/// Resolve the chunk size in *content* tokens, clamped to what the model can
+/// actually take (minus the special-token budget). `0` ⇒ model max.
+pub fn effective_max(settings: &ChunkSettings, model_max: usize) -> usize {
+    let ceiling = model_max.saturating_sub(SPECIAL_TOKENS_MARGIN).max(1);
+    let target = settings.max_tokens as usize;
+    if target == 0 {
+        ceiling
+    } else {
+        target.min(ceiling)
+    }
+}
+
+/// Average a document's chunk embeddings into one L2-normalized vector. Chunk
+/// vectors are already normalized by `predict`; the mean of unit vectors is
+/// re-normalized so the result is a unit vector too.
+pub fn mean_pool(chunks: &[Vec<f32>]) -> Vec<f32> {
+    let dim = chunks.first().map(Vec::len).unwrap_or(0);
+    let mut acc = vec![0.0f32; dim];
+    for v in chunks {
+        for (a, x) in acc.iter_mut().zip(v.iter()) {
+            *a += *x;
+        }
+    }
+    let n = chunks.len().max(1) as f32;
+    for a in acc.iter_mut() {
+        *a /= n;
+    }
+    normalize(&mut acc);
+    acc
+}
+
+/// Arrow-style per-row offsets (length `counts.len() + 1`) from per-document
+/// vector counts: document `i` owns vectors `[offsets[i] .. offsets[i + 1]]`.
+pub fn row_offsets_from_counts(counts: &[usize]) -> Vec<usize> {
+    let mut offsets = Vec::with_capacity(counts.len() + 1);
+    let mut acc = 0usize;
+    offsets.push(0);
+    for &c in counts {
+        acc += c;
+        offsets.push(acc);
+    }
+    offsets
+}
+
+/// Split one document into chunk byte spans `(start, end)` according to the
+/// strategy's split method. `tokenizer` is `Some` for local models (token-exact)
+/// and `None` for remote API models (char heuristic). `STRATEGY_TRUNCATE` never
+/// reaches here (it does not chunk); `STRATEGY_MEAN` splits recursively.
+pub fn chunk_text(
+    text: &str,
+    max_tokens: usize,
+    overlap: usize,
+    strategy: u32,
+    tokenizer: Option<&Tokenizer>,
+) -> Vec<(usize, usize)> {
+    match strategy {
+        STRATEGY_SENTENCE => chunk_sentence(text, max_tokens, overlap, tokenizer),
+        STRATEGY_FIXED => window(text, max_tokens, overlap, false, tokenizer),
+        // recursive (3), mean's internal split (1), and any other → recursive.
+        _ => window(text, max_tokens, overlap, true, tokenizer),
+    }
+}
+
+/// Token-window split, picking the token-accurate or heuristic backend.
+fn window(
+    text: &str,
+    max_tokens: usize,
+    overlap: usize,
+    snap: bool,
+    tokenizer: Option<&Tokenizer>,
+) -> Vec<(usize, usize)> {
+    match tokenizer {
+        Some(t) => chunk_local(text, t, max_tokens, overlap, snap),
+        None => chunk_chars(text, max_tokens, overlap),
+    }
+}
+
+/// Token-accurate chunking for local models. One tokenization pass; windows by
+/// token count; `recursive` snaps each cut back to a natural boundary; overlap
+/// is realigned to the (possibly snapped) cut so no tokens are dropped.
+///
+/// NOTE: assumes `tokenizer.encode(..).get_offsets()` returns byte offsets into
+/// the input (true for the WordPiece/BPE paths Manticore loads).
+pub fn chunk_local(
+    text: &str,
+    tokenizer: &Tokenizer,
+    max_tokens: usize,
+    overlap: usize,
+    recursive: bool,
+) -> Vec<(usize, usize)> {
+    if text.is_empty() {
+        return vec![(0, 0)];
+    }
+    // `false` = no special tokens, so offsets map cleanly onto the input text.
+    let encoding = match tokenizer.encode(text, false) {
+        Ok(e) => e,
+        // Tokenizer failure (rare): fall back to the whole doc as one chunk.
+        // predict() truncates it if needed — degrades to truncate, but is safe.
+        Err(_) => return vec![(0, text.len())],
+    };
+    let offsets = encoding.get_offsets();
+    let n = offsets.len();
+    if max_tokens == 0 || n <= max_tokens {
+        return vec![(0, text.len())];
+    }
+
+    let overlap = overlap.min(max_tokens / 2);
+    let mut chunks = Vec::new();
+    let mut start = 0usize; // start token index
+    loop {
+        let win_end = (start + max_tokens).min(n); // exclusive token index
+        let byte_start = offsets[start].0;
+        let mut byte_end = offsets[win_end - 1].1;
+
+        if recursive && win_end < n {
+            if let Some(cut) = snap_back(text, byte_start, byte_end) {
+                byte_end = cut;
+            }
+        }
+        chunks.push((byte_start, byte_end));
+
+        // First token that begins at/after the (possibly snapped) cut.
+        let cut_tok = offsets.partition_point(|o| o.0 < byte_end);
+        if cut_tok >= n {
+            break;
+        }
+        start = cut_tok.saturating_sub(overlap).max(start + 1); // step back overlap, always progress
+    }
+    chunks
+}
+
+/// Char/byte heuristic for remote API models (no local tokenizer). Windows by an
+/// estimated byte budget, snapping to whitespace and never cutting mid-UTF-8.
+pub fn chunk_chars(text: &str, max_tokens: usize, overlap_tokens: usize) -> Vec<(usize, usize)> {
+    if text.is_empty() {
+        return vec![(0, 0)];
+    }
+    let window = (max_tokens * REMOTE_BYTES_PER_TOKEN).max(1);
+    if text.len() <= window {
+        return vec![(0, text.len())];
+    }
+    let overlap = (overlap_tokens * REMOTE_BYTES_PER_TOKEN).min(window / 2);
+
+    let mut chunks = Vec::new();
+    let mut start = 0usize;
+    while start < text.len() {
+        let mut end = text.floor_char_boundary((start + window).min(text.len()));
+        if end <= start {
+            // A single char wider than the window — take at least one char.
+            end = text.ceil_char_boundary(start + 1).min(text.len());
+        }
+        if end < text.len() {
+            if let Some(cut) = snap_back(text, start, end) {
+                end = cut;
+            }
+        }
+        chunks.push((start, end));
+        if end >= text.len() {
+            break;
+        }
+        let mut next = end.saturating_sub(overlap);
+        if next <= start {
+            next = end;
+        }
+        start = text.ceil_char_boundary(next);
+    }
+    chunks
+}
+
+/// Count tokens in `text`: exact via the model tokenizer when available, else
+/// the conservative byte estimate the remote heuristic uses.
+fn count_tokens(text: &str, tokenizer: Option<&Tokenizer>) -> usize {
+    match tokenizer {
+        Some(t) => t
+            .encode(text, false)
+            .map(|e| e.get_ids().len())
+            .unwrap_or(text.len() / REMOTE_BYTES_PER_TOKEN + 1),
+        None => text.len() / REMOTE_BYTES_PER_TOKEN + 1,
+    }
+}
+
+/// UAX-29 sentence boundaries — the same standard Elasticsearch's ICU uses —
+/// as contiguous byte spans tiling the whole document.
+fn sentence_spans(text: &str) -> Vec<(usize, usize)> {
+    use unicode_segmentation::UnicodeSegmentation;
+    text.split_sentence_bound_indices()
+        .map(|(offset, s)| (offset, offset + s.len()))
+        .collect()
+}
+
+/// Sentence chunking, mirroring Elasticsearch's algorithm: detect sentences
+/// (UAX-29), then greedily group whole sentences up to `max_tokens`. A single
+/// sentence larger than the budget is split with the token-window splitter (ES
+/// splits such a sentence across chunks). `overlap` re-seeds the next chunk with
+/// trailing whole sentences summing ≤ `overlap` tokens.
+pub fn chunk_sentence(
+    text: &str,
+    max_tokens: usize,
+    overlap: usize,
+    tokenizer: Option<&Tokenizer>,
+) -> Vec<(usize, usize)> {
+    if text.is_empty() {
+        return vec![(0, 0)];
+    }
+    let units: Vec<(usize, usize, usize)> = sentence_spans(text)
+        .into_iter()
+        .map(|(a, b)| (a, b, count_tokens(&text[a..b], tokenizer)))
+        .collect();
+    // 0/1 sentence (e.g. terminator-free text): nothing to group on — fall back
+    // to the token-window splitter so an over-long blob still gets divided.
+    if units.len() <= 1 {
+        return window(text, max_tokens, overlap, true, tokenizer);
+    }
+
+    let mut chunks = Vec::new();
+    let mut i = 0usize;
+    while i < units.len() {
+        // Greedily pack whole sentences up to the budget (always take ≥ 1).
+        let mut j = i;
+        let mut sum = 0usize;
+        while j < units.len() {
+            let t = units[j].2;
+            if j > i && sum + t > max_tokens {
+                break;
+            }
+            sum += t;
+            j += 1;
+        }
+
+        let start = units[i].0;
+        let end = units[j - 1].1;
+        if j == i + 1 && units[i].2 > max_tokens {
+            // One sentence bigger than the budget — split it like ES does.
+            for (a, b) in window(&text[start..end], max_tokens, overlap, true, tokenizer) {
+                chunks.push((start + a, start + b));
+            }
+        } else {
+            chunks.push((start, end));
+        }
+
+        if j >= units.len() {
+            break;
+        }
+
+        // Overlap: re-seed with trailing whole sentences summing ≤ overlap,
+        // never back to i (guarantee forward progress).
+        let mut next = j;
+        if overlap > 0 {
+            let mut acc = 0usize;
+            let mut k = j;
+            while k > i + 1 {
+                let t = units[k - 1].2;
+                if acc + t > overlap {
+                    break;
+                }
+                acc += t;
+                k -= 1;
+            }
+            next = k.max(i + 1);
+        }
+        i = next;
+    }
+    chunks
+}
+
+/// Enforce `max_chunks`: if exceeded, merge the overflow tail into the last kept
+/// chunk so no document text is dropped. `0` ⇒ unlimited.
+pub fn cap_chunks(mut chunks: Vec<(usize, usize)>, max_chunks: usize) -> Vec<(usize, usize)> {
+    if max_chunks == 0 || chunks.len() <= max_chunks {
+        return chunks;
+    }
+    let last_end = chunks.last().map(|c| c.1).unwrap_or(0);
+    chunks.truncate(max_chunks);
+    if let Some(last) = chunks.last_mut() {
+        last.1 = last_end; // extend to cover everything merged in
+    }
+    chunks
+}
+
+/// Pull a chunk's end back to the nearest paragraph/line/sentence/space boundary
+/// within `[start, end)`, but never past the chunk midpoint (don't over-shrink).
+/// Returns the new end byte offset, or `None` if no good boundary was found.
+fn snap_back(text: &str, start: usize, end: usize) -> Option<usize> {
+    let floor = start + (end - start) / 2;
+    for sep in ["\n\n", "\n", ". ", " "] {
+        if let Some(pos) = text[start..end].rfind(sep) {
+            let cut = start + pos + sep.len();
+            if cut > floor && cut < end {
+                return Some(cut);
+            }
+        }
+    }
+    None
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn settings(strategy: u32, max: u32, overlap: u32, max_chunks: u32) -> ChunkSettings {
+        ChunkSettings {
+            strategy,
+            max_tokens: max,
+            overlap_tokens: overlap,
+            max_chunks,
+        }
+    }
+
+    #[test]
+    fn strategy_classification() {
+        assert!(!settings(STRATEGY_TRUNCATE, 0, 0, 0).needs_chunking());
+        assert!(settings(STRATEGY_MEAN, 0, 0, 0).needs_chunking());
+        assert!(settings(STRATEGY_SENTENCE, 0, 0, 0).needs_chunking());
+        assert!(settings(STRATEGY_TRUNCATE, 0, 0, 0).is_single_vector());
+        assert!(settings(STRATEGY_MEAN, 0, 0, 0).is_single_vector());
+        assert!(!settings(STRATEGY_FIXED, 0, 0, 0).is_single_vector());
+        assert!(!settings(STRATEGY_SENTENCE, 0, 0, 0).is_single_vector());
+    }
+
+    #[test]
+    fn effective_max_defaults_clamps_and_floors() {
+        assert_eq!(effective_max(&settings(STRATEGY_MEAN, 0, 0, 0), 512), 510);
+        assert_eq!(
+            effective_max(&settings(STRATEGY_MEAN, 100_000, 0, 0), 512),
+            510
+        );
+        assert_eq!(
+            effective_max(&settings(STRATEGY_MEAN, 256, 0, 0), 8192),
+            256
+        );
+        assert!(effective_max(&settings(STRATEGY_MEAN, 0, 0, 0), 0) >= 1);
+    }
+
+    #[test]
+    fn mean_pool_averages_then_normalizes() {
+        let m = mean_pool(&[vec![1.0, 0.0], vec![0.0, 1.0]]);
+        let e = std::f32::consts::FRAC_1_SQRT_2;
+        assert!((m[0] - e).abs() < 1e-6 && (m[1] - e).abs() < 1e-6);
+    }
+
+    #[test]
+    fn mean_pool_single_chunk_is_normalized() {
+        let m = mean_pool(&[vec![3.0, 4.0]]);
+        assert!((m[0] - 0.6).abs() < 1e-6 && (m[1] - 0.8).abs() < 1e-6);
+    }
+
+    #[test]
+    fn row_offsets_prefix_sum() {
+        assert_eq!(row_offsets_from_counts(&[3, 1, 2]), vec![0, 3, 4, 6]);
+        assert_eq!(row_offsets_from_counts(&[1, 1, 1]), vec![0, 1, 2, 3]);
+        assert_eq!(row_offsets_from_counts(&[]), vec![0]);
+    }
+
+    #[test]
+    fn short_text_is_one_chunk() {
+        assert_eq!(chunk_chars("hello world", 100, 0), vec![(0, 11)]);
+    }
+
+    #[test]
+    fn chunk_chars_tiles_document_with_no_overlap() {
+        let text = "word ".repeat(2000);
+        let chunks = chunk_chars(&text, 100, 0);
+        assert!(chunks.len() > 1);
+        assert_eq!(chunks.first().unwrap().0, 0);
+        assert_eq!(chunks.last().unwrap().1, text.len());
+        for w in chunks.windows(2) {
+            assert_eq!(w[0].1, w[1].0, "no gap/overlap when overlap=0");
+        }
+        for (a, b) in &chunks {
+            assert!(b > a && text.is_char_boundary(*a) && text.is_char_boundary(*b));
+        }
+    }
+
+    #[test]
+    fn chunk_chars_overlap_steps_back() {
+        let text = "word ".repeat(2000);
+        let chunks = chunk_chars(&text, 100, 20);
+        assert!(chunks.len() > 1);
+        for w in chunks.windows(2) {
+            assert!(w[1].0 < w[0].1, "overlap: next starts before prev end");
+        }
+        assert_eq!(chunks.last().unwrap().1, text.len());
+    }
+
+    #[test]
+    fn chunk_chars_multibyte_never_splits_a_char() {
+        let text = "🦀 данные ".repeat(500);
+        let chunks = chunk_chars(&text, 50, 5);
+        for (a, b) in &chunks {
+            assert!(text.is_char_boundary(*a) && text.is_char_boundary(*b));
+        }
+        assert_eq!(chunks.last().unwrap().1, text.len());
+    }
+
+    #[test]
+    fn chunk_sentence_splits_at_uax29_boundary() {
+        let text = "Aaa bbb ccc. Ddd eee fff.";
+        let chunks = chunk_sentence(text, 5, 0, None);
+        assert_eq!(chunks.len(), 2);
+        assert_eq!(chunks[0].0, 0);
+        assert_eq!(chunks[1].1, text.len());
+        assert_eq!(chunks[0].1, chunks[1].0);
+        assert!((12..=13).contains(&chunks[0].1));
+    }
+
+    #[test]
+    fn chunk_sentence_packs_and_splits_oversized() {
+        let text = "S one. S two. S three. S four. S five. S six.";
+        assert_eq!(chunk_sentence(text, 100, 0, None).len(), 1);
+        assert!(chunk_sentence(text, 3, 0, None).len() > 1);
+        // terminator-free over-long "sentence" still splits
+        let blob = "word ".repeat(1000);
+        assert!(chunk_sentence(&blob, 100, 0, None).len() > 1);
+    }
+
+    #[test]
+    fn chunk_text_routes_every_multivector_strategy() {
+        let text = "word ".repeat(400);
+        for s in [
+            STRATEGY_FIXED,
+            STRATEGY_RECURSIVE,
+            STRATEGY_SENTENCE,
+            STRATEGY_MEAN,
+        ] {
+            let chunks = chunk_text(&text, 50, 0, s, None);
+            assert!(chunks.len() > 1, "strategy {s} should split");
+            assert_eq!(chunks.first().unwrap().0, 0);
+            assert_eq!(chunks.last().unwrap().1, text.len());
+        }
+    }
+
+    #[test]
+    fn cap_chunks_merges_tail_and_noops() {
+        assert_eq!(
+            cap_chunks(vec![(0, 10), (10, 20), (20, 30), (30, 45)], 2),
+            vec![(0, 10), (10, 45)]
+        );
+        let two = vec![(0, 10), (10, 20)];
+        assert_eq!(cap_chunks(two.clone(), 5), two);
+        assert_eq!(cap_chunks(two.clone(), 0), two);
+    }
+}
diff --git a/embeddings/src/error_handling_test.rs b/embeddings/src/error_handling_test.rs
index 3b2d3b70..15ac8897 100644
--- a/embeddings/src/error_handling_test.rs
+++ b/embeddings/src/error_handling_test.rs
@@ -152,6 +152,9 @@ mod tests {
             ptr: ptr::null(),
             len: 0,
             cap: 0,
+            row_offsets: ptr::null(),
+            rows: 0,
+            offsets_cap: 0,
         };
 
         // Verify error is set and no vectors are present
@@ -194,6 +197,9 @@ mod tests {
             ptr: float_vecs.as_ptr(),
             len: float_vecs.len(),
             cap: float_vecs.capacity(),
+            row_offsets: ptr::null(),
+            rows: 0,
+            offsets_cap: 0,
         };
 
         // Verify successful result structure
diff --git a/embeddings/src/ffi.rs b/embeddings/src/ffi.rs
index be53847e..eae47276 100644
--- a/embeddings/src/ffi.rs
+++ b/embeddings/src/ffi.rs
@@ -1,3 +1,4 @@
+use crate::chunk::ChunkSettings;
 use crate::model::text_model_wrapper::{
     FloatVecResult, StringItem, TextModelResult, TextModelWrapper,
 };
@@ -18,8 +19,13 @@ type LoadModelFn = extern "C" fn(
 
 type FreeModelResultFn = extern "C" fn(TextModelResult);
 
-type MakeVectEmbeddingsFn =
-    extern "C" fn(&TextModelWrapper, *const StringItem, usize, i32) -> FloatVecResult;
+type MakeVectEmbeddingsFn = extern "C" fn(
+    &TextModelWrapper,
+    *const StringItem,
+    usize,
+    *const ChunkSettings,
+    i32,
+) -> FloatVecResult;
 
 type FreeVecResultFn = extern "C" fn(FloatVecResult);
 
@@ -62,7 +68,7 @@ pub struct EmbedLib {
 const VERSION_STR: &[u8] = concat!(env!("EMBEDDINGS_VERSION_STR"), "\0").as_bytes();
 
 const LIB: EmbedLib = EmbedLib {
-    version: 4usize,
+    version: 8usize,
     version_str: VERSION_STR.as_ptr() as *const c_char,
     load_model: TextModelWrapper::load_model,
     free_model_result: TextModelWrapper::free_model_result,
diff --git a/embeddings/src/lib.rs b/embeddings/src/lib.rs
index d7c11f44..dec5fd09 100644
--- a/embeddings/src/lib.rs
+++ b/embeddings/src/lib.rs
@@ -1,3 +1,4 @@
+mod chunk;
 mod error;
 mod ffi;
 mod model;
@@ -19,6 +20,10 @@ mod error_handling_test;
 #[cfg(test)]
 mod panic_guard_test;
 
+pub use chunk::{
+    ChunkSettings, STRATEGY_FIXED, STRATEGY_MEAN, STRATEGY_RECURSIVE, STRATEGY_SENTENCE,
+    STRATEGY_TRUNCATE,
+};
 pub use error::LibError;
 pub use ffi::{EmbedLib, GetLibFuncs};
 pub use model::TextModel;
diff --git a/embeddings/src/model/ffi_test.rs b/embeddings/src/model/ffi_test.rs
index 322012fa..2ce3a2e0 100644
--- a/embeddings/src/model/ffi_test.rs
+++ b/embeddings/src/model/ffi_test.rs
@@ -75,8 +75,13 @@ mod tests {
                             model_ptr as *mut std::ffi::c_void,
                         )
                     };
-                    let vec_result =
-                        TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, 0);
+                    let vec_result = TextModelWrapper::make_vect_embeddings(
+                        &wrapper,
+                        items.as_ptr(),
+                        1,
+                        std::ptr::null(),
+                        0,
+                    );
                     assert!(vec_result.error.is_null());
                     assert_eq!(vec_result.len, 1);
                     TextModelWrapper::free_vec_result(vec_result);
@@ -131,6 +136,9 @@ mod tests {
             ptr: ptr::null(),
             len: 0,
             cap: 0,
+            row_offsets: ptr::null(),
+            rows: 0,
+            offsets_cap: 0,
         };
 
         assert!(result.error.is_null());
@@ -339,6 +347,9 @@ mod tests {
             ptr: ptr::null(),
             len: 0,
             cap: 0,
+            row_offsets: ptr::null(),
+            rows: 0,
+            offsets_cap: 0,
         };
 
         // Verify error is set
@@ -365,6 +376,9 @@ mod tests {
             ptr: ptr::null(),
             len: 0,
             cap: 0,
+            row_offsets: ptr::null(),
+            rows: 0,
+            offsets_cap: 0,
         };
 
         // Should not crash with null pointers
@@ -394,12 +408,15 @@ mod tests {
             mem::size_of::<*const c_char>() + mem::size_of::<usize>()
         );
 
-        // FloatVecResult should be pointer + two usizes + error pointer
+        // FloatVecResult: error ptr + embeddings ptr + len + cap
+        //   + row_offsets ptr + rows + offsets_cap
         assert_eq!(
             mem::size_of::<FloatVecResult>(),
             mem::size_of::<*mut c_char>()
                 + mem::size_of::<*const FloatVec>()
                 + mem::size_of::<usize>() * 2
+                + mem::size_of::<*const usize>()
+                + mem::size_of::<usize>() * 2
         );
     }
 
@@ -501,4 +518,161 @@ mod tests {
     fn test_concurrent_qwen_embeddings_via_ffi() {
         run_concurrent_ffi_embeddings("Qwen/Qwen3-Embedding-0.6B");
     }
+
+    /// End-to-end on the cached MiniLM model: all five strategies. truncate/mean
+    /// return one vector/doc; fixed/recursive/sentence return N vectors/doc
+    /// grouped by row_offsets. Verifies offsets, normalization, and clean frees.
+    #[test]
+    fn test_all_strategies_local_model() {
+        let model_name = to_c_string("sentence-transformers/all-MiniLM-L6-v2");
+        let empty = to_c_string("");
+        let loaded = TextModelWrapper::load_model(
+            model_name.as_ptr(),
+            model_name.as_bytes().len(),
+            empty.as_ptr(),
+            empty.as_bytes().len(),
+            empty.as_ptr(),
+            empty.as_bytes().len(),
+            empty.as_ptr(),
+            empty.as_bytes().len(),
+            0,
+            false,
+        );
+        if loaded.model.is_null() {
+            TextModelWrapper::free_model_result(loaded);
+            eprintln!("skipping: all-MiniLM-L6-v2 not available locally");
+            return;
+        }
+        let wrapper =
+            unsafe { std::mem::transmute::<*mut std::ffi::c_void, TextModelWrapper>(loaded.model) };
+
+        // Long enough to exceed the model's input limit, so truncate drops the
+        // tail while mean covers the whole document via several chunks.
+        let long = "Vector search turns text into embeddings for retrieval. ".repeat(100);
+        let items = [create_string_item(&long)];
+
+        let first_vec = |res: &FloatVecResult| -> Vec<f32> {
+            assert!(res.error.is_null());
+            assert_eq!(res.len, 1, "exactly one vector per input document");
+            assert_eq!(res.rows, 1);
+            assert!(!res.row_offsets.is_null());
+            unsafe {
+                let offs = std::slice::from_raw_parts(res.row_offsets, res.rows + 1);
+                assert_eq!(offs, &[0usize, 1usize], "trivial offsets for 1 vector/doc");
+                let fv = &*res.ptr;
+                std::slice::from_raw_parts(fv.ptr, fv.len).to_vec()
+            }
+        };
+
+        // truncate (null settings)
+        let trunc =
+            TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, ptr::null(), 0);
+        let trunc_vec = first_vec(&trunc);
+
+        // mean: small chunk size forces many chunks → embed each → average
+        let settings = crate::chunk::ChunkSettings {
+            strategy: crate::chunk::STRATEGY_MEAN,
+            max_tokens: 32,
+            overlap_tokens: 0,
+            max_chunks: 0,
+        };
+        let mean =
+            TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, &settings, 0);
+        let mean_vec = first_vec(&mean);
+
+        assert_eq!(trunc_vec.len(), mean_vec.len(), "same embedding dimension");
+        let diff: f32 = trunc_vec
+            .iter()
+            .zip(&mean_vec)
+            .map(|(a, b)| (a - b).abs())
+            .sum();
+        assert!(
+            diff > 1e-3,
+            "mean must differ from truncate on a long document"
+        );
+        let norm: f32 = mean_vec.iter().map(|x| x * x).sum::<f32>().sqrt();
+        assert!(
+            (norm - 1.0).abs() < 1e-2,
+            "mean vector must be L2-normalized"
+        );
+
+        // multi-vector strategies: fixed/recursive/sentence keep every chunk
+        // vector → N vectors/doc, grouped by row_offsets [0, N].
+        for strat in [
+            crate::chunk::STRATEGY_FIXED,
+            crate::chunk::STRATEGY_RECURSIVE,
+            crate::chunk::STRATEGY_SENTENCE,
+        ] {
+            let s = crate::chunk::ChunkSettings {
+                strategy: strat,
+                max_tokens: 32,
+                overlap_tokens: 0,
+                max_chunks: 0,
+            };
+            let res = TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, &s, 0);
+            assert!(res.error.is_null(), "strategy {strat} errored");
+            assert_eq!(res.rows, 1, "one document");
+            assert!(
+                res.len > 1,
+                "strategy {strat} must emit multiple chunk vectors, got {}",
+                res.len
+            );
+            unsafe {
+                let offs = std::slice::from_raw_parts(res.row_offsets, res.rows + 1);
+                assert_eq!(offs[0], 0);
+                assert_eq!(offs[1], res.len, "row 0 owns all {} chunk vectors", res.len);
+                let fvs = std::slice::from_raw_parts(res.ptr, res.len);
+                for fv in fvs {
+                    assert_eq!(fv.len, trunc_vec.len(), "chunk vector dimension");
+                    let v = std::slice::from_raw_parts(fv.ptr, fv.len);
+                    let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
+                    assert!((norm - 1.0).abs() < 1e-2, "chunk vector must be normalized");
+                }
+            }
+            TextModelWrapper::free_vec_result(res);
+        }
+
+        // multi-document batch: row_offsets must group N-per-doc correctly, and
+        // overlap_tokens > 0 must still produce valid output. doc A is long
+        // (many chunks), doc B is short (one chunk).
+        let short = "One short sentence.";
+        let batch = [create_string_item(&long), create_string_item(short)];
+        let s = crate::chunk::ChunkSettings {
+            strategy: crate::chunk::STRATEGY_FIXED,
+            max_tokens: 32,
+            overlap_tokens: 4,
+            max_chunks: 0,
+        };
+        let res = TextModelWrapper::make_vect_embeddings(&wrapper, batch.as_ptr(), 2, &s, 0);
+        assert!(res.error.is_null());
+        assert_eq!(res.rows, 2, "two documents");
+        unsafe {
+            let offs = std::slice::from_raw_parts(res.row_offsets, res.rows + 1);
+            assert_eq!(offs[0], 0);
+            assert_eq!(offs[2], res.len, "last offset == total vectors");
+            assert!(offs[1] > 1, "doc A (long) has multiple chunks");
+            assert_eq!(offs[2] - offs[1], 1, "doc B (short) has exactly one chunk");
+        }
+        TextModelWrapper::free_vec_result(res);
+
+        // max_chunks cap: the long doc is capped to ≤ 3 vectors (tail merged).
+        let capped = crate::chunk::ChunkSettings {
+            strategy: crate::chunk::STRATEGY_FIXED,
+            max_tokens: 32,
+            overlap_tokens: 0,
+            max_chunks: 3,
+        };
+        let res = TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, &capped, 0);
+        assert!(res.error.is_null());
+        assert!(
+            (1..=3).contains(&res.len),
+            "max_chunks=3 caps to ≤3 vectors, got {}",
+            res.len
+        );
+        TextModelWrapper::free_vec_result(res);
+
+        TextModelWrapper::free_vec_result(trunc);
+        TextModelWrapper::free_vec_result(mean);
+        TextModelWrapper::free_model_result(loaded);
+    }
 }
diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs
index 429a6fb4..72eea015 100644
--- a/embeddings/src/model/local.rs
+++ b/embeddings/src/model/local.rs
@@ -1393,6 +1393,19 @@ impl LocalModel {
     }
 }
 
+impl LocalModel {
+    /// Borrow the loaded tokenizer for token-accurate document chunking.
+    fn tokenizer(&self) -> &Tokenizer {
+        match self {
+            LocalModel::Bert(m) => &m.tokenizer,
+            LocalModel::T5(m) => &m.tokenizer,
+            LocalModel::Causal(m) => &m.tokenizer,
+            LocalModel::Quantized(m) => &m.tokenizer,
+            LocalModel::Onnx(m) => &m.tokenizer,
+        }
+    }
+}
+
 impl TextModel for LocalModel {
     fn predict(&self, texts: &[&str], threads: usize) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
         // ONNX manages its own worker count internally — no rayon pool involved.
@@ -1429,6 +1442,18 @@ impl TextModel for LocalModel {
         // Local models don't use API keys, so validation is always successful
         Ok(())
     }
+
+    fn chunk(
+        &self,
+        text: &str,
+        max_tokens: usize,
+        overlap: usize,
+        strategy: u32,
+    ) -> Vec<(usize, usize)> {
+        // Token-accurate split via the loaded tokenizer; chunk_text picks the
+        // split method (fixed/recursive/sentence) from the strategy.
+        crate::chunk::chunk_text(text, max_tokens, overlap, strategy, Some(self.tokenizer()))
+    }
 }
 
 #[cfg(test)]
diff --git a/embeddings/src/model/mod.rs b/embeddings/src/model/mod.rs
index cfa48d72..6ca2a0d7 100644
--- a/embeddings/src/model/mod.rs
+++ b/embeddings/src/model/mod.rs
@@ -36,6 +36,21 @@ pub trait TextModel {
     /// Validates the API key by making a minimal test request to the API.
     /// For remote models, this makes an actual HTTP request. For local models, this is a no-op.
     fn validate_api_key(&self) -> Result<(), Box<dyn Error>>;
+
+    /// Split one document into chunk byte spans `(start, end)` per `strategy`,
+    /// sized to `max_tokens` with `overlap` token overlap. Default impl is the
+    /// char/byte heuristic for remote API models (no local tokenizer);
+    /// `LocalModel` overrides it with token-accurate chunking via its loaded
+    /// tokenizer. Used by every strategy except `truncate`.
+    fn chunk(
+        &self,
+        text: &str,
+        max_tokens: usize,
+        overlap: usize,
+        strategy: u32,
+    ) -> Vec<(usize, usize)> {
+        crate::chunk::chunk_text(text, max_tokens, overlap, strategy, None)
+    }
 }
 
 #[repr(C)]
@@ -105,6 +120,21 @@ impl TextModel for Model {
             Model::Local(m) => m.validate_api_key(),
         }
     }
+
+    fn chunk(
+        &self,
+        text: &str,
+        max_tokens: usize,
+        overlap: usize,
+        strategy: u32,
+    ) -> Vec<(usize, usize)> {
+        match self {
+            Model::OpenAI(m) => m.chunk(text, max_tokens, overlap, strategy),
+            Model::Voyage(m) => m.chunk(text, max_tokens, overlap, strategy),
+            Model::Jina(m) => m.chunk(text, max_tokens, overlap, strategy),
+            Model::Local(m) => m.chunk(text, max_tokens, overlap, strategy),
+        }
+    }
 }
 
 /// Refuse local (candle) inference on an emulated x86 CPU. Rosetta 2 and QEMU
diff --git a/embeddings/src/model/text_model_wrapper.rs b/embeddings/src/model/text_model_wrapper.rs
index 396de4a6..60368728 100644
--- a/embeddings/src/model/text_model_wrapper.rs
+++ b/embeddings/src/model/text_model_wrapper.rs
@@ -1,8 +1,14 @@
+use crate::chunk::ChunkSettings;
 use crate::model::{create_model, Model, ModelOptions, TextModel};
 use crate::panic_guard;
 use std::os::raw::c_char;
 use std::{ffi::c_void, ptr};
 
+/// Per-document output vectors (flat across all documents) plus per-document
+/// vector counts, used to build the row-offsets sidecar. One inner `Vec<f32>`
+/// per emitted vector.
+type EmbedResult = Result<(Vec<Vec<f32>>, Vec<usize>), Box<dyn std::error::Error>>;
+
 /// Sentinel written at offset 0 of every live model handle. Lets FFI entry
 /// points detect garbage, null, or freed pointers handed in by the C++ caller
 /// and return a clean error instead of dereferencing into UB.
@@ -60,13 +66,27 @@ pub struct FloatVec {
     pub cap: usize,
 }
 
-/// cbindgen:field-names=[m_szError,m_tEmbedding,len,cap]
+/// Embedding result for one `make_vect_embeddings` call.
+///
+/// `m_tEmbedding` is a FLAT array of `len` vectors — every input document's
+/// vectors concatenated. `m_pRowOffsets` (length `rows + 1`) groups them per
+/// input document, Arrow-style: document `i` owns
+/// `m_tEmbedding[m_pRowOffsets[i] .. m_pRowOffsets[i + 1]]`. For the v1
+/// strategies (truncate / mean) every document yields exactly one vector, so
+/// `len == rows` and the offsets are `[0, 1, ..., rows]`. The sidecar lets a
+/// future multi-vector strategy return N vectors per document through this same
+/// struct — no second method, cardinality carried as data.
+///
+/// cbindgen:field-names=[m_szError,m_tEmbedding,len,cap,m_pRowOffsets,rows,offsets_cap]
 #[repr(C)]
 pub struct FloatVecResult {
     pub error: *mut c_char,
     pub ptr: *const FloatVec,
     pub len: usize,
     pub cap: usize,
+    pub row_offsets: *const usize,
+    pub rows: usize,
+    pub offsets_cap: usize,
 }
 
 #[repr(C)]
@@ -201,10 +221,20 @@ impl TextModelWrapper {
         }
     }
 
+    /// Generate embeddings for a batch of documents. `settings` selects the
+    /// strategy (null ⇒ `truncate`):
+    /// - `truncate` — embed the first `max_tokens` tokens (one vector/doc).
+    /// - `mean` — split the doc, embed every chunk, average into one vector/doc.
+    /// - `fixed`/`recursive`/`sentence` — split the doc and keep every chunk
+    ///   vector (N vectors/doc).
+    ///
+    /// Output is a flat `FloatVec[]`; `m_pRowOffsets` groups it per document
+    /// (1 vector/doc for truncate/mean, N for the rest). `threads`: 0 = all CPUs.
     pub extern "C" fn make_vect_embeddings(
         &self,
         texts: *const StringItem,
         count: usize,
+        settings: *const ChunkSettings,
         threads: i32, // 0 = use all available CPUs, >0 = cap thread count
     ) -> FloatVecResult {
         panic_guard::catch_panic(|| {
@@ -217,6 +247,9 @@ impl TextModelWrapper {
                         ptr: ptr::null(),
                         len: 0,
                         cap: 0,
+                        row_offsets: ptr::null(),
+                        rows: 0,
+                        offsets_cap: 0,
                     };
                 }
             };
@@ -234,43 +267,112 @@ impl TextModelWrapper {
                 .collect();
 
             let threads = if threads > 0 { threads as usize } else { 0 };
+            let settings_ref = unsafe { settings.as_ref() };
+
+            let strategy = settings_ref
+                .map(|s| s.strategy)
+                .unwrap_or(crate::chunk::STRATEGY_TRUNCATE);
+
+            // Each strategy yields a flat Vec<Vec<f32>> (all documents' output
+            // vectors) plus per-document output counts. truncate/mean emit one
+            // vector/doc; fixed/recursive/sentence emit one vector per chunk.
+            let computed: EmbedResult =
+                if strategy == crate::chunk::STRATEGY_TRUNCATE {
+                    model.predict(&string_refs, threads).map(|vecs| {
+                        let counts = vec![1usize; vecs.len()];
+                        (vecs, counts)
+                    })
+                } else {
+                    let s = settings_ref.unwrap();
+                    let max = crate::chunk::effective_max(s, model.get_max_input_len());
+                    let overlap = s.overlap_tokens as usize;
+                    let max_chunks = s.max_chunks as usize;
+
+                    // Split every document; remember each document's chunk count.
+                    let mut flat: Vec<&str> = Vec::new();
+                    let mut chunk_counts: Vec<usize> = Vec::with_capacity(string_refs.len());
+                    for &text in &string_refs {
+                        let spans = crate::chunk::cap_chunks(
+                            model.chunk(text, max, overlap, strategy),
+                            max_chunks,
+                        );
+                        chunk_counts.push(spans.len());
+                        for (start, end) in spans {
+                            flat.push(&text[start..end]);
+                        }
+                    }
+
+                    model.predict(&flat, threads).map(|chunk_vecs| {
+                        if strategy == crate::chunk::STRATEGY_MEAN {
+                            // pool each document's chunks into one vector
+                            let mut out = Vec::with_capacity(chunk_counts.len());
+                            let mut idx = 0usize;
+                            for c in &chunk_counts {
+                                let end = idx + c;
+                                out.push(crate::chunk::mean_pool(&chunk_vecs[idx..end]));
+                                idx = end;
+                            }
+                            let counts = vec![1usize; out.len()];
+                            (out, counts)
+                        } else {
+                            // fixed/recursive/sentence: keep every chunk vector.
+                            (chunk_vecs, chunk_counts)
+                        }
+                    })
+                };
 
-            let mut float_vec_list: Vec<FloatVec> = Vec::new();
-            let embeddings_list = model.predict(&string_refs, threads);
-            let c_error = match embeddings_list {
-                Ok(embeddings_list) => {
+            match computed {
+                Ok((embeddings_list, counts)) => {
+                    let mut float_vec_list: Vec<FloatVec> =
+                        Vec::with_capacity(embeddings_list.len());
                     for embeddings in embeddings_list.iter() {
-                        let ptr = embeddings.as_ptr();
-                        let len = embeddings.len();
-                        let cap = embeddings.capacity();
-                        let vec = FloatVec { ptr, len, cap };
-                        float_vec_list.push(vec);
+                        float_vec_list.push(FloatVec {
+                            ptr: embeddings.as_ptr(),
+                            len: embeddings.len(),
+                            cap: embeddings.capacity(),
+                        });
                     }
-
                     std::mem::forget(embeddings_list);
-                    ptr::null_mut()
-                }
-                Err(e) => {
-                    // Don't push empty vector on error - return error through szError pattern
-                    let c_error = std::ffi::CString::new(e.to_string()).unwrap();
-                    c_error.into_raw()
-                }
-            };
 
-            let vec_result = FloatVecResult {
-                ptr: float_vec_list.as_ptr(),
-                len: float_vec_list.len(),
-                cap: float_vec_list.capacity(),
-                error: c_error,
-            };
-            std::mem::forget(float_vec_list);
-            vec_result
+                    // Group the flat vectors per document: truncate/mean → one
+                    // vector/doc (offsets [0,1,..]); fixed/recursive/sentence →
+                    // N vectors/doc, offsets = prefix sums of per-doc counts.
+                    let row_offsets = crate::chunk::row_offsets_from_counts(&counts);
+                    let rows = counts.len();
+
+                    let result = FloatVecResult {
+                        error: ptr::null_mut(),
+                        ptr: float_vec_list.as_ptr(),
+                        len: float_vec_list.len(),
+                        cap: float_vec_list.capacity(),
+                        row_offsets: row_offsets.as_ptr(),
+                        rows,
+                        offsets_cap: row_offsets.capacity(),
+                    };
+                    std::mem::forget(float_vec_list);
+                    std::mem::forget(row_offsets);
+                    result
+                }
+                // Don't push empty vectors on error — surface it via szError.
+                Err(e) => FloatVecResult {
+                    error: to_c_error(&e.to_string()),
+                    ptr: ptr::null(),
+                    len: 0,
+                    cap: 0,
+                    row_offsets: ptr::null(),
+                    rows: 0,
+                    offsets_cap: 0,
+                },
+            }
         })
         .unwrap_or_else(|msg| FloatVecResult {
             error: to_c_error(&format!("embeddings: internal error (panic): {msg}")),
             ptr: ptr::null(),
             len: 0,
             cap: 0,
+            row_offsets: ptr::null(),
+            rows: 0,
+            offsets_cap: 0,
         })
     }
 
@@ -293,6 +395,15 @@ impl TextModelWrapper {
                 let _ = Vec::from_raw_parts(result.ptr as *mut FloatVec, result.len, result.cap);
             }
 
+            // Free the per-row offsets sidecar (length = rows + 1).
+            if !result.row_offsets.is_null() && result.offsets_cap > 0 {
+                let _ = Vec::from_raw_parts(
+                    result.row_offsets as *mut usize,
+                    result.rows + 1,
+                    result.offsets_cap,
+                );
+            }
+
             // Free the error string if it exists
             if !result.error.is_null() {
                 let _ = std::ffi::CString::from_raw(result.error);
diff --git a/knn/embeddings.cpp b/knn/embeddings.cpp
index 9b3a1f48..d995c364 100644
--- a/knn/embeddings.cpp
+++ b/knn/embeddings.cpp
@@ -265,7 +265,9 @@ bool TextToEmbeddings_c::Convert ( const std::vector<std::string_view> & dTexts,
 	assert(pFuncs);
 
 	// iThreads: 0 = use all available CPUs (default), >0 = cap worker count in the embeddings lib
-	FloatVecResult tVecResult = pFuncs->make_vect_embeddings ( &m_pModel, dStringItems.data(), dStringItems.size(), iThreads );
+	// nullptr ChunkSettings = truncate strategy (today's behavior). Pass a populated
+	// ChunkSettings (e.g. STRATEGY_MEAN) once the table DDL exposes a chunking option.
+	FloatVecResult tVecResult = pFuncs->make_vect_embeddings ( &m_pModel, dStringItems.data(), dStringItems.size(), nullptr, iThreads );
 	if ( tVecResult.m_szError )
 	{
 		sError = tVecResult.m_szError;
@@ -273,6 +275,9 @@ bool TextToEmbeddings_c::Convert ( const std::vector<std::string_view> & dTexts,
 		return false;
 	}
 
+	// truncate/mean return one vector per input row, so tVecResult.len == rows
+	// and this 1:1 copy is correct. A future multi-vector strategy (N vectors
+	// per row) would instead group via tVecResult.m_pRowOffsets[i..i+1].
 	dEmbeddings.resize ( tVecResult.len );
 	for ( size_t i = 0; i < tVecResult.len; i++ )
 	{
@@ -312,7 +317,7 @@ knn::EmbeddingsLib_i * LoadEmbeddingsLib ( const std::string & sLibPath, std::st
 	if ( !pLib->Load(sError) )
 		return nullptr;
 
-	const int SUPPORTED_EMBEDDINGS_LIB_VER = 4;
+	const int SUPPORTED_EMBEDDINGS_LIB_VER = 8;
 	if ( pLib->GetVersion()!=SUPPORTED_EMBEDDINGS_LIB_VER )
 	{
 		sError = util::FormatStr ( "Unsupported embeddings library version %d (expected %d)", pLib->GetVersion(), SUPPORTED_EMBEDDINGS_LIB_VER );