Add text processing utility for token clipping

pmcfadin · pmcfadin · commit 8bbb0ed0bba1 · 2025-06-16T16:45:01.000-07:00
- Introduced a new utility function `clip_to_512_tokens` in `text.py` to truncate input text to a maximum of 512 tokens, ensuring compatibility with NV-Embed model requirements.
- Implemented a regex-based tokenizer to handle word and punctuation boundaries, providing a lightweight solution for tokenization.
- Added unit tests in `test_text.py` to validate the functionality of the clipping method, including edge cases for token counts and handling of Unicode punctuation.
diff --git a/app/utils/text.py b/app/utils/text.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+"""Text processing helpers used across the KillrVideo backend."""
+
+import re
+
+__all__ = ["clip_to_512_tokens"]
+
+# ---------------------------------------------------------------------------
+# Basic tokenizer
+# ---------------------------------------------------------------------------
+# The NV-Embed provider enforces a hard limit of **512 tokens** for both
+# `$vectorize` requests and vector-enabled queries.  We approximate token
+# boundaries using a lightweight regex that splits on standard *word* chunks
+# while treating punctuation and symbols as individual tokens.  This is **not**
+# an exact match of the provider's internal SentencePiece model but is
+# sufficiently close for defensive clipping.
+#
+#   • Consecutive whitespace is ignored (no empty tokens).
+#   • Unicode punctuation characters are captured as standalone tokens.
+#   • The pattern is Unicode-aware through the `re.UNICODE` flag (default in
+#     Python 3 but kept explicit).
+# ---------------------------------------------------------------------------
+TOKEN_RE = re.compile(r"\w+|[^\w\s]", flags=re.UNICODE)
+MAX_TOKENS_NV_EMBED = 512
+
+
+def clip_to_512_tokens(text: str) -> str:  # noqa: D401
+    """Return *text* truncated to **≤512** tokens.
+
+    If the input already fits under the limit it is returned unchanged.  For
+    longer inputs the first 512 tokens are kept and re-joined using a single
+    space so downstream code works with a valid plain-text string.
+
+    Parameters
+    ----------
+    text:
+        Arbitrary input string (may contain newlines, tabs, or Unicode
+        punctuation).
+
+    Returns
+    -------
+    str
+        The (possibly) truncated string, guaranteed to be ≤512 tokens when
+        tokenised via :pydata:`TOKEN_RE`.
+    """
+
+    if not text:
+        return text  # Early-exit for empty or ``None``‐like strings
+
+    tokens = TOKEN_RE.findall(text)
+
+    if len(tokens) <= MAX_TOKENS_NV_EMBED:
+        # No truncation needed – preserve original spacing to avoid surprising
+        # callers that might rely on exact text equality (e.g., hashing).
+        return text
+
+    clipped_tokens = tokens[:MAX_TOKENS_NV_EMBED]
+
+    # Re-join tokens with a single space.  This canonical form is sufficient
+    # for embedding purposes and avoids the complexity of reconstructing the
+    # original whitespace layout.
+    return " ".join(clipped_tokens)
diff --git a/tests/utils/test_text.py b/tests/utils/test_text.py
@@ -0,0 +1,36 @@
+import re
+
+import pytest
+
+from app.utils.text import clip_to_512_tokens
+
+TOKEN_RE = re.compile(r"\w+|[^\w\s]", flags=re.UNICODE)
+
+
+def _count_tokens(s: str) -> int:
+    return len(TOKEN_RE.findall(s))
+
+
+@pytest.mark.parametrize("token_count", [600, 513, 512, 100])
+def test_clip_to_512_tokens(token_count: int):
+    """Ensure text is clipped to 512 tokens when necessary."""
+
+    input_text = " ".join(f"tok{i}" for i in range(token_count))
+    result = clip_to_512_tokens(input_text)
+
+    if token_count <= 512:
+        assert result == input_text
+        assert _count_tokens(result) == token_count
+    else:
+        assert _count_tokens(result) == 512
+        assert result.split()[0] == "tok0"
+        assert result.split()[-1] == "tok511"
+
+
+def test_unicode_and_whitespace():
+    """Function should leave short text with Unicode punctuation unchanged."""
+
+    text = "你好，   世界！  Hello — world…"
+    clipped = clip_to_512_tokens(text)
+    assert clipped == text
+    assert _count_tokens(clipped) < 512