feat: migrate from Astra Vectorize to IBM Granite embeddings

pmcfadin · claude · pmcfadin · commit 88d7cbb8d8ac · 2025-11-05T16:00:53.000-08:00
Replace Astra's server-side NVIDIA NV-Embed vectorization with client-side embedding generation using the IBM Granite-Embedding-30m-English model for improved control and reduced dependency on external services. Key changes: - Add embedding service with Granite model (384-dim vectors vs 4096-dim) - Generate embeddings client-side during video ingestion - Update semantic search to embed queries client-side - Add configurable similarity threshold via VECTOR_SEARCH_SIMILARITY_THRESHOLD - Update backfill script to use Granite embeddings - Remove NVIDIA vectorization dependencies and configurations - Enable vector search by default Dependencies: - sentence-transformers >= 3.0.0 - torch >= 2.0.0 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/app/core/config.py b/app/core/config.py
@@ -104,8 +104,19 @@ def parsed_cors_origins(self) -> list[str]:  # noqa: D401
 
     INLINE_METADATA_DISABLED: bool = False
     ENABLE_BACKGROUND_PROCESSING: bool = True
-    # Feature flag – enables semantic vector search endpoints
-    VECTOR_SEARCH_ENABLED: bool = False
+    # Feature flag – enables semantic vector search using IBM Granite embeddings
+    VECTOR_SEARCH_ENABLED: bool = True
+
+    # Minimum similarity score (0.0-1.0) for semantic search results
+    # Higher values = more strict matching (fewer but more relevant results)
+    # Lower values = more lenient matching (more results but potentially less relevant)
+    # Recommended range: 0.5-0.8 for Granite embeddings
+    VECTOR_SEARCH_SIMILARITY_THRESHOLD: float = Field(
+        default=0.65,
+        ge=0.0,
+        le=1.0,
+        description="Minimum cosine similarity score for search results (0.0-1.0)",
+    )
 
     # ------------------------------------------------------------------
     # YouTube integration
diff --git a/app/services/embedding_service.py b/app/services/embedding_service.py
@@ -0,0 +1,170 @@
+"""
+Embedding service for generating vector embeddings using IBM Granite model.
+
+This service uses the IBM Granite-Embedding-30m-English model to generate
+384-dimensional embeddings for text. The model is loaded once at startup
+and cached in memory for fast inference.
+"""
+
+import logging
+import re
+from typing import List, Optional
+
+from sentence_transformers import SentenceTransformer
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingService:
+    """
+    Singleton service for generating embeddings using IBM Granite model.
+
+    The model is loaded once at initialization and cached for fast inference.
+    """
+
+    _instance: Optional["EmbeddingService"] = None
+    _model: Optional[SentenceTransformer] = None
+
+    MODEL_NAME = "ibm-granite/granite-embedding-30m-english"
+    EMBEDDING_DIMENSION = 384
+    MAX_TOKENS = 512
+
+    def __new__(cls):
+        """Implement singleton pattern."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self):
+        """Initialize the embedding service."""
+        if self._model is None:
+            logger.info(f"Loading embedding model: {self.MODEL_NAME}")
+            self._model = SentenceTransformer(self.MODEL_NAME)
+            logger.info(
+                f"Embedding model loaded successfully. Dimension: {self.EMBEDDING_DIMENSION}"
+            )
+
+    def _clip_to_max_tokens(self, text: str) -> str:
+        """
+        Clip text to maximum token limit (512 tokens).
+
+        Uses a simple tokenizer that matches the pattern used for NVIDIA embeddings.
+        This is a conservative approximation - actual tokenization may differ slightly.
+
+        Args:
+            text: The input text to clip
+
+        Returns:
+            The clipped text if over limit, otherwise original text
+        """
+        # Simple tokenizer: matches word characters and punctuation
+        token_re = re.compile(r"\w+|[^\w\s]", flags=re.UNICODE)
+        tokens = token_re.findall(text)
+
+        if len(tokens) <= self.MAX_TOKENS:
+            return text
+
+        # Clip to max tokens and rejoin
+        clipped_tokens = tokens[: self.MAX_TOKENS]
+        # Find the position in original text where we should cut
+        # This is approximate but works well enough
+        clipped_text = " ".join(clipped_tokens)
+
+        logger.warning(f"Text clipped from {len(tokens)} to {self.MAX_TOKENS} tokens")
+
+        return clipped_text
+
+    def generate_embedding(self, text: str, clip_tokens: bool = True) -> List[float]:
+        """
+        Generate a 384-dimensional embedding vector for the given text.
+
+        Args:
+            text: The input text to embed
+            clip_tokens: Whether to clip text to MAX_TOKENS (default: True)
+
+        Returns:
+            A list of 384 float values representing the embedding vector
+
+        Raises:
+            ValueError: If text is empty or model is not loaded
+        """
+        if not text or not text.strip():
+            raise ValueError("Cannot generate embedding for empty text")
+
+        if self._model is None:
+            raise ValueError("Embedding model not loaded")
+
+        # Clip to token limit if requested
+        if clip_tokens:
+            text = self._clip_to_max_tokens(text)
+
+        # Generate embedding
+        # encode() returns numpy array, convert to list of floats
+        embedding = self._model.encode(text, convert_to_numpy=True)
+
+        return embedding.tolist()
+
+    def generate_embeddings_batch(
+        self, texts: List[str], clip_tokens: bool = True
+    ) -> List[List[float]]:
+        """
+        Generate embeddings for multiple texts in a batch.
+
+        This is more efficient than calling generate_embedding() multiple times
+        as the model can process multiple texts in parallel.
+
+        Args:
+            texts: List of input texts to embed
+            clip_tokens: Whether to clip texts to MAX_TOKENS (default: True)
+
+        Returns:
+            List of embedding vectors, one for each input text
+
+        Raises:
+            ValueError: If any text is empty or model is not loaded
+        """
+        if not texts:
+            raise ValueError("Cannot generate embeddings for empty list")
+
+        if self._model is None:
+            raise ValueError("Embedding model not loaded")
+
+        # Validate and clip texts
+        processed_texts = []
+        for text in texts:
+            if not text or not text.strip():
+                raise ValueError("Cannot generate embedding for empty text")
+
+            if clip_tokens:
+                text = self._clip_to_max_tokens(text)
+
+            processed_texts.append(text)
+
+        # Generate embeddings in batch
+        embeddings = self._model.encode(
+            processed_texts, convert_to_numpy=True, show_progress_bar=False
+        )
+
+        return [emb.tolist() for emb in embeddings]
+
+
+# Global instance
+_embedding_service: Optional[EmbeddingService] = None
+
+
+def get_embedding_service() -> EmbeddingService:
+    """
+    Get the global embedding service instance.
+
+    This function initializes the service on first call and returns
+    the cached instance on subsequent calls.
+
+    Returns:
+        The global EmbeddingService instance
+    """
+    global _embedding_service
+
+    if _embedding_service is None:
+        _embedding_service = EmbeddingService()
+
+    return _embedding_service
diff --git a/app/services/vector_search_utils.py b/app/services/vector_search_utils.py
@@ -29,7 +29,7 @@ async def semantic_search_with_threshold(
     *,
     db_table: AstraDBCollection,
     vector_column: str,
-    query: str,
+    query_vector: List[float],
     page: int,
     page_size: int,
     similarity_threshold: float = 0.0,
@@ -43,8 +43,8 @@ async def semantic_search_with_threshold(
         Table / collection to query (must contain the *vector_column*).
     vector_column : str
         Name of the vector column to sort on, e.g. ``"content_features"``.
-    query : str
-        The natural-language query that will be embedded on-the-fly by Astra.
+    query_vector : List[float]
+        Pre-computed embedding vector for the query (384 dimensions for Granite model).
     page / page_size : int
         Standard pagination parameters expected by the public API.
     similarity_threshold : float, optional
@@ -69,11 +69,11 @@ async def semantic_search_with_threshold(
     start_time = time.perf_counter()
 
     with tracer.start_as_current_span("vector.search") as span:
-        span.set_attribute("query", query[:64])  # truncate long queries for span
+        span.set_attribute("vector_dimensions", len(query_vector))
 
         cursor = db_table.find(
             filter={},
-            sort={vector_column: query},
+            sort={vector_column: query_vector},
             limit=overfetch,
             include_similarity=True,  # ⭐
         )
diff --git a/app/services/video_service.py b/app/services/video_service.py
@@ -37,7 +37,7 @@
     MetadataFetchError,
 )
 from app.core.config import settings
-from app.utils.text import clip_to_512_tokens
+from app.services.embedding_service import get_embedding_service
 
 from astrapy.exceptions.data_api_exceptions import DataAPIResponseException
 
@@ -198,12 +198,11 @@ async def submit_new_video(
     full_doc = new_video.model_dump(by_alias=False, exclude_none=True)
 
     # ------------------------------------------------------------------
-    # Build semantic embedding input string for NV-Embed auto-vectorisation.
-    # The Data API embeds *strings* via the `$vectorize` operator when they are
-    # stored in a ``vector`` column.  We therefore concatenate title,
-    # description, and tags into a single text blob and store it directly in
-    # the ``content_features`` field.  The vector will be generated
-    # server-side during the insert/update operation.
+    # Generate semantic embeddings using IBM Granite model.
+    # We concatenate title, description, and tags into a single text blob,
+    # then generate a 384-dimensional embedding vector client-side using
+    # the Granite-Embedding-30m-English model. The embedding service
+    # handles token limiting (512 tokens max) automatically.
     # ------------------------------------------------------------------
 
     components: list[str] = [resolved_name]
@@ -212,8 +211,11 @@ async def submit_new_video(
     if new_video.tags:
         components.append(" ".join(new_video.tags))
 
-    embedding_raw = "\n".join(components)
-    full_doc["content_features"] = clip_to_512_tokens(embedding_raw)
+    embedding_text = "\n".join(components)
+
+    # Generate embedding using Granite model (returns List[float] with 384 dimensions)
+    embedding_service = get_embedding_service()
+    full_doc["content_features"] = embedding_service.generate_embedding(embedding_text)
 
     # Ensure any HttpUrl instances are converted to plain strings so AstraDB
     # JSON encoder does not choke.  We purposely *do not* strip unknown
@@ -757,29 +759,33 @@ async def search_videos_by_semantic(
     page_size: int,
     db_table: Optional[AstraDBCollection] = None,
 ) -> Tuple[List[VideoSummary], int]:
-    """Return videos ranked by semantic similarity using Astra `$vectorize`.
+    """Return videos ranked by semantic similarity using IBM Granite embeddings.
+
+    The query is embedded client-side using the Granite-Embedding-30m-English
+    model, then compared against stored video embeddings using cosine similarity.
 
     Raises
     ------
     HTTPException
-        With status ``400`` if the query exceeds the NV-Embed 512-token limit.
+        With status ``400`` if the query exceeds the 512-token limit.
     """
 
     # ------------------------------------------------------------------
-    # Validate token length against NVIDIA provider limit (512 tokens).
+    # Generate query embedding using Granite model.
+    # The embedding service handles token validation (512 tokens max).
     # ------------------------------------------------------------------
 
-    import re as _re
+    embedding_service = get_embedding_service()
 
-    token_re = _re.compile(r"\w+|[^\w\s]", flags=_re.UNICODE)
-    if len(token_re.findall(query)) > 512:
+    try:
+        query_vector = embedding_service.generate_embedding(query)
+    except ValueError as e:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
-            detail="Query exceeds 512-token limit for semantic search.",
+            detail=f"Failed to generate query embedding: {str(e)}",
         )
 
-    # Delegate to reusable helper so we can later swap with server-side
-    # threshold once the Data API supports it natively.
+    # Delegate to reusable helper with pre-computed embedding vector.
 
     from app.services.vector_search_utils import (
         semantic_search_with_threshold,
@@ -791,12 +797,12 @@ async def search_videos_by_semantic(
     return await semantic_search_with_threshold(
         db_table=db_table,
         vector_column="content_features",
-        query=query,
+        query_vector=query_vector,
         page=page,
         page_size=page_size,
-        # NV-Embed scores rarely exceed ~0.75, so 0.65 keeps the top
-        # matches while still trimming weak ones.
-        similarity_threshold=0.65,
+        # Use configurable similarity threshold from settings
+        # Can be adjusted via VECTOR_SEARCH_SIMILARITY_THRESHOLD in .env
+        similarity_threshold=settings.VECTOR_SEARCH_SIMILARITY_THRESHOLD,
     )
 
 
diff --git a/env.example b/env.example
@@ -44,9 +44,15 @@ SECRET_KEY=please-change-me-to-a-long-random-string
 # --------------------------------------------------------------------
 # Feature Flags
 # --------------------------------------------------------------------
-# Enable semantic vector search mode (true/false). Keep disabled until the
-# migrations & backfill jobs have run successfully.
-VECTOR_SEARCH_ENABLED=false
+# Enable semantic vector search using IBM Granite embeddings (true/false).
+# Default is true. Set to false if you haven't loaded vector embeddings yet.
+VECTOR_SEARCH_ENABLED=true
+
+# Minimum similarity score (0.0-1.0) for semantic search results
+# Higher values = more strict matching (fewer but more relevant results)
+# Lower values = more lenient matching (more results but potentially less relevant)
+# Recommended range: 0.5-0.8 for Granite embeddings. Default: 0.65
+# VECTOR_SEARCH_SIMILARITY_THRESHOLD=0.65
 
 # --------------------------------------------------------------------
 # Observability / Telemetry
diff --git a/migrations/2025_08_vector.cql b/migrations/2025_08_vector.cql
@@ -1,10 +1,10 @@
--- Increase the vector dimension to 4096 and attach the NVIDIA embedding service
-ALTER TABLE killrvideo.videos ALTER content_features TYPE vector<float, 4096>;
+-- Set the vector dimension to 384 for IBM Granite embeddings
+ALTER TABLE killrvideo.videos ALTER content_features TYPE vector<float, 384>;
 
 -- Drop the existing vector index if present
 DROP INDEX IF EXISTS videos_content_features_idx;
 
--- Recreate the SAI index for the enlarged vector column using cosine similarity
+-- Recreate the SAI index for the vector column using cosine similarity
 CREATE CUSTOM INDEX videos_content_features_idx
 ON killrvideo.videos (content_features)
 USING 'StorageAttachedIndex'
diff --git a/migrations/2025_08_vector.json b/migrations/2025_08_vector.json
@@ -6,11 +6,7 @@
         "alterColumns": {
           "content_features": {
             "type": "vector",
-            "dimension": 4096,
-            "service": {
-              "provider": "nvidia",
-              "modelName": "NV-Embed-QA"
-            }
+            "dimension": 384
           }
         }
       }
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/scripts/backfill_vectors.py b/scripts/backfill_vectors.py

Original file line number	Diff line number	Diff line change
`@@ -6,11 +6,7 @@`
`6`	`6`	`"alterColumns": {`
`7`	`7`	`"content_features": {`
`8`	`8`	`"type": "vector",`
`9`		`- "dimension": 4096,`
`10`		`- "service": {`
`11`		`- "provider": "nvidia",`
`12`		`- "modelName": "NV-Embed-QA"`
`13`		`- }`
	`9`	`+ "dimension": 384`
`14`	`10`	`}`
`15`	`11`	`}`
`16`	`12`	`}`