Skip to content

Commit c241808

Browse files
committed
feat(vector-search): enhance vector search functionality and integrate NVIDIA index
- Added `vector_search_utils.py` for semantic search with threshold capabilities, improving vector search efficiency. - Updated `video_service.py` to utilize the new semantic search method, maintaining backward compatibility with existing keyword search. - Introduced `attach_nvidia_vector_index.py` script to manage the NVIDIA Vectorize index for the `content_features` column in the database. - Modified `schema-astra.cql` to adjust the `content_features` vector dimension to 1024 for optimized performance. - Updated `pyproject.toml` to include new dependencies and scripts for enhanced functionality. - Improved logging and error handling across the new and modified components.
1 parent 4cd81de commit c241808

6 files changed

Lines changed: 1675 additions & 241 deletions

File tree

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
from typing import Any, Dict, List, Tuple
5+
6+
from app.db.astra_client import AstraDBCollection # noqa: F401
7+
8+
from app.models.video import VideoSummary
9+
10+
import logging
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
def _collect_docs_from_cursor(cursor):
16+
"""Return a list of docs from an astrapy cursor or a stub list in unit-tests."""
17+
18+
if asyncio.iscoroutine(cursor):
19+
return cursor # caller should await upstream
20+
21+
if hasattr(cursor, "to_list"):
22+
return cursor.to_list()
23+
24+
# In unit tests we sometimes pass a list instead of a real cursor
25+
return cursor
26+
27+
28+
async def semantic_search_with_threshold(
29+
*,
30+
db_table: AstraDBCollection,
31+
vector_column: str,
32+
query: str,
33+
page: int,
34+
page_size: int,
35+
similarity_threshold: float = 0.0,
36+
overfetch_factor: int = 3,
37+
) -> Tuple[List[VideoSummary], int]:
38+
"""Run a vector search and apply a client-side similarity cutoff.
39+
40+
Parameters
41+
----------
42+
db_table : AstraDBCollection
43+
Table / collection to query (must contain the *vector_column*).
44+
vector_column : str
45+
Name of the vector column to sort on, e.g. ``"content_features"``.
46+
query : str
47+
The natural-language query that will be embedded on-the-fly by Astra.
48+
page / page_size : int
49+
Standard pagination parameters expected by the public API.
50+
similarity_threshold : float, optional
51+
Keep only rows whose ``$similarity`` ≥ this value. Default 0 (no trim).
52+
overfetch_factor : int, optional
53+
How many extra rows to ask Astra for. 3× the *page_size* works well
54+
for typical thresholds around 0.7-0.9.
55+
"""
56+
57+
if page < 1 or page_size < 1:
58+
return [], 0
59+
60+
# Ask Astra for a generous slice so we can trim client-side.
61+
overfetch = page_size * overfetch_factor * page # grow with page number
62+
63+
cursor = db_table.find(
64+
filter={},
65+
sort={vector_column: query},
66+
limit=overfetch,
67+
include_similarity=True, # ⭐
68+
)
69+
70+
# Fetch docs.
71+
docs: List[Dict[str, Any]]
72+
if hasattr(cursor, "to_list"):
73+
docs = await cursor.to_list()
74+
else:
75+
docs = cursor # type: ignore[assignment]
76+
77+
logger.debug(
78+
"Vector search fetched %s docs (page=%s, overfetch=%s)",
79+
len(docs),
80+
page,
81+
overfetch,
82+
)
83+
84+
if similarity_threshold > 0:
85+
pre_trim = len(docs)
86+
docs = [d for d in docs if d.get("$similarity", 0) >= similarity_threshold]
87+
logger.debug(
88+
"Trimmed by threshold %.2f: %s → %s docs", similarity_threshold, pre_trim, len(docs)
89+
)
90+
91+
if docs:
92+
logger.debug(
93+
"Top doc similarity after trim: %.3f", docs[0].get("$similarity", -1.0)
94+
)
95+
96+
total = len(docs)
97+
98+
# Slice to requested page.
99+
start = (page - 1) * page_size
100+
end = start + page_size
101+
page_docs = docs[start:end]
102+
103+
summaries = [VideoSummary.model_validate(d) for d in page_docs]
104+
105+
return summaries, total

app/services/video_service.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -704,25 +704,18 @@ async def search_videos_by_keyword(
704704
page_size: int,
705705
db_table: Optional[AstraDBCollection] = None,
706706
) -> Tuple[List[VideoSummary], int]:
707-
"""Basic case-insensitive substring search across title, description, tags."""
707+
"""Keyword search fallback using Astra's semantic `$vectorize` sort.
708708
709-
if db_table is None:
710-
db_table = await get_table(VIDEOS_TABLE_NAME)
711-
712-
escaped = re.escape(query)
713-
search_filter: Dict[str, Any] = {
714-
"$or": [
715-
{"name": {"$regex": escaped, "$options": "i"}},
716-
{"description": {"$regex": escaped, "$options": "i"}},
717-
{"tags": {"$regex": escaped, "$options": "i"}},
718-
],
719-
}
709+
The Data API does not support `$regex` filters. Instead we rely on the
710+
built-in vector search to rank results by textual similarity to *query*.
711+
This mirrors what ``search_videos_by_semantic`` does but keeps the public
712+
interface unchanged for callers expecting *keyword* search.
713+
"""
720714

721-
return await list_videos_with_query(
722-
query_filter=search_filter,
715+
return await search_videos_by_semantic(
716+
query=query,
723717
page=page,
724718
page_size=page_size,
725-
sort_options={"added_date": -1},
726719
db_table=db_table,
727720
)
728721

@@ -759,14 +752,25 @@ async def search_videos_by_semantic(
759752
detail="Query exceeds 512-token limit for semantic search.",
760753
)
761754

762-
sort_vector = {"$vectorize": query}
755+
# Delegate to reusable helper so we can later swap with server-side
756+
# threshold once the Data API supports it natively.
763757

764-
return await list_videos_with_query(
765-
query_filter={},
758+
from app.services.vector_search_utils import (
759+
semantic_search_with_threshold,
760+
)
761+
762+
if db_table is None:
763+
db_table = await get_table(VIDEOS_TABLE_NAME)
764+
765+
return await semantic_search_with_threshold(
766+
db_table=db_table,
767+
vector_column="content_features",
768+
query=query,
766769
page=page,
767770
page_size=page_size,
768-
sort_options=sort_vector,
769-
db_table=db_table,
771+
# NV-Embed scores rarely exceed ~0.75, so 0.65 keeps the top
772+
# matches while still trimming weak ones.
773+
similarity_threshold=0.65,
770774
)
771775

772776

docs/schema-astra.cql

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ CREATE TABLE IF NOT EXISTS killrvideo.videos (
8787
name text,
8888
preview_image_location text,
8989
tags set<text>, -- Collection for efficient tag storage
90-
content_features vector<float, 4096>, -- Vector type (4096-dim) for NV-Embed semantic search
90+
content_features vector<float, 1024>, -- Vector type (1024-dim) for NV-Embed semantic search
9191
userid uuid,
9292
content_rating text, -- 'G', 'PG', 'PG-13', 'R', etc.
9393
category text,
@@ -135,10 +135,13 @@ USING 'StorageAttachedIndex';
135135

136136
-- Vector search index with COSINE similarity function
137137
-- Enables ANN searches for content-based recommendations
138-
CREATE CUSTOM INDEX IF NOT EXISTS videos_content_features_idx
139-
ON killrvideo.videos(content_features)
138+
CREATE CUSTOM INDEX videos_content_features_idx
139+
ON killrvideo.videos(content_features)
140140
USING 'StorageAttachedIndex'
141-
WITH OPTIONS = { 'similarity_function': 'COSINE' };
141+
WITH OPTIONS = {
142+
'similarity_function' : 'COSINE',
143+
'source_model' : 'nv-qa-4'
144+
};
142145

143146
-- Denormalized table for latest videos
144147
-- Supports queries: Get latest videos in chronological order

0 commit comments

Comments
 (0)