|
37 | 37 | MetadataFetchError, |
38 | 38 | ) |
39 | 39 | from app.core.config import settings |
| 40 | +from app.utils.text import clip_to_512_tokens |
40 | 41 |
|
41 | 42 | from astrapy.exceptions.data_api_exceptions import DataAPIResponseException |
42 | 43 |
|
@@ -195,6 +196,24 @@ async def submit_new_video( |
195 | 196 |
|
196 | 197 | full_doc = new_video.model_dump(by_alias=False, exclude_none=True) |
197 | 198 |
|
| 199 | + # ------------------------------------------------------------------ |
| 200 | + # Build semantic embedding input string for NV-Embed auto-vectorisation. |
| 201 | + # The Data API embeds *strings* via the `$vectorize` operator when they are |
| 202 | + # stored in a ``vector`` column. We therefore concatenate title, |
| 203 | + # description, and tags into a single text blob and store it directly in |
| 204 | + # the ``content_features`` field. The vector will be generated |
| 205 | + # server-side during the insert/update operation. |
| 206 | + # ------------------------------------------------------------------ |
| 207 | + |
| 208 | + components: list[str] = [resolved_name] |
| 209 | + if new_video.description: |
| 210 | + components.append(new_video.description) |
| 211 | + if new_video.tags: |
| 212 | + components.append(" ".join(new_video.tags)) |
| 213 | + |
| 214 | + embedding_raw = "\n".join(components) |
| 215 | + full_doc["content_features"] = clip_to_512_tokens(embedding_raw) |
| 216 | + |
198 | 217 | # Ensure any HttpUrl instances are converted to plain strings so AstraDB |
199 | 218 | # JSON encoder does not choke. We purposely *do not* strip unknown |
200 | 219 | # columns here because unit-tests rely on seeing them; schema filtering |
@@ -708,6 +727,49 @@ async def search_videos_by_keyword( |
708 | 727 | ) |
709 | 728 |
|
710 | 729 |
|
| 730 | +# --------------------------------------------------------------------------- |
| 731 | +# Semantic (vector) search |
| 732 | +# --------------------------------------------------------------------------- |
| 733 | + |
| 734 | + |
| 735 | +async def search_videos_by_semantic( |
| 736 | + query: str, |
| 737 | + page: int, |
| 738 | + page_size: int, |
| 739 | + db_table: Optional[AstraDBCollection] = None, |
| 740 | +) -> Tuple[List[VideoSummary], int]: |
| 741 | + """Return videos ranked by semantic similarity using Astra `$vectorize`. |
| 742 | +
|
| 743 | + Raises |
| 744 | + ------ |
| 745 | + HTTPException |
| 746 | + With status ``400`` if the query exceeds the NV-Embed 512-token limit. |
| 747 | + """ |
| 748 | + |
| 749 | + # ------------------------------------------------------------------ |
| 750 | + # Validate token length against NVIDIA provider limit (512 tokens). |
| 751 | + # ------------------------------------------------------------------ |
| 752 | + |
| 753 | + import re as _re |
| 754 | + |
| 755 | + token_re = _re.compile(r"\w+|[^\w\s]", flags=_re.UNICODE) |
| 756 | + if len(token_re.findall(query)) > 512: |
| 757 | + raise HTTPException( |
| 758 | + status_code=status.HTTP_400_BAD_REQUEST, |
| 759 | + detail="Query exceeds 512-token limit for semantic search.", |
| 760 | + ) |
| 761 | + |
| 762 | + sort_vector = {"$vectorize": query} |
| 763 | + |
| 764 | + return await list_videos_with_query( |
| 765 | + query_filter={}, |
| 766 | + page=page, |
| 767 | + page_size=page_size, |
| 768 | + sort_options=sort_vector, |
| 769 | + db_table=db_table, |
| 770 | + ) |
| 771 | + |
| 772 | + |
711 | 773 | # --------------------------------------------------------------------------- |
712 | 774 | # Tag suggestions |
713 | 775 | # --------------------------------------------------------------------------- |
|
0 commit comments