From 146549d10fc7169e83e5e8cc96ce5027702725bf Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sun, 31 May 2026 23:44:33 +0700 Subject: [PATCH 01/17] build(cf): add @cloudflare/workers-types devDependency --- adapters/cf/package.json | 1 + pnpm-lock.yaml | 40 +++++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/adapters/cf/package.json b/adapters/cf/package.json index 60f0f1b..67c9c57 100644 --- a/adapters/cf/package.json +++ b/adapters/cf/package.json @@ -30,6 +30,7 @@ "payloadcms-vectorize": ">=1.0.0" }, "devDependencies": { + "@cloudflare/workers-types": "^4.20240000.0", "payloadcms-vectorize": "workspace:*" }, "engines": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c9e19e1..28b5ddf 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -19,10 +19,10 @@ importers: version: 3.3.3 '@payloadcms/db-postgres': specifier: 3.69.0 - version: 3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3)) + version: 3.69.0(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3)) '@payloadcms/db-sqlite': specifier: 3.69.0 - version: 3.69.0(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3) + version: 3.69.0(@cloudflare/workers-types@4.20260531.1)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3) '@payloadcms/eslint-config': specifier: 3.9.0 version: 3.9.0(@typescript-eslint/eslint-plugin@8.51.0(@typescript-eslint/parser@8.51.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.7.3))(eslint@9.39.2(jiti@2.6.1))(typescript@5.7.3))(jest@30.2.0(@types/node@22.19.3)(babel-plugin-macros@3.1.0)(esbuild-register@3.6.0(esbuild@0.25.12)))(jiti@2.6.1) @@ -153,6 +153,9 @@ importers: specifier: '>=3.0.0 <4.0.0' version: 3.69.0(graphql@16.12.0)(typescript@5.7.3) devDependencies: + '@cloudflare/workers-types': + specifier: ^4.20240000.0 + version: 4.20260531.1 payloadcms-vectorize: specifier: workspace:* version: link:../.. @@ -180,7 +183,7 @@ importers: dependencies: '@payloadcms/db-postgres': specifier: '>=3.0.0 <4.0.0' - version: 3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3)) + version: 3.69.0(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3)) payload: specifier: '>=3.0.0 <4.0.0' version: 3.69.0(graphql@16.12.0)(typescript@5.7.3) @@ -457,6 +460,9 @@ packages: '@changesets/write@0.4.0': resolution: {integrity: sha512-CdTLvIOPiCNuH71pyDu3rA+Q0n65cmAbXnwWH84rKGiFumFzkmHNT8KHTMEchcxN+Kl8I54xGUhJ7l3E7X396Q==} + '@cloudflare/workers-types@4.20260531.1': + resolution: {integrity: sha512-7DybhbX12n+mVgJEDvm9W/jjqpaUIczg+RWj1Hua9nGEG+pNJnT+yZj1JKENrbdyuGWx3OFEgUCNFcGJN86Dvg==} + '@date-fns/tz@1.2.0': resolution: {integrity: sha512-LBrd7MiJZ9McsOgxqWX7AaxrDjcFVjWH/tIKJd7pnR7McaslGYOP1QmmiBXdJH/H/yLCT+rcQ7FaPBUxRGUtrg==} @@ -7225,6 +7231,8 @@ snapshots: human-id: 4.1.3 prettier: 2.8.8 + '@cloudflare/workers-types@4.20260531.1': {} + '@date-fns/tz@1.2.0': {} '@dnd-kit/accessibility@3.1.1(react@19.1.0)': @@ -8649,13 +8657,13 @@ snapshots: - socks - supports-color - '@payloadcms/db-postgres@3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))': + '@payloadcms/db-postgres@3.69.0(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))': dependencies: - '@payloadcms/drizzle': 3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3) + '@payloadcms/drizzle': 3.69.0(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3) '@types/pg': 8.10.2 console-table-printer: 2.12.1 drizzle-kit: 0.31.7 - drizzle-orm: 0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(pg@8.16.3) + drizzle-orm: 0.44.7(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(pg@8.16.3) payload: 3.69.0(graphql@16.12.0)(typescript@5.7.3) pg: 8.16.3 prompts: 2.4.2 @@ -8692,13 +8700,13 @@ snapshots: - sqlite3 - supports-color - '@payloadcms/db-sqlite@3.69.0(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3)': + '@payloadcms/db-sqlite@3.69.0(@cloudflare/workers-types@4.20260531.1)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3)': dependencies: '@libsql/client': 0.14.0 - '@payloadcms/drizzle': 3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3) + '@payloadcms/drizzle': 3.69.0(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3) console-table-printer: 2.12.1 drizzle-kit: 0.31.7 - drizzle-orm: 0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(pg@8.16.3) + drizzle-orm: 0.44.7(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(pg@8.16.3) payload: 3.69.0(graphql@16.12.0)(typescript@5.7.3) prompts: 2.4.2 to-snake-case: 1.0.0 @@ -8736,11 +8744,11 @@ snapshots: - supports-color - utf-8-validate - '@payloadcms/drizzle@3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3)': + '@payloadcms/drizzle@3.69.0(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3)': dependencies: console-table-printer: 2.12.1 dequal: 2.0.3 - drizzle-orm: 0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(pg@8.16.3) + drizzle-orm: 0.44.7(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(pg@8.16.3) payload: 3.69.0(graphql@16.12.0)(typescript@5.7.3) prompts: 2.4.2 to-snake-case: 1.0.0 @@ -8776,11 +8784,11 @@ snapshots: - sql.js - sqlite3 - '@payloadcms/drizzle@3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3)': + '@payloadcms/drizzle@3.69.0(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3)': dependencies: console-table-printer: 2.12.1 dequal: 2.0.3 - drizzle-orm: 0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(pg@8.16.3) + drizzle-orm: 0.44.7(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(pg@8.16.3) payload: 3.69.0(graphql@16.12.0)(typescript@5.7.3) prompts: 2.4.2 to-snake-case: 1.0.0 @@ -10671,15 +10679,17 @@ snapshots: transitivePeerDependencies: - supports-color - drizzle-orm@0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(pg@8.16.3): + drizzle-orm@0.44.7(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(pg@8.16.3): optionalDependencies: + '@cloudflare/workers-types': 4.20260531.1 '@libsql/client': 0.14.0 '@opentelemetry/api': 1.9.0 '@types/pg': 8.10.2 pg: 8.16.3 - drizzle-orm@0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(pg@8.16.3): + drizzle-orm@0.44.7(@cloudflare/workers-types@4.20260531.1)(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.16.0)(pg@8.16.3): optionalDependencies: + '@cloudflare/workers-types': 4.20260531.1 '@libsql/client': 0.14.0 '@opentelemetry/api': 1.9.0 '@types/pg': 8.16.0 From 0af470cf45219416ad9c417117a44f8bb858ef39 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 08:59:09 +0700 Subject: [PATCH 02/17] refactor(cf): adopt @cloudflare/workers-types Vectorize binding --- adapters/cf/src/types.ts | 54 ++++------------------------------------ 1 file changed, 5 insertions(+), 49 deletions(-) diff --git a/adapters/cf/src/types.ts b/adapters/cf/src/types.ts index 92786d4..e10e441 100644 --- a/adapters/cf/src/types.ts +++ b/adapters/cf/src/types.ts @@ -1,65 +1,21 @@ +/// import type { BasePayload } from 'payload' import { getVectorizedPayload } from 'payloadcms-vectorize' -/** - * Retrieve the Cloudflare Vectorize binding from a Payload instance. - * Throws if the binding is not found. - */ -export function getVectorizeBinding(payload: BasePayload): CloudflareVectorizeBinding { +export function getVectorizeBinding(payload: BasePayload): Vectorize { const binding = getVectorizedPayload(payload)?.getDbAdapterCustom() - ?._vectorizeBinding as CloudflareVectorizeBinding | undefined + ?._vectorizeBinding as Vectorize | undefined if (!binding) { throw new Error('[@payloadcms-vectorize/cf] Cloudflare Vectorize binding not found') } return binding } -/** - * Configuration for a knowledge pool in Cloudflare Vectorize - */ export interface CloudflareVectorizePoolConfig { - /** Vector dimensions for this pool (must match embedding model output) */ dims: number } -/** - * All knowledge pools configuration for Cloudflare Vectorize - */ export type KnowledgePoolsConfig = Record -/** A single vector match returned by a Vectorize query */ -export interface VectorizeMatch { - id: string - score?: number - metadata?: Record -} - -/** Result of a Vectorize query */ -export interface VectorizeQueryResult { - matches: VectorizeMatch[] - count: number -} - -/** Vector to upsert into Vectorize */ -export interface VectorizeVector { - id: string - values: number[] - metadata?: Record -} - -/** - * Cloudflare Vectorize binding interface. - * Mirrors the subset of the Vectorize API we use. - * For the full type, install `@cloudflare/workers-types`. - */ -export interface CloudflareVectorizeBinding { - query(vector: number[], options?: { - topK?: number - returnMetadata?: boolean | 'indexed' | 'all' - filter?: Record - /** Vectorize metadata filtering */ - where?: Record - }): Promise - upsert(vectors: VectorizeVector[]): Promise - deleteByIds(ids: string[]): Promise -} +/** @deprecated Use the official `Vectorize` type from `@cloudflare/workers-types`. */ +export type CloudflareVectorizeBinding = Vectorize From 3319f85687e580f7da5ded51e554ed5be1130004 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 09:08:40 +0700 Subject: [PATCH 03/17] feat: add EmbeddingRecord type and findByIds/findEmbeddingsByIds signatures --- src/index.ts | 1 + src/types.ts | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/index.ts b/src/index.ts index bf3ac4e..1fba1ed 100644 --- a/src/index.ts +++ b/src/index.ts @@ -76,6 +76,7 @@ export type { // For adapters VectorSearchResult, + EmbeddingRecord, } from './types.js' export { getVectorizedPayload } from './types.js' diff --git a/src/types.ts b/src/types.ts index e54611d..0701068 100644 --- a/src/types.ts +++ b/src/types.ts @@ -57,6 +57,10 @@ export type VectorizedPayload = { _isBulkEmbedEnabled: (knowledgePool: KnowledgePoolName) => boolean getDbAdapterCustom: () => Record | undefined search: (params: VectorSearchQuery) => Promise> + findEmbeddingsByIds: (params: { + knowledgePool: KnowledgePoolName + ids: string[] + }) => Promise> queueEmbed: ( params: | { @@ -322,6 +326,17 @@ export interface VectorSearchResult { [key: string]: any // Extension fields and other dynamic fields } +export interface EmbeddingRecord { + id: string + sourceCollection: string + docId: string + chunkIndex: number + chunkText: string + embeddingVersion: string + embedding: number[] + [key: string]: any +} + export interface VectorSearchQuery { /** The knowledge pool to search in */ knowledgePool: KnowledgePoolName @@ -430,4 +445,9 @@ export type DbAdapter = { limit?: number, where?: Where, ) => Promise> + findByIds: ( + payload: BasePayload, + poolName: KnowledgePoolName, + ids: string[], + ) => Promise> } From 4357d1d83ab3cd22f19f815cf74b55ae6f62475d Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 10:45:44 +0700 Subject: [PATCH 04/17] feat: wire findEmbeddingsByIds public method with mock-adapter coverage --- dev/helpers/mockAdapter.ts | 35 +++++++++++++++++- dev/specs/vectorizedPayload.spec.ts | 57 +++++++++++++++++++++++++++++ src/index.ts | 4 ++ 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/dev/helpers/mockAdapter.ts b/dev/helpers/mockAdapter.ts index dacad6f..932a1bd 100644 --- a/dev/helpers/mockAdapter.ts +++ b/dev/helpers/mockAdapter.ts @@ -1,4 +1,4 @@ -import type { DbAdapter, KnowledgePoolName, KnowledgePoolDynamicConfig, StoreChunkData, VectorSearchResult } from 'payloadcms-vectorize' +import type { DbAdapter, EmbeddingRecord, KnowledgePoolName, KnowledgePoolDynamicConfig, StoreChunkData, VectorSearchResult } from 'payloadcms-vectorize' import { createEmbeddingsCollection } from 'payloadcms-vectorize' import type { CollectionSlug, Payload, BasePayload, Where, Config } from 'payload' @@ -195,6 +195,39 @@ export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter = .slice(0, limit) .map(({ _score, ...rest }) => rest) }, + + findByIds: async ( + payload: BasePayload, + poolName: KnowledgePoolName, + ids: string[], + ): Promise => { + const records: EmbeddingRecord[] = [] + for (const id of ids) { + const stored = storage.get(`${poolName}:${id}`) + if (!stored) continue + try { + const doc = await payload.findByID({ + collection: poolName as CollectionSlug, + id: stored.id, + }) + if (!doc) continue + const { + id: _id, + createdAt: _createdAt, + updatedAt: _updatedAt, + embedding: _embedding, + ...docFields + } = doc as any + records.push({ + id: stored.id, + embedding: stored.embedding, + ...docFields, + } as EmbeddingRecord) + } catch (_e) { + } + } + return records + }, } } diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index 65c40ba..7895bd5 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -200,6 +200,63 @@ describe('VectorizedPayload', () => { }) }) + describe('findEmbeddingsByIds method', () => { + let embeddingId: string + + beforeAll(async () => { + const post = await payload.create({ + collection: 'posts', + data: { title: 'FindByIds seed', content: markdownContent as unknown as any }, + }) + await waitForVectorizationJobs(payload) + const rows = await payload.find({ + collection: 'default' as any, + where: { docId: { equals: String(post.id) } }, + limit: 1, + }) + embeddingId = String(rows.docs[0].id) + }) + + test('payload has findEmbeddingsByIds method', () => { + const vectorizedPayload = getVectorizedPayload(payload) + expect(typeof vectorizedPayload!.findEmbeddingsByIds).toBe('function') + }) + + test('returns the full EmbeddingRecord including the embedding vector', async () => { + const vectorizedPayload = getVectorizedPayload(payload)! + const records = await vectorizedPayload.findEmbeddingsByIds({ + knowledgePool: 'default', + ids: [embeddingId], + }) + expect(records).toHaveLength(1) + const [record] = records + expect(record.id).toBe(embeddingId) + expect(Array.isArray(record.embedding)).toBe(true) + expect(record.embedding.length).toBe(DIMS) + expect(typeof record.sourceCollection).toBe('string') + expect(typeof record.chunkText).toBe('string') + }) + + test('drops unknown ids (result length < ids length)', async () => { + const vectorizedPayload = getVectorizedPayload(payload)! + const records = await vectorizedPayload.findEmbeddingsByIds({ + knowledgePool: 'default', + ids: [embeddingId, 'definitely-not-an-id-999999'], + }) + expect(records).toHaveLength(1) + expect(records[0].id).toBe(embeddingId) + }) + + test('empty ids returns []', async () => { + const vectorizedPayload = getVectorizedPayload(payload)! + const records = await vectorizedPayload.findEmbeddingsByIds({ + knowledgePool: 'default', + ids: [], + }) + expect(records).toEqual([]) + }) + }) + describe('queueEmbed method', () => { test('payload has queueEmbed method', () => { const vectorizedPayload = getVectorizedPayload(payload) diff --git a/src/index.ts b/src/index.ts index 1fba1ed..6e77714 100644 --- a/src/index.ts +++ b/src/index.ts @@ -357,6 +357,10 @@ export default (pluginOptions: PayloadcmsVectorizeConfig) => params.limit, params.where, ), + findEmbeddingsByIds: (params: { knowledgePool: KnowledgePoolName; ids: string[] }) => { + if (params.ids.length === 0) return Promise.resolve([]) + return pluginOptions.dbAdapter.findByIds(payload, params.knowledgePool, params.ids) + }, queueEmbed: async ( params: | { From a5e95e53b1109fa066e7c7cc0d81425978b157be Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 11:34:27 +0700 Subject: [PATCH 05/17] feat(pg): implement findByIds read primitive --- adapters/pg/dev/specs/findByIds.spec.ts | 105 +++++++++++++++++++++++ adapters/pg/src/findByIds.ts | 106 ++++++++++++++++++++++++ adapters/pg/src/index.ts | 2 + 3 files changed, 213 insertions(+) create mode 100644 adapters/pg/dev/specs/findByIds.spec.ts create mode 100644 adapters/pg/src/findByIds.ts diff --git a/adapters/pg/dev/specs/findByIds.spec.ts b/adapters/pg/dev/specs/findByIds.spec.ts new file mode 100644 index 0000000..c1c2323 --- /dev/null +++ b/adapters/pg/dev/specs/findByIds.spec.ts @@ -0,0 +1,105 @@ +import type { Payload } from 'payload' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { buildDummyConfig, integration, plugin, DIMS } from './constants.js' +import { createTestDb, destroyPayload, waitForVectorizationJobs } from './utils.js' +import { getPayload } from 'payload' +import { chunkText } from '@shared-test/helpers/chunkers' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '@shared-test/helpers/embed' + +describe('pg findByIds', () => { + let payload: Payload + const dbName = 'pg_find_by_ids_test' + let embeddingId: string + + beforeAll(async () => { + await createTestDb({ dbName }) + const config = await buildDummyConfig({ + jobs: { tasks: [], autoRun: [{ cron: '*/5 * * * * *', limit: 10 }] }, + collections: [ + { slug: 'posts', fields: [ + { name: 'title', type: 'text' }, + { name: 'category', type: 'text' }, + ] }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}` }, + }), + plugins: [ + plugin({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => { + const chunks: Array<{ chunk: string; category?: string }> = [] + if (doc.title) { + for (const chunk of chunkText(doc.title)) { + chunks.push({ chunk, category: doc.category || 'general' }) + } + } + return chunks + }, + }, + }, + extensionFields: [{ name: 'category', type: 'text' }], + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + }) + payload = await getPayload({ config, key: `pg-find-by-ids-${Date.now()}`, cron: true }) + + const post = await payload.create({ + collection: 'posts', + data: { title: 'Find me by id', category: 'science' }, + }) + await waitForVectorizationJobs(payload) + const rows = await payload.find({ + collection: 'default' as any, + where: { docId: { equals: String(post.id) } }, + limit: 1, + }) + embeddingId = String(rows.docs[0].id) + }) + + afterAll(async () => { + await destroyPayload(payload) + }) + + test('returns full EmbeddingRecord including numeric embedding array', async () => { + const records = await integration.adapter.findByIds(payload, 'default', [embeddingId]) + expect(records).toHaveLength(1) + const [r] = records + expect(r.id).toBe(embeddingId) + expect(Array.isArray(r.embedding)).toBe(true) + expect(r.embedding.length).toBe(DIMS) + expect(r.embedding.every((n) => typeof n === 'number')).toBe(true) + expect(r.sourceCollection).toBe('posts') + expect(typeof r.chunkText).toBe('string') + expect(r.embeddingVersion).toBe(testEmbeddingVersion) + }) + + test('includes extension fields when the pool defines them', async () => { + const [r] = await integration.adapter.findByIds(payload, 'default', [embeddingId]) + expect((r as any).category).toBe('science') + }) + + test('drops misses', async () => { + const records = await integration.adapter.findByIds(payload, 'default', [embeddingId, '999999']) + expect(records).toHaveLength(1) + expect(records[0].id).toBe(embeddingId) + }) + + test('empty ids returns []', async () => { + const records = await integration.adapter.findByIds(payload, 'default', []) + expect(records).toEqual([]) + }) +}) diff --git a/adapters/pg/src/findByIds.ts b/adapters/pg/src/findByIds.ts new file mode 100644 index 0000000..76a9e9d --- /dev/null +++ b/adapters/pg/src/findByIds.ts @@ -0,0 +1,106 @@ +import { inArray } from '@payloadcms/db-postgres/drizzle' +import { BasePayload, SanitizedCollectionConfig } from 'payload' +import { KnowledgePoolName, EmbeddingRecord } from 'payloadcms-vectorize' +import toSnakeCase from 'to-snake-case' +import { getEmbeddingsTable } from './drizzle.js' + +export default async ( + payload: BasePayload, + poolName: KnowledgePoolName, + ids: string[], +): Promise> => { + if (ids.length === 0) return [] + + const isPostgres = payload.db?.pool?.query || payload.db?.drizzle + if (!isPostgres) { + throw new Error('[@payloadcms-vectorize/pg] Only works with Postgres') + } + const drizzle = payload.db?.drizzle + if (!drizzle) { + throw new Error('[@payloadcms-vectorize/pg] Drizzle instance not found in adapter') + } + + const collectionConfig = payload.collections[poolName]?.config + if (!collectionConfig) { + throw new Error(`[@payloadcms-vectorize/pg] Collection ${poolName} not found`) + } + + const table = getEmbeddingsTable(poolName) + if (!table) { + throw new Error( + `[@payloadcms-vectorize/pg] Embeddings table for knowledge pool "${poolName}" not registered.`, + ) + } + + const numericIds = ids.filter((id) => /^\d+$/.test(id)).map(Number) + if (numericIds.length === 0) return [] + + const selectObj: Record = { + id: table.id, + embedding: table.embedding, + } + for (const field of collectionConfig.fields ?? []) { + if (typeof field === 'object' && 'name' in field) { + const name = field.name as string + if (name in table) { + selectObj[name] = table[name] + } else if (toSnakeCase(name) in table) { + selectObj[name] = table[toSnakeCase(name)] + } + } + } + + const rows = await drizzle.select(selectObj).from(table).where(inArray(table.id, numericIds)) + return mapRowsToRecords(rows, collectionConfig) +} + +function mapRowsToRecords( + rows: Record[], + collectionConfig: SanitizedCollectionConfig, +): Array { + const numberFields = new Set() + for (const field of collectionConfig.fields) { + if (typeof field === 'object' && 'name' in field && field.type === 'number') { + numberFields.add(field.name) + } + } + + return rows.map((row) => { + const rawDocId = row.docId ?? row.doc_id + const rawChunkIndex = row.chunkIndex ?? row.chunk_index + + const record = { + ...row, + id: String(row.id), + docId: String(rawDocId), + chunkIndex: + typeof rawChunkIndex === 'number' ? rawChunkIndex : parseInt(String(rawChunkIndex), 10), + embedding: parseEmbedding(row.embedding), + } as EmbeddingRecord + + for (const fieldName of numberFields) { + const value = record[fieldName] + if (value != null && typeof value !== 'number') { + const parsed = parseFloat(String(value)) + if (!Number.isNaN(parsed)) { + record[fieldName] = parsed + } + } + } + + return record + }) +} + +function parseEmbedding(value: unknown): number[] { + if (Array.isArray(value)) return value as number[] + if (typeof value === 'string') { + return value + .replace(/^\[/, '') + .replace(/\]$/, '') + .split(',') + .filter((s) => s.length > 0) + .map((s) => Number(s)) + } + return [] +} diff --git a/adapters/pg/src/index.ts b/adapters/pg/src/index.ts index ac28c21..22c25c0 100644 --- a/adapters/pg/src/index.ts +++ b/adapters/pg/src/index.ts @@ -9,6 +9,7 @@ import { fileURLToPath } from 'url' import { dirname, resolve } from 'path' import embed from './embed.js' import search from './search.js' +import findByIds from './findByIds.js' export type { KnowledgePoolsConfig as KnowledgePoolConfig } @@ -93,6 +94,7 @@ export const createPostgresVectorIntegration = ( } }, search, + findByIds, storeChunk: async (payload, poolName, data) => { const embeddingArray = Array.isArray(data.embedding) ? data.embedding : Array.from(data.embedding) From fd2d9d8a03bfdc4d993dccb8261bab05223be070 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 12:44:35 +0700 Subject: [PATCH 06/17] feat(mongodb): implement findByIds read primitive --- adapters/mongodb/dev/specs/findByIds.spec.ts | 89 ++++++++++++++++++++ adapters/mongodb/src/findByIds.ts | 59 +++++++++++++ adapters/mongodb/src/index.ts | 3 + 3 files changed, 151 insertions(+) create mode 100644 adapters/mongodb/dev/specs/findByIds.spec.ts create mode 100644 adapters/mongodb/src/findByIds.ts diff --git a/adapters/mongodb/dev/specs/findByIds.spec.ts b/adapters/mongodb/dev/specs/findByIds.spec.ts new file mode 100644 index 0000000..87d04ac --- /dev/null +++ b/adapters/mongodb/dev/specs/findByIds.spec.ts @@ -0,0 +1,89 @@ +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { MongoClient } from 'mongodb' +import type { BasePayload } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { DIMS, MONGO_URI } from './constants.js' +import { buildMongoTestPayload, teardownDbs } from './utils.js' +import { testEmbeddingVersion, makeDummyEmbedDocs, makeDummyEmbedQuery } from '@shared-test/helpers/embed' + +const DB = `mongo_find_by_ids_${Date.now()}` + +describe('mongodb findByIds', () => { + let payload: BasePayload + let adapter: DbAdapter + let embeddingId: string + + beforeAll(async () => { + const built = await buildMongoTestPayload({ + uri: MONGO_URI, + dbName: DB, + pools: { default: { dimensions: DIMS, filterableFields: ['category'] } }, + knowledgePools: { + default: { + collections: {}, + extensionFields: [{ name: 'category', type: 'text' }], + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }) + payload = built.payload + adapter = built.adapter + + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'posts', + docId: 'doc-1', + chunkIndex: 0, + chunkText: 'find me', + embeddingVersion: testEmbeddingVersion, + embedding: Array(DIMS).fill(0.25), + extensionFields: { category: 'science' }, + }) + + const c = new MongoClient(MONGO_URI) + await c.connect() + const doc = await c.db(`${DB}_vectors`).collection('vectorize_default').findOne({ docId: 'doc-1' }) + embeddingId = String(doc!._id) + await c.close() + }) + + afterAll(async () => { + await teardownDbs(payload, MONGO_URI, DB) + }) + + test('returns full EmbeddingRecord including numeric embedding array', async () => { + const records = await adapter.findByIds(payload, 'default', [embeddingId]) + expect(records).toHaveLength(1) + const [r] = records + expect(r.id).toBe(embeddingId) + expect(Array.isArray(r.embedding)).toBe(true) + expect(r.embedding.length).toBe(DIMS) + expect(r.embedding.every((n) => typeof n === 'number')).toBe(true) + expect(r.sourceCollection).toBe('posts') + expect(r.chunkText).toBe('find me') + expect(r.embeddingVersion).toBe(testEmbeddingVersion) + }) + + test('includes extension fields', async () => { + const [r] = await adapter.findByIds(payload, 'default', [embeddingId]) + expect((r as any).category).toBe('science') + }) + + test('drops misses and invalid ids without throwing', async () => { + const records = await adapter.findByIds(payload, 'default', [ + embeddingId, + '000000000000000000000000', + 'not-an-object-id', + ]) + expect(records).toHaveLength(1) + expect(records[0].id).toBe(embeddingId) + }) + + test('empty ids returns []', async () => { + const records = await adapter.findByIds(payload, 'default', []) + expect(records).toEqual([]) + }) +}) diff --git a/adapters/mongodb/src/findByIds.ts b/adapters/mongodb/src/findByIds.ts new file mode 100644 index 0000000..3704714 --- /dev/null +++ b/adapters/mongodb/src/findByIds.ts @@ -0,0 +1,59 @@ +import type { BasePayload } from 'payload' +import type { EmbeddingRecord } from 'payloadcms-vectorize' +import { ObjectId } from 'mongodb' +import { getMongoClient } from './client.js' +import { RESERVED_FIELDS, type ResolvedPoolConfig } from './types.js' + +export interface MongoFindByIdsCtx { + uri: string + dbName: string + pools: Record +} + +const HEX24 = /^[a-f\d]{24}$/i +const RESERVED_AND_META = new Set([...RESERVED_FIELDS, '_id', 'createdAt', 'updatedAt']) + +export async function findByIdsImpl( + ctx: MongoFindByIdsCtx, + _payload: BasePayload, + poolName: string, + ids: string[], +): Promise { + if (ids.length === 0) return [] + + const cfg = ctx.pools[poolName] + if (!cfg) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}". Configured pools: ${Object.keys(ctx.pools).join(', ')}`, + ) + } + + const objectIds = ids.filter((id) => HEX24.test(id)).map((id) => new ObjectId(id)) + if (objectIds.length === 0) return [] + + const client = await getMongoClient(ctx.uri) + const docs = await client + .db(ctx.dbName) + .collection(cfg.collectionName) + .find({ _id: { $in: objectIds } }) + .toArray() + + return docs.map((doc) => mapDocToRecord(doc as Record)) +} + +function mapDocToRecord(doc: Record): EmbeddingRecord { + const extensionFields = Object.fromEntries( + Object.entries(doc).filter(([k]) => !RESERVED_AND_META.has(k)), + ) + return { + id: String(doc._id), + sourceCollection: String(doc.sourceCollection ?? ''), + docId: String(doc.docId ?? ''), + chunkIndex: + typeof doc.chunkIndex === 'number' ? doc.chunkIndex : Number(doc.chunkIndex ?? 0), + chunkText: String(doc.chunkText ?? ''), + embeddingVersion: String(doc.embeddingVersion ?? ''), + embedding: Array.isArray(doc.embedding) ? (doc.embedding as number[]) : [], + ...extensionFields, + } +} diff --git a/adapters/mongodb/src/index.ts b/adapters/mongodb/src/index.ts index 0827547..6681836 100644 --- a/adapters/mongodb/src/index.ts +++ b/adapters/mongodb/src/index.ts @@ -2,6 +2,7 @@ import type { DbAdapter } from 'payloadcms-vectorize' import { getMongoClient } from './client.js' import { storeChunkImpl } from './embed.js' import { searchImpl } from './search.js' +import { findByIdsImpl } from './findByIds.js' import { resolvePoolConfig, type MongoVectorIntegrationConfig, @@ -81,6 +82,8 @@ export const createMongoVectorIntegration = ( search: (payload, queryEmbedding, poolName, limit, where) => searchImpl(ctx, payload, queryEmbedding, poolName, limit, where), + + findByIds: (payload, poolName, ids) => findByIdsImpl(ctx, payload, poolName, ids), } return { adapter } From 1e53fee515e5892ffac737c41bc4aaeb83c8bad1 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 13:29:52 +0700 Subject: [PATCH 07/17] feat(cf): implement findByIds via Vectorize getByIds --- adapters/cf/dev/specs/adapter.spec.ts | 76 +++++++++++++++++++++++++++ adapters/cf/src/findByIds.ts | 44 ++++++++++++++++ adapters/cf/src/index.ts | 3 ++ 3 files changed, 123 insertions(+) create mode 100644 adapters/cf/src/findByIds.ts diff --git a/adapters/cf/dev/specs/adapter.spec.ts b/adapters/cf/dev/specs/adapter.spec.ts index b3ac360..0bdf5cf 100644 --- a/adapters/cf/dev/specs/adapter.spec.ts +++ b/adapters/cf/dev/specs/adapter.spec.ts @@ -61,6 +61,13 @@ function createMockCloudflareBinding() { } }), + getByIds: vi.fn(async (ids: string[]) => { + return ids + .map((id) => storage.get(id)) + .filter((v): v is { id: string; values: number[]; metadata: any } => v !== undefined) + .map((v) => ({ id: v.id, values: v.values, metadata: v.metadata })) + }), + list: vi.fn(async (options: any) => { const vectors = Array.from(storage.values()).map((item) => ({ id: item.id, @@ -419,4 +426,73 @@ describe('createCloudflareVectorizeIntegration', () => { }) }) }) + + describe('findByIds', () => { + test('returns full EmbeddingRecord including embedding values', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: DIMS } }, + binding: mockBinding as any, + }) + const mockPayload = createMockPayload(mockBinding) + const embedding = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + + await adapter.storeChunk(mockPayload, 'default', { + sourceCollection: 'posts', + docId: 'doc-1', + chunkIndex: 0, + chunkText: 'find me', + embeddingVersion: 'v1', + embedding, + extensionFields: { category: 'science' }, + }) + + const id = 'default:posts:doc-1:0' + const records = await adapter.findByIds(mockPayload, 'default', [id]) + expect(records).toHaveLength(1) + const [r] = records + expect(r.id).toBe(id) + expect(r.embedding).toEqual(embedding) + expect(r.sourceCollection).toBe('posts') + expect(r.docId).toBe('doc-1') + expect(r.chunkText).toBe('find me') + expect(r.embeddingVersion).toBe('v1') + expect((r as any).category).toBe('science') + }) + + test('drops misses', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: DIMS } }, + binding: mockBinding as any, + }) + const mockPayload = createMockPayload(mockBinding) + await adapter.storeChunk(mockPayload, 'default', { + sourceCollection: 'posts', + docId: 'doc-1', + chunkIndex: 0, + chunkText: 'x', + embeddingVersion: 'v1', + embedding: [0, 0, 0, 0, 0, 0, 0, 0], + extensionFields: {}, + }) + const records = await adapter.findByIds(mockPayload, 'default', [ + 'default:posts:doc-1:0', + 'default:posts:nope:0', + ]) + expect(records).toHaveLength(1) + expect(records[0].id).toBe('default:posts:doc-1:0') + }) + + test('empty ids returns []', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: DIMS } }, + binding: mockBinding as any, + }) + const mockPayload = createMockPayload(mockBinding) + const records = await adapter.findByIds(mockPayload, 'default', []) + expect(records).toEqual([]) + }) + }) }) diff --git a/adapters/cf/src/findByIds.ts b/adapters/cf/src/findByIds.ts new file mode 100644 index 0000000..35638f5 --- /dev/null +++ b/adapters/cf/src/findByIds.ts @@ -0,0 +1,44 @@ +import { BasePayload } from 'payload' +import { KnowledgePoolName, EmbeddingRecord } from 'payloadcms-vectorize' +import { getVectorizeBinding } from './types.js' + +const RESERVED_METADATA = ['sourceCollection', 'docId', 'chunkIndex', 'chunkText', 'embeddingVersion'] + +export default async ( + payload: BasePayload, + _poolName: KnowledgePoolName, + ids: string[], +): Promise> => { + if (ids.length === 0) return [] + + const binding = getVectorizeBinding(payload) + + try { + const vectors = await binding.getByIds(ids) + if (!vectors) return [] + + return vectors.map((vector) => { + const metadata = (vector.metadata || {}) as Record + const extensionFields = Object.fromEntries( + Object.entries(metadata).filter(([k]) => !RESERVED_METADATA.includes(k)), + ) + return { + id: vector.id, + sourceCollection: String(metadata.sourceCollection ?? ''), + docId: String(metadata.docId ?? ''), + chunkIndex: + typeof metadata.chunkIndex === 'number' + ? metadata.chunkIndex + : parseInt(String(metadata.chunkIndex ?? '0'), 10), + chunkText: String(metadata.chunkText ?? ''), + embeddingVersion: String(metadata.embeddingVersion ?? ''), + embedding: Array.from(vector.values ?? []), + ...extensionFields, + } + }) + } catch (e) { + const errorMessage = e instanceof Error ? e.message : String(e) + payload.logger.error(`[@payloadcms-vectorize/cf] findByIds failed: ${errorMessage}`) + throw new Error(`[@payloadcms-vectorize/cf] findByIds failed: ${errorMessage}`) + } +} diff --git a/adapters/cf/src/index.ts b/adapters/cf/src/index.ts index 8a6f23a..963530f 100644 --- a/adapters/cf/src/index.ts +++ b/adapters/cf/src/index.ts @@ -5,6 +5,7 @@ import type { CloudflareVectorizeBinding, KnowledgePoolsConfig } from './types.j import cfMappingsCollection, { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' import embed from './embed.js' import search from './search.js' +import findByIds from './findByIds.js' /** * Configuration for Cloudflare Vectorize integration @@ -117,6 +118,8 @@ export const createCloudflareVectorizeIntegration = ( } }, + findByIds, + hasEmbeddingVersion: async (payload, poolName, sourceCollection, docId, embeddingVersion) => { const result = await payload.find({ collection: CF_MAPPINGS_SLUG as CollectionSlug, From 2fe55bc91966437feca8ada649d8fbc28e10d9f3 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 13:35:21 +0700 Subject: [PATCH 08/17] docs: document findEmbeddingsByIds and findByIds contract --- README.md | 24 +++++++++++++++++++++++- adapters/README.md | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1a5a625..384a33d 100644 --- a/README.md +++ b/README.md @@ -832,7 +832,7 @@ curl -X POST http://localhost:3000/api/vector-retry-failed-batch \ ### Local API -The plugin provides a `getVectorizedPayload(payload)` function which returns a `vectorizedPayload` object exposing `search`, `queueEmbed`, `bulkEmbed`, and `retryFailedBatch` methods. +The plugin provides a `getVectorizedPayload(payload)` function which returns a `vectorizedPayload` object exposing `search`, `findEmbeddingsByIds`, `queueEmbed`, `bulkEmbed`, and `retryFailedBatch` methods. #### Getting the Vectorized Payload Object @@ -883,6 +883,28 @@ const results = await vectorizedPayload.search({ }) ``` +#### `vectorizedPayload.findEmbeddingsByIds(params)` + +Fetch stored embedding records by primary key — **including the raw embedding vector**, which the normal search/query API never returns. The `id` of each record is whatever [`search()`](#vectorizedpayloadsearchparams) returns as `result.id`, so a search result round-trips directly. This is the building block for "more like this" flows. + +**Returns:** `Promise>` — `EmbeddingRecord` is the search result shape without `score` and with `embedding: number[]`. + +**Example:** + +```typescript +const [record] = await vectorizedPayload.findEmbeddingsByIds({ + knowledgePool: 'mainKnowledgePool', + ids: [''], +}) + +if (record) { + // record.embedding is the raw number[] vector — feed it back into search for "more like this" + console.log(record.embedding.length, record.chunkText) +} +``` + +Misses are dropped (the result may be shorter than `ids`), order is not guaranteed, and an empty `ids` array returns `[]` without touching the backend. + #### `vectorizedPayload.queueEmbed(params)` Manually queue a vectorization job for a document. diff --git a/adapters/README.md b/adapters/README.md index 096a6e4..3b28c84 100644 --- a/adapters/README.md +++ b/adapters/README.md @@ -110,6 +110,7 @@ import type { KnowledgePoolDynamicConfig, StoreChunkData, VectorSearchResult, + EmbeddingRecord, } from 'payloadcms-vectorize' export type DbAdapter = { @@ -150,6 +151,12 @@ export type DbAdapter = { limit?: number, where?: Where, ) => Promise> + + findByIds: ( + payload: BasePayload, + poolName: KnowledgePoolName, + ids: string[], + ) => Promise> } ``` @@ -162,6 +169,7 @@ export type DbAdapter = { | `deleteChunks` | After a source document is deleted. | Remove every chunk where `sourceCollection === ... && docId === ...`. Must be safe to call when no chunks exist (no-op, no throw). | | `hasEmbeddingVersion` | During bulk-embed planning, per candidate document. | Return `true` iff at least one chunk exists with the matching `(sourceCollection, docId, embeddingVersion)` triple. Must filter on **all three** — older `0.7.0` adapters that ignored `embeddingVersion` caused stale embeddings on model bumps. | | `search` | Per `/vector-search` request and per `getVectorizedPayload().search()` call. | Translate `where` (Payload-style) into your store's filter language, perform a vector search using `queryEmbedding`, and return up to `limit` results sorted by descending relevance. | +| `findByIds` | Per `getVectorizedPayload().findEmbeddingsByIds()` call. | Fetch stored embedding records by primary key, **including the raw `embedding` vector** (which `search` never returns). Look up by the same `id` your `search` returns as `result.id`. Misses are dropped (result length may be `< ids.length`); order is not guaranteed; empty `ids` returns `[]` without a backend call; unknown or malformed ids are treated as misses (dropped), not raised as errors. | ### Error contract @@ -286,6 +294,12 @@ export const createYourDbVectorIntegration = ( // Return Array sorted by descending score. return [] }, + + findByIds: async (payload, poolName, ids) => { + // TODO: fetch stored records by primary key, including the raw `embedding` vector. + // Return Array. Unknown ids are misses (drop them, don't throw). + return [] + }, } return { adapter } @@ -361,6 +375,25 @@ export interface VectorSearchResult { /** Any extensionFields persisted via storeChunk must round-trip here. */ [key: string]: any } + +export interface EmbeddingRecord { + /** Embedding record ID — the same value your adapter returns as VectorSearchResult.id. */ + id: string + /** Source collection slug (echoed from StoreChunkData). */ + sourceCollection: string + /** Source document ID (echoed from StoreChunkData). */ + docId: string + /** Chunk index within the source document. */ + chunkIndex: number + /** The original chunk text. */ + chunkText: string + /** Embedding model/version string. */ + embeddingVersion: string + /** The raw embedding vector — never returned by `search`. */ + embedding: number[] + /** Any extensionFields persisted via storeChunk round-trip here. */ + [key: string]: any +} ``` | Field | Required | Notes | @@ -371,6 +404,8 @@ export interface VectorSearchResult { | `chunkText`, `embeddingVersion` | yes | Same. | | `extensionFields.*` | optional | Whatever the user passed in `extensionFields` must be queryable via `where`. | +> `EmbeddingRecord` (returned by `findByIds`) is `VectorSearchResult` without `score` and with the raw `embedding: number[]`. + ## Testing your adapter The dev harness in [`dev/`](../dev) runs the integration suite against any adapter you wire up. To test a new adapter: From ffb58d34c101f65153817c4332ab3d3c225daaaf Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 14:50:38 +0700 Subject: [PATCH 09/17] docs: scope findByIds malformed-id behavior per adapter --- adapters/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adapters/README.md b/adapters/README.md index 3b28c84..1d57eda 100644 --- a/adapters/README.md +++ b/adapters/README.md @@ -169,7 +169,7 @@ export type DbAdapter = { | `deleteChunks` | After a source document is deleted. | Remove every chunk where `sourceCollection === ... && docId === ...`. Must be safe to call when no chunks exist (no-op, no throw). | | `hasEmbeddingVersion` | During bulk-embed planning, per candidate document. | Return `true` iff at least one chunk exists with the matching `(sourceCollection, docId, embeddingVersion)` triple. Must filter on **all three** — older `0.7.0` adapters that ignored `embeddingVersion` caused stale embeddings on model bumps. | | `search` | Per `/vector-search` request and per `getVectorizedPayload().search()` call. | Translate `where` (Payload-style) into your store's filter language, perform a vector search using `queryEmbedding`, and return up to `limit` results sorted by descending relevance. | -| `findByIds` | Per `getVectorizedPayload().findEmbeddingsByIds()` call. | Fetch stored embedding records by primary key, **including the raw `embedding` vector** (which `search` never returns). Look up by the same `id` your `search` returns as `result.id`. Misses are dropped (result length may be `< ids.length`); order is not guaranteed; empty `ids` returns `[]` without a backend call; unknown or malformed ids are treated as misses (dropped), not raised as errors. | +| `findByIds` | Per `getVectorizedPayload().findEmbeddingsByIds()` call. | Fetch stored embedding records by primary key, **including the raw `embedding` vector** (which `search` never returns). Look up by the same `id` your `search` returns as `result.id`. Unknown ids are dropped (result length may be `< ids.length`); order is not guaranteed; empty `ids` returns `[]` without a backend call. Adapters with a strict id format (pg integer PK, MongoDB `ObjectId`) also drop *malformed* ids as misses without erroring; adapters keyed on an opaque id (CF's composite vector id) forward ids to the backend as-is, so a backend that rejects a malformed id may surface that error. | ### Error contract From fb98908ad183fdd5dbbc15169a17f305a85c971d Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:22:04 +0700 Subject: [PATCH 10/17] docs(cf): restore JSDoc on retained types after binding swap --- adapters/cf/src/types.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/adapters/cf/src/types.ts b/adapters/cf/src/types.ts index e10e441..a0fd6c7 100644 --- a/adapters/cf/src/types.ts +++ b/adapters/cf/src/types.ts @@ -2,6 +2,10 @@ import type { BasePayload } from 'payload' import { getVectorizedPayload } from 'payloadcms-vectorize' +/** + * Retrieve the Cloudflare Vectorize binding from a Payload instance. + * Throws if the binding is not found. + */ export function getVectorizeBinding(payload: BasePayload): Vectorize { const binding = getVectorizedPayload(payload)?.getDbAdapterCustom() ?._vectorizeBinding as Vectorize | undefined @@ -11,10 +15,17 @@ export function getVectorizeBinding(payload: BasePayload): Vectorize { return binding } +/** + * Configuration for a knowledge pool in Cloudflare Vectorize + */ export interface CloudflareVectorizePoolConfig { + /** Vector dimensions for this pool (must match embedding model output) */ dims: number } +/** + * All knowledge pools configuration for Cloudflare Vectorize + */ export type KnowledgePoolsConfig = Record /** @deprecated Use the official `Vectorize` type from `@cloudflare/workers-types`. */ From 39c17dc372a291711519aaca9d30c943acf958a3 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:49:31 +0700 Subject: [PATCH 11/17] build: stop root tsconfig from typechecking adapter sources The CF adapter's '/// ' pulls in Workers ambient globals that redefine Request/Response (where .json() returns unknown, not the DOM's any). The root tsconfig included ./adapters/*/src/**, so 'tsc --noEmit' (build:types:all) leaked those globals into core endpoint/admin code and failed typecheck. Adapters are already typechecked independently via their own tsconfig.build.json in the CI build job, so root coverage of adapter sources was redundant. --- tsconfig.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsconfig.json b/tsconfig.json index 8c2fdbc..2622313 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -22,5 +22,5 @@ } ] }, - "include": ["./src/**/*.ts", "./src/**/*.tsx", "./adapters/*/src/**/*.ts", "./dev/next-env.d.ts"] + "include": ["./src/**/*.ts", "./src/**/*.tsx", "./dev/next-env.d.ts"] } From 963a5d1c47ac66de9df7becde477649dfd5f242e Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Wed, 3 Jun 2026 19:59:25 +0700 Subject: [PATCH 12/17] refactor(cf): narrow Vectorize binding to the methods the adapter uses Depend on Pick via a named VectorizeBinding type instead of the full 8-method Vectorize contract. env.VECTORIZE remains assignable; CloudflareVectorizeBinding is kept as a deprecated alias for back-compat. --- adapters/cf/src/index.ts | 6 +++--- adapters/cf/src/types.ts | 13 +++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/adapters/cf/src/index.ts b/adapters/cf/src/index.ts index dd78a3e..f3a51bb 100644 --- a/adapters/cf/src/index.ts +++ b/adapters/cf/src/index.ts @@ -1,7 +1,7 @@ import type { CollectionSlug } from 'payload' import type { DbAdapter } from 'payloadcms-vectorize' import { getVectorizeBinding } from './types.js' -import type { CloudflareVectorizeBinding, KnowledgePoolsConfig } from './types.js' +import type { CloudflareVectorizeBinding, KnowledgePoolsConfig, VectorizeBinding } from './types.js' import cfMappingsCollection, { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' import embed from './embed.js' import search from './search.js' @@ -13,7 +13,7 @@ interface CloudflareVectorizeConfig { /** Knowledge pools configuration with their dimensions */ config: KnowledgePoolsConfig /** Cloudflare Vectorize binding for vector storage */ - binding: CloudflareVectorizeBinding + binding: VectorizeBinding } /** @@ -134,5 +134,5 @@ export const createCloudflareVectorizeIntegration = ( } export { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' -export type { CloudflareVectorizeBinding, KnowledgePoolsConfig } +export type { CloudflareVectorizeBinding, KnowledgePoolsConfig, VectorizeBinding } export type { KnowledgePoolsConfig as KnowledgePoolConfig } diff --git a/adapters/cf/src/types.ts b/adapters/cf/src/types.ts index a0fd6c7..1cecab2 100644 --- a/adapters/cf/src/types.ts +++ b/adapters/cf/src/types.ts @@ -2,13 +2,18 @@ import type { BasePayload } from 'payload' import { getVectorizedPayload } from 'payloadcms-vectorize' +/** + * The subset of the Cloudflare `Vectorize` binding used by this adapter. + */ +export type VectorizeBinding = Pick + /** * Retrieve the Cloudflare Vectorize binding from a Payload instance. * Throws if the binding is not found. */ -export function getVectorizeBinding(payload: BasePayload): Vectorize { +export function getVectorizeBinding(payload: BasePayload): VectorizeBinding { const binding = getVectorizedPayload(payload)?.getDbAdapterCustom() - ?._vectorizeBinding as Vectorize | undefined + ?._vectorizeBinding as VectorizeBinding | undefined if (!binding) { throw new Error('[@payloadcms-vectorize/cf] Cloudflare Vectorize binding not found') } @@ -28,5 +33,5 @@ export interface CloudflareVectorizePoolConfig { */ export type KnowledgePoolsConfig = Record -/** @deprecated Use the official `Vectorize` type from `@cloudflare/workers-types`. */ -export type CloudflareVectorizeBinding = Vectorize +/** @deprecated Use {@link VectorizeBinding}. */ +export type CloudflareVectorizeBinding = VectorizeBinding From ba5e92f7a708a6fd8f73ccf29b5b912a88eb6a6b Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 6 Jun 2026 17:52:31 +0700 Subject: [PATCH 13/17] feat(find-by-ids): rename to findByIds with opt-in populateEmbedding Rename `payload.findEmbeddingsByIds` -> `payload.findByIds` and add an opt-in `populateEmbedding?: boolean` (default false). `EmbeddingRecord.embedding` is now optional and only returned when populateEmbedding is true. Each backend honors the flag at the source where possible: pg skips selecting the embedding column, mongodb uses { projection: { embedding: 0 } }, and CF strips values post-fetch (getByIds always returns them). DbAdapter.findByIds gains the populateEmbedding param; the shared mock and adapters README follow. Specs split into a populateEmbedding:true case (keeps the full-vector assertions) plus a default-omits-embedding case. --- adapters/README.md | 10 +++--- adapters/cf/dev/specs/adapter.spec.ts | 32 ++++++++++++++++++-- adapters/cf/src/findByIds.ts | 3 +- adapters/mongodb/dev/specs/findByIds.spec.ts | 18 ++++++++--- adapters/mongodb/src/findByIds.ts | 14 ++++++--- adapters/mongodb/src/index.ts | 3 +- adapters/pg/dev/specs/findByIds.spec.ts | 17 ++++++++--- adapters/pg/src/findByIds.ts | 10 ++++-- dev/helpers/mockAdapter.ts | 3 +- dev/specs/vectorizedPayload.spec.ts | 31 ++++++++++++++----- src/index.ts | 13 ++++++-- src/types.ts | 6 ++-- 12 files changed, 124 insertions(+), 36 deletions(-) diff --git a/adapters/README.md b/adapters/README.md index 1d57eda..a600310 100644 --- a/adapters/README.md +++ b/adapters/README.md @@ -156,6 +156,7 @@ export type DbAdapter = { payload: BasePayload, poolName: KnowledgePoolName, ids: string[], + populateEmbedding?: boolean, ) => Promise> } ``` @@ -169,7 +170,7 @@ export type DbAdapter = { | `deleteChunks` | After a source document is deleted. | Remove every chunk where `sourceCollection === ... && docId === ...`. Must be safe to call when no chunks exist (no-op, no throw). | | `hasEmbeddingVersion` | During bulk-embed planning, per candidate document. | Return `true` iff at least one chunk exists with the matching `(sourceCollection, docId, embeddingVersion)` triple. Must filter on **all three** — older `0.7.0` adapters that ignored `embeddingVersion` caused stale embeddings on model bumps. | | `search` | Per `/vector-search` request and per `getVectorizedPayload().search()` call. | Translate `where` (Payload-style) into your store's filter language, perform a vector search using `queryEmbedding`, and return up to `limit` results sorted by descending relevance. | -| `findByIds` | Per `getVectorizedPayload().findEmbeddingsByIds()` call. | Fetch stored embedding records by primary key, **including the raw `embedding` vector** (which `search` never returns). Look up by the same `id` your `search` returns as `result.id`. Unknown ids are dropped (result length may be `< ids.length`); order is not guaranteed; empty `ids` returns `[]` without a backend call. Adapters with a strict id format (pg integer PK, MongoDB `ObjectId`) also drop *malformed* ids as misses without erroring; adapters keyed on an opaque id (CF's composite vector id) forward ids to the backend as-is, so a backend that rejects a malformed id may surface that error. | +| `findByIds` | Per `getVectorizedPayload().findByIds()` call. | Fetch stored embedding records by primary key. The raw `embedding` vector is **only included when `populateEmbedding` is `true`** (default `false`) — omit it otherwise so callers that only need text/metadata don't pay for it. Where possible, skip reading the vector at the source (pg: don't select the column; MongoDB: `{ projection: { embedding: 0 } }`); CF's `getByIds` always returns values, so omit them post-fetch. Look up by the same `id` your `search` returns as `result.id`. Unknown ids are dropped (result length may be `< ids.length`); order is not guaranteed; empty `ids` returns `[]` without a backend call. Adapters with a strict id format (pg integer PK, MongoDB `ObjectId`) also drop *malformed* ids as misses without erroring; adapters keyed on an opaque id (CF's composite vector id) forward ids to the backend as-is, so a backend that rejects a malformed id may surface that error. | ### Error contract @@ -295,8 +296,9 @@ export const createYourDbVectorIntegration = ( return [] }, - findByIds: async (payload, poolName, ids) => { - // TODO: fetch stored records by primary key, including the raw `embedding` vector. + findByIds: async (payload, poolName, ids, populateEmbedding = false) => { + // TODO: fetch stored records by primary key. Include the raw `embedding` vector + // only when `populateEmbedding` is true (default false); skip reading it otherwise. // Return Array. Unknown ids are misses (drop them, don't throw). return [] }, @@ -404,7 +406,7 @@ export interface EmbeddingRecord { | `chunkText`, `embeddingVersion` | yes | Same. | | `extensionFields.*` | optional | Whatever the user passed in `extensionFields` must be queryable via `where`. | -> `EmbeddingRecord` (returned by `findByIds`) is `VectorSearchResult` without `score` and with the raw `embedding: number[]`. +> `EmbeddingRecord` (returned by `findByIds`) is `VectorSearchResult` without `score` and with an optional raw `embedding?: number[]` — present only when `findByIds` is called with `populateEmbedding: true`. ## Testing your adapter diff --git a/adapters/cf/dev/specs/adapter.spec.ts b/adapters/cf/dev/specs/adapter.spec.ts index 32091d0..dac497f 100644 --- a/adapters/cf/dev/specs/adapter.spec.ts +++ b/adapters/cf/dev/specs/adapter.spec.ts @@ -440,7 +440,7 @@ describe('createCloudflareVectorizeIntegration', () => { }) describe('findByIds', () => { - test('returns full EmbeddingRecord including embedding values', async () => { + test('returns full EmbeddingRecord including embedding values when populateEmbedding is true', async () => { const mockBinding = createMockCloudflareBinding() const { adapter } = createCloudflareVectorizeIntegration({ config: { default: { dims: DIMS } }, @@ -460,7 +460,7 @@ describe('createCloudflareVectorizeIntegration', () => { }) const id = 'default:posts:doc-1:0' - const records = await adapter.findByIds(mockPayload, 'default', [id]) + const records = await adapter.findByIds(mockPayload, 'default', [id], true) expect(records).toHaveLength(1) const [r] = records expect(r.id).toBe(id) @@ -472,6 +472,34 @@ describe('createCloudflareVectorizeIntegration', () => { expect((r as any).category).toBe('science') }) + test('omits embedding values by default', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: DIMS } }, + binding: mockBinding as any, + }) + const mockPayload = createMockPayload(mockBinding) + + await adapter.storeChunk(mockPayload, 'default', { + sourceCollection: 'posts', + docId: 'doc-1', + chunkIndex: 0, + chunkText: 'find me', + embeddingVersion: 'v1', + embedding: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], + extensionFields: { category: 'science' }, + }) + + const id = 'default:posts:doc-1:0' + const records = await adapter.findByIds(mockPayload, 'default', [id]) + expect(records).toHaveLength(1) + const [r] = records + expect(r.id).toBe(id) + expect(r.embedding).toBeUndefined() + expect(r.chunkText).toBe('find me') + expect((r as any).category).toBe('science') + }) + test('drops misses', async () => { const mockBinding = createMockCloudflareBinding() const { adapter } = createCloudflareVectorizeIntegration({ diff --git a/adapters/cf/src/findByIds.ts b/adapters/cf/src/findByIds.ts index 35638f5..de7cf62 100644 --- a/adapters/cf/src/findByIds.ts +++ b/adapters/cf/src/findByIds.ts @@ -8,6 +8,7 @@ export default async ( payload: BasePayload, _poolName: KnowledgePoolName, ids: string[], + populateEmbedding = false, ): Promise> => { if (ids.length === 0) return [] @@ -32,7 +33,7 @@ export default async ( : parseInt(String(metadata.chunkIndex ?? '0'), 10), chunkText: String(metadata.chunkText ?? ''), embeddingVersion: String(metadata.embeddingVersion ?? ''), - embedding: Array.from(vector.values ?? []), + ...(populateEmbedding ? { embedding: Array.from(vector.values ?? []) } : {}), ...extensionFields, } }) diff --git a/adapters/mongodb/dev/specs/findByIds.spec.ts b/adapters/mongodb/dev/specs/findByIds.spec.ts index 87d04ac..7cea989 100644 --- a/adapters/mongodb/dev/specs/findByIds.spec.ts +++ b/adapters/mongodb/dev/specs/findByIds.spec.ts @@ -54,19 +54,29 @@ describe('mongodb findByIds', () => { await teardownDbs(payload, MONGO_URI, DB) }) - test('returns full EmbeddingRecord including numeric embedding array', async () => { - const records = await adapter.findByIds(payload, 'default', [embeddingId]) + test('returns full EmbeddingRecord including numeric embedding array when populateEmbedding is true', async () => { + const records = await adapter.findByIds(payload, 'default', [embeddingId], true) expect(records).toHaveLength(1) const [r] = records expect(r.id).toBe(embeddingId) expect(Array.isArray(r.embedding)).toBe(true) - expect(r.embedding.length).toBe(DIMS) - expect(r.embedding.every((n) => typeof n === 'number')).toBe(true) + expect(r.embedding!.length).toBe(DIMS) + expect(r.embedding!.every((n) => typeof n === 'number')).toBe(true) expect(r.sourceCollection).toBe('posts') expect(r.chunkText).toBe('find me') expect(r.embeddingVersion).toBe(testEmbeddingVersion) }) + test('omits the embedding array by default', async () => { + const records = await adapter.findByIds(payload, 'default', [embeddingId]) + expect(records).toHaveLength(1) + const [r] = records + expect(r.id).toBe(embeddingId) + expect(r.embedding).toBeUndefined() + expect(r.sourceCollection).toBe('posts') + expect(r.chunkText).toBe('find me') + }) + test('includes extension fields', async () => { const [r] = await adapter.findByIds(payload, 'default', [embeddingId]) expect((r as any).category).toBe('science') diff --git a/adapters/mongodb/src/findByIds.ts b/adapters/mongodb/src/findByIds.ts index 3704714..d8b3b03 100644 --- a/adapters/mongodb/src/findByIds.ts +++ b/adapters/mongodb/src/findByIds.ts @@ -18,6 +18,7 @@ export async function findByIdsImpl( _payload: BasePayload, poolName: string, ids: string[], + populateEmbedding = false, ): Promise { if (ids.length === 0) return [] @@ -35,13 +36,16 @@ export async function findByIdsImpl( const docs = await client .db(ctx.dbName) .collection(cfg.collectionName) - .find({ _id: { $in: objectIds } }) + .find({ _id: { $in: objectIds } }, populateEmbedding ? {} : { projection: { embedding: 0 } }) .toArray() - return docs.map((doc) => mapDocToRecord(doc as Record)) + return docs.map((doc) => mapDocToRecord(doc as Record, populateEmbedding)) } -function mapDocToRecord(doc: Record): EmbeddingRecord { +function mapDocToRecord( + doc: Record, + populateEmbedding: boolean, +): EmbeddingRecord { const extensionFields = Object.fromEntries( Object.entries(doc).filter(([k]) => !RESERVED_AND_META.has(k)), ) @@ -53,7 +57,9 @@ function mapDocToRecord(doc: Record): EmbeddingRecord { typeof doc.chunkIndex === 'number' ? doc.chunkIndex : Number(doc.chunkIndex ?? 0), chunkText: String(doc.chunkText ?? ''), embeddingVersion: String(doc.embeddingVersion ?? ''), - embedding: Array.isArray(doc.embedding) ? (doc.embedding as number[]) : [], + ...(populateEmbedding + ? { embedding: Array.isArray(doc.embedding) ? (doc.embedding as number[]) : [] } + : {}), ...extensionFields, } } diff --git a/adapters/mongodb/src/index.ts b/adapters/mongodb/src/index.ts index b1c0914..5a0d2d2 100644 --- a/adapters/mongodb/src/index.ts +++ b/adapters/mongodb/src/index.ts @@ -91,7 +91,8 @@ export const createMongoVectorIntegration = ( search: (payload, queryEmbedding, poolName, limit, where) => searchImpl(getCtx(), payload, queryEmbedding, poolName, limit, where), - findByIds: (payload, poolName, ids) => findByIdsImpl(getCtx(), payload, poolName, ids), + findByIds: (payload, poolName, ids, populateEmbedding) => + findByIdsImpl(getCtx(), payload, poolName, ids, populateEmbedding), } return { adapter } diff --git a/adapters/pg/dev/specs/findByIds.spec.ts b/adapters/pg/dev/specs/findByIds.spec.ts index c1c2323..eeb5cf8 100644 --- a/adapters/pg/dev/specs/findByIds.spec.ts +++ b/adapters/pg/dev/specs/findByIds.spec.ts @@ -74,19 +74,28 @@ describe('pg findByIds', () => { await destroyPayload(payload) }) - test('returns full EmbeddingRecord including numeric embedding array', async () => { - const records = await integration.adapter.findByIds(payload, 'default', [embeddingId]) + test('returns full EmbeddingRecord including numeric embedding array when populateEmbedding is true', async () => { + const records = await integration.adapter.findByIds(payload, 'default', [embeddingId], true) expect(records).toHaveLength(1) const [r] = records expect(r.id).toBe(embeddingId) expect(Array.isArray(r.embedding)).toBe(true) - expect(r.embedding.length).toBe(DIMS) - expect(r.embedding.every((n) => typeof n === 'number')).toBe(true) + expect(r.embedding!.length).toBe(DIMS) + expect(r.embedding!.every((n) => typeof n === 'number')).toBe(true) expect(r.sourceCollection).toBe('posts') expect(typeof r.chunkText).toBe('string') expect(r.embeddingVersion).toBe(testEmbeddingVersion) }) + test('omits the embedding array by default', async () => { + const records = await integration.adapter.findByIds(payload, 'default', [embeddingId]) + expect(records).toHaveLength(1) + const [r] = records + expect(r.id).toBe(embeddingId) + expect(r.embedding).toBeUndefined() + expect(r.sourceCollection).toBe('posts') + }) + test('includes extension fields when the pool defines them', async () => { const [r] = await integration.adapter.findByIds(payload, 'default', [embeddingId]) expect((r as any).category).toBe('science') diff --git a/adapters/pg/src/findByIds.ts b/adapters/pg/src/findByIds.ts index 76a9e9d..5c2f4e3 100644 --- a/adapters/pg/src/findByIds.ts +++ b/adapters/pg/src/findByIds.ts @@ -8,6 +8,7 @@ export default async ( payload: BasePayload, poolName: KnowledgePoolName, ids: string[], + populateEmbedding = false, ): Promise> => { if (ids.length === 0) return [] @@ -37,7 +38,9 @@ export default async ( const selectObj: Record = { id: table.id, - embedding: table.embedding, + } + if (populateEmbedding) { + selectObj.embedding = table.embedding } for (const field of collectionConfig.fields ?? []) { if (typeof field === 'object' && 'name' in field) { @@ -51,12 +54,13 @@ export default async ( } const rows = await drizzle.select(selectObj).from(table).where(inArray(table.id, numericIds)) - return mapRowsToRecords(rows, collectionConfig) + return mapRowsToRecords(rows, collectionConfig, populateEmbedding) } function mapRowsToRecords( rows: Record[], collectionConfig: SanitizedCollectionConfig, + populateEmbedding: boolean, ): Array { const numberFields = new Set() for (const field of collectionConfig.fields) { @@ -75,7 +79,7 @@ function mapRowsToRecords( docId: String(rawDocId), chunkIndex: typeof rawChunkIndex === 'number' ? rawChunkIndex : parseInt(String(rawChunkIndex), 10), - embedding: parseEmbedding(row.embedding), + ...(populateEmbedding ? { embedding: parseEmbedding(row.embedding) } : {}), } as EmbeddingRecord for (const fieldName of numberFields) { diff --git a/dev/helpers/mockAdapter.ts b/dev/helpers/mockAdapter.ts index 932a1bd..80053a8 100644 --- a/dev/helpers/mockAdapter.ts +++ b/dev/helpers/mockAdapter.ts @@ -200,6 +200,7 @@ export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter = payload: BasePayload, poolName: KnowledgePoolName, ids: string[], + populateEmbedding = false, ): Promise => { const records: EmbeddingRecord[] = [] for (const id of ids) { @@ -220,7 +221,7 @@ export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter = } = doc as any records.push({ id: stored.id, - embedding: stored.embedding, + ...(populateEmbedding ? { embedding: stored.embedding } : {}), ...docFields, } as EmbeddingRecord) } catch (_e) { diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index 7895bd5..4f986a8 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -200,7 +200,7 @@ describe('VectorizedPayload', () => { }) }) - describe('findEmbeddingsByIds method', () => { + describe('findByIds method', () => { let embeddingId: string beforeAll(async () => { @@ -217,29 +217,44 @@ describe('VectorizedPayload', () => { embeddingId = String(rows.docs[0].id) }) - test('payload has findEmbeddingsByIds method', () => { + test('payload has findByIds method', () => { const vectorizedPayload = getVectorizedPayload(payload) - expect(typeof vectorizedPayload!.findEmbeddingsByIds).toBe('function') + expect(typeof vectorizedPayload!.findByIds).toBe('function') }) - test('returns the full EmbeddingRecord including the embedding vector', async () => { + test('returns the full EmbeddingRecord including the embedding vector when populateEmbedding is true', async () => { const vectorizedPayload = getVectorizedPayload(payload)! - const records = await vectorizedPayload.findEmbeddingsByIds({ + const records = await vectorizedPayload.findByIds({ knowledgePool: 'default', ids: [embeddingId], + populateEmbedding: true, }) expect(records).toHaveLength(1) const [record] = records expect(record.id).toBe(embeddingId) expect(Array.isArray(record.embedding)).toBe(true) - expect(record.embedding.length).toBe(DIMS) + expect(record.embedding!.length).toBe(DIMS) + expect(typeof record.sourceCollection).toBe('string') + expect(typeof record.chunkText).toBe('string') + }) + + test('omits the embedding vector by default', async () => { + const vectorizedPayload = getVectorizedPayload(payload)! + const records = await vectorizedPayload.findByIds({ + knowledgePool: 'default', + ids: [embeddingId], + }) + expect(records).toHaveLength(1) + const [record] = records + expect(record.id).toBe(embeddingId) + expect(record.embedding).toBeUndefined() expect(typeof record.sourceCollection).toBe('string') expect(typeof record.chunkText).toBe('string') }) test('drops unknown ids (result length < ids length)', async () => { const vectorizedPayload = getVectorizedPayload(payload)! - const records = await vectorizedPayload.findEmbeddingsByIds({ + const records = await vectorizedPayload.findByIds({ knowledgePool: 'default', ids: [embeddingId, 'definitely-not-an-id-999999'], }) @@ -249,7 +264,7 @@ describe('VectorizedPayload', () => { test('empty ids returns []', async () => { const vectorizedPayload = getVectorizedPayload(payload)! - const records = await vectorizedPayload.findEmbeddingsByIds({ + const records = await vectorizedPayload.findByIds({ knowledgePool: 'default', ids: [], }) diff --git a/src/index.ts b/src/index.ts index 6e77714..45a961e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -357,9 +357,18 @@ export default (pluginOptions: PayloadcmsVectorizeConfig) => params.limit, params.where, ), - findEmbeddingsByIds: (params: { knowledgePool: KnowledgePoolName; ids: string[] }) => { + findByIds: (params: { + knowledgePool: KnowledgePoolName + ids: string[] + populateEmbedding?: boolean + }) => { if (params.ids.length === 0) return Promise.resolve([]) - return pluginOptions.dbAdapter.findByIds(payload, params.knowledgePool, params.ids) + return pluginOptions.dbAdapter.findByIds( + payload, + params.knowledgePool, + params.ids, + params.populateEmbedding ?? false, + ) }, queueEmbed: async ( params: diff --git a/src/types.ts b/src/types.ts index 0701068..09f685b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -57,9 +57,10 @@ export type VectorizedPayload = { _isBulkEmbedEnabled: (knowledgePool: KnowledgePoolName) => boolean getDbAdapterCustom: () => Record | undefined search: (params: VectorSearchQuery) => Promise> - findEmbeddingsByIds: (params: { + findByIds: (params: { knowledgePool: KnowledgePoolName ids: string[] + populateEmbedding?: boolean }) => Promise> queueEmbed: ( params: @@ -333,7 +334,7 @@ export interface EmbeddingRecord { chunkIndex: number chunkText: string embeddingVersion: string - embedding: number[] + embedding?: number[] [key: string]: any } @@ -449,5 +450,6 @@ export type DbAdapter = { payload: BasePayload, poolName: KnowledgePoolName, ids: string[], + populateEmbedding?: boolean, ) => Promise> } From e72adf4b8f4d85a015d859600156c9682edc57be Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 6 Jun 2026 18:32:04 +0700 Subject: [PATCH 14/17] docs: align README method name + EmbeddingRecord with shipped findByIds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root README advertised a `findEmbeddingsByIds` method that doesn't exist — the shipped public method is `findByIds`. Rename the method reference and example, document the `populateEmbedding?` param (default false), and fix the adapters/README EmbeddingRecord interface block to `embedding?: number[]` (optional, present only when populateEmbedding: true). --- README.md | 15 +++++++++------ adapters/README.md | 5 +++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 384a33d..fb4fba5 100644 --- a/README.md +++ b/README.md @@ -832,7 +832,7 @@ curl -X POST http://localhost:3000/api/vector-retry-failed-batch \ ### Local API -The plugin provides a `getVectorizedPayload(payload)` function which returns a `vectorizedPayload` object exposing `search`, `findEmbeddingsByIds`, `queueEmbed`, `bulkEmbed`, and `retryFailedBatch` methods. +The plugin provides a `getVectorizedPayload(payload)` function which returns a `vectorizedPayload` object exposing `search`, `findByIds`, `queueEmbed`, `bulkEmbed`, and `retryFailedBatch` methods. #### Getting the Vectorized Payload Object @@ -883,23 +883,26 @@ const results = await vectorizedPayload.search({ }) ``` -#### `vectorizedPayload.findEmbeddingsByIds(params)` +#### `vectorizedPayload.findByIds(params)` -Fetch stored embedding records by primary key — **including the raw embedding vector**, which the normal search/query API never returns. The `id` of each record is whatever [`search()`](#vectorizedpayloadsearchparams) returns as `result.id`, so a search result round-trips directly. This is the building block for "more like this" flows. +Fetch stored embedding records by primary key. The `id` of each record is whatever [`search()`](#vectorizedpayloadsearchparams) returns as `result.id`, so a search result round-trips directly. Pass `populateEmbedding: true` to also get the raw embedding vector back (the normal search/query API never returns it) — the building block for "more like this" flows. It defaults to `false`, so by default you get the record's text and metadata without the heavy vector. -**Returns:** `Promise>` — `EmbeddingRecord` is the search result shape without `score` and with `embedding: number[]`. +**Params:** `{ knowledgePool: string; ids: string[]; populateEmbedding?: boolean }` (`populateEmbedding` defaults to `false`). + +**Returns:** `Promise>` — `EmbeddingRecord` is the search result shape without `score` and with an optional `embedding?: number[]`, present only when `populateEmbedding: true`. **Example:** ```typescript -const [record] = await vectorizedPayload.findEmbeddingsByIds({ +const [record] = await vectorizedPayload.findByIds({ knowledgePool: 'mainKnowledgePool', ids: [''], + populateEmbedding: true, }) if (record) { // record.embedding is the raw number[] vector — feed it back into search for "more like this" - console.log(record.embedding.length, record.chunkText) + console.log(record.embedding!.length, record.chunkText) } ``` diff --git a/adapters/README.md b/adapters/README.md index a600310..d33a85a 100644 --- a/adapters/README.md +++ b/adapters/README.md @@ -391,8 +391,9 @@ export interface EmbeddingRecord { chunkText: string /** Embedding model/version string. */ embeddingVersion: string - /** The raw embedding vector — never returned by `search`. */ - embedding: number[] + /** The raw embedding vector — never returned by `search`, and only present + * when `findByIds` is called with `populateEmbedding: true`. */ + embedding?: number[] /** Any extensionFields persisted via storeChunk round-trip here. */ [key: string]: any } From 4011e91b4d86f41ff8cf393cb97d4fe1b9ca7250 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 6 Jun 2026 21:28:55 +0700 Subject: [PATCH 15/17] fix(pg): support non-integer (uuid) ids in findByIds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit findByIds filtered ids through /^\d+$/ and mapped to Number, hardcoding an integer primary key. The embeddings collection defines no custom id, so under postgresAdapter({ idType: 'uuid' }) every embedding id is a uuid — the filter dropped all of them and findByIds returned [] for ids that exist, while search() round-tripped the same uuids fine. Pass ids straight to inArray; Postgres casts the text params to the column type, supporting both integer and uuid PKs. Well-formed but nonexistent ids are still absent from results; a malformed id now surfaces a backend error rather than being silently dropped (documented in adapters/README). Adds a uuid-idType regression spec. --- adapters/README.md | 2 +- adapters/pg/dev/specs/findByIds.spec.ts | 94 +++++++++++++++++++++++++ adapters/pg/src/findByIds.ts | 5 +- 3 files changed, 96 insertions(+), 5 deletions(-) diff --git a/adapters/README.md b/adapters/README.md index d33a85a..37467fe 100644 --- a/adapters/README.md +++ b/adapters/README.md @@ -170,7 +170,7 @@ export type DbAdapter = { | `deleteChunks` | After a source document is deleted. | Remove every chunk where `sourceCollection === ... && docId === ...`. Must be safe to call when no chunks exist (no-op, no throw). | | `hasEmbeddingVersion` | During bulk-embed planning, per candidate document. | Return `true` iff at least one chunk exists with the matching `(sourceCollection, docId, embeddingVersion)` triple. Must filter on **all three** — older `0.7.0` adapters that ignored `embeddingVersion` caused stale embeddings on model bumps. | | `search` | Per `/vector-search` request and per `getVectorizedPayload().search()` call. | Translate `where` (Payload-style) into your store's filter language, perform a vector search using `queryEmbedding`, and return up to `limit` results sorted by descending relevance. | -| `findByIds` | Per `getVectorizedPayload().findByIds()` call. | Fetch stored embedding records by primary key. The raw `embedding` vector is **only included when `populateEmbedding` is `true`** (default `false`) — omit it otherwise so callers that only need text/metadata don't pay for it. Where possible, skip reading the vector at the source (pg: don't select the column; MongoDB: `{ projection: { embedding: 0 } }`); CF's `getByIds` always returns values, so omit them post-fetch. Look up by the same `id` your `search` returns as `result.id`. Unknown ids are dropped (result length may be `< ids.length`); order is not guaranteed; empty `ids` returns `[]` without a backend call. Adapters with a strict id format (pg integer PK, MongoDB `ObjectId`) also drop *malformed* ids as misses without erroring; adapters keyed on an opaque id (CF's composite vector id) forward ids to the backend as-is, so a backend that rejects a malformed id may surface that error. | +| `findByIds` | Per `getVectorizedPayload().findByIds()` call. | Fetch stored embedding records by primary key. The raw `embedding` vector is **only included when `populateEmbedding` is `true`** (default `false`) — omit it otherwise so callers that only need text/metadata don't pay for it. Where possible, skip reading the vector at the source (pg: don't select the column; MongoDB: `{ projection: { embedding: 0 } }`); CF's `getByIds` always returns values, so omit them post-fetch. Look up by the same `id` your `search` returns as `result.id`. Well-formed but nonexistent ids are dropped (result length may be `< ids.length`); order is not guaranteed; empty `ids` returns `[]` without a backend call. Whether a *malformed* id (wrong shape for the key type) is dropped or surfaces an error is adapter-specific: an adapter that validates the id shape itself (MongoDB drops non-24-hex ids via an `ObjectId` guard) treats it as a miss; an adapter that forwards ids straight to the backend (pg passes them to the `IN` query, supporting both integer and `uuid` PKs; CF forwards its composite vector id) lets the backend reject a malformed id, which may surface as an error. | ### Error contract diff --git a/adapters/pg/dev/specs/findByIds.spec.ts b/adapters/pg/dev/specs/findByIds.spec.ts index eeb5cf8..67a2930 100644 --- a/adapters/pg/dev/specs/findByIds.spec.ts +++ b/adapters/pg/dev/specs/findByIds.spec.ts @@ -112,3 +112,97 @@ describe('pg findByIds', () => { expect(records).toEqual([]) }) }) + +describe('pg findByIds (uuid idType)', () => { + let payload: Payload + const dbName = 'pg_find_by_ids_uuid_test' + let embeddingId: string + + beforeAll(async () => { + await createTestDb({ dbName }) + const config = await buildDummyConfig({ + jobs: { tasks: [], autoRun: [{ cron: '*/5 * * * * *', limit: 10 }] }, + collections: [ + { slug: 'posts', fields: [ + { name: 'title', type: 'text' }, + { name: 'category', type: 'text' }, + ] }, + ], + db: postgresAdapter({ + idType: 'uuid', + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}` }, + }), + plugins: [ + plugin({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => { + const chunks: Array<{ chunk: string; category?: string }> = [] + if (doc.title) { + for (const chunk of chunkText(doc.title)) { + chunks.push({ chunk, category: doc.category || 'general' }) + } + } + return chunks + }, + }, + }, + extensionFields: [{ name: 'category', type: 'text' }], + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + }) + payload = await getPayload({ config, key: `pg-find-by-ids-uuid-${Date.now()}`, cron: true }) + + const post = await payload.create({ + collection: 'posts', + data: { title: 'Find me by uuid', category: 'science' }, + }) + await waitForVectorizationJobs(payload) + const rows = await payload.find({ + collection: 'default' as any, + where: { docId: { equals: String(post.id) } }, + limit: 1, + }) + embeddingId = String(rows.docs[0].id) + }) + + afterAll(async () => { + await destroyPayload(payload) + }) + + test('embedding id is a uuid, not a numeric PK', () => { + expect(embeddingId).toMatch( + /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i, + ) + }) + + test('findByIds resolves a uuid id (regression: numeric-only filter dropped uuids)', async () => { + const records = await integration.adapter.findByIds(payload, 'default', [embeddingId], true) + expect(records).toHaveLength(1) + const [r] = records + expect(r.id).toBe(embeddingId) + expect(Array.isArray(r.embedding)).toBe(true) + expect(r.embedding!.length).toBe(DIMS) + expect((r as any).category).toBe('science') + }) + + test('drops a well-formed but nonexistent uuid', async () => { + const records = await integration.adapter.findByIds(payload, 'default', [ + embeddingId, + '00000000-0000-0000-0000-000000000000', + ]) + expect(records).toHaveLength(1) + expect(records[0].id).toBe(embeddingId) + }) +}) diff --git a/adapters/pg/src/findByIds.ts b/adapters/pg/src/findByIds.ts index 5c2f4e3..3dc26b4 100644 --- a/adapters/pg/src/findByIds.ts +++ b/adapters/pg/src/findByIds.ts @@ -33,9 +33,6 @@ export default async ( ) } - const numericIds = ids.filter((id) => /^\d+$/.test(id)).map(Number) - if (numericIds.length === 0) return [] - const selectObj: Record = { id: table.id, } @@ -53,7 +50,7 @@ export default async ( } } - const rows = await drizzle.select(selectObj).from(table).where(inArray(table.id, numericIds)) + const rows = await drizzle.select(selectObj).from(table).where(inArray(table.id, ids)) return mapRowsToRecords(rows, collectionConfig, populateEmbedding) } From 1a9063d059baeac248fb3a10e608f59a7632415f Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 6 Jun 2026 21:39:11 +0700 Subject: [PATCH 16/17] fix(pg): coerce nullable text fields to "" in findByIds and search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit chunkText and embeddingVersion are not required in the embeddings schema, so a null column was spread through raw as `null`, violating EmbeddingRecord / VectorSearchResult (`chunkText: string`). CF and MongoDB both coerce via String(x ?? '') → '', so identical data round-tripped as '' on those adapters but null on pg; a consumer doing record.chunkText.length crashed only on pg. Coerce sourceCollection/chunkText/embeddingVersion via String(x ?? '') in both mapRowsToRecords (findByIds) and mapRowsToResults (search) so pg matches the declared types and the other adapters. Adds a regression test. --- adapters/pg/dev/specs/findByIds.spec.ts | 17 +++++++++++++++++ adapters/pg/src/findByIds.ts | 5 ++++- adapters/pg/src/search.ts | 5 ++++- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/adapters/pg/dev/specs/findByIds.spec.ts b/adapters/pg/dev/specs/findByIds.spec.ts index 67a2930..beb35c2 100644 --- a/adapters/pg/dev/specs/findByIds.spec.ts +++ b/adapters/pg/dev/specs/findByIds.spec.ts @@ -1,6 +1,8 @@ import type { Payload } from 'payload' import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { postgresAdapter } from '@payloadcms/db-postgres' +import { eq } from '@payloadcms/db-postgres/drizzle' +import { getEmbeddingsTable } from '../../src/drizzle.js' import { buildDummyConfig, integration, plugin, DIMS } from './constants.js' import { createTestDb, destroyPayload, waitForVectorizationJobs } from './utils.js' import { getPayload } from 'payload' @@ -111,6 +113,21 @@ describe('pg findByIds', () => { const records = await integration.adapter.findByIds(payload, 'default', []) expect(records).toEqual([]) }) + + test('coerces null chunkText/embeddingVersion to "" (EmbeddingRecord type)', async () => { + // These columns are not required in the embeddings schema, so a row can have + // nulls. Set them directly and confirm findByIds returns '' (parity with cf/mongo), + // not null — which would violate EmbeddingRecord's `chunkText: string`. + const table = getEmbeddingsTable('default')! + await (payload.db as any).drizzle + .update(table) + .set({ chunkText: null, embeddingVersion: null }) + .where(eq(table.id, Number(embeddingId))) + + const [r] = await integration.adapter.findByIds(payload, 'default', [embeddingId]) + expect(r.chunkText).toBe('') + expect(r.embeddingVersion).toBe('') + }) }) describe('pg findByIds (uuid idType)', () => { diff --git a/adapters/pg/src/findByIds.ts b/adapters/pg/src/findByIds.ts index 3dc26b4..cf64dc3 100644 --- a/adapters/pg/src/findByIds.ts +++ b/adapters/pg/src/findByIds.ts @@ -73,9 +73,12 @@ function mapRowsToRecords( const record = { ...row, id: String(row.id), - docId: String(rawDocId), + sourceCollection: String(row.sourceCollection ?? ''), + docId: String(rawDocId ?? ''), chunkIndex: typeof rawChunkIndex === 'number' ? rawChunkIndex : parseInt(String(rawChunkIndex), 10), + chunkText: String(row.chunkText ?? ''), + embeddingVersion: String(row.embeddingVersion ?? ''), ...(populateEmbedding ? { embedding: parseEmbedding(row.embedding) } : {}), } as EmbeddingRecord diff --git a/adapters/pg/src/search.ts b/adapters/pg/src/search.ts index ce31c56..54dcc20 100644 --- a/adapters/pg/src/search.ts +++ b/adapters/pg/src/search.ts @@ -303,10 +303,13 @@ function mapRowsToResults( const result = { ...row, id: String(row.id), - docId: String(rawDocId), + sourceCollection: String(row.sourceCollection ?? ''), + docId: String(rawDocId ?? ''), score: typeof rawScore === 'number' ? rawScore : parseFloat(String(rawScore)), chunkIndex: typeof rawChunkIndex === 'number' ? rawChunkIndex : parseInt(String(rawChunkIndex), 10), + chunkText: String(row.chunkText ?? ''), + embeddingVersion: String(row.embeddingVersion ?? ''), } as VectorSearchResult // Ensure any number fields from the schema are numbers in the result From c357d789eabe129e27dcbf3352865d1f4c336ee4 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:50:45 +0700 Subject: [PATCH 17/17] feat(find-by-ids): return a map keyed by id instead of an array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit findByIds now returns Record instead of Array. Order isn't conserved by any backend and a lookup may miss, so an array forced callers to re-join by id and made misses a silent gap. Keying by the requested id makes the round-trip O(1) (records[searchHit.id]), order irrelevant, and a miss an explicit undefined. Every requested id is a key. Unify the malformed-id contract: unknown AND malformed ids map to undefined, never throw. pg now filters ids that don't match the PK column type (getSQLType: numeric for integer/serial, uuid-shaped for uuid) before the IN query, so a bad id is a miss instead of a cast error that poisoned the batch — matching mongo (non-24-hex drop) and cf (unknown ids absent from getByIds). Stop the mock adapter from swallowing real errors: only Payload NotFound is treated as a miss; everything else rethrows. Docs + specs updated across all adapters; note that key order is not input order (integer-like keys sort first) so callers must look up by id. --- README.md | 10 ++-- adapters/README.md | 9 ++-- adapters/cf/dev/specs/adapter.spec.ts | 21 ++++---- adapters/cf/src/findByIds.ts | 15 +++--- adapters/mongodb/dev/specs/findByIds.spec.ts | 26 +++++----- adapters/mongodb/src/findByIds.ts | 14 ++++-- adapters/pg/dev/specs/findByIds.spec.ts | 52 +++++++++++++------- adapters/pg/src/findByIds.ts | 32 ++++++++++-- dev/helpers/mockAdapter.ts | 42 +++++++++------- dev/specs/vectorizedPayload.spec.ts | 21 ++++---- src/index.ts | 2 +- src/types.ts | 4 +- 12 files changed, 159 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index fb4fba5..5499040 100644 --- a/README.md +++ b/README.md @@ -889,24 +889,26 @@ Fetch stored embedding records by primary key. The `id` of each record is whatev **Params:** `{ knowledgePool: string; ids: string[]; populateEmbedding?: boolean }` (`populateEmbedding` defaults to `false`). -**Returns:** `Promise>` — `EmbeddingRecord` is the search result shape without `score` and with an optional `embedding?: number[]`, present only when `populateEmbedding: true`. +**Returns:** `Promise>` — an object keyed by the ids you passed in. Each requested id is present as a key; a found record is the value, and an unknown or malformed id maps to `undefined`. `EmbeddingRecord` is the search result shape without `score` and with an optional `embedding?: number[]`, present only when `populateEmbedding: true`. **Example:** ```typescript -const [record] = await vectorizedPayload.findByIds({ +const id = '' +const records = await vectorizedPayload.findByIds({ knowledgePool: 'mainKnowledgePool', - ids: [''], + ids: [id], populateEmbedding: true, }) +const record = records[id] if (record) { // record.embedding is the raw number[] vector — feed it back into search for "more like this" console.log(record.embedding!.length, record.chunkText) } ``` -Misses are dropped (the result may be shorter than `ids`), order is not guaranteed, and an empty `ids` array returns `[]` without touching the backend. +Because the result is keyed by id, a search result round-trips directly (`records[searchHit.id]`) and there's no positional alignment to worry about — look records up by id rather than relying on key order. Unknown or malformed ids map to `undefined` (never throw), and an empty `ids` array returns `{}` without touching the backend. #### `vectorizedPayload.queueEmbed(params)` diff --git a/adapters/README.md b/adapters/README.md index 37467fe..9b9cbac 100644 --- a/adapters/README.md +++ b/adapters/README.md @@ -157,7 +157,7 @@ export type DbAdapter = { poolName: KnowledgePoolName, ids: string[], populateEmbedding?: boolean, - ) => Promise> + ) => Promise> } ``` @@ -170,7 +170,7 @@ export type DbAdapter = { | `deleteChunks` | After a source document is deleted. | Remove every chunk where `sourceCollection === ... && docId === ...`. Must be safe to call when no chunks exist (no-op, no throw). | | `hasEmbeddingVersion` | During bulk-embed planning, per candidate document. | Return `true` iff at least one chunk exists with the matching `(sourceCollection, docId, embeddingVersion)` triple. Must filter on **all three** — older `0.7.0` adapters that ignored `embeddingVersion` caused stale embeddings on model bumps. | | `search` | Per `/vector-search` request and per `getVectorizedPayload().search()` call. | Translate `where` (Payload-style) into your store's filter language, perform a vector search using `queryEmbedding`, and return up to `limit` results sorted by descending relevance. | -| `findByIds` | Per `getVectorizedPayload().findByIds()` call. | Fetch stored embedding records by primary key. The raw `embedding` vector is **only included when `populateEmbedding` is `true`** (default `false`) — omit it otherwise so callers that only need text/metadata don't pay for it. Where possible, skip reading the vector at the source (pg: don't select the column; MongoDB: `{ projection: { embedding: 0 } }`); CF's `getByIds` always returns values, so omit them post-fetch. Look up by the same `id` your `search` returns as `result.id`. Well-formed but nonexistent ids are dropped (result length may be `< ids.length`); order is not guaranteed; empty `ids` returns `[]` without a backend call. Whether a *malformed* id (wrong shape for the key type) is dropped or surfaces an error is adapter-specific: an adapter that validates the id shape itself (MongoDB drops non-24-hex ids via an `ObjectId` guard) treats it as a miss; an adapter that forwards ids straight to the backend (pg passes them to the `IN` query, supporting both integer and `uuid` PKs; CF forwards its composite vector id) lets the backend reject a malformed id, which may surface as an error. | +| `findByIds` | Per `getVectorizedPayload().findByIds()` call. | Fetch stored embedding records by primary key. **Return an object keyed by the ids you were given:** every requested id must be present as a key, with a found record as the value and `undefined` for any id that didn't resolve. The raw `embedding` vector is **only included when `populateEmbedding` is `true`** (default `false`) — omit it otherwise so callers that only need text/metadata don't pay for it. Where possible, skip reading the vector at the source (pg: don't select the column; MongoDB: `{ projection: { embedding: 0 } }`); CF's `getByIds` always returns values, so omit them post-fetch. Look up by the same `id` your `search` returns as `result.id`. Unknown **and** malformed ids must map to `undefined` — never throw for a bad id. Validate the id shape against your key type before querying so a malformed id can't error the whole batch (MongoDB drops non-24-hex ids; pg drops ids that don't match the PK column type — numeric for integer PKs, uuid-shaped for `uuid` PKs — before the `IN` query; CF's ids are arbitrary strings, so an unknown one is simply absent from `getByIds`). Empty `ids` returns `{}` without a backend call. | ### Error contract @@ -299,8 +299,9 @@ export const createYourDbVectorIntegration = ( findByIds: async (payload, poolName, ids, populateEmbedding = false) => { // TODO: fetch stored records by primary key. Include the raw `embedding` vector // only when `populateEmbedding` is true (default false); skip reading it otherwise. - // Return Array. Unknown ids are misses (drop them, don't throw). - return [] + // Return an object keyed by every requested id: a record for hits, `undefined` + // for unknown or malformed ids (never throw for a bad id). + return Object.fromEntries(ids.map((id) => [id, undefined])) }, } diff --git a/adapters/cf/dev/specs/adapter.spec.ts b/adapters/cf/dev/specs/adapter.spec.ts index dac497f..ee988be 100644 --- a/adapters/cf/dev/specs/adapter.spec.ts +++ b/adapters/cf/dev/specs/adapter.spec.ts @@ -461,8 +461,8 @@ describe('createCloudflareVectorizeIntegration', () => { const id = 'default:posts:doc-1:0' const records = await adapter.findByIds(mockPayload, 'default', [id], true) - expect(records).toHaveLength(1) - const [r] = records + expect(Object.keys(records)).toEqual([id]) + const r = records[id]! expect(r.id).toBe(id) expect(r.embedding).toEqual(embedding) expect(r.sourceCollection).toBe('posts') @@ -492,15 +492,15 @@ describe('createCloudflareVectorizeIntegration', () => { const id = 'default:posts:doc-1:0' const records = await adapter.findByIds(mockPayload, 'default', [id]) - expect(records).toHaveLength(1) - const [r] = records + expect(Object.keys(records)).toEqual([id]) + const r = records[id]! expect(r.id).toBe(id) expect(r.embedding).toBeUndefined() expect(r.chunkText).toBe('find me') expect((r as any).category).toBe('science') }) - test('drops misses', async () => { + test('maps misses to undefined', async () => { const mockBinding = createMockCloudflareBinding() const { adapter } = createCloudflareVectorizeIntegration({ config: { default: { dims: DIMS } }, @@ -520,11 +520,14 @@ describe('createCloudflareVectorizeIntegration', () => { 'default:posts:doc-1:0', 'default:posts:nope:0', ]) - expect(records).toHaveLength(1) - expect(records[0].id).toBe('default:posts:doc-1:0') + expect(Object.keys(records).sort()).toEqual( + ['default:posts:doc-1:0', 'default:posts:nope:0'].sort(), + ) + expect(records['default:posts:doc-1:0']!.id).toBe('default:posts:doc-1:0') + expect(records['default:posts:nope:0']).toBeUndefined() }) - test('empty ids returns []', async () => { + test('empty ids returns {}', async () => { const mockBinding = createMockCloudflareBinding() const { adapter } = createCloudflareVectorizeIntegration({ config: { default: { dims: DIMS } }, @@ -532,7 +535,7 @@ describe('createCloudflareVectorizeIntegration', () => { }) const mockPayload = createMockPayload(mockBinding) const records = await adapter.findByIds(mockPayload, 'default', []) - expect(records).toEqual([]) + expect(records).toEqual({}) }) }) }) diff --git a/adapters/cf/src/findByIds.ts b/adapters/cf/src/findByIds.ts index de7cf62..c8f2104 100644 --- a/adapters/cf/src/findByIds.ts +++ b/adapters/cf/src/findByIds.ts @@ -9,21 +9,23 @@ export default async ( _poolName: KnowledgePoolName, ids: string[], populateEmbedding = false, -): Promise> => { - if (ids.length === 0) return [] +): Promise> => { + const result: Record = {} + for (const id of ids) result[id] = undefined + if (ids.length === 0) return result const binding = getVectorizeBinding(payload) try { const vectors = await binding.getByIds(ids) - if (!vectors) return [] + if (!vectors) return result - return vectors.map((vector) => { + for (const vector of vectors) { const metadata = (vector.metadata || {}) as Record const extensionFields = Object.fromEntries( Object.entries(metadata).filter(([k]) => !RESERVED_METADATA.includes(k)), ) - return { + result[vector.id] = { id: vector.id, sourceCollection: String(metadata.sourceCollection ?? ''), docId: String(metadata.docId ?? ''), @@ -36,7 +38,8 @@ export default async ( ...(populateEmbedding ? { embedding: Array.from(vector.values ?? []) } : {}), ...extensionFields, } - }) + } + return result } catch (e) { const errorMessage = e instanceof Error ? e.message : String(e) payload.logger.error(`[@payloadcms-vectorize/cf] findByIds failed: ${errorMessage}`) diff --git a/adapters/mongodb/dev/specs/findByIds.spec.ts b/adapters/mongodb/dev/specs/findByIds.spec.ts index 7cea989..231a0e4 100644 --- a/adapters/mongodb/dev/specs/findByIds.spec.ts +++ b/adapters/mongodb/dev/specs/findByIds.spec.ts @@ -56,8 +56,8 @@ describe('mongodb findByIds', () => { test('returns full EmbeddingRecord including numeric embedding array when populateEmbedding is true', async () => { const records = await adapter.findByIds(payload, 'default', [embeddingId], true) - expect(records).toHaveLength(1) - const [r] = records + expect(Object.keys(records)).toEqual([embeddingId]) + const r = records[embeddingId]! expect(r.id).toBe(embeddingId) expect(Array.isArray(r.embedding)).toBe(true) expect(r.embedding!.length).toBe(DIMS) @@ -69,8 +69,8 @@ describe('mongodb findByIds', () => { test('omits the embedding array by default', async () => { const records = await adapter.findByIds(payload, 'default', [embeddingId]) - expect(records).toHaveLength(1) - const [r] = records + expect(Object.keys(records)).toEqual([embeddingId]) + const r = records[embeddingId]! expect(r.id).toBe(embeddingId) expect(r.embedding).toBeUndefined() expect(r.sourceCollection).toBe('posts') @@ -78,22 +78,26 @@ describe('mongodb findByIds', () => { }) test('includes extension fields', async () => { - const [r] = await adapter.findByIds(payload, 'default', [embeddingId]) - expect((r as any).category).toBe('science') + const records = await adapter.findByIds(payload, 'default', [embeddingId]) + expect((records[embeddingId] as any).category).toBe('science') }) - test('drops misses and invalid ids without throwing', async () => { + test('maps misses and invalid ids to undefined without throwing', async () => { const records = await adapter.findByIds(payload, 'default', [ embeddingId, '000000000000000000000000', 'not-an-object-id', ]) - expect(records).toHaveLength(1) - expect(records[0].id).toBe(embeddingId) + expect(Object.keys(records).sort()).toEqual( + [embeddingId, '000000000000000000000000', 'not-an-object-id'].sort(), + ) + expect(records[embeddingId]!.id).toBe(embeddingId) + expect(records['000000000000000000000000']).toBeUndefined() + expect(records['not-an-object-id']).toBeUndefined() }) - test('empty ids returns []', async () => { + test('empty ids returns {}', async () => { const records = await adapter.findByIds(payload, 'default', []) - expect(records).toEqual([]) + expect(records).toEqual({}) }) }) diff --git a/adapters/mongodb/src/findByIds.ts b/adapters/mongodb/src/findByIds.ts index d8b3b03..fe90203 100644 --- a/adapters/mongodb/src/findByIds.ts +++ b/adapters/mongodb/src/findByIds.ts @@ -19,8 +19,10 @@ export async function findByIdsImpl( poolName: string, ids: string[], populateEmbedding = false, -): Promise { - if (ids.length === 0) return [] +): Promise> { + const result: Record = {} + for (const id of ids) result[id] = undefined + if (ids.length === 0) return result const cfg = ctx.pools[poolName] if (!cfg) { @@ -30,7 +32,7 @@ export async function findByIdsImpl( } const objectIds = ids.filter((id) => HEX24.test(id)).map((id) => new ObjectId(id)) - if (objectIds.length === 0) return [] + if (objectIds.length === 0) return result const client = await getMongoClient(ctx.uri) const docs = await client @@ -39,7 +41,11 @@ export async function findByIdsImpl( .find({ _id: { $in: objectIds } }, populateEmbedding ? {} : { projection: { embedding: 0 } }) .toArray() - return docs.map((doc) => mapDocToRecord(doc as Record, populateEmbedding)) + for (const doc of docs) { + const record = mapDocToRecord(doc as Record, populateEmbedding) + result[record.id] = record + } + return result } function mapDocToRecord( diff --git a/adapters/pg/dev/specs/findByIds.spec.ts b/adapters/pg/dev/specs/findByIds.spec.ts index beb35c2..f03a260 100644 --- a/adapters/pg/dev/specs/findByIds.spec.ts +++ b/adapters/pg/dev/specs/findByIds.spec.ts @@ -78,8 +78,8 @@ describe('pg findByIds', () => { test('returns full EmbeddingRecord including numeric embedding array when populateEmbedding is true', async () => { const records = await integration.adapter.findByIds(payload, 'default', [embeddingId], true) - expect(records).toHaveLength(1) - const [r] = records + expect(Object.keys(records)).toEqual([embeddingId]) + const r = records[embeddingId]! expect(r.id).toBe(embeddingId) expect(Array.isArray(r.embedding)).toBe(true) expect(r.embedding!.length).toBe(DIMS) @@ -91,27 +91,35 @@ describe('pg findByIds', () => { test('omits the embedding array by default', async () => { const records = await integration.adapter.findByIds(payload, 'default', [embeddingId]) - expect(records).toHaveLength(1) - const [r] = records + expect(Object.keys(records)).toEqual([embeddingId]) + const r = records[embeddingId]! expect(r.id).toBe(embeddingId) expect(r.embedding).toBeUndefined() expect(r.sourceCollection).toBe('posts') }) test('includes extension fields when the pool defines them', async () => { - const [r] = await integration.adapter.findByIds(payload, 'default', [embeddingId]) - expect((r as any).category).toBe('science') + const records = await integration.adapter.findByIds(payload, 'default', [embeddingId]) + expect((records[embeddingId] as any).category).toBe('science') }) - test('drops misses', async () => { + test('maps a well-formed but nonexistent id to undefined', async () => { const records = await integration.adapter.findByIds(payload, 'default', [embeddingId, '999999']) - expect(records).toHaveLength(1) - expect(records[0].id).toBe(embeddingId) + expect(Object.keys(records).sort()).toEqual([embeddingId, '999999'].sort()) + expect(records[embeddingId]!.id).toBe(embeddingId) + expect(records['999999']).toBeUndefined() + }) + + test('maps a malformed (non-numeric) id to undefined instead of throwing', async () => { + const records = await integration.adapter.findByIds(payload, 'default', [embeddingId, 'not-an-id']) + expect(Object.keys(records).sort()).toEqual([embeddingId, 'not-an-id'].sort()) + expect(records[embeddingId]!.id).toBe(embeddingId) + expect(records['not-an-id']).toBeUndefined() }) - test('empty ids returns []', async () => { + test('empty ids returns {}', async () => { const records = await integration.adapter.findByIds(payload, 'default', []) - expect(records).toEqual([]) + expect(records).toEqual({}) }) test('coerces null chunkText/embeddingVersion to "" (EmbeddingRecord type)', async () => { @@ -124,7 +132,7 @@ describe('pg findByIds', () => { .set({ chunkText: null, embeddingVersion: null }) .where(eq(table.id, Number(embeddingId))) - const [r] = await integration.adapter.findByIds(payload, 'default', [embeddingId]) + const r = (await integration.adapter.findByIds(payload, 'default', [embeddingId]))[embeddingId]! expect(r.chunkText).toBe('') expect(r.embeddingVersion).toBe('') }) @@ -206,20 +214,30 @@ describe('pg findByIds (uuid idType)', () => { test('findByIds resolves a uuid id (regression: numeric-only filter dropped uuids)', async () => { const records = await integration.adapter.findByIds(payload, 'default', [embeddingId], true) - expect(records).toHaveLength(1) - const [r] = records + expect(Object.keys(records)).toEqual([embeddingId]) + const r = records[embeddingId]! expect(r.id).toBe(embeddingId) expect(Array.isArray(r.embedding)).toBe(true) expect(r.embedding!.length).toBe(DIMS) expect((r as any).category).toBe('science') }) - test('drops a well-formed but nonexistent uuid', async () => { + test('maps a well-formed but nonexistent uuid to undefined', async () => { const records = await integration.adapter.findByIds(payload, 'default', [ embeddingId, '00000000-0000-0000-0000-000000000000', ]) - expect(records).toHaveLength(1) - expect(records[0].id).toBe(embeddingId) + expect(Object.keys(records).sort()).toEqual( + [embeddingId, '00000000-0000-0000-0000-000000000000'].sort(), + ) + expect(records[embeddingId]!.id).toBe(embeddingId) + expect(records['00000000-0000-0000-0000-000000000000']).toBeUndefined() + }) + + test('maps a malformed (non-uuid) id to undefined instead of throwing', async () => { + const records = await integration.adapter.findByIds(payload, 'default', [embeddingId, '999999']) + expect(Object.keys(records).sort()).toEqual([embeddingId, '999999'].sort()) + expect(records[embeddingId]!.id).toBe(embeddingId) + expect(records['999999']).toBeUndefined() }) }) diff --git a/adapters/pg/src/findByIds.ts b/adapters/pg/src/findByIds.ts index cf64dc3..9e51aeb 100644 --- a/adapters/pg/src/findByIds.ts +++ b/adapters/pg/src/findByIds.ts @@ -9,8 +9,10 @@ export default async ( poolName: KnowledgePoolName, ids: string[], populateEmbedding = false, -): Promise> => { - if (ids.length === 0) return [] +): Promise> => { + const result: Record = {} + for (const id of ids) result[id] = undefined + if (ids.length === 0) return result const isPostgres = payload.db?.pool?.query || payload.db?.drizzle if (!isPostgres) { @@ -33,6 +35,12 @@ export default async ( ) } + // Drop ids that can't match the primary-key column type before querying, so a + // malformed id is treated as a miss instead of making Postgres reject the cast + // and throw for the whole batch. + const queryableIds = ids.filter((id) => idMatchesPkType(table.id, id)) + if (queryableIds.length === 0) return result + const selectObj: Record = { id: table.id, } @@ -50,8 +58,24 @@ export default async ( } } - const rows = await drizzle.select(selectObj).from(table).where(inArray(table.id, ids)) - return mapRowsToRecords(rows, collectionConfig, populateEmbedding) + const rows = await drizzle.select(selectObj).from(table).where(inArray(table.id, queryableIds)) + for (const record of mapRowsToRecords(rows, collectionConfig, populateEmbedding)) { + result[record.id] = record + } + return result +} + +const UUID = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i + +function idMatchesPkType(idColumn: { getSQLType?: () => string }, id: string): boolean { + const sqlType = idColumn.getSQLType?.() ?? '' + if (sqlType === 'integer' || sqlType === 'serial' || sqlType === 'bigint' || sqlType === 'bigserial') { + return /^\d+$/.test(id) + } + if (sqlType === 'uuid') { + return UUID.test(id) + } + return true } function mapRowsToRecords( diff --git a/dev/helpers/mockAdapter.ts b/dev/helpers/mockAdapter.ts index 80053a8..0659f1d 100644 --- a/dev/helpers/mockAdapter.ts +++ b/dev/helpers/mockAdapter.ts @@ -201,31 +201,37 @@ export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter = poolName: KnowledgePoolName, ids: string[], populateEmbedding = false, - ): Promise => { - const records: EmbeddingRecord[] = [] + ): Promise> => { + const records: Record = {} for (const id of ids) { + records[id] = undefined const stored = storage.get(`${poolName}:${id}`) if (!stored) continue + let doc: Record | null try { - const doc = await payload.findByID({ + doc = (await payload.findByID({ collection: poolName as CollectionSlug, id: stored.id, - }) - if (!doc) continue - const { - id: _id, - createdAt: _createdAt, - updatedAt: _updatedAt, - embedding: _embedding, - ...docFields - } = doc as any - records.push({ - id: stored.id, - ...(populateEmbedding ? { embedding: stored.embedding } : {}), - ...docFields, - } as EmbeddingRecord) - } catch (_e) { + })) as Record | null + } catch (e) { + if (e instanceof Error && e.name === 'NotFound') { + continue + } + throw e } + if (!doc) continue + const { + id: _id, + createdAt: _createdAt, + updatedAt: _updatedAt, + embedding: _embedding, + ...docFields + } = doc + records[id] = { + id: stored.id, + ...(populateEmbedding ? { embedding: stored.embedding } : {}), + ...docFields, + } as EmbeddingRecord } return records }, diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index 4f986a8..f4c36b4 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -229,8 +229,8 @@ describe('VectorizedPayload', () => { ids: [embeddingId], populateEmbedding: true, }) - expect(records).toHaveLength(1) - const [record] = records + expect(Object.keys(records)).toEqual([embeddingId]) + const record = records[embeddingId]! expect(record.id).toBe(embeddingId) expect(Array.isArray(record.embedding)).toBe(true) expect(record.embedding!.length).toBe(DIMS) @@ -244,31 +244,34 @@ describe('VectorizedPayload', () => { knowledgePool: 'default', ids: [embeddingId], }) - expect(records).toHaveLength(1) - const [record] = records + expect(Object.keys(records)).toEqual([embeddingId]) + const record = records[embeddingId]! expect(record.id).toBe(embeddingId) expect(record.embedding).toBeUndefined() expect(typeof record.sourceCollection).toBe('string') expect(typeof record.chunkText).toBe('string') }) - test('drops unknown ids (result length < ids length)', async () => { + test('maps unknown ids to undefined (every requested id is a key)', async () => { const vectorizedPayload = getVectorizedPayload(payload)! const records = await vectorizedPayload.findByIds({ knowledgePool: 'default', ids: [embeddingId, 'definitely-not-an-id-999999'], }) - expect(records).toHaveLength(1) - expect(records[0].id).toBe(embeddingId) + expect(Object.keys(records).sort()).toEqual( + [embeddingId, 'definitely-not-an-id-999999'].sort(), + ) + expect(records[embeddingId]!.id).toBe(embeddingId) + expect(records['definitely-not-an-id-999999']).toBeUndefined() }) - test('empty ids returns []', async () => { + test('empty ids returns {}', async () => { const vectorizedPayload = getVectorizedPayload(payload)! const records = await vectorizedPayload.findByIds({ knowledgePool: 'default', ids: [], }) - expect(records).toEqual([]) + expect(records).toEqual({}) }) }) diff --git a/src/index.ts b/src/index.ts index 45a961e..7621b9c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -362,7 +362,7 @@ export default (pluginOptions: PayloadcmsVectorizeConfig) => ids: string[] populateEmbedding?: boolean }) => { - if (params.ids.length === 0) return Promise.resolve([]) + if (params.ids.length === 0) return Promise.resolve({}) return pluginOptions.dbAdapter.findByIds( payload, params.knowledgePool, diff --git a/src/types.ts b/src/types.ts index 09f685b..2fb10b1 100644 --- a/src/types.ts +++ b/src/types.ts @@ -61,7 +61,7 @@ export type VectorizedPayload = { knowledgePool: KnowledgePoolName ids: string[] populateEmbedding?: boolean - }) => Promise> + }) => Promise> queueEmbed: ( params: | { @@ -451,5 +451,5 @@ export type DbAdapter = { poolName: KnowledgePoolName, ids: string[], populateEmbedding?: boolean, - ) => Promise> + ) => Promise> }