data-fair · albanm · Jun 5, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/api/doc/settings/put-req/.type/index.d.ts b/api/doc/settings/put-req/.type/index.d.ts
@@ -67,6 +67,8 @@ export type InputPricePer1MTokens2 = number;
 export type OutputPricePer1MTokens2 = number;
 export type InputPricePer1MTokens3 = number;
 export type OutputPricePer1MTokens3 = number;
+export type InputPricePer1MTokens4 = number;
+export type OutputPricePer1MTokens4 = number;
 export type Unlimited = boolean;
 /**
  * Weekly limit = monthly / 2, daily limit = monthly / 4
@@ -170,6 +172,7 @@ export type Models = {
   tools?: Tools;
   summarizer?: Summarizer;
   evaluator?: Evaluator;
+  moderator?: Moderator;
   [k: string]: unknown;
 }
 /**
@@ -264,6 +267,29 @@ export type Model3 = {
   };
   [k: string]: unknown;
 }
+/**
+ *
+ * The "gatekeeper." Classifies each new user message for profanity, prompt-injection, persona override, and out-of-scope requests. Should be fast and cheap — it sits on the critical path to the first response token.
+ *
+ * Recommendations: a small/fast model, e.g. Claude 4.5 Haiku, GPT-5.4 Mini, Mistral Small 4, or a dedicated moderation classifier.
+ */
+export type Moderator = {
+  model?: Model4;
+  inputPricePerMillion?: InputPricePer1MTokens4;
+  outputPricePerMillion?: OutputPricePer1MTokens4;
+  [k: string]: unknown;
+}
+export type Model4 = {
+  id: ModelID;
+  name: Name;
+  provider: {
+    type: ProviderType9;
+    name: ProviderName;
+    id: ProviderID9;
+    [k: string]: unknown;
+  };
+  [k: string]: unknown;
+}
 export type RoleQuotas = {
   global: GlobalQuotas;
   admin: AdminQuotas;
@@ -316,7 +342,7 @@ export type RoleQuota = {
  * This interface was referenced by `SettingsPut`'s JSON-Schema
  * via the `definition` "Model".
  */
-export type Model4 = {
+export type Model5 = {
   id: ModelID;
   name: Name;
   provider: {

diff --git a/api/doc/settings/put-req/.type/validate.js b/api/doc/settings/put-req/.type/validate.js
diff --git a/api/src/gateway/router.ts b/api/src/gateway/router.ts
@@ -46,22 +46,27 @@ function buildUsage (usage: LanguageModelUsage | undefined) {
 const router = Router()
 export default router
 
-const MODEL_IDS = ['assistant', 'evaluator', 'summarizer', 'tools'] as const
+const MODEL_IDS = ['assistant', 'evaluator', 'summarizer', 'tools', 'moderator'] as const
 type ModelId = typeof MODEL_IDS[number]
 
 function isValidModelId (id: string): id is ModelId {
   return MODEL_IDS.includes(id as ModelId)
 }
 
 function getModelConfig (settings: Settings, modelId: ModelId) {
-  const modelEntry = settings.models[modelId]
-  const fallback = settings.models.assistant
-  const modelConfig = modelEntry?.model || fallback?.model
-  if (!modelConfig) throw new Error(`No model configured for ${modelId}`)
-  const source = modelEntry?.model ? modelEntry : fallback
-  const inputPricePerMillion = source?.inputPricePerMillion ?? 0
-  const outputPricePerMillion = source?.outputPricePerMillion ?? 0
-  return { modelConfig, inputPricePerMillion, outputPricePerMillion }
+  // moderator prefers a cheap dedicated model, then the summarizer, then the
+  // assistant as a guaranteed last resort; every other role falls back straight
+  // to the assistant.
+  const chain = modelId === 'moderator'
+    ? [settings.models.moderator, settings.models.summarizer, settings.models.assistant]
+    : [settings.models[modelId], settings.models.assistant]
+  const source = chain.find(entry => entry?.model)
+  if (!source?.model) throw new Error(`No model configured for ${modelId}`)
+  return {
+    modelConfig: source.model,
+    inputPricePerMillion: source.inputPricePerMillion ?? 0,
+    outputPricePerMillion: source.outputPricePerMillion ?? 0
+  }
 }
 
 async function getModelForGateway (settings: Settings, modelId: ModelId) {

diff --git a/api/src/models/mock-model.ts b/api/src/models/mock-model.ts
@@ -136,6 +136,18 @@ function processMockSummarizerPrompt (): MockPromptResult {
   return { type: 'text', text: 'Summary: conversation covered the main topics discussed.' }
 }
 
+/**
+ * mock-moderator: returns a deterministic moderation verdict as JSON text.
+ * Messages containing "jailbreak" or "ignore (all|previous) instructions" are
+ * blocked; everything else is allowed. Used by moderation tests.
+ */
+function processMockModeratorPrompt (lastMessage: string): MockPromptResult {
+  if (/jailbreak|ignore (all|previous) instructions/i.test(lastMessage)) {
+    return { type: 'text', text: '{"action":"block","category":"prompt-injection","reason":"mock block"}' }
+  }
+  return { type: 'text', text: '{"action":"allow"}' }
+}
+
 /**
  * Exploration test seam: when the request advertises a `select_tools` tool, emit a
  * deterministic select_tools call choosing every tool named inside the prompt's
@@ -165,6 +177,8 @@ function processForModel (modelId: string, options: { prompt: string | Array<any
       return processMockToolsPrompt(lastMessage, options.prompt)
     case 'mock-summarizer':
       return processMockSummarizerPrompt()
+    case 'mock-moderator':
+      return processMockModeratorPrompt(lastMessage)
     default:
       return processMockPrompt(lastMessage, options.prompt)
   }

diff --git a/api/types/settings/schema.js b/api/types/settings/schema.js
@@ -794,6 +794,48 @@ Recommendations: Claude Opus 4.6, GPT-5.4 (Reasoning), DeepSeek-R1, Pharia-1-LLM
               minimum: 0
             }
           }
+        },
+        moderator: {
+          type: 'object',
+          title: 'Moderator',
+          description: `
+The "gatekeeper." Classifies each new user message for profanity, prompt-injection, persona override, and out-of-scope requests. Should be fast and cheap — it sits on the critical path to the first response token.
+
+Recommendations: a small/fast model, e.g. Claude 4.5 Haiku, GPT-5.4 Mini, Mistral Small 4, or a dedicated moderation classifier.`,
+          'x-i18n-title': {
+            en: 'Moderator',
+            fr: 'Modérateur'
+          },
+          'x-i18n-description': {
+            en: 'The "gatekeeper." Classifies each new user message for profanity, prompt-injection, persona override, and out-of-scope requests. Should be fast and cheap — it sits on the critical path to the first response token.\n\nRecommendations: a small/fast model, e.g. Claude 4.5 Haiku, GPT-5.4 Mini, Mistral Small 4, or a dedicated moderation classifier.',
+            fr: 'Le « gardien ». Classe chaque nouveau message utilisateur (grossièretés, injection de prompt, usurpation de persona, demandes hors périmètre). Doit être rapide et peu coûteux — il se trouve sur le chemin critique vers le premier token de réponse.\n\nRecommandations : un modèle petit et rapide, par ex. Claude 4.5 Haiku, GPT-5.4 Mini, Mistral Small 4, ou un classifieur de modération dédié.'
+          },
+          layout: {
+            comp: 'card',
+            children: [{ key: 'model' }, { key: 'inputPricePerMillion', cols: 6 }, { key: 'outputPricePerMillion', cols: 6 }],
+            cols: 6
+          },
+          properties: {
+            model: {
+              $ref: '#/definitions/Model',
+              title: 'Model',
+              'x-i18n-title': { en: 'Model', fr: 'Modèle' }
+            },
+            inputPricePerMillion: {
+              type: 'number',
+              title: 'Input price (per 1M tokens)',
+              'x-i18n-title': { en: 'Input price (per 1M tokens)', fr: "Prix d'entrée (par million de tokens)" },
+              default: 0,
+              minimum: 0
+            },
+            outputPricePerMillion: {
+              type: 'number',
+              title: 'Output price (per 1M tokens)',
+              'x-i18n-title': { en: 'Output price (per 1M tokens)', fr: 'Prix de sortie (par million de tokens)' },
+              default: 0,
+              minimum: 0
+            }
+          }
         }
       }
     },

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -61,7 +61,7 @@ sequenceDiagram
 - `api/src/gateway/operations.ts` — OpenAI ↔ AI SDK message/tool format conversion
 - `ui/src/composables/use-agent-chat.ts` — client-side history management
 
-**Model IDs are roles, not model names.** The client requests `assistant`, `tools`, `summarizer`, or `evaluator` — the server resolves which provider/model to use from settings.
+**Model IDs are roles, not model names.** The client requests `assistant`, `tools`, `summarizer`, `evaluator`, or `moderator` — the server resolves which provider/model to use from settings.
 
 ---
 
@@ -168,6 +168,7 @@ graph LR
 | `tools` | Structured data / tool-calling specialist | 0.5 |
 | `summarizer` | Context compaction | 0.5 |
 | `evaluator` | Quality control / reasoning | 1.0 |
+| `moderator` | Input moderation guard (fast/cheap) | 0.5 |
 
 Each owner (user or organization) configures their own providers and model assignments. API keys are **encrypted at rest** (AES-256-CBC) and obfuscated in API responses. Model lists are fetched from provider APIs with **5-minute memoized caching**.
 
@@ -299,7 +300,50 @@ graph TB
 
 ---
 
-## 8. Progressive Tool Disclosure (Tool Exploration)
+## 8. Input Moderation Guard
+
+A per-message guard protects the **UI-integrated assistant** from abuse — profanity, prompt-injection attempts, persona/identity override, and out-of-scope requests that deviate from the agent's mission. It is **always on** and runs **concurrently** with the assistant turn, only withholding the first visible output byte; the request itself is never delayed.
+
+```mermaid
+sequenceDiagram
+  participant Chat as use-agent-chat
+  participant GW as Gateway (/v1/chat/completions)
+  participant LLM as Resolved model
+
+  Note over Chat: on user submit, both start in parallel
+  Chat->>GW: POST model=assistant, stream=true
+  Chat->>GW: POST model=moderator, stream=false
+  GW->>GW: role + quota check (both calls)
+  GW->>LLM: generateText (moderator→summarizer→assistant)
+  LLM-->>GW: verdict text
+  GW->>GW: recordUsage (metered)
+  GW-->>Chat: completion JSON
+  Chat->>Chat: parseModerationVerdict (client-side, 1.5s fail-open)
+  GW-->>Chat: SSE chunks (buffered until verdict)
+  alt allow
+    Chat->>Chat: flush + stream normally
+  else block
+    Chat->>Chat: abort stream, drop user msg, show hardcoded refusal
+  end
+```
+
+**Reuses the gateway.** There is no dedicated moderation endpoint. The client issues a second, non-streaming gateway call with `model: 'moderator'`, which resolves **moderator → summarizer → assistant** server-side (`getModelConfig`). Because it goes through the gateway, moderation inherits the same role checks, quota checks, and usage recording as any other model call — every user message therefore costs two metered calls (moderation + assistant).
+
+**Advisory, not a security boundary.** The gate lives in the client orchestration loop. A direct or anonymous call straight to the gateway's `assistant` model bypasses moderation entirely; that is by design and governed by auth/quotas. The moderation prompt and verdict parser live in the browser (`ui/src/composables/moderation.ts`).
+
+**Fail-open everywhere.** A client-side 1.5s timeout, any transport/HTTP error (including a quota 429 on the moderation call), and any unparseable model output all resolve to `allow`. Moderation never blocks the user on an internal failure.
+
+**Hardcoded refusal.** Blocked messages show a fixed, localized refusal (en/fr) supplied by the chat component; it is not configurable. The model's `category`/`reason` are recorded in the trace but never shown to the user.
+
+**Input only (v1).** The moderator sees the new user message plus the agent mission (system prompt) — not the full history. No output moderation, no tool-result / indirect-injection coverage, no multi-turn jailbreak detection. A block is enforced before any assistant text is shown, but if a turn's first action is a tool call, the tool may already have executed by the time the verdict arrives — moderation does not roll back tool side effects.
+
+**Observable, client-side only.** Every decision — `allow`, `skip` (fail-open), and `block` — is recorded in the session trace (`SessionRecorder.recordModerationDecision`) with the model's `category` and `reason`, viewable in the debug dialog. Tracing is ephemeral and client-only.
+
+**Key files:**
+- `api/src/gateway/router.ts` — resolves the `moderator` role and meters the call
+- `ui/src/composables/moderation.ts` — moderation prompt + tolerant verdict parser
+- `ui/src/composables/use-agent-chat.ts` — parallel gate, withholding the first byte, block → refusal
+## 9. Progressive Tool Disclosure (Tool Exploration)
 
 When many tools are registered (or they churn as the user navigates), sending every tool's full schema on every request bloats context and destabilises any prompt cache. An **opt-in** exploration mode replaces "send all tools" with "discover on demand": the assistant sees only a single always-on `explore_tools` tool plus a catalog of tool *names*, and must call `explore_tools` to make the tools it needs callable.