From 94c10e78be09609a575a8ca65aeb6d55fd571d38 Mon Sep 17 00:00:00 2001
From: Mike Edwards <njrun1804@gmail.com>
Date: Sun, 14 Jun 2026 13:36:28 -0400
Subject: [PATCH 1/5] fix: reconcile dead-pid jobs so status/result stop
 reporting zombies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A worker that dies before recording a terminal state (crash, SIGKILL, OOM, broker drop,
session teardown) left its job stuck status:running forever — status showed a growing
elapsed on a frozen updatedAt, and result threw 'still running' / 'no job found'.

- job-control.mjs: every job-read path now routes through a liveness reconcile —
  process.kill(pid,0) -> ESRCH means the worker is gone, so the job is persisted as
  failed:orphaned. Idempotent; a live pid (or our own pid) is never touched.
- tracked-jobs.mjs: the worker installs exit/SIGTERM/SIGINT/SIGHUP finalizers that flush
  a terminal failed:interrupted record on graceful death (SIGKILL/OOM are caught by the
  reconcile above). Defense in depth.
- render.mjs: result falls back to the job's log tail when no payload was captured.
- tests/job-control.test.mjs: dead-pid->orphaned, live-pid untouched, result resolves.

86 -> 89 tests, all green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 plugins/codex/scripts/lib/job-control.mjs  |  69 +++++++++++--
 plugins/codex/scripts/lib/render.mjs       |  25 +++++
 plugins/codex/scripts/lib/tracked-jobs.mjs |  48 +++++++++
 tests/job-control.test.mjs                 | 112 +++++++++++++++++++++
 4 files changed, 248 insertions(+), 6 deletions(-)
 create mode 100644 tests/job-control.test.mjs

diff --git a/plugins/codex/scripts/lib/job-control.mjs b/plugins/codex/scripts/lib/job-control.mjs
index ad152c15..53086333 100644
--- a/plugins/codex/scripts/lib/job-control.mjs
+++ b/plugins/codex/scripts/lib/job-control.mjs
@@ -1,8 +1,8 @@
 import fs from "node:fs";
 
 import { getSessionRuntimeStatus } from "./codex.mjs";
-import { getConfig, listJobs, readJobFile, resolveJobFile } from "./state.mjs";
-import { SESSION_ID_ENV } from "./tracked-jobs.mjs";
+import { getConfig, listJobs, readJobFile, resolveJobFile, upsertJob, writeJobFile } from "./state.mjs";
+import { nowIso, SESSION_ID_ENV } from "./tracked-jobs.mjs";
 import { resolveWorkspaceRoot } from "./workspace.mjs";
 
 export const DEFAULT_MAX_STATUS_JOBS = 8;
@@ -210,10 +210,65 @@ function matchJobReference(jobs, reference, predicate = () => true) {
   throw new Error(`No job found for "${reference}". Run /codex:status to list known jobs.`);
 }
 
+const ACTIVE_STATUSES = new Set(["queued", "running"]);
+
+// Liveness probe: signal 0 is delivered to a live pid, throws ESRCH for a dead one.
+// EPERM means the process exists but we may not signal it — still alive.
+function isPidAlive(pid) {
+  if (!Number.isFinite(pid)) {
+    return false;
+  }
+  try {
+    process.kill(pid, 0);
+    return true;
+  } catch (error) {
+    return error?.code === "EPERM";
+  }
+}
+
+// A job whose worker pid is dead while its stored status is still active is an ORPHAN:
+// the worker died (crash, SIGKILL, OOM, broker drop, parent teardown) without recording
+// a terminal state, so status/result would report `running` forever. Reconcile it to a
+// durable `failed:orphaned` record. Idempotent — once reconciled, status is no longer
+// active, so later reads skip it. This is what makes status/result honest about liveness.
+function reconcileJob(workspaceRoot, job) {
+  if (!job || !ACTIVE_STATUSES.has(job.status) || !Number.isFinite(job.pid)) {
+    return job;
+  }
+  if (job.pid === process.pid || isPidAlive(job.pid)) {
+    return job;
+  }
+  const patch = {
+    id: job.id,
+    status: "failed",
+    phase: "failed:orphaned",
+    pid: null,
+    completedAt: nowIso(),
+    errorMessage:
+      "Worker process exited without recording completion (orphaned — reconciled by a status/result check)."
+  };
+  try {
+    upsertJob(workspaceRoot, patch);
+    const jobFile = resolveJobFile(workspaceRoot, job.id);
+    if (fs.existsSync(jobFile)) {
+      writeJobFile(workspaceRoot, job.id, { ...readJobFile(jobFile), ...patch });
+    }
+  } catch {
+    // Best-effort persistence; still return the reconciled view so the caller stays honest.
+  }
+  return { ...job, ...patch };
+}
+
+// Reconciling read: the single source of truth for "what is actually alive". Every
+// job-reading resolver routes through this so a dead worker can never masquerade as running.
+function listJobsReconciled(workspaceRoot) {
+  return listJobs(workspaceRoot).map((job) => reconcileJob(workspaceRoot, job));
+}
+
 export function buildStatusSnapshot(cwd, options = {}) {
   const workspaceRoot = resolveWorkspaceRoot(cwd);
   const config = getConfig(workspaceRoot);
-  const jobs = sortJobsNewestFirst(filterJobsForCurrentSession(listJobs(workspaceRoot), options));
+  const jobs = sortJobsNewestFirst(filterJobsForCurrentSession(listJobsReconciled(workspaceRoot), options));
   const maxJobs = options.maxJobs ?? DEFAULT_MAX_STATUS_JOBS;
   const maxProgressLines = options.maxProgressLines ?? DEFAULT_MAX_PROGRESS_LINES;
 
@@ -241,7 +296,7 @@ export function buildStatusSnapshot(cwd, options = {}) {
 
 export function buildSingleJobSnapshot(cwd, reference, options = {}) {
   const workspaceRoot = resolveWorkspaceRoot(cwd);
-  const jobs = sortJobsNewestFirst(listJobs(workspaceRoot));
+  const jobs = sortJobsNewestFirst(listJobsReconciled(workspaceRoot));
   const selected = matchJobReference(jobs, reference);
   if (!selected) {
     throw new Error(`No job found for "${reference}". Run /codex:status to inspect known jobs.`);
@@ -255,7 +310,9 @@ export function buildSingleJobSnapshot(cwd, reference, options = {}) {
 
 export function resolveResultJob(cwd, reference) {
   const workspaceRoot = resolveWorkspaceRoot(cwd);
-  const jobs = sortJobsNewestFirst(reference ? listJobs(workspaceRoot) : filterJobsForCurrentSession(listJobs(workspaceRoot)));
+  const jobs = sortJobsNewestFirst(
+    reference ? listJobsReconciled(workspaceRoot) : filterJobsForCurrentSession(listJobsReconciled(workspaceRoot))
+  );
   const selected = matchJobReference(
     jobs,
     reference,
@@ -280,7 +337,7 @@ export function resolveResultJob(cwd, reference) {
 
 export function resolveCancelableJob(cwd, reference, options = {}) {
   const workspaceRoot = resolveWorkspaceRoot(cwd);
-  const jobs = sortJobsNewestFirst(listJobs(workspaceRoot));
+  const jobs = sortJobsNewestFirst(listJobsReconciled(workspaceRoot));
   const activeJobs = jobs.filter((job) => job.status === "queued" || job.status === "running");
 
   if (reference) {
diff --git a/plugins/codex/scripts/lib/render.mjs b/plugins/codex/scripts/lib/render.mjs
index 2ec18523..c84bb9c3 100644
--- a/plugins/codex/scripts/lib/render.mjs
+++ b/plugins/codex/scripts/lib/render.mjs
@@ -1,3 +1,23 @@
+import fs from "node:fs";
+
+// When a job has no captured result payload (e.g. it died before producing one and was
+// reconciled to failed:orphaned), surface the tail of its log so the user still gets the
+// real work / partial output instead of an empty "No result" message.
+function readLogTail(logFile, maxLines = 40) {
+  if (!logFile || !fs.existsSync(logFile)) {
+    return null;
+  }
+  try {
+    const lines = fs
+      .readFileSync(logFile, "utf8")
+      .split(/\r?\n/)
+      .filter((line) => line.length > 0);
+    return lines.length ? lines.slice(-maxLines).join("\n") : null;
+  } catch {
+    return null;
+  }
+}
+
 function severityRank(severity) {
   switch (severity) {
     case "critical":
@@ -442,6 +462,11 @@ export function renderStoredJobResult(job, storedJob) {
     lines.push("", "No captured result payload was stored for this job.");
   }
 
+  const logTail = readLogTail(job.logFile ?? storedJob?.logFile);
+  if (logTail) {
+    lines.push("", "--- last log lines (no result payload was captured) ---", logTail);
+  }
+
   return `${lines.join("\n").trimEnd()}\n`;
 }
 
diff --git a/plugins/codex/scripts/lib/tracked-jobs.mjs b/plugins/codex/scripts/lib/tracked-jobs.mjs
index 90286901..9318ec73 100644
--- a/plugins/codex/scripts/lib/tracked-jobs.mjs
+++ b/plugins/codex/scripts/lib/tracked-jobs.mjs
@@ -151,6 +151,52 @@ export async function runTrackedJob(job, runner, options = {}) {
   writeJobFile(job.workspaceRoot, job.id, runningRecord);
   upsertJob(job.workspaceRoot, runningRecord);
 
+  // Crash safety: if this worker dies before recording a terminal state — SIGTERM/SIGINT/
+  // SIGHUP, an uncaught error, or a plain process exit while still "running" — flush a
+  // terminal `failed:interrupted` record so status/result never see a permanent "running"
+  // ghost. (SIGKILL/OOM cannot be trapped here; those are caught by the pid-liveness
+  // reconcile in job-control.mjs. Defense in depth.)
+  let settled = false;
+  const finalizeInterrupted = () => {
+    if (settled) {
+      return;
+    }
+    const existing = readStoredJobOrNull(job.workspaceRoot, job.id);
+    if (!existing || (existing.status !== "running" && existing.status !== "queued")) {
+      return;
+    }
+    settled = true;
+    const patch = {
+      status: "failed",
+      phase: "failed:interrupted",
+      pid: null,
+      completedAt: nowIso(),
+      errorMessage: "Worker process terminated before completion."
+    };
+    try {
+      writeJobFile(job.workspaceRoot, job.id, { ...existing, ...patch });
+      upsertJob(job.workspaceRoot, { id: job.id, ...patch });
+    } catch {
+      // best-effort
+    }
+  };
+  const onSignal = (signal) => {
+    finalizeInterrupted();
+    process.exit(signal === "SIGINT" ? 130 : 143);
+  };
+  const trappedSignals = ["SIGTERM", "SIGINT", "SIGHUP"];
+  process.on("exit", finalizeInterrupted);
+  for (const signal of trappedSignals) {
+    process.on(signal, onSignal);
+  }
+  const disarm = () => {
+    settled = true;
+    process.removeListener("exit", finalizeInterrupted);
+    for (const signal of trappedSignals) {
+      process.removeListener(signal, onSignal);
+    }
+  };
+
   try {
     const execution = await runner();
     const completionStatus = execution.exitStatus === 0 ? "completed" : "failed";
@@ -200,5 +246,7 @@ export async function runTrackedJob(job, runner, options = {}) {
       completedAt
     });
     throw error;
+  } finally {
+    disarm();
   }
 }
diff --git a/tests/job-control.test.mjs b/tests/job-control.test.mjs
new file mode 100644
index 00000000..ecec0878
--- /dev/null
+++ b/tests/job-control.test.mjs
@@ -0,0 +1,112 @@
+import fs from "node:fs";
+import { spawn } from "node:child_process";
+import test from "node:test";
+import assert from "node:assert/strict";
+
+import { makeTempDir } from "./helpers.mjs";
+import { resolveJobLogFile, upsertJob, writeJobFile } from "../plugins/codex/scripts/lib/state.mjs";
+import { buildStatusSnapshot, resolveResultJob } from "../plugins/codex/scripts/lib/job-control.mjs";
+
+// These cover the liveness-reconcile fix: a job whose worker pid is dead but whose stored
+// status is still active must not masquerade as `running` forever (the zombie bug).
+function withWorkspace(fn) {
+  const prevData = process.env.CLAUDE_PLUGIN_DATA;
+  const prevSession = process.env.CODEX_COMPANION_SESSION_ID;
+  process.env.CLAUDE_PLUGIN_DATA = makeTempDir();
+  delete process.env.CODEX_COMPANION_SESSION_ID; // avoid session-scoped filtering in tests
+  const workspace = makeTempDir();
+  try {
+    return fn(workspace);
+  } finally {
+    if (prevData === undefined) {
+      delete process.env.CLAUDE_PLUGIN_DATA;
+    } else {
+      process.env.CLAUDE_PLUGIN_DATA = prevData;
+    }
+    if (prevSession === undefined) {
+      delete process.env.CODEX_COMPANION_SESSION_ID;
+    } else {
+      process.env.CODEX_COMPANION_SESSION_ID = prevSession;
+    }
+  }
+}
+
+const DEAD_PID = 999999; // beyond the macOS/Linux pid range — guaranteed not running
+
+test("status reconciles a running job with a dead worker pid to failed:orphaned", () => {
+  withWorkspace((workspace) => {
+    const logFile = resolveJobLogFile(workspace, "task-dead");
+    fs.writeFileSync(logFile, "[t] Running command: pytest\n[t] partial work captured\n");
+    const record = {
+      id: "task-dead",
+      status: "running",
+      pid: DEAD_PID,
+      startedAt: new Date().toISOString(),
+      logFile,
+      jobClass: "task"
+    };
+    upsertJob(workspace, record);
+    writeJobFile(workspace, "task-dead", record);
+
+    const snapshot = buildStatusSnapshot(workspace);
+    assert.equal(
+      snapshot.running.some((job) => job.id === "task-dead"),
+      false,
+      "a dead-pid job must not remain in running[]"
+    );
+    const finished = [snapshot.latestFinished, ...snapshot.recent].filter(Boolean);
+    const dead = finished.find((job) => job.id === "task-dead");
+    assert.ok(dead, "the dead job must be reconciled into a terminal bucket");
+    assert.equal(dead.status, "failed");
+    assert.equal(dead.phase, "failed:orphaned");
+
+    // Idempotent: a second read must not error or flip it back to running.
+    const second = buildStatusSnapshot(workspace);
+    assert.equal(second.running.some((job) => job.id === "task-dead"), false);
+  });
+});
+
+test("status leaves a running job with a live worker pid untouched", () => {
+  withWorkspace((workspace) => {
+    const child = spawn("sleep", ["10"]);
+    try {
+      upsertJob(workspace, {
+        id: "task-live",
+        status: "running",
+        pid: child.pid,
+        startedAt: new Date().toISOString(),
+        jobClass: "task"
+      });
+      const snapshot = buildStatusSnapshot(workspace);
+      assert.equal(
+        snapshot.running.some((job) => job.id === "task-live" && job.status === "running"),
+        true,
+        "a live-pid job must stay running"
+      );
+    } finally {
+      child.kill();
+    }
+  });
+});
+
+test("result resolves a reconciled orphan instead of throwing 'still running'", () => {
+  withWorkspace((workspace) => {
+    const logFile = resolveJobLogFile(workspace, "task-orphan");
+    fs.writeFileSync(logFile, "[t] FINDING: example at foo.js:12\n");
+    const record = {
+      id: "task-orphan",
+      status: "running",
+      pid: DEAD_PID,
+      startedAt: new Date().toISOString(),
+      logFile,
+      jobClass: "task",
+      title: "Codex Task"
+    };
+    upsertJob(workspace, record);
+    writeJobFile(workspace, "task-orphan", record);
+
+    const { job } = resolveResultJob(workspace, "task-orphan");
+    assert.equal(job.status, "failed");
+    assert.equal(job.phase, "failed:orphaned");
+  });
+});

From ee34bb6ecca457b98a3f10b5912a51a4802b8ee4 Mon Sep 17 00:00:00 2001
From: Mike Edwards <njrun1804@gmail.com>
Date: Sun, 14 Jun 2026 14:12:23 -0400
Subject: [PATCH 2/5] harden(codex): wall-clock turn timeout so a hung turn
 can't zombie the worker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A single app-server turn that never reports completion would await forever. The
liveness-reconcile in job-control only reaps DEAD workers — a turn that hangs while the
pid stays alive slips past it. captureTurn now races state.completion against a wall-clock
deadline (CODEX_TURN_TIMEOUT_MS, default 45m, 0 disables); on expiry it best-effort
turn/interrupts the stuck turn and throws, so the job finalizes failed instead of hanging.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 plugins/codex/scripts/lib/codex.mjs | 51 +++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs
index f2fe88bd..a2b9006f 100644
--- a/plugins/codex/scripts/lib/codex.mjs
+++ b/plugins/codex/scripts/lib/codex.mjs
@@ -550,6 +550,23 @@ function applyTurnNotification(state, message) {
   }
 }
 
+const DEFAULT_TURN_TIMEOUT_MS = 45 * 60 * 1000;
+
+// Wall-clock ceiling for a single app-server turn. A turn that never reports completion would
+// hang the worker forever — and because the worker pid stays alive, the liveness-reconcile in
+// job-control cannot catch it (that only reaps DEAD workers). CODEX_TURN_TIMEOUT_MS overrides; 0 disables.
+function resolveTurnTimeoutMs(options = {}) {
+  const raw = options.turnTimeoutMs ?? process.env.CODEX_TURN_TIMEOUT_MS;
+  if (raw === undefined || raw === null || raw === "") {
+    return DEFAULT_TURN_TIMEOUT_MS;
+  }
+  const value = Number(raw);
+  if (!Number.isFinite(value) || value < 0) {
+    return DEFAULT_TURN_TIMEOUT_MS;
+  }
+  return value;
+}
+
 async function captureTurn(client, threadId, startRequest, options = {}) {
   const state = createTurnCaptureState(threadId, options);
   const previousHandler = client.notificationHandler;
@@ -597,6 +614,40 @@ async function captureTurn(client, threadId, startRequest, options = {}) {
       completeTurn(state, response.turn);
     }
 
+    const turnTimeoutMs = resolveTurnTimeoutMs(options);
+    if (turnTimeoutMs > 0) {
+      let timeoutTimer = null;
+      const timeout = new Promise((_, reject) => {
+        timeoutTimer = setTimeout(() => {
+          timeoutTimer = null;
+          reject(
+            new Error(
+              `Codex turn exceeded ${Math.round(turnTimeoutMs / 1000)}s without completing — interrupted and marked failed (set CODEX_TURN_TIMEOUT_MS=0 to disable).`
+            )
+          );
+        }, turnTimeoutMs);
+        timeoutTimer.unref?.();
+      });
+      try {
+        return await Promise.race([state.completion, timeout]);
+      } catch (error) {
+        if (state.turnId) {
+          // Best-effort: tell the app server to stop the stuck turn before we surface the failure.
+          try {
+            await client.request("turn/interrupt", {
+              threadId: state.threadId,
+              turnId: state.turnId
+            });
+          } catch {}
+        }
+        throw error;
+      } finally {
+        if (timeoutTimer) {
+          clearTimeout(timeoutTimer);
+        }
+      }
+    }
+
     return await state.completion;
   } finally {
     clearCompletionTimer(state);

From b77d6dc8175ec6c5bf295610043a8882c5122449 Mon Sep 17 00:00:00 2001
From: Mike Edwards <njrun1804@gmail.com>
Date: Sun, 14 Jun 2026 14:12:23 -0400
Subject: [PATCH 3/5] =?UTF-8?q?feat(codex):=20design-critique=20+=20house?=
 =?UTF-8?q?=20review=20overlay=20=E2=80=94=20the=20two=20second-family=20j?=
 =?UTF-8?q?obs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex is used as a second model family for exactly two jobs; make both first-class and "like us":

(a) Design critique — /codex:critique + codex-critique subagent + handleCritique run the new
    design-critique prompt READ-ONLY, with a hard mandate to ground every load-bearing claim in
    the code (file:line) AND the LIVE database (cite the query + result) — not prose-only.
(b) Code review — a prompt-overlay seam (loadPromptTemplate appends prompts/overlays/<name>.md
    inside <house_conventions>) carries our adversarial-review rubric ON TOP of the upstream
    prompt, so "git pull upstream" never conflicts on the scaffold. The rubric: contradict don't
    echo, correctness-first, file:line, demand check.sh proof, flag scope creep.

Also makes the brittle exact-list command test tolerant of fork additions, and adds
critique-and-overlays.test.mjs to lock in both jobs + the timeout. 96/96 green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 plugins/codex/agents/codex-critique.md        | 33 +++++++
 plugins/codex/commands/critique.md            | 23 +++++
 plugins/codex/prompts/design-critique.md      | 49 ++++++++++
 .../prompts/overlays/adversarial-review.md    | 16 ++++
 plugins/codex/scripts/codex-companion.mjs     | 64 +++++++++++++
 plugins/codex/scripts/lib/prompts.mjs         | 14 ++-
 tests/commands.test.mjs                       | 11 ++-
 tests/critique-and-overlays.test.mjs          | 96 +++++++++++++++++++
 8 files changed, 303 insertions(+), 3 deletions(-)
 create mode 100644 plugins/codex/agents/codex-critique.md
 create mode 100644 plugins/codex/commands/critique.md
 create mode 100644 plugins/codex/prompts/design-critique.md
 create mode 100644 plugins/codex/prompts/overlays/adversarial-review.md
 create mode 100644 tests/critique-and-overlays.test.mjs

diff --git a/plugins/codex/agents/codex-critique.md b/plugins/codex/agents/codex-critique.md
new file mode 100644
index 00000000..d9bbd8ed
--- /dev/null
+++ b/plugins/codex/agents/codex-critique.md
@@ -0,0 +1,33 @@
+---
+name: codex-critique
+description: Use to hand a DESIGN (not a finished diff) to Codex for an independent, code- and data-grounded critique. Codex reads the repo AND queries the live database, then reports where the design is wrong, unjustified, or unsupported by what is actually there.
+model: sonnet
+tools: Bash
+skills:
+  - codex-cli-runtime
+---
+
+You are a thin forwarding wrapper around the Codex companion `critique` runtime.
+
+Your only job is to forward the user's design-critique request to the Codex companion script. Do not do anything else.
+
+Selection guidance:
+
+- Use this subagent when Claude has produced or endorsed a DESIGN and wants a second model family to attack it before it gets built — not for reviewing a finished diff (that is `adversarial-review`) and not for running a task (that is `codex-rescue`).
+- The whole point is independence: Codex reads the code and queries the database itself, so do not pre-digest the design for it.
+
+Forwarding rules:
+
+- Use exactly one `Bash` call to invoke `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" critique ...`.
+- The `critique` run is READ-ONLY by design (a critique never edits). Never add `--write`.
+- If the user did not explicitly choose `--background` or `--wait`, prefer background — a design critique reads the whole repo and queries the database, and usually runs long.
+- Pass the user's design reference through unchanged: the design text itself as the positional argument, or `--prompt-file <path>` when they point at a design-doc file.
+- If the user supplies `--focus "<text>"`, forward it as `--focus "<text>"` — it steers what the critique scrutinizes.
+- Treat `--effort <value>` and `--model <value>` as runtime controls and forward them; do not include them in the design text. If the user asks for `spark`, map it to `--model gpt-5.3-codex-spark`.
+- Do not reshape, summarize, or pre-analyze the design yourself. Do not inspect the repository, read files, grep, monitor progress, poll status, fetch results, or do any follow-up work of your own. The design-critique prompt and Codex do the analysis.
+- Return the stdout of the `codex-companion` command exactly as-is.
+- If the Bash call fails or Codex cannot be invoked, return nothing.
+
+Response style:
+
+- Do not add commentary before or after the forwarded `codex-companion` output.
diff --git a/plugins/codex/commands/critique.md b/plugins/codex/commands/critique.md
new file mode 100644
index 00000000..3e5ce6fc
--- /dev/null
+++ b/plugins/codex/commands/critique.md
@@ -0,0 +1,23 @@
+---
+description: Hand a DESIGN to Codex (a second model family) for an independent critique with full read access to the code AND the live database
+argument-hint: "[--background|--wait] [--model <model|spark>] [--effort <none|minimal|low|medium|high|xhigh>] [--focus \"what to scrutinize\"] <design-doc path, or the design to critique>"
+allowed-tools: Bash(node:*), Agent
+---
+
+Invoke the `codex:codex-critique` subagent via the `Agent` tool (`subagent_type: "codex:codex-critique"`), forwarding the raw user request as the prompt.
+`codex:codex-critique` is a subagent, not a skill — do not call `Skill(codex:codex-critique)` (no such skill) or `Skill(codex:critique)` (that re-enters this command and hangs the session). The command runs inline so the `Agent` tool stays in scope; forked general-purpose subagents do not expose it.
+The final user-visible response must be Codex's output verbatim.
+
+This is for critiquing a DESIGN before it is built — Codex reads the codebase and queries the live database to check the design's claims. For reviewing finished work, use `/codex:adversarial-review`; for delegating a task, use `/codex:rescue`.
+
+Raw user request:
+$ARGUMENTS
+
+Execution mode:
+
+- If the request includes `--background`, run the `codex:codex-critique` subagent in the background.
+- If the request includes `--wait`, run the `codex:codex-critique` subagent in the foreground.
+- If neither flag is present, default to background — a design critique reads the whole repo and queries the database, and usually runs long.
+- `--background` and `--wait` are execution flags for Claude Code. Do not forward them to `critique`, and do not treat them as part of the design text.
+- `--model`, `--effort`, and `--focus` are runtime-selection flags. Preserve them for the forwarded `critique` call, but do not treat them as part of the design text.
+- If the user did not supply a design, ask what design Codex should critique (a design-doc path or the design itself).
diff --git a/plugins/codex/prompts/design-critique.md b/plugins/codex/prompts/design-critique.md
new file mode 100644
index 00000000..97e84ebd
--- /dev/null
+++ b/plugins/codex/prompts/design-critique.md
@@ -0,0 +1,49 @@
+<role>
+You are Codex, a SECOND model family, critiquing a DESIGN — not reviewing a finished diff.
+A different model (Claude) produced or endorsed this design. Your entire value is independence:
+echoing its conclusions is worthless. Find where the design is wrong, unjustified, or unsupported by
+what is actually in the codebase and the database.
+</role>
+
+<what_you_have>
+You have full read access to the repository AND, where a database / MCP tool is available, the LIVE data.
+USE BOTH. A design critique that only reasons about the prose is half a critique.
+- Read the code the design touches and confirm the design's claims about what already exists.
+- Query the database (e.g. `scripts/sql.sh "SELECT ..."` or the repo's MCP) to check the design's
+  assumptions about the DATA: counts, the n, nulls, distributions, whether a claimed relationship or
+  baseline is even present. A design that asserts something about the data is only as good as the query
+  that confirms it. Cite the query and its result in your finding.
+</what_you_have>
+
+<method>
+For each load-bearing claim or decision in the design:
+1. State the claim in one line.
+2. Verify it against the code (`file:line`) and/or the data (the query + its result).
+3. If it holds, say so in one line and move on. If it does NOT hold, that is your highest-value finding —
+   lead with it.
+Hunt specifically for: assumptions the data does not support; a simpler design the existing code already
+enables; hidden coupling or a confound the design ignores; an estimand / metric that means something
+different than the design assumes; scope the design under- or over-reaches; a failure mode at the
+empty-state, the n=small case, or under real data skew.
+</method>
+
+<stance>
+Default to skepticism. Do not credit good intent, partial coverage, or likely follow-up. Ground every
+point in a row, a query result, or a `file:line` — never a vibe. Prefer one well-evidenced structural
+objection over many shallow ones. If, after you actually checked the code AND the data, the design holds,
+say so plainly and state exactly what you verified (the files you read, the queries you ran).
+</stance>
+
+<output>
+Lead with a one-line verdict: does this design hold up, or not, and the single biggest reason. Then the
+findings, most-load-bearing first, each with: the claim, what you checked (file:line and/or query+result),
+and what the design should change. Be terse. Surface what is missing, not just what is wrong.
+</output>
+
+<design>
+{{DESIGN_INPUT}}
+</design>
+
+<focus>
+{{USER_FOCUS}}
+</focus>
diff --git a/plugins/codex/prompts/overlays/adversarial-review.md b/plugins/codex/prompts/overlays/adversarial-review.md
new file mode 100644
index 00000000..8ab171f2
--- /dev/null
+++ b/plugins/codex/prompts/overlays/adversarial-review.md
@@ -0,0 +1,16 @@
+House review rubric — apply ON TOP of the adversarial stance above. This is a second-family
+code review of completed work; the author and the first model (Claude) already believe it is right.
+
+- You are a SECOND model family. Your value is to CONTRADICT, not echo. Hunt for what the author and
+  the first model would rubber-stamp. If you only agree, you added nothing — find the thing they missed.
+- Order findings by what bites: correctness and regressions first, then reuse / simplification /
+  efficiency / altitude. Style or naming only when it causes a real bug.
+- Every finding cites `file:line` and is defensible from the actual code or a tool output. Never invent
+  a path, a line, or runtime behavior you cannot support.
+- Severity-rank, and prefer one strong, real finding over a pile of weak ones. If it is genuinely safe
+  after you tried to break it, say so plainly and return no findings — do not manufacture filler.
+- Verification expectation: a claim of "works / passing / fixed" must be backed by command output. Flag
+  any change that edits code but does not run the repo's own check (e.g. `scripts/check.sh`) or whose
+  tests do not actually exercise the changed behavior.
+- Scope discipline: changes beyond what the task asked for (bonus refactors, "while I'm here" edits) are
+  a finding, not a courtesy.
diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs
index 35222fd5..609d74ef 100644
--- a/plugins/codex/scripts/codex-companion.mjs
+++ b/plugins/codex/scripts/codex-companion.mjs
@@ -792,6 +792,67 @@ async function handleTask(argv) {
   );
 }
 
+// Design critique (the second-family, code+DB-grounded critique of a DESIGN — not a diff).
+// Same task engine as handleTask, but the prompt wraps the user's design in the design-critique
+// template and the run is READ-ONLY (a critique never edits). The design reference is read the
+// same way a task prompt is (positional text or --prompt-file).
+async function handleCritique(argv) {
+  const { options, positionals } = parseCommandInput(argv, {
+    valueOptions: ["model", "effort", "cwd", "prompt-file", "focus"],
+    booleanOptions: ["json", "background"],
+    aliasMap: {
+      m: "model"
+    }
+  });
+
+  const cwd = resolveCommandCwd(options);
+  const workspaceRoot = resolveCommandWorkspace(options);
+  const model = normalizeRequestedModel(options.model);
+  const effort = normalizeReasoningEffort(options.effort);
+  const designInput = readTaskPrompt(cwd, options, positionals);
+  requireTaskRequest(designInput, false);
+
+  const prompt = interpolateTemplate(loadPromptTemplate(ROOT_DIR, "design-critique"), {
+    DESIGN_INPUT: designInput,
+    USER_FOCUS: typeof options.focus === "string" ? options.focus : ""
+  });
+  const taskMetadata = buildTaskRunMetadata({ prompt, resumeLast: false });
+
+  if (options.background) {
+    ensureCodexAvailable(cwd);
+    const job = buildTaskJob(workspaceRoot, taskMetadata, false);
+    const request = buildTaskRequest({
+      cwd,
+      model,
+      effort,
+      prompt,
+      write: false,
+      resumeLast: false,
+      jobId: job.id
+    });
+    const { payload } = enqueueBackgroundTask(cwd, job, request);
+    outputCommandResult(payload, renderQueuedTaskLaunch(payload), options.json);
+    return;
+  }
+
+  const job = buildTaskJob(workspaceRoot, taskMetadata, false);
+  await runForegroundCommand(
+    job,
+    (progress) =>
+      executeTaskRun({
+        cwd,
+        model,
+        effort,
+        prompt,
+        write: false,
+        resumeLast: false,
+        jobId: job.id,
+        onProgress: progress
+      }),
+    { json: options.json }
+  );
+}
+
 async function handleTaskWorker(argv) {
   const { options } = parseCommandInput(argv, {
     valueOptions: ["cwd", "job-id"]
@@ -1000,6 +1061,9 @@ async function main() {
     case "task":
       await handleTask(argv);
       break;
+    case "critique":
+      await handleCritique(argv);
+      break;
     case "task-worker":
       await handleTaskWorker(argv);
       break;
diff --git a/plugins/codex/scripts/lib/prompts.mjs b/plugins/codex/scripts/lib/prompts.mjs
index 20108150..4a9026bd 100644
--- a/plugins/codex/scripts/lib/prompts.mjs
+++ b/plugins/codex/scripts/lib/prompts.mjs
@@ -3,7 +3,19 @@ import path from "node:path";
 
 export function loadPromptTemplate(rootDir, name) {
   const promptPath = path.join(rootDir, "prompts", `${name}.md`);
-  return fs.readFileSync(promptPath, "utf8");
+  let template = fs.readFileSync(promptPath, "utf8");
+  // House overlay (ours, not upstream's): if prompts/overlays/<name>.md exists, append it so
+  // our review/critique conventions ride ON TOP of the upstream prompt without editing it —
+  // `git pull upstream` never conflicts on the prompt scaffold; only this loader is a small,
+  // stable fork change. The overlay carries no {{TOKENS}}, so interpolation leaves it intact.
+  const overlayPath = path.join(rootDir, "prompts", "overlays", `${name}.md`);
+  if (fs.existsSync(overlayPath)) {
+    const overlay = fs.readFileSync(overlayPath, "utf8").trim();
+    if (overlay) {
+      template += `\n\n<house_conventions>\n${overlay}\n</house_conventions>\n`;
+    }
+  }
+  return template;
 }
 
 export function interpolateTemplate(template, variables) {
diff --git a/tests/commands.test.mjs b/tests/commands.test.mjs
index 3724ffa4..653f484a 100644
--- a/tests/commands.test.mjs
+++ b/tests/commands.test.mjs
@@ -72,7 +72,12 @@ test("adversarial review command uses AskUserQuestion and background Bash while
 
 test("continue is not exposed as a user-facing command", () => {
   const commandFiles = fs.readdirSync(path.join(PLUGIN_ROOT, "commands")).sort();
-  assert.deepEqual(commandFiles, [
+  // The real invariant: `continue` is folded into `rescue`, never its own command.
+  assert.equal(commandFiles.includes("continue.md"), false);
+  // The upstream command set must all still be present. This fork may ADD commands
+  // (e.g. critique.md) on top — that must not break this guard, and the guard must
+  // not have to be re-edited on every upstream sync.
+  for (const expected of [
     "adversarial-review.md",
     "cancel.md",
     "rescue.md",
@@ -80,7 +85,9 @@ test("continue is not exposed as a user-facing command", () => {
     "review.md",
     "setup.md",
     "status.md"
-  ]);
+  ]) {
+    assert.equal(commandFiles.includes(expected), true, `missing command: ${expected}`);
+  }
 });
 
 test("rescue command absorbs continue semantics", () => {
diff --git a/tests/critique-and-overlays.test.mjs b/tests/critique-and-overlays.test.mjs
new file mode 100644
index 00000000..e5ae295a
--- /dev/null
+++ b/tests/critique-and-overlays.test.mjs
@@ -0,0 +1,96 @@
+import fs from "node:fs";
+import path from "node:path";
+import test from "node:test";
+import assert from "node:assert/strict";
+import { fileURLToPath } from "node:url";
+
+import { loadPromptTemplate } from "../plugins/codex/scripts/lib/prompts.mjs";
+
+const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..");
+const PLUGIN_ROOT = path.join(ROOT, "plugins", "codex");
+
+function read(relativePath) {
+  return fs.readFileSync(path.join(PLUGIN_ROOT, relativePath), "utf8");
+}
+
+// Job (a): design critique. Codex is a second model family used to critique a DESIGN with full
+// read access to the code AND the live database — before the design is built.
+test("critique command forwards to the codex-critique subagent and stays read-only", () => {
+  const command = read("commands/critique.md");
+  assert.match(command, /subagent_type: "codex:codex-critique"/);
+  assert.match(command, /do not call `Skill\(codex:codex-critique\)`/i);
+  assert.match(command, /allowed-tools:\s*Bash\(node:\*\),\s*Agent/);
+  assert.match(command, /critiquing a DESIGN before it is built/i);
+  assert.match(command, /default to background/i);
+  assert.match(command, /--focus/);
+  assert.match(command, /The final user-visible response must be Codex's output verbatim/i);
+});
+
+test("codex-critique agent is a thin forwarder to `critique` that never writes", () => {
+  const agent = read("agents/codex-critique.md");
+  assert.match(agent, /thin forwarding wrapper/i);
+  assert.match(agent, /codex-companion\.mjs" critique \.\.\./);
+  assert.match(agent, /READ-ONLY by design/i);
+  assert.match(agent, /Never add `--write`/i);
+  // It must NOT do its own analysis — independence is the whole point.
+  assert.match(agent, /Do not reshape, summarize, or pre-analyze the design yourself/i);
+  assert.match(agent, /Return the stdout of the `codex-companion` command exactly as-is/i);
+  assert.match(agent, /codex-cli-runtime/);
+});
+
+test("the design-critique prompt mandates grounding in BOTH the code and the live database", () => {
+  const prompt = read("prompts/design-critique.md");
+  assert.match(prompt, /SECOND model family/);
+  assert.match(prompt, /echoing its conclusions is worthless/i);
+  // The database-grounding mandate is the thing that makes this critique better than prose-only.
+  assert.match(prompt, /LIVE data/i);
+  assert.match(prompt, /Query the database/i);
+  assert.match(prompt, /Cite the query and its result/i);
+  assert.match(prompt, /file:line/);
+  assert.match(prompt, /\{\{DESIGN_INPUT\}\}/);
+  assert.match(prompt, /\{\{USER_FOCUS\}\}/);
+});
+
+test("the companion exposes a read-only `critique` entrypoint", () => {
+  const companion = read("scripts/codex-companion.mjs");
+  assert.match(companion, /case "critique":/);
+  assert.match(companion, /async function handleCritique\(argv\)/);
+  assert.match(companion, /loadPromptTemplate\(ROOT_DIR, "design-critique"\)/);
+  // handleCritique must run write:false (a critique never edits).
+  assert.doesNotMatch(companion, /handleCritique[\s\S]*?write:\s*true/);
+});
+
+// The overlay seam: house conventions ride ON TOP of upstream prompts so `git pull upstream`
+// never conflicts on the prompt scaffold. This is the durable "make it like us" mechanism.
+test("the prompt overlay seam appends house conventions only when an overlay exists", () => {
+  // adversarial-review HAS an overlay -> house_conventions block is appended.
+  const reviewed = loadPromptTemplate(PLUGIN_ROOT, "adversarial-review");
+  assert.match(reviewed, /<house_conventions>/);
+  assert.match(reviewed, /SECOND model family/);
+  assert.match(reviewed, /CONTRADICT, not echo/i);
+
+  // design-critique has NO overlay -> no house_conventions wrapper is added.
+  const critique = loadPromptTemplate(PLUGIN_ROOT, "design-critique");
+  assert.doesNotMatch(critique, /<house_conventions>/);
+});
+
+// Job (b): code review on completed work. The house review rubric is carried by the overlay.
+test("the adversarial-review overlay carries the house code-review rubric", () => {
+  const overlay = read("prompts/overlays/adversarial-review.md");
+  assert.match(overlay, /SECOND model family/);
+  assert.match(overlay, /CONTRADICT, not echo/i);
+  assert.match(overlay, /correctness and regressions first/i);
+  assert.match(overlay, /file:line/);
+  assert.match(overlay, /scripts\/check\.sh/);
+  assert.match(overlay, /Scope discipline/i);
+});
+
+// Robustness: a turn that never completes would hang the worker forever (pid stays alive, so the
+// liveness-reconcile can't catch it). A wall-clock turn timeout interrupts and fails it instead.
+test("captureTurn has a wall-clock turn timeout that interrupts the stuck turn", () => {
+  const codex = read("scripts/lib/codex.mjs");
+  assert.match(codex, /CODEX_TURN_TIMEOUT_MS/);
+  assert.match(codex, /function resolveTurnTimeoutMs/);
+  assert.match(codex, /Promise\.race\(\[state\.completion, timeout\]\)/);
+  assert.match(codex, /turn\/interrupt/);
+});

From 8fe24679520f46a6e8b7a7dc4eb6add349885444 Mon Sep 17 00:00:00 2001
From: Mike Edwards <njrun1804@gmail.com>
Date: Sun, 14 Jun 2026 14:39:52 -0400
Subject: [PATCH 4/5] retire(codex): /codex:review folds into
 adversarial-review; document the two-job model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex is a second model family for exactly two jobs — critique a design before building it
(/codex:critique), and adversarially review completed work (/codex:adversarial-review). The
plain /codex:review command was a strict subset of adversarial-review (same engine, same git
targets, just no focus text and no house rubric), so it only created "which review?" confusion.

- Remove the /codex:review command (the user-facing path). The companion `review` engine stays —
  the optional Stop review-gate uses it internally (runtime tests still cover it).
- README: drop the review section + refs, add a /codex:critique section, frame the two-job model
  up front. adversarial-review is now "the one review command".
- Skill routing + result.md follow-ups point at adversarial-review / critique.
- Tests updated to match. 95/95 green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                                     | 49 ++++++++-------
 plugins/codex/commands/adversarial-review.md  |  4 +-
 plugins/codex/commands/result.md              |  2 +-
 plugins/codex/commands/review.md              | 61 -------------------
 .../codex/skills/gpt-5-4-prompting/SKILL.md   |  2 +-
 tests/commands.test.mjs                       | 36 +----------
 6 files changed, 31 insertions(+), 123 deletions(-)
 delete mode 100644 plugins/codex/commands/review.md

diff --git a/README.md b/README.md
index 458c39fb..1aa9ac90 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Codex plugin for Claude Code
 
-Use Codex from inside Claude Code for code reviews or to delegate tasks to Codex.
+Use Codex from inside Claude Code as a second model family — to critique a design before you build it, adversarially review completed work, or delegate a task to Codex.
 
 This plugin is for Claude Code users who want an easy way to start using Codex from the workflow
 they already have.
@@ -9,9 +9,11 @@ they already have.
 
 ## What You Get
 
-- `/codex:review` for a normal read-only Codex review
-- `/codex:adversarial-review` for a steerable challenge review
-- `/codex:rescue`, `/codex:status`, `/codex:result`, and `/codex:cancel` to delegate work and manage background jobs
+Codex is a **second model family** you reach for in two situations, plus a workhorse for delegation:
+
+- `/codex:critique` — hand it a **design, before you build it**. Codex reads the code *and* queries the live database, then attacks the design where the data doesn't back it up.
+- `/codex:adversarial-review` — an adversarial review of **completed work**. The one review command: it challenges the approach, not just the diff.
+- `/codex:rescue`, `/codex:status`, `/codex:result`, and `/codex:cancel` — delegate a task to Codex and manage background jobs.
 
 ## Requirements
 
@@ -62,41 +64,32 @@ If Codex is installed but not logged in yet, run:
 After install, you should see:
 
 - the slash commands listed below
-- the `codex:codex-rescue` subagent in `/agents`
+- the `codex:codex-rescue` and `codex:codex-critique` subagents in `/agents`
 
 One simple first run is:
 
 ```bash
-/codex:review --background
+/codex:adversarial-review --background
 /codex:status
 /codex:result
 ```
 
 ## Usage
 
-### `/codex:review`
+### `/codex:critique`
 
-Runs a normal Codex review on your current work. It gives you the same quality of code review as running `/review` inside Codex directly.
+Hands Codex a **design — before you build it** — for an independent, second-family critique. Codex has full read access to the repository *and*, where a database or MCP tool is available, the live data. It verifies each load-bearing claim in the design against the code (`file:line`) and the data (the actual query and its result), and leads with where the design does *not* hold up.
 
-> [!NOTE]
-> Code review especially for multi-file changes might take a while. It's generally recommended to run it in the background.
+Reach for this when Claude has produced or endorsed a design and you want a different model to attack it before it gets built — not to review a finished diff (that's [`/codex:adversarial-review`](#codexadversarial-review)) and not to run a task (that's [`/codex:rescue`](#codexrescue)).
 
-Use it when you want:
-
-- a review of your current uncommitted changes
-- a review of your branch compared to a base branch like `main`
-
-Use `--base <ref>` for branch review. It also supports `--wait` and `--background`. It is not steerable and does not take custom focus text. Use [`/codex:adversarial-review`](#codexadversarial-review) when you want to challenge a specific decision or risk area.
-
-Examples:
+Pass the design inline, or point at a design doc:
 
 ```bash
-/codex:review
-/codex:review --base main
-/codex:review --background
+/codex:critique docs/design/new-cache-layer.md
+/codex:critique --focus "the n is tiny — does the data even support this split?" we plan to ...
 ```
 
-This command is read-only and will not perform any changes. When run in the background you can use [`/codex:status`](#codexstatus) to check on the progress and [`/codex:cancel`](#codexcancel) to cancel the ongoing task.
+It is read-only and runs in the background by default — it reads the whole repo and queries the database, so it usually runs long. Use [`/codex:status`](#codexstatus) and [`/codex:result`](#codexresult) to follow it.
 
 ### `/codex:adversarial-review`
 
@@ -104,8 +97,8 @@ Runs a **steerable** review that questions the chosen implementation and design.
 
 It can be used to pressure-test assumptions, tradeoffs, failure modes, and whether a different approach would have been safer or simpler.
 
-It uses the same review target selection as `/codex:review`, including `--base <ref>` for branch review.
-It also supports `--wait` and `--background`. Unlike `/codex:review`, it can take extra focus text after the flags.
+It reviews your current uncommitted changes by default, or your branch against a base with `--base <ref>`.
+It also supports `--wait` and `--background`, and takes extra focus text after the flags to steer what it scrutinizes.
 
 Use it when you want:
 
@@ -223,10 +216,16 @@ When the review gate is enabled, the plugin uses a `Stop` hook to run a targeted
 
 ## Typical Flows
 
+### Critique A Design Before Building
+
+```bash
+/codex:critique docs/design/new-cache-layer.md
+```
+
 ### Review Before Shipping
 
 ```bash
-/codex:review
+/codex:adversarial-review
 ```
 
 ### Hand A Problem To Codex
diff --git a/plugins/codex/commands/adversarial-review.md b/plugins/codex/commands/adversarial-review.md
index da440ab4..7ac4fdc1 100644
--- a/plugins/codex/commands/adversarial-review.md
+++ b/plugins/codex/commands/adversarial-review.md
@@ -39,10 +39,10 @@ Argument handling:
 - Do not strip `--wait` or `--background` yourself.
 - Do not weaken the adversarial framing or rewrite the user's focus text.
 - The companion script parses `--wait` and `--background`, but Claude Code's `Bash(..., run_in_background: true)` is what actually detaches the run.
-- `/codex:adversarial-review` uses the same review target selection as `/codex:review`.
+- `/codex:adversarial-review` is the one review command for completed work.
 - It supports working-tree review, branch review, and `--base <ref>`.
 - It does not support `--scope staged` or `--scope unstaged`.
-- Unlike `/codex:review`, it can still take extra focus text after the flags.
+- It takes extra focus text after the flags to steer what it scrutinizes.
 
 Foreground flow:
 - Run:
diff --git a/plugins/codex/commands/result.md b/plugins/codex/commands/result.md
index 3abc2d93..75013d4b 100644
--- a/plugins/codex/commands/result.md
+++ b/plugins/codex/commands/result.md
@@ -12,4 +12,4 @@ Present the full command output to the user. Do not summarize or condense it. Pr
 - The complete result payload, including verdict, summary, findings, details, artifacts, and next steps
 - File paths and line numbers exactly as reported
 - Any error messages or parse errors
-- Follow-up commands such as `/codex:status <id>` and `/codex:review`
+- Follow-up commands such as `/codex:status <id>` and `/codex:adversarial-review`
diff --git a/plugins/codex/commands/review.md b/plugins/codex/commands/review.md
deleted file mode 100644
index fb70a487..00000000
--- a/plugins/codex/commands/review.md
+++ /dev/null
@@ -1,61 +0,0 @@
----
-description: Run a Codex code review against local git state
-argument-hint: '[--wait|--background] [--base <ref>] [--scope auto|working-tree|branch]'
-disable-model-invocation: true
-allowed-tools: Read, Glob, Grep, Bash(node:*), Bash(git:*), AskUserQuestion
----
-
-Run a Codex review through the shared built-in reviewer.
-
-Raw slash-command arguments:
-`$ARGUMENTS`
-
-Core constraint:
-- This command is review-only.
-- Do not fix issues, apply patches, or suggest that you are about to make changes.
-- Your only job is to run the review and return Codex's output verbatim to the user.
-
-Execution mode rules:
-- If the raw arguments include `--wait`, do not ask. Run the review in the foreground.
-- If the raw arguments include `--background`, do not ask. Run the review in a Claude background task.
-- Otherwise, estimate the review size before asking:
-  - For working-tree review, start with `git status --short --untracked-files=all`.
-  - For working-tree review, also inspect both `git diff --shortstat --cached` and `git diff --shortstat`.
-  - For base-branch review, use `git diff --shortstat <base>...HEAD`.
-  - Treat untracked files or directories as reviewable work even when `git diff --shortstat` is empty.
-  - Only conclude there is nothing to review when the relevant working-tree status is empty or the explicit branch diff is empty.
-  - Recommend waiting only when the review is clearly tiny, roughly 1-2 files total and no sign of a broader directory-sized change.
-  - In every other case, including unclear size, recommend background.
-  - When in doubt, run the review instead of declaring that there is nothing to review.
-- Then use `AskUserQuestion` exactly once with two options, putting the recommended option first and suffixing its label with `(Recommended)`:
-  - `Wait for results`
-  - `Run in background`
-
-Argument handling:
-- Preserve the user's arguments exactly.
-- Do not strip `--wait` or `--background` yourself.
-- Do not add extra review instructions or rewrite the user's intent.
-- The companion script parses `--wait` and `--background`, but Claude Code's `Bash(..., run_in_background: true)` is what actually detaches the run.
-- `/codex:review` is native-review only. It does not support staged-only review, unstaged-only review, or extra focus text.
-- If the user needs custom review instructions or more adversarial framing, they should use `/codex:adversarial-review`.
-
-Foreground flow:
-- Run:
-```bash
-node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" review "$ARGUMENTS"
-```
-- Return the command stdout verbatim, exactly as-is.
-- Do not paraphrase, summarize, or add commentary before or after it.
-- Do not fix any issues mentioned in the review output.
-
-Background flow:
-- Launch the review with `Bash` in the background:
-```typescript
-Bash({
-  command: `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" review "$ARGUMENTS"`,
-  description: "Codex review",
-  run_in_background: true
-})
-```
-- Do not call `BashOutput` or wait for completion in this turn.
-- After launching the command, tell the user: "Codex review started in the background. Check `/codex:status` for progress."
diff --git a/plugins/codex/skills/gpt-5-4-prompting/SKILL.md b/plugins/codex/skills/gpt-5-4-prompting/SKILL.md
index 16669d92..9d17c213 100644
--- a/plugins/codex/skills/gpt-5-4-prompting/SKILL.md
+++ b/plugins/codex/skills/gpt-5-4-prompting/SKILL.md
@@ -31,7 +31,7 @@ When to add blocks:
 - Write-capable tasks: add `action_safety` so Codex stays narrow and avoids unrelated refactors.
 
 How to choose prompt shape:
-- Use built-in `review` or `adversarial-review` commands when the job is reviewing local git changes. Those prompts already carry the review contract.
+- Use the `adversarial-review` command when the job is reviewing completed local git changes, and `critique` when the job is attacking a design before it is built. Those prompts already carry the review/critique contract.
 - Use `task` when the task is diagnosis, planning, research, or implementation and you need to control the prompt more directly.
 - Use `task --resume-last` for follow-up instructions on the same Codex thread. Send only the delta instruction instead of restating the whole prompt unless the direction changed materially.
 
diff --git a/tests/commands.test.mjs b/tests/commands.test.mjs
index 653f484a..f7162be3 100644
--- a/tests/commands.test.mjs
+++ b/tests/commands.test.mjs
@@ -11,34 +11,6 @@ function read(relativePath) {
   return fs.readFileSync(path.join(PLUGIN_ROOT, relativePath), "utf8");
 }
 
-test("review command uses AskUserQuestion and background Bash while staying review-only", () => {
-  const source = read("commands/review.md");
-  assert.match(source, /AskUserQuestion/);
-  assert.match(source, /\bBash\(/);
-  assert.match(source, /Do not fix issues/i);
-  assert.match(source, /review-only/i);
-  assert.match(source, /return Codex's output verbatim to the user/i);
-  assert.match(source, /```bash/);
-  assert.match(source, /```typescript/);
-  assert.match(source, /review "\$ARGUMENTS"/);
-  assert.match(source, /\[--scope auto\|working-tree\|branch\]/);
-  assert.match(source, /run_in_background:\s*true/);
-  assert.match(source, /command:\s*`node "\$\{CLAUDE_PLUGIN_ROOT\}\/scripts\/codex-companion\.mjs" review "\$ARGUMENTS"`/);
-  assert.match(source, /description:\s*"Codex review"/);
-  assert.match(source, /Do not call `BashOutput`/);
-  assert.match(source, /Return the command stdout verbatim, exactly as-is/i);
-  assert.match(source, /git status --short --untracked-files=all/);
-  assert.match(source, /git diff --shortstat/);
-  assert.match(source, /Treat untracked files or directories as reviewable work/i);
-  assert.match(source, /Recommend waiting only when the review is clearly tiny, roughly 1-2 files total/i);
-  assert.match(source, /In every other case, including unclear size, recommend background/i);
-  assert.match(source, /The companion script parses `--wait` and `--background`/i);
-  assert.match(source, /Claude Code's `Bash\(..., run_in_background: true\)` is what actually detaches the run/i);
-  assert.match(source, /When in doubt, run the review/i);
-  assert.match(source, /\(Recommended\)/);
-  assert.match(source, /does not support staged-only review, unstaged-only review, or extra focus text/i);
-});
-
 test("adversarial review command uses AskUserQuestion and background Bash while staying review-only", () => {
   const source = read("commands/adversarial-review.md");
   assert.match(source, /AskUserQuestion/);
@@ -64,10 +36,10 @@ test("adversarial review command uses AskUserQuestion and background Bash while
   assert.match(source, /Claude Code's `Bash\(..., run_in_background: true\)` is what actually detaches the run/i);
   assert.match(source, /When in doubt, run the review/i);
   assert.match(source, /\(Recommended\)/);
-  assert.match(source, /uses the same review target selection as `\/codex:review`/i);
+  assert.match(source, /is the one review command for completed work/i);
   assert.match(source, /supports working-tree review, branch review, and `--base <ref>`/i);
   assert.match(source, /does not support `--scope staged` or `--scope unstaged`/i);
-  assert.match(source, /can still take extra focus text after the flags/i);
+  assert.match(source, /takes extra focus text after the flags/i);
 });
 
 test("continue is not exposed as a user-facing command", () => {
@@ -82,7 +54,6 @@ test("continue is not exposed as a user-facing command", () => {
     "cancel.md",
     "rescue.md",
     "result.md",
-    "review.md",
     "setup.md",
     "status.md"
   ]) {
@@ -165,9 +136,8 @@ test("rescue command absorbs continue semantics", () => {
   assert.match(readme, /`spark`, the plugin maps that to `gpt-5\.3-codex-spark`/i);
   assert.match(readme, /continue a previous Codex task/i);
   assert.match(readme, /### `\/codex:setup`/);
-  assert.match(readme, /### `\/codex:review`/);
+  assert.match(readme, /### `\/codex:critique`/);
   assert.match(readme, /### `\/codex:adversarial-review`/);
-  assert.match(readme, /uses the same review target selection as `\/codex:review`/i);
   assert.match(readme, /--base main challenge whether this was the right caching and retry design/);
   assert.match(readme, /### `\/codex:rescue`/);
   assert.match(readme, /### `\/codex:status`/);

From 332f95735f9513e9f4959c1acb6acf60ee69903f Mon Sep 17 00:00:00 2001
From: Mike Edwards <njrun1804@gmail.com>
Date: Sun, 14 Jun 2026 16:41:47 -0400
Subject: [PATCH 5/5] fix(critique): run /codex:critique in foreground; stop
 detaching into an orphaned job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

/codex:critique previously defaulted to --background, which detached the
review into a separate Codex job and returned an instant stub — the
subagent finished with nothing and no real completion was produced.

- agent: always foreground; strip --background/--wait (Claude-side dispatch
  controls); set max Bash timeout so long critiques are not cut short
- handleCritique: parse+ignore --background/--wait (matches the review
  commands); remove the detach path; also fixes --wait leaking into design text
- README: returns the critique directly; drop stale status/result polling note
- bump 1.0.4 -> 1.0.5 + CHANGELOG so the plugin cache refreshes on update

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                                 |  2 +-
 plugins/codex/.claude-plugin/plugin.json  |  2 +-
 plugins/codex/CHANGELOG.md                |  6 ++++++
 plugins/codex/agents/codex-critique.md    |  4 +++-
 plugins/codex/scripts/codex-companion.mjs | 25 +++++++----------------
 5 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 1aa9ac90..1a1124e0 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ Pass the design inline, or point at a design doc:
 /codex:critique --focus "the n is tiny — does the data even support this split?" we plan to ...
 ```
 
-It is read-only and runs in the background by default — it reads the whole repo and queries the database, so it usually runs long. Use [`/codex:status`](#codexstatus) and [`/codex:result`](#codexresult) to follow it.
+It is read-only and returns the critique directly. Because it reads the whole repo and queries the database, it usually runs long, so by default it runs as a background subagent and notifies you when the critique is ready; add `--wait` to run it inline instead.
 
 ### `/codex:adversarial-review`
 
diff --git a/plugins/codex/.claude-plugin/plugin.json b/plugins/codex/.claude-plugin/plugin.json
index da262028..168b342d 100644
--- a/plugins/codex/.claude-plugin/plugin.json
+++ b/plugins/codex/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "codex",
-  "version": "1.0.4",
+  "version": "1.0.5",
   "description": "Use Codex from Claude Code to review code or delegate tasks.",
   "author": {
     "name": "OpenAI"
diff --git a/plugins/codex/CHANGELOG.md b/plugins/codex/CHANGELOG.md
index d647561b..be52c3df 100644
--- a/plugins/codex/CHANGELOG.md
+++ b/plugins/codex/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 1.0.5
+
+- fix(critique): `/codex:critique` now runs in the foreground and returns the critique directly. It previously detached the work into a background Codex job and returned an instant stub, so the subagent "finished" with nothing and never produced a real completion. Backgrounding is now purely a subagent-dispatch concern.
+- The `codex-critique` agent always runs `critique` in the foreground, strips `--background`/`--wait` (Claude-side dispatch controls), and sets a max Bash timeout so long critiques aren't cut short.
+- `handleCritique` parses but ignores `--background`/`--wait` (matching the review commands), removing the detach path and fixing `--wait` leaking into the design text.
+
 ## 1.0.0
 
 - Initial version of the Codex plugin for Claude Code
diff --git a/plugins/codex/agents/codex-critique.md b/plugins/codex/agents/codex-critique.md
index d9bbd8ed..d1032d9c 100644
--- a/plugins/codex/agents/codex-critique.md
+++ b/plugins/codex/agents/codex-critique.md
@@ -19,8 +19,10 @@ Selection guidance:
 Forwarding rules:
 
 - Use exactly one `Bash` call to invoke `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" critique ...`.
+- A critique reads the whole repo and queries the database, so it usually runs long. Set the `Bash` call's `timeout` to `600000` (the maximum) so the run is not cut short.
 - The `critique` run is READ-ONLY by design (a critique never edits). Never add `--write`.
-- If the user did not explicitly choose `--background` or `--wait`, prefer background — a design critique reads the whole repo and queries the database, and usually runs long.
+- Always run `critique` in the foreground so the actual critique comes back as this subagent's result. Never add `--background`. Detaching would return an instant stub and orphan the real review, and this subagent is forbidden from polling or fetching it.
+- `--background` and `--wait` are Claude-side dispatch controls, not `critique` flags. If the request contains either, strip it and do not forward it — backgrounding is the dispatcher's job. The subagent staying alive until Codex finishes is what makes its completion (and the returned critique) meaningful.
 - Pass the user's design reference through unchanged: the design text itself as the positional argument, or `--prompt-file <path>` when they point at a design-doc file.
 - If the user supplies `--focus "<text>"`, forward it as `--focus "<text>"` — it steers what the critique scrutinizes.
 - Treat `--effort <value>` and `--model <value>` as runtime controls and forward them; do not include them in the design text. If the user asks for `spark`, map it to `--model gpt-5.3-codex-spark`.
diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs
index 609d74ef..0e3d5ff8 100644
--- a/plugins/codex/scripts/codex-companion.mjs
+++ b/plugins/codex/scripts/codex-companion.mjs
@@ -796,10 +796,16 @@ async function handleTask(argv) {
 // Same task engine as handleTask, but the prompt wraps the user's design in the design-critique
 // template and the run is READ-ONLY (a critique never edits). The design reference is read the
 // same way a task prompt is (positional text or --prompt-file).
+//
+// Unlike `task`, critique always runs in the foreground (like the review commands): it is only
+// ever invoked by the `codex:codex-critique` subagent, so backgrounding it would detach the work
+// into an orphaned job and return an instant stub. Backgrounding a critique is a Claude-side
+// dispatch concern (run the subagent in the background). `--background`/`--wait` are parsed only
+// so they cannot leak into the design text, then ignored.
 async function handleCritique(argv) {
   const { options, positionals } = parseCommandInput(argv, {
     valueOptions: ["model", "effort", "cwd", "prompt-file", "focus"],
-    booleanOptions: ["json", "background"],
+    booleanOptions: ["json", "background", "wait"],
     aliasMap: {
       m: "model"
     }
@@ -818,23 +824,6 @@ async function handleCritique(argv) {
   });
   const taskMetadata = buildTaskRunMetadata({ prompt, resumeLast: false });
 
-  if (options.background) {
-    ensureCodexAvailable(cwd);
-    const job = buildTaskJob(workspaceRoot, taskMetadata, false);
-    const request = buildTaskRequest({
-      cwd,
-      model,
-      effort,
-      prompt,
-      write: false,
-      resumeLast: false,
-      jobId: job.id
-    });
-    const { payload } = enqueueBackgroundTask(cwd, job, request);
-    outputCommandResult(payload, renderQueuedTaskLaunch(payload), options.json);
-    return;
-  }
-
   const job = buildTaskJob(workspaceRoot, taskMetadata, false);
   await runForegroundCommand(
     job,