fix: run ANALYZE at startup and use real relpages for stats override

veksen · claude · veksen · commit 9796e04ec170 · 2026-03-27T14:43:19.000+04:00
PostgreSQL's planner ignores pg_class.relpages for tables with data —
it reads actual disk pages via RelationGetNumberOfBlocks(). The old
fromAssumption(reltuples=10000, relpages=1) caused the planner to
estimate tuples as actual_pages × 10000 / 1, inflating row estimates
by up to 74x (e.g. 740,000 instead of 10,000 for a 10K-row table).

Fix: run ANALYZE before reading statistics to populate pg_statistic
deterministically, then build a fromStatisticsExport mode that pairs
reltuples=10,000 with the real relpages from pg_class. This makes
the planner formula produce exactly 10,000 regardless of actual data.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/build-stats.test.ts b/src/build-stats.test.ts
@@ -0,0 +1,249 @@
+import { test, expect, beforeEach, afterEach } from "vitest";
+import { buildStatsFromDatabase } from "./build-stats.ts";
+import { connectToSource } from "./sql/postgresjs.ts";
+import { Connectable } from "./sync/connectable.ts";
+import {
+  IndexOptimizer,
+  PostgresQueryBuilder,
+  Statistics,
+  type Postgres,
+} from "@query-doctor/core";
+
+const TEST_DB = "querydoctor_test";
+const PG_URL = "postgresql://localhost:5432";
+
+let db: Postgres;
+
+async function execOnAdmin(sql: string) {
+  const admin = connectToSource(Connectable.fromString(`${PG_URL}/postgres`));
+  try {
+    await admin.exec(sql);
+  } finally {
+    await (admin as unknown as { close(): Promise<void> }).close();
+  }
+}
+
+beforeEach(async () => {
+  await execOnAdmin(`DROP DATABASE IF EXISTS ${TEST_DB}`);
+  await execOnAdmin(`CREATE DATABASE ${TEST_DB}`);
+  db = connectToSource(Connectable.fromString(`${PG_URL}/${TEST_DB}`));
+});
+
+afterEach(async () => {
+  await (db as unknown as { close(): Promise<void> }).close();
+  await execOnAdmin(`DROP DATABASE IF EXISTS ${TEST_DB}`);
+});
+
+test("sets reltuples to 10,000 for tables below threshold, preserves real relpages", async () => {
+  await db.exec(`
+    CREATE TABLE users(id serial PRIMARY KEY, name text, email text);
+    CREATE INDEX users_email_idx ON users(email);
+    INSERT INTO users (name, email)
+      SELECT 'user_' || i, 'user_' || i || '@example.com'
+      FROM generate_series(1, 1000) AS i;
+    ANALYZE;
+  `);
+
+  const mode = await buildStatsFromDatabase(db);
+
+  expect(mode.kind).toBe("fromStatisticsExport");
+  if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
+
+  const usersStats = mode.stats.find((s) => s.tableName === "users");
+  expect(usersStats).toBeDefined();
+  // 1000 rows is below the 5,000 threshold → bumped to 10,000
+  expect(usersStats!.reltuples).toBe(10_000);
+  // 1000 rows should produce more than 1 page
+  expect(usersStats!.relpages).toBeGreaterThan(1);
+
+  // Verify indexes are included
+  const emailIdx = usersStats!.indexes.find(
+    (i) => i.indexName === "users_email_idx",
+  );
+  expect(emailIdx).toBeDefined();
+  expect(emailIdx!.relpages).toBeGreaterThanOrEqual(1);
+});
+
+test("clamps relpages to at least 1 for empty tables", async () => {
+  await db.exec(`
+    CREATE TABLE empty_table(id serial PRIMARY KEY, data text);
+    ANALYZE;
+  `);
+
+  const mode = await buildStatsFromDatabase(db);
+  if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
+
+  const stats = mode.stats.find((s) => s.tableName === "empty_table");
+  expect(stats).toBeDefined();
+  expect(stats!.reltuples).toBe(10_000);
+  expect(stats!.relpages).toBeGreaterThanOrEqual(1);
+});
+
+test("density stays realistic regardless of actual row count", async () => {
+  // This is the core bug: with fromAssumption(reltuples=10000, relpages=1),
+  // PostgreSQL calculates estimated_tuples = actual_pages * 10000 / 1,
+  // inflating estimates proportionally to actual data volume.
+  //
+  // buildStatsFromDatabase fixes this by using the real relpages so that
+  // estimated_tuples = actual_pages * 10000 / actual_relpages ≈ 10000.
+  await db.exec(`
+    CREATE TABLE orders(id serial PRIMARY KEY, user_id int, total numeric);
+    CREATE INDEX orders_user_id_idx ON orders(user_id);
+    INSERT INTO orders (user_id, total)
+      SELECT (random() * 1000)::int, random() * 100
+      FROM generate_series(1, 10000);
+    ANALYZE;
+  `);
+
+  const mode = await buildStatsFromDatabase(db);
+  if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
+
+  const ordersStats = mode.stats.find((s) => s.tableName === "orders");
+  expect(ordersStats).toBeDefined();
+
+  // The key invariant: reltuples / relpages should give a reasonable
+  // density, NOT the broken 10000/1 = 10000 tuples-per-page.
+  const density = ordersStats!.reltuples / ordersStats!.relpages;
+  // Real density for a table with int+int+numeric columns is roughly
+  // 50-200 tuples per page. The override should preserve this ratio.
+  expect(density).toBeLessThan(500);
+  expect(density).toBeGreaterThan(10);
+});
+
+test("groups indexes by their parent table", async () => {
+  await db.exec(`
+    CREATE TABLE products(id serial PRIMARY KEY, name text, price numeric);
+    CREATE INDEX products_name_idx ON products(name);
+    CREATE INDEX products_price_idx ON products(price);
+    CREATE TABLE categories(id serial PRIMARY KEY, label text);
+    ANALYZE;
+  `);
+
+  const mode = await buildStatsFromDatabase(db);
+  if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
+
+  const products = mode.stats.find((s) => s.tableName === "products");
+  expect(products).toBeDefined();
+  const indexNames = products!.indexes.map((i) => i.indexName).sort();
+  expect(indexNames).toContain("products_name_idx");
+  expect(indexNames).toContain("products_price_idx");
+  expect(indexNames).toContain("products_pkey");
+
+  const categories = mode.stats.find((s) => s.tableName === "categories");
+  expect(categories).toBeDefined();
+  const catIndexNames = categories!.indexes.map((i) => i.indexName);
+  expect(catIndexNames).toContain("categories_pkey");
+  expect(catIndexNames).not.toContain("products_name_idx");
+});
+
+test("planner estimates 10,000 rows with only 1 row seeded", async () => {
+  // This is the end-to-end proof: seed 1 row, run ANALYZE,
+  // build stats, feed them through core's restoreStats + EXPLAIN,
+  // and verify the planner sees ~10,000 rows — not 1.
+  await db.exec(`
+    CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
+    INSERT INTO widgets (user_id, name) VALUES ('00000000-0000-0000-0000-000000000001', 'w1');
+    ANALYZE;
+  `);
+
+  const mode = await buildStatsFromDatabase(db);
+  const stats = await Statistics.fromPostgres(db, mode);
+  const existingIndexes = await stats.getExistingIndexes();
+  const optimizer = new IndexOptimizer(db, stats, existingIndexes);
+
+  const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
+  const plan = await optimizer.testQueryWithStats(builder);
+
+  // The planner's "Plan Rows" should be exactly 10,000 — NOT 1.
+  const estimatedRows = plan.Plan["Plan Rows"];
+  expect(estimatedRows).toBe(10_000);
+});
+
+test("planner estimates 10,000 rows with 10,000 rows seeded", async () => {
+  // Same test but with 10,000 actual rows — the estimate should be
+  // the same, proving the stats override works regardless of actual data.
+  await db.exec(`
+    CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
+    INSERT INTO widgets (user_id, name)
+      SELECT gen_random_uuid(), 'widget_' || i
+      FROM generate_series(1, 10000) AS i;
+    ANALYZE;
+  `);
+
+  const mode = await buildStatsFromDatabase(db);
+  const stats = await Statistics.fromPostgres(db, mode);
+  const existingIndexes = await stats.getExistingIndexes();
+  const optimizer = new IndexOptimizer(db, stats, existingIndexes);
+
+  const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
+  const plan = await optimizer.testQueryWithStats(builder);
+
+  const estimatedRows = plan.Plan["Plan Rows"];
+  expect(estimatedRows).toBe(10_000);
+});
+
+test("planner estimates 10,000 rows even with 50,000 rows seeded", async () => {
+  await db.exec(`
+    CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
+    INSERT INTO widgets (user_id, name)
+      SELECT gen_random_uuid(), 'widget_' || i
+      FROM generate_series(1, 50000) AS i;
+    ANALYZE;
+  `);
+
+  const mode = await buildStatsFromDatabase(db);
+  const stats = await Statistics.fromPostgres(db, mode);
+  const existingIndexes = await stats.getExistingIndexes();
+  const optimizer = new IndexOptimizer(db, stats, existingIndexes);
+
+  const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
+  const plan = await optimizer.testQueryWithStats(builder);
+
+  const estimatedRows = plan.Plan["Plan Rows"];
+  expect(estimatedRows).toBe(10_000);
+});
+
+test("BUG: fromAssumption(relpages=1) inflates estimates with real data", async () => {
+  // Demonstrates the bug in core's fromAssumption mode.
+  // With 10,000 rows seeded (~74 real pages), the planner calculates:
+  //   estimated_tuples = actual_pages × reltuples ÷ relpages
+  //                    = 74 × 10,000 ÷ 1 = 740,000
+  // The estimate is wildly inflated — 74x the correct value.
+  await db.exec(`
+    CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
+    INSERT INTO widgets (user_id, name)
+      SELECT gen_random_uuid(), 'widget_' || i
+      FROM generate_series(1, 10000) AS i;
+    ANALYZE;
+  `);
+
+  const brokenMode = Statistics.defaultStatsMode; // fromAssumption(10000, 1)
+  const stats = await Statistics.fromPostgres(db, brokenMode);
+  const existingIndexes = await stats.getExistingIndexes();
+  const optimizer = new IndexOptimizer(db, stats, existingIndexes);
+
+  const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
+  const plan = await optimizer.testQueryWithStats(builder);
+
+  const estimatedRows = plan.Plan["Plan Rows"];
+  // With the bug, this is ~740,000 — NOT 10,000.
+  expect(estimatedRows).toBeGreaterThan(100_000);
+});
+
+test("leaves columns null so ANALYZE pg_statistic entries persist", async () => {
+  await db.exec(`
+    CREATE TABLE items(id serial PRIMARY KEY, label text);
+    INSERT INTO items (label) SELECT 'item_' || i FROM generate_series(1, 100) AS i;
+    ANALYZE;
+  `);
+
+  const mode = await buildStatsFromDatabase(db);
+  if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
+
+  const items = mode.stats.find((s) => s.tableName === "items");
+  expect(items).toBeDefined();
+  // columns must be null — core's restoreStats only overwrites pg_statistic
+  // when columns are provided. Leaving them null means the ANALYZE-populated
+  // statistics persist across the rolled-back transaction.
+  expect(items!.columns).toBeNull();
+});
diff --git a/src/build-stats.ts b/src/build-stats.ts
@@ -0,0 +1,92 @@
+import {
+  type Postgres,
+  Statistics,
+  type StatisticsMode,
+} from "@query-doctor/core";
+
+const DEFAULT_RELTUPLES = 10_000;
+
+/**
+ * Build a `fromStatisticsExport` stats mode from the live database.
+ *
+ * PostgreSQL's planner ignores `pg_class.relpages` for tables with data on
+ * disk — it reads the actual page count via `RelationGetNumberOfBlocks()`.
+ * It then estimates tuples as:
+ *
+ *     estimated_tuples = actual_pages × pg_class.reltuples ÷ pg_class.relpages
+ *
+ * The old `fromAssumption` default (reltuples=10 000, relpages=1) causes a
+ * massive inflation when tables have real data (e.g. 167 pages → 1.67 M
+ * estimated tuples).
+ *
+ * By reading the real `relpages` from pg_class (after ANALYZE) and pairing
+ * it with a correct reltuples, the formula produces the correct estimate
+ * regardless of actual data volume.  Column-level statistics (`pg_statistic`)
+ * are left untouched — ANALYZE already populated them.
+ *
+ * All tables are assumed to have 10,000 rows regardless of actual data.
+ */
+export async function buildStatsFromDatabase(
+  db: Postgres,
+): Promise<StatisticsMode> {
+  type TableRow = {
+    tableName: string;
+    schemaName: string;
+    relpages: number;
+    relallvisible: number;
+  };
+  type IndexRow = TableRow & { indexName: string; reltuples: number };
+
+  const [tables, indexes] = await Promise.all([
+    db.exec<TableRow>(`
+      SELECT c.relname       AS "tableName",
+             n.nspname       AS "schemaName",
+             c.relpages::int AS "relpages",
+             c.relallvisible::int AS "relallvisible"
+      FROM pg_class c
+      JOIN pg_namespace n ON n.oid = c.relnamespace
+      WHERE c.relkind = 'r'
+        AND n.nspname NOT IN ('pg_catalog', 'information_schema') -- @qd_introspection
+    `),
+    db.exec<IndexRow>(`
+      SELECT t.relname       AS "tableName",
+             n.nspname       AS "schemaName",
+             i.relname       AS "indexName",
+             i.reltuples::real AS "reltuples",
+             i.relpages::int AS "relpages",
+             i.relallvisible::int AS "relallvisible"
+      FROM pg_index ix
+      JOIN pg_class t ON t.oid = ix.indrelid
+      JOIN pg_class i ON i.oid = ix.indexrelid
+      JOIN pg_namespace n ON n.oid = t.relnamespace
+      WHERE n.nspname NOT IN ('pg_catalog', 'information_schema') -- @qd_introspection
+    `),
+  ]);
+
+  const indexesByTable = new Map<string, IndexRow[]>();
+  for (const idx of indexes) {
+    const key = `${idx.schemaName}.${idx.tableName}`;
+    const list = indexesByTable.get(key) ?? [];
+    list.push(idx);
+    indexesByTable.set(key, list);
+  }
+
+  const stats = tables.map((t) => ({
+    tableName: t.tableName,
+    schemaName: t.schemaName,
+    reltuples: DEFAULT_RELTUPLES,
+    relpages: Math.max(1, t.relpages),
+    relallvisible: t.relallvisible ?? 0,
+    columns: null,
+    indexes: (
+      indexesByTable.get(`${t.schemaName}.${t.tableName}`) ?? []
+    ).map((i) => ({
+      indexName: i.indexName,
+      relpages: Math.max(1, i.relpages),
+      reltuples: i.reltuples,
+      relallvisible: i.relallvisible ?? 0,
+    })),
+  }));
+
+  return Statistics.statsModeFromExport(stats);
+}
diff --git a/src/runner.ts b/src/runner.ts
@@ -39,6 +39,7 @@ import { env } from "./env.ts";
 import { connectToSource } from "./sql/postgresjs.ts";
 import { parse } from "@libpg-query/parser";
 import { Connectable } from "./sync/connectable.ts";
+import { buildStatsFromDatabase } from "./build-stats.ts";
 
 export class Runner {
   private readonly seenQueries = new Set<string>();
@@ -66,7 +67,14 @@ export class Runner {
     ignoredQueryHashes?: string[];
   }) {
     const db = connectToSource(options.postgresUrl);
-    const statisticsMode = Runner.decideStatisticsMode(options.statisticsPath);
+    // Run ANALYZE before reading statistics so pg_statistic (column-level
+    // stats like n_distinct) is populated deterministically from the current
+    // data.  Without this, autovacuum may or may not have analyzed tables,
+    // causing the same query to produce different EXPLAIN costs across runs.
+    await db.exec("ANALYZE");
+    const statisticsMode = options.statisticsPath
+      ? Runner.decideStatisticsMode(options.statisticsPath)
+      : await buildStatsFromDatabase(db);
     const stats = await Statistics.fromPostgres(db, statisticsMode);
     const existingIndexes = await stats.getExistingIndexes();
     const optimizer = new IndexOptimizer(db, stats, existingIndexes);
@@ -461,19 +469,12 @@ export class Runner {
     console.log();
   }
 
-  private static decideStatisticsMode(path?: string): StatisticsMode {
-    if (path) {
-      const data = Runner.readStatisticsFile(path);
-      return Statistics.statsModeFromExport(data);
-    } else {
-      return Statistics.defaultStatsMode;
-    }
-  }
-  private static readStatisticsFile(path: string): ExportedStats[] {
+  private static decideStatisticsMode(path: string): StatisticsMode {
     const data = readFileSync(path);
     const json = JSON.parse(new TextDecoder().decode(data));
-    return ExportedStats.array().parse(json);
+    return Statistics.statsModeFromExport(ExportedStats.array().parse(json));
   }
+
 }
 
 export type QueryProcessResult =