Skip to content

Commit 9796e04

Browse files
veksenclaude
andcommitted
fix: run ANALYZE at startup and use real relpages for stats override
PostgreSQL's planner ignores pg_class.relpages for tables with data — it reads actual disk pages via RelationGetNumberOfBlocks(). The old fromAssumption(reltuples=10000, relpages=1) caused the planner to estimate tuples as actual_pages × 10000 / 1, inflating row estimates by up to 74x (e.g. 740,000 instead of 10,000 for a 10K-row table). Fix: run ANALYZE before reading statistics to populate pg_statistic deterministically, then build a fromStatisticsExport mode that pairs reltuples=10,000 with the real relpages from pg_class. This makes the planner formula produce exactly 10,000 regardless of actual data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 70cb54a commit 9796e04

3 files changed

Lines changed: 353 additions & 11 deletions

File tree

src/build-stats.test.ts

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
import { test, expect, beforeEach, afterEach } from "vitest";
2+
import { buildStatsFromDatabase } from "./build-stats.ts";
3+
import { connectToSource } from "./sql/postgresjs.ts";
4+
import { Connectable } from "./sync/connectable.ts";
5+
import {
6+
IndexOptimizer,
7+
PostgresQueryBuilder,
8+
Statistics,
9+
type Postgres,
10+
} from "@query-doctor/core";
11+
12+
const TEST_DB = "querydoctor_test";
13+
const PG_URL = "postgresql://localhost:5432";
14+
15+
let db: Postgres;
16+
17+
async function execOnAdmin(sql: string) {
18+
const admin = connectToSource(Connectable.fromString(`${PG_URL}/postgres`));
19+
try {
20+
await admin.exec(sql);
21+
} finally {
22+
await (admin as unknown as { close(): Promise<void> }).close();
23+
}
24+
}
25+
26+
beforeEach(async () => {
27+
await execOnAdmin(`DROP DATABASE IF EXISTS ${TEST_DB}`);
28+
await execOnAdmin(`CREATE DATABASE ${TEST_DB}`);
29+
db = connectToSource(Connectable.fromString(`${PG_URL}/${TEST_DB}`));
30+
});
31+
32+
afterEach(async () => {
33+
await (db as unknown as { close(): Promise<void> }).close();
34+
await execOnAdmin(`DROP DATABASE IF EXISTS ${TEST_DB}`);
35+
});
36+
37+
test("sets reltuples to 10,000 for tables below threshold, preserves real relpages", async () => {
38+
await db.exec(`
39+
CREATE TABLE users(id serial PRIMARY KEY, name text, email text);
40+
CREATE INDEX users_email_idx ON users(email);
41+
INSERT INTO users (name, email)
42+
SELECT 'user_' || i, 'user_' || i || '@example.com'
43+
FROM generate_series(1, 1000) AS i;
44+
ANALYZE;
45+
`);
46+
47+
const mode = await buildStatsFromDatabase(db);
48+
49+
expect(mode.kind).toBe("fromStatisticsExport");
50+
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
51+
52+
const usersStats = mode.stats.find((s) => s.tableName === "users");
53+
expect(usersStats).toBeDefined();
54+
// 1000 rows is below the 5,000 threshold → bumped to 10,000
55+
expect(usersStats!.reltuples).toBe(10_000);
56+
// 1000 rows should produce more than 1 page
57+
expect(usersStats!.relpages).toBeGreaterThan(1);
58+
59+
// Verify indexes are included
60+
const emailIdx = usersStats!.indexes.find(
61+
(i) => i.indexName === "users_email_idx",
62+
);
63+
expect(emailIdx).toBeDefined();
64+
expect(emailIdx!.relpages).toBeGreaterThanOrEqual(1);
65+
});
66+
67+
test("clamps relpages to at least 1 for empty tables", async () => {
68+
await db.exec(`
69+
CREATE TABLE empty_table(id serial PRIMARY KEY, data text);
70+
ANALYZE;
71+
`);
72+
73+
const mode = await buildStatsFromDatabase(db);
74+
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
75+
76+
const stats = mode.stats.find((s) => s.tableName === "empty_table");
77+
expect(stats).toBeDefined();
78+
expect(stats!.reltuples).toBe(10_000);
79+
expect(stats!.relpages).toBeGreaterThanOrEqual(1);
80+
});
81+
82+
test("density stays realistic regardless of actual row count", async () => {
83+
// This is the core bug: with fromAssumption(reltuples=10000, relpages=1),
84+
// PostgreSQL calculates estimated_tuples = actual_pages * 10000 / 1,
85+
// inflating estimates proportionally to actual data volume.
86+
//
87+
// buildStatsFromDatabase fixes this by using the real relpages so that
88+
// estimated_tuples = actual_pages * 10000 / actual_relpages ≈ 10000.
89+
await db.exec(`
90+
CREATE TABLE orders(id serial PRIMARY KEY, user_id int, total numeric);
91+
CREATE INDEX orders_user_id_idx ON orders(user_id);
92+
INSERT INTO orders (user_id, total)
93+
SELECT (random() * 1000)::int, random() * 100
94+
FROM generate_series(1, 10000);
95+
ANALYZE;
96+
`);
97+
98+
const mode = await buildStatsFromDatabase(db);
99+
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
100+
101+
const ordersStats = mode.stats.find((s) => s.tableName === "orders");
102+
expect(ordersStats).toBeDefined();
103+
104+
// The key invariant: reltuples / relpages should give a reasonable
105+
// density, NOT the broken 10000/1 = 10000 tuples-per-page.
106+
const density = ordersStats!.reltuples / ordersStats!.relpages;
107+
// Real density for a table with int+int+numeric columns is roughly
108+
// 50-200 tuples per page. The override should preserve this ratio.
109+
expect(density).toBeLessThan(500);
110+
expect(density).toBeGreaterThan(10);
111+
});
112+
113+
test("groups indexes by their parent table", async () => {
114+
await db.exec(`
115+
CREATE TABLE products(id serial PRIMARY KEY, name text, price numeric);
116+
CREATE INDEX products_name_idx ON products(name);
117+
CREATE INDEX products_price_idx ON products(price);
118+
CREATE TABLE categories(id serial PRIMARY KEY, label text);
119+
ANALYZE;
120+
`);
121+
122+
const mode = await buildStatsFromDatabase(db);
123+
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
124+
125+
const products = mode.stats.find((s) => s.tableName === "products");
126+
expect(products).toBeDefined();
127+
const indexNames = products!.indexes.map((i) => i.indexName).sort();
128+
expect(indexNames).toContain("products_name_idx");
129+
expect(indexNames).toContain("products_price_idx");
130+
expect(indexNames).toContain("products_pkey");
131+
132+
const categories = mode.stats.find((s) => s.tableName === "categories");
133+
expect(categories).toBeDefined();
134+
const catIndexNames = categories!.indexes.map((i) => i.indexName);
135+
expect(catIndexNames).toContain("categories_pkey");
136+
expect(catIndexNames).not.toContain("products_name_idx");
137+
});
138+
139+
test("planner estimates 10,000 rows with only 1 row seeded", async () => {
140+
// This is the end-to-end proof: seed 1 row, run ANALYZE,
141+
// build stats, feed them through core's restoreStats + EXPLAIN,
142+
// and verify the planner sees ~10,000 rows — not 1.
143+
await db.exec(`
144+
CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
145+
INSERT INTO widgets (user_id, name) VALUES ('00000000-0000-0000-0000-000000000001', 'w1');
146+
ANALYZE;
147+
`);
148+
149+
const mode = await buildStatsFromDatabase(db);
150+
const stats = await Statistics.fromPostgres(db, mode);
151+
const existingIndexes = await stats.getExistingIndexes();
152+
const optimizer = new IndexOptimizer(db, stats, existingIndexes);
153+
154+
const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
155+
const plan = await optimizer.testQueryWithStats(builder);
156+
157+
// The planner's "Plan Rows" should be exactly 10,000 — NOT 1.
158+
const estimatedRows = plan.Plan["Plan Rows"];
159+
expect(estimatedRows).toBe(10_000);
160+
});
161+
162+
test("planner estimates 10,000 rows with 10,000 rows seeded", async () => {
163+
// Same test but with 10,000 actual rows — the estimate should be
164+
// the same, proving the stats override works regardless of actual data.
165+
await db.exec(`
166+
CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
167+
INSERT INTO widgets (user_id, name)
168+
SELECT gen_random_uuid(), 'widget_' || i
169+
FROM generate_series(1, 10000) AS i;
170+
ANALYZE;
171+
`);
172+
173+
const mode = await buildStatsFromDatabase(db);
174+
const stats = await Statistics.fromPostgres(db, mode);
175+
const existingIndexes = await stats.getExistingIndexes();
176+
const optimizer = new IndexOptimizer(db, stats, existingIndexes);
177+
178+
const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
179+
const plan = await optimizer.testQueryWithStats(builder);
180+
181+
const estimatedRows = plan.Plan["Plan Rows"];
182+
expect(estimatedRows).toBe(10_000);
183+
});
184+
185+
test("planner estimates 10,000 rows even with 50,000 rows seeded", async () => {
186+
await db.exec(`
187+
CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
188+
INSERT INTO widgets (user_id, name)
189+
SELECT gen_random_uuid(), 'widget_' || i
190+
FROM generate_series(1, 50000) AS i;
191+
ANALYZE;
192+
`);
193+
194+
const mode = await buildStatsFromDatabase(db);
195+
const stats = await Statistics.fromPostgres(db, mode);
196+
const existingIndexes = await stats.getExistingIndexes();
197+
const optimizer = new IndexOptimizer(db, stats, existingIndexes);
198+
199+
const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
200+
const plan = await optimizer.testQueryWithStats(builder);
201+
202+
const estimatedRows = plan.Plan["Plan Rows"];
203+
expect(estimatedRows).toBe(10_000);
204+
});
205+
206+
test("BUG: fromAssumption(relpages=1) inflates estimates with real data", async () => {
207+
// Demonstrates the bug in core's fromAssumption mode.
208+
// With 10,000 rows seeded (~74 real pages), the planner calculates:
209+
// estimated_tuples = actual_pages × reltuples ÷ relpages
210+
// = 74 × 10,000 ÷ 1 = 740,000
211+
// The estimate is wildly inflated — 74x the correct value.
212+
await db.exec(`
213+
CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
214+
INSERT INTO widgets (user_id, name)
215+
SELECT gen_random_uuid(), 'widget_' || i
216+
FROM generate_series(1, 10000) AS i;
217+
ANALYZE;
218+
`);
219+
220+
const brokenMode = Statistics.defaultStatsMode; // fromAssumption(10000, 1)
221+
const stats = await Statistics.fromPostgres(db, brokenMode);
222+
const existingIndexes = await stats.getExistingIndexes();
223+
const optimizer = new IndexOptimizer(db, stats, existingIndexes);
224+
225+
const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
226+
const plan = await optimizer.testQueryWithStats(builder);
227+
228+
const estimatedRows = plan.Plan["Plan Rows"];
229+
// With the bug, this is ~740,000 — NOT 10,000.
230+
expect(estimatedRows).toBeGreaterThan(100_000);
231+
});
232+
233+
test("leaves columns null so ANALYZE pg_statistic entries persist", async () => {
234+
await db.exec(`
235+
CREATE TABLE items(id serial PRIMARY KEY, label text);
236+
INSERT INTO items (label) SELECT 'item_' || i FROM generate_series(1, 100) AS i;
237+
ANALYZE;
238+
`);
239+
240+
const mode = await buildStatsFromDatabase(db);
241+
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");
242+
243+
const items = mode.stats.find((s) => s.tableName === "items");
244+
expect(items).toBeDefined();
245+
// columns must be null — core's restoreStats only overwrites pg_statistic
246+
// when columns are provided. Leaving them null means the ANALYZE-populated
247+
// statistics persist across the rolled-back transaction.
248+
expect(items!.columns).toBeNull();
249+
});

src/build-stats.ts

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import {
2+
type Postgres,
3+
Statistics,
4+
type StatisticsMode,
5+
} from "@query-doctor/core";
6+
7+
const DEFAULT_RELTUPLES = 10_000;
8+
9+
/**
10+
* Build a `fromStatisticsExport` stats mode from the live database.
11+
*
12+
* PostgreSQL's planner ignores `pg_class.relpages` for tables with data on
13+
* disk — it reads the actual page count via `RelationGetNumberOfBlocks()`.
14+
* It then estimates tuples as:
15+
*
16+
* estimated_tuples = actual_pages × pg_class.reltuples ÷ pg_class.relpages
17+
*
18+
* The old `fromAssumption` default (reltuples=10 000, relpages=1) causes a
19+
* massive inflation when tables have real data (e.g. 167 pages → 1.67 M
20+
* estimated tuples).
21+
*
22+
* By reading the real `relpages` from pg_class (after ANALYZE) and pairing
23+
* it with a correct reltuples, the formula produces the correct estimate
24+
* regardless of actual data volume. Column-level statistics (`pg_statistic`)
25+
* are left untouched — ANALYZE already populated them.
26+
*
27+
* All tables are assumed to have 10,000 rows regardless of actual data.
28+
*/
29+
export async function buildStatsFromDatabase(
30+
db: Postgres,
31+
): Promise<StatisticsMode> {
32+
type TableRow = {
33+
tableName: string;
34+
schemaName: string;
35+
relpages: number;
36+
relallvisible: number;
37+
};
38+
type IndexRow = TableRow & { indexName: string; reltuples: number };
39+
40+
const [tables, indexes] = await Promise.all([
41+
db.exec<TableRow>(`
42+
SELECT c.relname AS "tableName",
43+
n.nspname AS "schemaName",
44+
c.relpages::int AS "relpages",
45+
c.relallvisible::int AS "relallvisible"
46+
FROM pg_class c
47+
JOIN pg_namespace n ON n.oid = c.relnamespace
48+
WHERE c.relkind = 'r'
49+
AND n.nspname NOT IN ('pg_catalog', 'information_schema') -- @qd_introspection
50+
`),
51+
db.exec<IndexRow>(`
52+
SELECT t.relname AS "tableName",
53+
n.nspname AS "schemaName",
54+
i.relname AS "indexName",
55+
i.reltuples::real AS "reltuples",
56+
i.relpages::int AS "relpages",
57+
i.relallvisible::int AS "relallvisible"
58+
FROM pg_index ix
59+
JOIN pg_class t ON t.oid = ix.indrelid
60+
JOIN pg_class i ON i.oid = ix.indexrelid
61+
JOIN pg_namespace n ON n.oid = t.relnamespace
62+
WHERE n.nspname NOT IN ('pg_catalog', 'information_schema') -- @qd_introspection
63+
`),
64+
]);
65+
66+
const indexesByTable = new Map<string, IndexRow[]>();
67+
for (const idx of indexes) {
68+
const key = `${idx.schemaName}.${idx.tableName}`;
69+
const list = indexesByTable.get(key) ?? [];
70+
list.push(idx);
71+
indexesByTable.set(key, list);
72+
}
73+
74+
const stats = tables.map((t) => ({
75+
tableName: t.tableName,
76+
schemaName: t.schemaName,
77+
reltuples: DEFAULT_RELTUPLES,
78+
relpages: Math.max(1, t.relpages),
79+
relallvisible: t.relallvisible ?? 0,
80+
columns: null,
81+
indexes: (
82+
indexesByTable.get(`${t.schemaName}.${t.tableName}`) ?? []
83+
).map((i) => ({
84+
indexName: i.indexName,
85+
relpages: Math.max(1, i.relpages),
86+
reltuples: i.reltuples,
87+
relallvisible: i.relallvisible ?? 0,
88+
})),
89+
}));
90+
91+
return Statistics.statsModeFromExport(stats);
92+
}

src/runner.ts

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import { env } from "./env.ts";
3939
import { connectToSource } from "./sql/postgresjs.ts";
4040
import { parse } from "@libpg-query/parser";
4141
import { Connectable } from "./sync/connectable.ts";
42+
import { buildStatsFromDatabase } from "./build-stats.ts";
4243

4344
export class Runner {
4445
private readonly seenQueries = new Set<string>();
@@ -66,7 +67,14 @@ export class Runner {
6667
ignoredQueryHashes?: string[];
6768
}) {
6869
const db = connectToSource(options.postgresUrl);
69-
const statisticsMode = Runner.decideStatisticsMode(options.statisticsPath);
70+
// Run ANALYZE before reading statistics so pg_statistic (column-level
71+
// stats like n_distinct) is populated deterministically from the current
72+
// data. Without this, autovacuum may or may not have analyzed tables,
73+
// causing the same query to produce different EXPLAIN costs across runs.
74+
await db.exec("ANALYZE");
75+
const statisticsMode = options.statisticsPath
76+
? Runner.decideStatisticsMode(options.statisticsPath)
77+
: await buildStatsFromDatabase(db);
7078
const stats = await Statistics.fromPostgres(db, statisticsMode);
7179
const existingIndexes = await stats.getExistingIndexes();
7280
const optimizer = new IndexOptimizer(db, stats, existingIndexes);
@@ -461,19 +469,12 @@ export class Runner {
461469
console.log();
462470
}
463471

464-
private static decideStatisticsMode(path?: string): StatisticsMode {
465-
if (path) {
466-
const data = Runner.readStatisticsFile(path);
467-
return Statistics.statsModeFromExport(data);
468-
} else {
469-
return Statistics.defaultStatsMode;
470-
}
471-
}
472-
private static readStatisticsFile(path: string): ExportedStats[] {
472+
private static decideStatisticsMode(path: string): StatisticsMode {
473473
const data = readFileSync(path);
474474
const json = JSON.parse(new TextDecoder().decode(data));
475-
return ExportedStats.array().parse(json);
475+
return Statistics.statsModeFromExport(ExportedStats.array().parse(json));
476476
}
477+
477478
}
478479

479480
export type QueryProcessResult =

0 commit comments

Comments
 (0)