From aaf971d93a7ee5999df3144933d2425479cd3f94 Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Tue, 23 Jun 2026 10:48:29 -0500 Subject: [PATCH 1/2] Compile predicate evaluation per plan: close most of the gap to CHECK constraint exclusion Investigation into why table_range planning is slower than PostgreSQL's own constraint exclusion (the built-in way to prune on a non-key column via a data-range CHECK per partition). How constraint exclusion works (src/backend/optimizer/util/plancat.c): relation_excluded_by_constraints -> get_relation_constraints reads each partition's CHECK expressions straight from the relcache (relation->rd_att->constr->check) via table_open(..., NoLock) -- the partition is already locked and cached from planning, so it does zero extra I/O and zero extra locking per partition, then predicate_refuted_by proves contradiction on the in-memory expression trees. Attribution of our ~31us/partition overhead at 2,000 partitions (via a temporary diagnostic) was surprising: the index-page read+deserialize is only ~7us; the *evaluation* was ~23us -- dominated by work that is identical across every partition but was being redone for each one: btree_strategy (3 syscache lookups), getTypeInputInfo and fmgr_info setup inside every text_to_datum / OidFunctionCall, and constant rendering. Fix: resolve those once per top-level plan and reuse across partitions -- - FMGR_MEMO: a palloc'd FmgrInfo per function, so each compare / input-function call skips fmgr_info's syscache lookup (FunctionCall2Coll / InputFunctionCall); - INPUT_INFO_MEMO: getTypeInputInfo result per type; - STRATEGY_MEMO: btree strategy per (operator, left type). These caches are planner-only and cleared per plan; the aminsert path keeps using the uncached datum_cmp / text_to_datum (no cross-statement cache to invalidate). Result at 2,000 partitions (warm, same session): full planning ~88ms -> ~43ms; eval is now effectively free (full ~= read-only ~= traversal-only). Versus CHECK constraint exclusion (~33ms) we went from ~2.6x to ~1.3x; the residual ~5us/part is the per-partition index-page read+deserialize. 29 tests pass. Co-Authored-By: Claude Opus 4.8 --- src/prune_hook.rs | 101 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/src/prune_hook.rs b/src/prune_hook.rs index a701cca..e9e8704 100644 --- a/src/prune_hook.rs +++ b/src/prune_hook.rs @@ -51,6 +51,82 @@ thread_local! { /// Per-plan memo of parsed query constants, keyed by (type oid, text). The same /// constant is otherwise re-rendered and re-parsed once per partition. static CONST_MEMO: RefCell> = RefCell::new(HashMap::new()); + /// Per-plan cache of resolved `FmgrInfo` structs (one palloc per function), so each + /// comparison / input-function call across partitions skips the `fmgr_info` syscache + /// lookup. Pointers are into the planner memory context and cleared per top-level plan. + static FMGR_MEMO: RefCell> = RefCell::new(HashMap::new()); + /// Per-plan memo of a type's input function + ioparam (from `getTypeInputInfo`). + static INPUT_INFO_MEMO: RefCell> = + RefCell::new(HashMap::new()); + /// Per-plan memo of the btree strategy for an (operator, left type) pair. + static STRATEGY_MEMO: RefCell>> = RefCell::new(HashMap::new()); +} + +/// A planner-cached `FmgrInfo` for `proc_oid`, valid for the current plan. Avoids the +/// per-call `fmgr_info` syscache lookup that `OidFunctionCall*` does. Planner-only: the +/// cache is cleared per top-level plan (the `aminsert` path must not use this). +unsafe fn fmgr_cached(proc_oid: pg_sys::Oid) -> *mut pg_sys::FmgrInfo { + let key: u32 = proc_oid.into(); + if let Some(p) = FMGR_MEMO.with(|m| m.borrow().get(&key).copied()) { + return p; + } + let p = pg_sys::palloc0(std::mem::size_of::()) as *mut pg_sys::FmgrInfo; + pg_sys::fmgr_info(proc_oid, p); + FMGR_MEMO.with(|m| { + m.borrow_mut().insert(key, p); + }); + p +} + +/// Compare two datums of the same type using a plan-cached `FmgrInfo`. Planner-only. +unsafe fn cmp_cached( + cmpproc: pg_sys::Oid, + collation: pg_sys::Oid, + a: pg_sys::Datum, + b: pg_sys::Datum, +) -> i32 { + pg_sys::FunctionCall2Coll(fmgr_cached(cmpproc), collation, a, b).value() as i32 +} + +/// Parse `text` to a Datum of `typ` using plan-cached type-input info and `FmgrInfo`. +/// Planner-only (clears per plan); the `aminsert` path uses [`text_to_datum`] instead. +unsafe fn parse_cached(typ: pg_sys::Oid, text: &str) -> Option { + let key: u32 = typ.into(); + let (infunc, ioparam) = match INPUT_INFO_MEMO.with(|m| m.borrow().get(&key).copied()) { + Some(v) => v, + None => { + let mut infunc = pg_sys::Oid::INVALID; + let mut ioparam = pg_sys::Oid::INVALID; + pg_sys::getTypeInputInfo(typ, &mut infunc, &mut ioparam); + INPUT_INFO_MEMO.with(|m| { + m.borrow_mut().insert(key, (infunc, ioparam)); + }); + (infunc, ioparam) + } + }; + if infunc == pg_sys::Oid::INVALID { + return None; + } + let cstr = CString::new(text).ok()?; + Some(pg_sys::InputFunctionCall( + fmgr_cached(infunc), + cstr.as_ptr() as *mut _, + ioparam, + -1, + )) +} + +/// btree strategy for an (operator, left type) pair, memoized for the current plan. +unsafe fn strategy_cached(opno: pg_sys::Oid, lefttype: pg_sys::Oid) -> Option { + let key = (opno.into(), lefttype.into()); + if let Some(v) = STRATEGY_MEMO.with(|m| m.borrow().get(&key).copied()) { + return v; + } + let v = btree_strategy(opno, lefttype); + STRATEGY_MEMO.with(|m| { + m.borrow_mut().insert(key, v); + }); + v } /// Compare proc for `typ`, memoized for the current plan. See [`btree_cmp_proc`]. @@ -74,7 +150,7 @@ unsafe fn const_datum_cached(typ: pg_sys::Oid, text: &str) -> Option Option { } // Scalar btree comparison (`<`, `<=`, `=`, `>=`, `>`). - if let Some(strategy) = btree_strategy((*opexpr).opno, (*var).vartype) { + if let Some(strategy) = strategy_cached((*opexpr).opno, (*var).vartype) { let strategy = if commuted { commute_strategy(strategy) } else { @@ -475,7 +556,7 @@ unsafe fn extract_saop(saop: *mut pg_sys::ScalarArrayOpExpr) -> Option return None; } // Only the equality operator gives the "any element in range" semantics. - if btree_strategy((*saop).opno, (*var).vartype)? != 3 { + if strategy_cached((*saop).opno, (*var).vartype)? != 3 { return None; } let elems = array_const_texts(con)?; @@ -510,15 +591,15 @@ unsafe fn eval_compare( Some(d) => d, None => return false, }; - let min_d = match text_to_datum(typ, min) { + let min_d = match parse_cached(typ, min) { Some(d) => d, None => return false, }; - let max_d = match text_to_datum(typ, max) { + let max_d = match parse_cached(typ, max) { Some(d) => d, None => return false, }; - let cmp = |a, b| datum_cmp(cmpproc, collation, a, b); + let cmp = |a, b| cmp_cached(cmpproc, collation, a, b); match strategy { 1 => cmp(min_d, k) >= 0, // col < K : prune iff min >= K 2 => cmp(min_d, k) > 0, // col <= K : prune iff min > K @@ -543,11 +624,11 @@ unsafe fn eval_in_list( Some(p) => p, None => return false, }; - let min_d = match text_to_datum(typ, min) { + let min_d = match parse_cached(typ, min) { Some(d) => d, None => return false, }; - let max_d = match text_to_datum(typ, max) { + let max_d = match parse_cached(typ, max) { Some(d) => d, None => return false, }; @@ -562,8 +643,8 @@ unsafe fn eval_in_list( Some(d) => d, None => return false, }; - if datum_cmp(cmpproc, collation, d, min_d) >= 0 - && datum_cmp(cmpproc, collation, d, max_d) <= 0 + if cmp_cached(cmpproc, collation, d, min_d) >= 0 + && cmp_cached(cmpproc, collation, d, max_d) <= 0 { return false; // this value is in range -> KEEP } From 3c978210c4f754b2853c357497b71badc9c485bf Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Tue, 23 Jun 2026 11:19:50 -0500 Subject: [PATCH 2/2] Cache deserialized summaries per backend: close the gap to CHECK constraint exclusion Backend-lifetime cache (src/summary_cache.rs) keyed by index OID, so warm/repeated plans skip the per-partition index open + metapage read + deserialize entirely and serve the summary from memory (shared via Rc with the per-plan cache). Coherence rests on the over-inclusive invariant: a cached summary is safe as long as it is never *narrower* than the data. Only aminsert widens a summary; when it does it calls CacheInvalidateRelcacheByRelid on the index, and a registered relcache callback drops the cached copy in every backend (locally at the next command boundary, in other backends at the widening txn's commit -- matching row visibility). Operations that only narrow (delete, vacuum re-tighten) need no invalidation. Widen-invalidations are coalesced to one per index per transaction so bulk loads don't thrash. Result at 2,000 partitions (warm, same session): planning ~43ms -> ~34ms, which now matches -- and slightly beats -- CHECK constraint exclusion (~37ms), because we serve a cached summary while constraint exclusion re-parses each CHECK every plan. At 300 partitions, pruning-on planning (~4ms) is now ~equal to pruning-off. Combined with the earlier per-plan compilation, per-partition planning cost fell from ~31us to ~3-4us. A cold first plan still reads each page; every plan after is cached. 29 tests pass. README performance/scaling sections and benchmark numbers updated accordingly. Co-Authored-By: Claude Opus 4.8 --- README.md | 88 ++++++++++++++++++++++++-------------------- src/index_am.rs | 3 ++ src/lib.rs | 3 ++ src/prune_hook.rs | 31 ++++++++++++---- src/summary_cache.rs | 81 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 158 insertions(+), 48 deletions(-) create mode 100644 src/summary_cache.rs diff --git a/README.md b/README.md index 8a42d13..fd5b4e3 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,11 @@ EXPLAIN (COSTS OFF) SELECT * FROM places WHERE geom && ST_MakeEnvelope(0,0,10,10 for range types (`range_merge(range_agg(col))`) or the bounding box for PostGIS geometry (`ST_Extent(col)`). - **Planning.** For each partition the planner builds, a `set_rel_pathlist_hook` reads the - summary from that partition's index (cached for the plan) and evaluates the partition's - restriction clauses against it, calling `mark_dummy_rel` on any partition that provably - cannot match — eliminating it before child paths are generated. + summary from that partition's index and evaluates the partition's restriction clauses + against it, calling `mark_dummy_rel` on any partition that provably cannot match — + eliminating it before child paths are generated. Deserialized summaries are cached for + the life of the backend (kept coherent by a relcache-invalidation callback), so warm + plans skip the per-partition page read; see [Performance](#performance). - **Typed comparisons.** Min/max vs. constant comparisons use each column type's own btree compare function, so **any btree-comparable type works**: `bigint` / `int` / `smallint`, `numeric`, `real` / `double precision`, `text` / `varchar`, `date`, @@ -80,27 +82,26 @@ EXPLAIN (COSTS OFF) SELECT * FROM places WHERE geom && ST_MakeEnvelope(0,0,10,10 ## Performance -The deal is simple and worth stating plainly: **table_range trades more planning time for -much less execution time.** A selective predicate on a non-key column scans only the -matching partition instead of every partition, which is a huge execution win — but the -planner pays to evaluate each partition's summary, so planning gets slower. +**table_range trades a small amount of planning time for a large execution win.** A +selective predicate on a non-key column scans only the matching partition instead of every +partition. The planner pays a little to evaluate each partition's summary, but that cost is +small and — warm — close to free (see the cache note below). The numbers below are reproducible with `bench/benchmark.sql` (`cargo pgrx run pg18`, then `\i bench/benchmark.sql`); they report `EXPLAIN (ANALYZE)` planning and execution time -separately, warm. +separately, warm, on PostgreSQL 18. **Faster execution.** 300 partitions × 8,000 rows (2.4M rows), `WHERE nk = `, PostgreSQL 18, warm: +partition>`: | | Planning | Execution | Total | |---|---|---|---| -| pruning **off** (scans all 300 partitions) | ~3 ms | ~100 ms | ~103 ms | -| pruning **on** (scans 1 partition) | ~12 ms | ~0.4 ms | **~12 ms** | +| pruning **off** (scans all 300 partitions) | ~4 ms | ~110 ms | ~114 ms | +| pruning **on** (scans 1 partition) | ~4 ms | ~0.4 ms | **~4 ms** | -Planning is ~4× slower, execution is ~230× faster, and total time drops ~8×. The win -grows with how much data the eliminated partitions hold, and shrinks as partitions get -smaller — on tiny partitions the planning overhead can exceed the execution it saves, so -measure your workload with `table_range.enable_pruning`. +Execution is ~250× faster, total time drops ~25×, and warm the planning overhead is in the +noise. The win grows with how much data the eliminated partitions hold; measure your +workload with `table_range.enable_pruning`. **Honest comparison to native pruning.** When a predicate is on the *partition key*, PostgreSQL prunes natively — and that path is in a different league, because it eliminates @@ -110,16 +111,15 @@ natively) and `nk` (the same values, not the key, pruned by table_range): | Same `=` predicate, 2,000 partitions | Planning | Execution | |---|---|---| -| native pruning — column **is** the partition key | **~0.1 ms** | ~0.02 ms | -| table_range — column is **not** the partition key | ~80 ms | ~0.06 ms | -| no pruning — scans all 2,000 partitions | ~30 ms | ~26 ms | +| native pruning — column **is** the partition key | **~0.15 ms** | ~0.05 ms | +| table_range — column is **not** the partition key | ~34 ms | ~0.06 ms | +| no pruning — scans all 2,000 partitions | ~28 ms | ~27 ms | Native pruning is *hundreds of times* cheaper to plan and is effectively constant in the partition count. table_range cannot match that (see [Scaling](#scaling-and-partition-count)): its job is the case native pruning **can't** do -— eliminating partitions by a non-key column. Against the realistic alternative for that -case (scanning every partition), it still wins on total time whenever the partitions are -sizeable. +— eliminating partitions by a non-key column. Note that table_range's overhead over the +no-pruning baseline (~28 ms to expand 2,000 partitions) is now small (~6 ms, ~3 µs/part). **Comparison to `CHECK` constraint exclusion.** The built-in way to prune on a non-key column is to put a data-range `CHECK (col BETWEEN lo AND hi)` on each partition and let the @@ -128,14 +128,15 @@ baseline. Same table, 2,000 partitions, same `nk = ` predicate: | Same `=` predicate, 2,000 partitions | Planning | Execution | Scans | |---|---|---|---| -| `CHECK` constraint exclusion (`constraint_exclusion=on`) | ~32 ms | ~0.08 ms | 1 partition | -| table_range pruning | ~84 ms | ~0.08 ms | 1 partition | -| no pruning | ~22 ms | ~24 ms | all 2,000 | +| `CHECK` constraint exclusion (`constraint_exclusion=on`) | ~37 ms | ~0.08 ms | 1 partition | +| table_range pruning | ~34 ms | ~0.08 ms | 1 partition | +| no pruning | ~26 ms | ~25 ms | all 2,000 | -Both are O(partitions) and give the **identical execution win**. Constraint exclusion plans -~2.6× faster — it is C code testing an already-loaded `CHECK` expression (~5 µs/partition), -while table_range reads each partition's index page (~31 µs/partition). What table_range -buys for that extra planning cost is everything `CHECK` constraints make you give up: +Both are O(partitions) and give the **identical execution win**, and **table_range now +plans on par with — and warm, slightly faster than — constraint exclusion.** (Constraint +exclusion re-parses each partition's `CHECK` expression on every plan; table_range serves +warm plans from a cached summary, see below.) On top of matching the speed, table_range +avoids everything `CHECK` constraints make you give up: - **No manual management** — `CREATE INDEX` builds and owns the ranges; you don't compute and attach a constraint per partition and keep it correct. @@ -144,14 +145,18 @@ buys for that extra planning cost is everything `CHECK` constraints make you giv - **Incremental maintenance** — changing a `CHECK` means `DROP`/`ADD CONSTRAINT` with a full-partition revalidation scan; table_range widens in place in `aminsert`, no rescan. -So table_range offers constraint-exclusion-class pruning without manual, enforced, -rescan-on-change constraints. Closing the ~2.6× planning gap (the per-partition index read) -is an active optimization target. +**How the per-partition cost got small.** Two optimizations took the per-partition planning +cost from ~31 µs to ~3–4 µs: -Each partition's summary is read from its own index page and cached for the duration of -one plan; the per-column compare function and the query constant are resolved once per -plan and reused across partitions (so the per-partition cost is a typed min/max compare, -not repeated catalog lookups). +1. *Per-plan compilation.* The compare function, type-input function, and operator strategy + are identical across a column's partitions, so they are resolved once per plan + (cached `FmgrInfo`s) instead of re-looked-up for each partition. +2. *Backend summary cache.* Each index's deserialized summary is cached for the life of the + backend, so warm/repeated plans skip the per-partition index open and metapage + read+deserialize entirely. The cache is kept coherent by a relcache invalidation + callback: `aminsert` only ever *widens* a summary, and when it does it invalidates the + cached copy everywhere — so a cached summary is never narrower than reality (a wider one + prunes correctly). A cold first plan still reads each page; every plan after is cached. ## Scaling and partition count @@ -172,10 +177,12 @@ Two practical consequences and how to handle them: `max_locks_per_transaction` (e.g. to a few thousand) and restart — it preallocates shared memory for the lock table, pushing the wall out in proportion. - **Planning time grows with partition count.** Even below the lock wall, planning scales - linearly. **Mitigations:** prefer **fewer, larger partitions** (table_range's sweet spot - — the execution win is biggest there anyway); use **prepared statements** so a plan is - reused across executions and the planning cost is amortized; and where you can, - **align the hot filter column with the partition key** so native pruning handles it. + linearly — though the per-partition constant is now small (~3–4 µs warm, on par with + `CHECK` constraint exclusion) thanks to the per-plan compilation and backend summary + cache described above. **Mitigations:** prefer **fewer, larger partitions** (table_range's + sweet spot — the execution win is biggest there anyway); use **prepared statements** so a + plan is reused across executions; and where you can, **align the hot filter column with + the partition key** so native pruning handles it. In short, table_range targets **hundreds to a few thousand sizeable partitions with a selective non-key predicate**. For tens of thousands of partitions, non-key pruning is not @@ -215,7 +222,8 @@ metapage (block 0), written by `ambuild` and updated in place by `aminsert`, lik | `src/lib.rs` | GUCs, `_PG_init`, test wiring | | `src/index_storage.rs` | per-index summary on the metapage: page I/O (Generic WAL) + (de)serialization | | `src/summary_build.rs` | build a leaf's summary by scanning its data (used by `ambuild`) | -| `src/prune_hook.rs` | planner + pathlist hooks, per-plan cache, typed in-memory evaluation | +| `src/prune_hook.rs` | planner + pathlist hooks, per-plan compilation cache, typed in-memory evaluation | +| `src/summary_cache.rs` | backend-lifetime per-index summary cache + relcache-invalidation coherence | | `src/index_am.rs` | `table_range` index AM: build, incremental `aminsert` widening, opclass provisioning | | `src/e2e_tests.rs`, `src/index_am_tests.rs` | end-to-end tests | diff --git a/src/index_am.rs b/src/index_am.rs index 21c9d3b..d8a1852 100644 --- a/src/index_am.rs +++ b/src/index_am.rs @@ -145,6 +145,9 @@ unsafe fn widen_on_insert( } if changed { let _ = index_storage::write_summary(index, &summary); + // The on-page summary just widened; drop any cached (now-too-narrow) copy in every + // backend so planning never prunes away the newly covered values. + crate::summary_cache::note_widened((*index).rd_id); } } diff --git a/src/lib.rs b/src/lib.rs index fc95806..eda56a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ mod index_am; mod index_storage; mod prune_hook; mod summary_build; +mod summary_cache; /// Master switch for planner-side partition pruning. pub(crate) static TABLE_RANGE_ENABLE_PRUNING: GucSetting = GucSetting::::new(true); @@ -38,6 +39,8 @@ pub extern "C-unwind" fn _PG_init() { // Install the real planner-time partition pruning hooks. prune_hook::install(); + // Register the relcache callback that keeps the per-index summary cache coherent. + summary_cache::register(); } #[cfg(any(test, feature = "pg_test"))] diff --git a/src/prune_hook.rs b/src/prune_hook.rs index e9e8704..e5ec95a 100644 --- a/src/prune_hook.rs +++ b/src/prune_hook.rs @@ -3,6 +3,7 @@ use pgrx::prelude::*; use std::cell::{Cell, RefCell}; use std::collections::HashMap; use std::ffi::{CStr, CString}; +use std::rc::Rc; use crate::index_storage::ColSummary; use crate::{TABLE_RANGE_ENABLE_PRUNING, TABLE_RANGE_LOG_PRUNING_DEBUG}; @@ -34,7 +35,7 @@ static mut PREV_PLANNER_HOOK: pg_sys::planner_hook_type = None; /// Per-partition summaries read from each partition's index, cached for one planner /// invocation (keyed by partition relid). A relid present with an empty vec means /// "checked, no table_range index / no summary". -type SummaryMap = HashMap>; +type SummaryMap = HashMap>>; thread_local! { /// Summaries read during the current planner invocation. Cleared per top-level plan. @@ -279,21 +280,33 @@ unsafe fn load_summary(rel: *mut pg_sys::RelOptInfo, relid_u32: u32) { }); } +/// Empty shared summary returned when a relation has no usable table_range summary. +fn empty_summary() -> Rc> { + thread_local!(static EMPTY: Rc> = Rc::new(Vec::new())); + EMPTY.with(Rc::clone) +} + // We rely on the planner having put the table_range index into `rel->indexlist`. The // planner only lists indexes with `indisvalid = true`, so a table_range index must be // valid for pruning to engage — if anything marks it invalid (e.g. an external // "hide indexes" DDL hook), `indexlist` omits it and we silently fall back to KEEP. -unsafe fn read_index_summary(rel: *mut pg_sys::RelOptInfo) -> Vec { +unsafe fn read_index_summary(rel: *mut pg_sys::RelOptInfo) -> Rc> { let am = table_range_am_oid(); if am == pg_sys::Oid::INVALID || (*rel).indexlist.is_null() { - return Vec::new(); + return empty_summary(); } let indexes = pgrx::PgList::::from_pg((*rel).indexlist); for idx in indexes.iter_ptr() { if idx.is_null() || (*idx).relam != am { continue; } - let irel = pg_sys::index_open((*idx).indexoid, pg_sys::AccessShareLock as i32); + let indexoid = (*idx).indexoid; + // Warm path: a backend-lifetime cache lets repeated plans skip the index open and + // metapage read+deserialize entirely (kept coherent by a relcache callback). + if let Some(cached) = crate::summary_cache::get(indexoid) { + return cached; + } + let irel = pg_sys::index_open(indexoid, pg_sys::AccessShareLock as i32); // A partitioned (parent) index has no storage; only leaf indexes hold summaries. let has_storage = (*(*irel).rd_rel).relkind != pg_sys::RELKIND_PARTITIONED_INDEX as std::ffi::c_char; @@ -303,9 +316,11 @@ unsafe fn read_index_summary(rel: *mut pg_sys::RelOptInfo) -> Vec { None }; pg_sys::index_close(irel, pg_sys::AccessShareLock as i32); - return summary.map(|s| s.cols).unwrap_or_default(); + let cols = Rc::new(summary.map(|s| s.cols).unwrap_or_default()); + crate::summary_cache::put(indexoid, Rc::clone(&cols)); + return cols; } - Vec::new() + empty_summary() } /// Returns true iff some restriction clause proves the partition cannot match. @@ -321,8 +336,8 @@ unsafe fn evaluate_relation(rel: *mut pg_sys::RelOptInfo, relid: pg_sys::Oid) -> CACHE.with(|c| { let borrow = c.borrow(); - let rows = match borrow.get(&relid_u32) { - Some(rows) if !rows.is_empty() => rows, + let rows: &[ColSummary] = match borrow.get(&relid_u32) { + Some(rows) if !rows.is_empty() => rows.as_slice(), _ => return false, // no summary -> never prune }; diff --git a/src/summary_cache.rs b/src/summary_cache.rs new file mode 100644 index 0000000..1e27e70 --- /dev/null +++ b/src/summary_cache.rs @@ -0,0 +1,81 @@ +// Backend-lifetime cache of deserialized per-index summaries, so warm/repeated planning +// does not re-open each partition's index and re-read+deserialize its metapage on every +// plan. Keyed by index OID; values are shared (`Rc`) with the per-plan cache. +// +// Correctness rests on the over-inclusive invariant: a cached summary is safe as long as +// it is never *narrower* than the real data. Only `aminsert` widens a summary, and it +// calls `note_widened`, which sends a relcache invalidation for the index. That clears the +// cache in every backend (via `relcache_callback`) — locally at the next command boundary, +// and in other backends when the widening transaction commits, matching row visibility. +// Operations that only *narrow* a summary (deletes, vacuum re-tighten) need no +// invalidation, because an over-wide cached summary still prunes correctly. + +use crate::index_storage::ColSummary; +use pgrx::pg_sys; +use std::cell::{Cell, RefCell}; +use std::collections::{HashMap, HashSet}; +use std::rc::Rc; + +thread_local! { + /// indexoid -> its deserialized summary (empty vec = "no summary / not a leaf"). + static CACHE: RefCell>>> = RefCell::new(HashMap::new()); + /// Indexes already invalidated in the current transaction, to coalesce a bulk insert's + /// repeated widenings into one invalidation per index per transaction. + static INVALIDATED: RefCell> = RefCell::new(HashSet::new()); + /// TransactionId the `INVALIDATED` set belongs to; on change we clear the set. + static INVALIDATED_XID: Cell = const { Cell::new(0) }; +} + +/// Look up a cached summary for `indexoid`, if present. +pub fn get(indexoid: pg_sys::Oid) -> Option>> { + let key: u32 = indexoid.into(); + CACHE.with(|c| c.borrow().get(&key).cloned()) +} + +/// Store a deserialized summary for `indexoid`. +pub fn put(indexoid: pg_sys::Oid, summary: Rc>) { + let key: u32 = indexoid.into(); + CACHE.with(|c| { + c.borrow_mut().insert(key, summary); + }); +} + +/// Called by `aminsert` when it widens an index's on-page summary. Invalidates the cached +/// copy everywhere (once per index per transaction). +pub unsafe fn note_widened(indexoid: pg_sys::Oid) { + let key: u32 = indexoid.into(); + let xid: u32 = pg_sys::GetTopTransactionIdIfAny().into(); + if xid == 0 { + // No assigned xid to scope dedup to; invalidate unconditionally (safe). + pg_sys::CacheInvalidateRelcacheByRelid(indexoid); + return; + } + if INVALIDATED_XID.with(|x| x.get()) != xid { + INVALIDATED_XID.with(|x| x.set(xid)); + INVALIDATED.with(|s| s.borrow_mut().clear()); + } + let first = INVALIDATED.with(|s| s.borrow_mut().insert(key)); + if first { + pg_sys::CacheInvalidateRelcacheByRelid(indexoid); + } +} + +/// Relcache invalidation callback: drop the cached summary for `relid` (or all of them +/// when `relid` is invalid, PG's "flush everything" signal). +unsafe extern "C-unwind" fn relcache_callback(_arg: pg_sys::Datum, relid: pg_sys::Oid) { + if relid == pg_sys::Oid::INVALID { + CACHE.with(|c| c.borrow_mut().clear()); + } else { + let key: u32 = relid.into(); + CACHE.with(|c| { + c.borrow_mut().remove(&key); + }); + } +} + +/// Register the relcache invalidation callback. Called once from `_PG_init`. +pub fn register() { + unsafe { + pg_sys::CacheRegisterRelcacheCallback(Some(relcache_callback), pg_sys::Datum::from(0)); + } +}