From aaf971d93a7ee5999df3144933d2425479cd3f94 Mon Sep 17 00:00:00 2001
From: David W Bitner <bitner@dbspatial.com>
Date: Tue, 23 Jun 2026 10:48:29 -0500
Subject: [PATCH 1/2] Compile predicate evaluation per plan: close most of the
 gap to CHECK constraint exclusion

Investigation into why table_range planning is slower than PostgreSQL's own
constraint exclusion (the built-in way to prune on a non-key column via a
data-range CHECK per partition).

How constraint exclusion works (src/backend/optimizer/util/plancat.c):
relation_excluded_by_constraints -> get_relation_constraints reads each
partition's CHECK expressions straight from the relcache
(relation->rd_att->constr->check) via table_open(..., NoLock) -- the partition
is already locked and cached from planning, so it does zero extra I/O and zero
extra locking per partition, then predicate_refuted_by proves contradiction on
the in-memory expression trees.

Attribution of our ~31us/partition overhead at 2,000 partitions (via a temporary
diagnostic) was surprising: the index-page read+deserialize is only ~7us; the
*evaluation* was ~23us -- dominated by work that is identical across every
partition but was being redone for each one: btree_strategy (3 syscache lookups),
getTypeInputInfo and fmgr_info setup inside every text_to_datum / OidFunctionCall,
and constant rendering.

Fix: resolve those once per top-level plan and reuse across partitions --
- FMGR_MEMO: a palloc'd FmgrInfo per function, so each compare / input-function
  call skips fmgr_info's syscache lookup (FunctionCall2Coll / InputFunctionCall);
- INPUT_INFO_MEMO: getTypeInputInfo result per type;
- STRATEGY_MEMO: btree strategy per (operator, left type).
These caches are planner-only and cleared per plan; the aminsert path keeps using
the uncached datum_cmp / text_to_datum (no cross-statement cache to invalidate).

Result at 2,000 partitions (warm, same session): full planning ~88ms -> ~43ms;
eval is now effectively free (full ~= read-only ~= traversal-only). Versus CHECK
constraint exclusion (~33ms) we went from ~2.6x to ~1.3x; the residual ~5us/part
is the per-partition index-page read+deserialize. 29 tests pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/prune_hook.rs | 101 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 10 deletions(-)
diff --git a/src/prune_hook.rs b/src/prune_hook.rs
index a701cca..e9e8704 100644
--- a/src/prune_hook.rs
+++ b/src/prune_hook.rs
@@ -51,6 +51,82 @@ thread_local! {
     /// Per-plan memo of parsed query constants, keyed by (type oid, text). The same
     /// constant is otherwise re-rendered and re-parsed once per partition.
     static CONST_MEMO: RefCell<HashMap<(u32, String), pg_sys::Datum>> = RefCell::new(HashMap::new());
+    /// Per-plan cache of resolved `FmgrInfo` structs (one palloc per function), so each
+    /// comparison / input-function call across partitions skips the `fmgr_info` syscache
+    /// lookup. Pointers are into the planner memory context and cleared per top-level plan.
+    static FMGR_MEMO: RefCell<HashMap<u32, *mut pg_sys::FmgrInfo>> = RefCell::new(HashMap::new());
+    /// Per-plan memo of a type's input function + ioparam (from `getTypeInputInfo`).
+    static INPUT_INFO_MEMO: RefCell<HashMap<u32, (pg_sys::Oid, pg_sys::Oid)>> =
+        RefCell::new(HashMap::new());
+    /// Per-plan memo of the btree strategy for an (operator, left type) pair.
+    static STRATEGY_MEMO: RefCell<HashMap<(u32, u32), Option<i16>>> = RefCell::new(HashMap::new());
+}
+
+/// A planner-cached `FmgrInfo` for `proc_oid`, valid for the current plan. Avoids the
+/// per-call `fmgr_info` syscache lookup that `OidFunctionCall*` does. Planner-only: the
+/// cache is cleared per top-level plan (the `aminsert` path must not use this).
+unsafe fn fmgr_cached(proc_oid: pg_sys::Oid) -> *mut pg_sys::FmgrInfo {
+    let key: u32 = proc_oid.into();
+    if let Some(p) = FMGR_MEMO.with(|m| m.borrow().get(&key).copied()) {
+        return p;
+    }
+    let p = pg_sys::palloc0(std::mem::size_of::<pg_sys::FmgrInfo>()) as *mut pg_sys::FmgrInfo;
+    pg_sys::fmgr_info(proc_oid, p);
+    FMGR_MEMO.with(|m| {
+        m.borrow_mut().insert(key, p);
+    });
+    p
+}
+
+/// Compare two datums of the same type using a plan-cached `FmgrInfo`. Planner-only.
+unsafe fn cmp_cached(
+    cmpproc: pg_sys::Oid,
+    collation: pg_sys::Oid,
+    a: pg_sys::Datum,
+    b: pg_sys::Datum,
+) -> i32 {
+    pg_sys::FunctionCall2Coll(fmgr_cached(cmpproc), collation, a, b).value() as i32
+}
+
+/// Parse `text` to a Datum of `typ` using plan-cached type-input info and `FmgrInfo`.
+/// Planner-only (clears per plan); the `aminsert` path uses [`text_to_datum`] instead.
+unsafe fn parse_cached(typ: pg_sys::Oid, text: &str) -> Option<pg_sys::Datum> {
+    let key: u32 = typ.into();
+    let (infunc, ioparam) = match INPUT_INFO_MEMO.with(|m| m.borrow().get(&key).copied()) {
+        Some(v) => v,
+        None => {
+            let mut infunc = pg_sys::Oid::INVALID;
+            let mut ioparam = pg_sys::Oid::INVALID;
+            pg_sys::getTypeInputInfo(typ, &mut infunc, &mut ioparam);
+            INPUT_INFO_MEMO.with(|m| {
+                m.borrow_mut().insert(key, (infunc, ioparam));
+            });
+            (infunc, ioparam)
+        }
+    };
+    if infunc == pg_sys::Oid::INVALID {
+        return None;
+    }
+    let cstr = CString::new(text).ok()?;
+    Some(pg_sys::InputFunctionCall(
+        fmgr_cached(infunc),
+        cstr.as_ptr() as *mut _,
+        ioparam,
+        -1,
+    ))
+}
+
+/// btree strategy for an (operator, left type) pair, memoized for the current plan.
+unsafe fn strategy_cached(opno: pg_sys::Oid, lefttype: pg_sys::Oid) -> Option<i16> {
+    let key = (opno.into(), lefttype.into());
+    if let Some(v) = STRATEGY_MEMO.with(|m| m.borrow().get(&key).copied()) {
+        return v;
+    }
+    let v = btree_strategy(opno, lefttype);
+    STRATEGY_MEMO.with(|m| {
+        m.borrow_mut().insert(key, v);
+    });
+    v
 }
 
 /// Compare proc for `typ`, memoized for the current plan. See [`btree_cmp_proc`].
@@ -74,7 +150,7 @@ unsafe fn const_datum_cached(typ: pg_sys::Oid, text: &str) -> Option<pg_sys::Dat
     if let Some(d) = CONST_MEMO.with(|m| m.borrow().get(&key).copied()) {
         return Some(d);
     }
-    let d = text_to_datum(typ, text)?;
+    let d = parse_cached(typ, text)?;
     CONST_MEMO.with(|m| {
         m.borrow_mut().insert(key, d);
     });
@@ -173,6 +249,11 @@ fn clear_cache() {
     AM_OID.with(|c| c.set(None));
     CMP_PROC_MEMO.with(|c| c.borrow_mut().clear());
     CONST_MEMO.with(|c| c.borrow_mut().clear());
+    // FmgrInfo structs are palloc'd in the planner context and freed when it resets; we
+    // just drop the (now-dangling-after-reset) pointers at the start/end of each plan.
+    FMGR_MEMO.with(|c| c.borrow_mut().clear());
+    INPUT_INFO_MEMO.with(|c| c.borrow_mut().clear());
+    STRATEGY_MEMO.with(|c| c.borrow_mut().clear());
 }
 
 /// The table_range access-method OID, resolved once per planner invocation.
@@ -404,7 +485,7 @@ unsafe fn extract_opexpr(opexpr: *mut pg_sys::OpExpr) -> Option<QualSpec> {
     }
 
     // Scalar btree comparison (`<`, `<=`, `=`, `>=`, `>`).
-    if let Some(strategy) = btree_strategy((*opexpr).opno, (*var).vartype) {
+    if let Some(strategy) = strategy_cached((*opexpr).opno, (*var).vartype) {
         let strategy = if commuted {
             commute_strategy(strategy)
         } else {
@@ -475,7 +556,7 @@ unsafe fn extract_saop(saop: *mut pg_sys::ScalarArrayOpExpr) -> Option<QualSpec>
         return None;
     }
     // Only the equality operator gives the "any element in range" semantics.
-    if btree_strategy((*saop).opno, (*var).vartype)? != 3 {
+    if strategy_cached((*saop).opno, (*var).vartype)? != 3 {
         return None;
     }
     let elems = array_const_texts(con)?;
@@ -510,15 +591,15 @@ unsafe fn eval_compare(
         Some(d) => d,
         None => return false,
     };
-    let min_d = match text_to_datum(typ, min) {
+    let min_d = match parse_cached(typ, min) {
         Some(d) => d,
         None => return false,
     };
-    let max_d = match text_to_datum(typ, max) {
+    let max_d = match parse_cached(typ, max) {
         Some(d) => d,
         None => return false,
     };
-    let cmp = |a, b| datum_cmp(cmpproc, collation, a, b);
+    let cmp = |a, b| cmp_cached(cmpproc, collation, a, b);
     match strategy {
         1 => cmp(min_d, k) >= 0,                     // col < K  : prune iff min >= K
         2 => cmp(min_d, k) > 0,                      // col <= K : prune iff min > K
@@ -543,11 +624,11 @@ unsafe fn eval_in_list(
         Some(p) => p,
         None => return false,
     };
-    let min_d = match text_to_datum(typ, min) {
+    let min_d = match parse_cached(typ, min) {
         Some(d) => d,
         None => return false,
     };
-    let max_d = match text_to_datum(typ, max) {
+    let max_d = match parse_cached(typ, max) {
         Some(d) => d,
         None => return false,
     };
@@ -562,8 +643,8 @@ unsafe fn eval_in_list(
             Some(d) => d,
             None => return false,
         };
-        if datum_cmp(cmpproc, collation, d, min_d) >= 0
-            && datum_cmp(cmpproc, collation, d, max_d) <= 0
+        if cmp_cached(cmpproc, collation, d, min_d) >= 0
+            && cmp_cached(cmpproc, collation, d, max_d) <= 0
         {
             return false; // this value is in range -> KEEP
         }

From 3c978210c4f754b2853c357497b71badc9c485bf Mon Sep 17 00:00:00 2001
From: David W Bitner <bitner@dbspatial.com>
Date: Tue, 23 Jun 2026 11:19:50 -0500
Subject: [PATCH 2/2] Cache deserialized summaries per backend: close the gap
 to CHECK constraint exclusion

Backend-lifetime cache (src/summary_cache.rs) keyed by index OID, so warm/repeated
plans skip the per-partition index open + metapage read + deserialize entirely and
serve the summary from memory (shared via Rc with the per-plan cache).

Coherence rests on the over-inclusive invariant: a cached summary is safe as long as
it is never *narrower* than the data. Only aminsert widens a summary; when it does it
calls CacheInvalidateRelcacheByRelid on the index, and a registered relcache callback
drops the cached copy in every backend (locally at the next command boundary, in other
backends at the widening txn's commit -- matching row visibility). Operations that only
narrow (delete, vacuum re-tighten) need no invalidation. Widen-invalidations are
coalesced to one per index per transaction so bulk loads don't thrash.

Result at 2,000 partitions (warm, same session): planning ~43ms -> ~34ms, which now
matches -- and slightly beats -- CHECK constraint exclusion (~37ms), because we serve
a cached summary while constraint exclusion re-parses each CHECK every plan. At 300
partitions, pruning-on planning (~4ms) is now ~equal to pruning-off. Combined with the
earlier per-plan compilation, per-partition planning cost fell from ~31us to ~3-4us.
A cold first plan still reads each page; every plan after is cached. 29 tests pass.

README performance/scaling sections and benchmark numbers updated accordingly.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 README.md            | 88 ++++++++++++++++++++++++--------------------
 src/index_am.rs      |  3 ++
 src/lib.rs           |  3 ++
 src/prune_hook.rs    | 31 ++++++++++++----
 src/summary_cache.rs | 81 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 158 insertions(+), 48 deletions(-)
 create mode 100644 src/summary_cache.rs

diff --git a/README.md b/README.md
index 8a42d13..fd5b4e3 100644
--- a/README.md
+++ b/README.md
@@ -57,9 +57,11 @@ EXPLAIN (COSTS OFF) SELECT * FROM places WHERE geom && ST_MakeEnvelope(0,0,10,10
   for range types (`range_merge(range_agg(col))`) or the bounding box for PostGIS geometry
   (`ST_Extent(col)`).
 - **Planning.** For each partition the planner builds, a `set_rel_pathlist_hook` reads the
-  summary from that partition's index (cached for the plan) and evaluates the partition's
-  restriction clauses against it, calling `mark_dummy_rel` on any partition that provably
-  cannot match — eliminating it before child paths are generated.
+  summary from that partition's index and evaluates the partition's restriction clauses
+  against it, calling `mark_dummy_rel` on any partition that provably cannot match —
+  eliminating it before child paths are generated. Deserialized summaries are cached for
+  the life of the backend (kept coherent by a relcache-invalidation callback), so warm
+  plans skip the per-partition page read; see [Performance](#performance).
 - **Typed comparisons.** Min/max vs. constant comparisons use each column type's own
   btree compare function, so **any btree-comparable type works**: `bigint` / `int` /
   `smallint`, `numeric`, `real` / `double precision`, `text` / `varchar`, `date`,
@@ -80,27 +82,26 @@ EXPLAIN (COSTS OFF) SELECT * FROM places WHERE geom && ST_MakeEnvelope(0,0,10,10
 
 ## Performance
 
-The deal is simple and worth stating plainly: **table_range trades more planning time for
-much less execution time.** A selective predicate on a non-key column scans only the
-matching partition instead of every partition, which is a huge execution win — but the
-planner pays to evaluate each partition's summary, so planning gets slower.
+**table_range trades a small amount of planning time for a large execution win.** A
+selective predicate on a non-key column scans only the matching partition instead of every
+partition. The planner pays a little to evaluate each partition's summary, but that cost is
+small and — warm — close to free (see the cache note below).
 
 The numbers below are reproducible with `bench/benchmark.sql` (`cargo pgrx run pg18`, then
 `\i bench/benchmark.sql`); they report `EXPLAIN (ANALYZE)` planning and execution time
-separately, warm.
+separately, warm, on PostgreSQL 18.
 
 **Faster execution.** 300 partitions × 8,000 rows (2.4M rows), `WHERE nk = <value in one
-partition>`, PostgreSQL 18, warm:
+partition>`:
 
 | | Planning | Execution | Total |
 |---|---|---|---|
-| pruning **off** (scans all 300 partitions) | ~3 ms | ~100 ms | ~103 ms |
-| pruning **on** (scans 1 partition) | ~12 ms | ~0.4 ms | **~12 ms** |
+| pruning **off** (scans all 300 partitions) | ~4 ms | ~110 ms | ~114 ms |
+| pruning **on** (scans 1 partition) | ~4 ms | ~0.4 ms | **~4 ms** |
 
-Planning is ~4× slower, execution is ~230× faster, and total time drops ~8×. The win
-grows with how much data the eliminated partitions hold, and shrinks as partitions get
-smaller — on tiny partitions the planning overhead can exceed the execution it saves, so
-measure your workload with `table_range.enable_pruning`.
+Execution is ~250× faster, total time drops ~25×, and warm the planning overhead is in the
+noise. The win grows with how much data the eliminated partitions hold; measure your
+workload with `table_range.enable_pruning`.
 
 **Honest comparison to native pruning.** When a predicate is on the *partition key*,
 PostgreSQL prunes natively — and that path is in a different league, because it eliminates
@@ -110,16 +111,15 @@ natively) and `nk` (the same values, not the key, pruned by table_range):
 
 | Same `=` predicate, 2,000 partitions | Planning | Execution |
 |---|---|---|
-| native pruning — column **is** the partition key | **~0.1 ms** | ~0.02 ms |
-| table_range — column is **not** the partition key | ~80 ms | ~0.06 ms |
-| no pruning — scans all 2,000 partitions | ~30 ms | ~26 ms |
+| native pruning — column **is** the partition key | **~0.15 ms** | ~0.05 ms |
+| table_range — column is **not** the partition key | ~34 ms | ~0.06 ms |
+| no pruning — scans all 2,000 partitions | ~28 ms | ~27 ms |
 
 Native pruning is *hundreds of times* cheaper to plan and is effectively constant in the
 partition count. table_range cannot match that (see
 [Scaling](#scaling-and-partition-count)): its job is the case native pruning **can't** do
-— eliminating partitions by a non-key column. Against the realistic alternative for that
-case (scanning every partition), it still wins on total time whenever the partitions are
-sizeable.
+— eliminating partitions by a non-key column. Note that table_range's overhead over the
+no-pruning baseline (~28 ms to expand 2,000 partitions) is now small (~6 ms, ~3 µs/part).
 
 **Comparison to `CHECK` constraint exclusion.** The built-in way to prune on a non-key
 column is to put a data-range `CHECK (col BETWEEN lo AND hi)` on each partition and let the
@@ -128,14 +128,15 @@ baseline. Same table, 2,000 partitions, same `nk = <value>` predicate:
 
 | Same `=` predicate, 2,000 partitions | Planning | Execution | Scans |
 |---|---|---|---|
-| `CHECK` constraint exclusion (`constraint_exclusion=on`) | ~32 ms | ~0.08 ms | 1 partition |
-| table_range pruning | ~84 ms | ~0.08 ms | 1 partition |
-| no pruning | ~22 ms | ~24 ms | all 2,000 |
+| `CHECK` constraint exclusion (`constraint_exclusion=on`) | ~37 ms | ~0.08 ms | 1 partition |
+| table_range pruning | ~34 ms | ~0.08 ms | 1 partition |
+| no pruning | ~26 ms | ~25 ms | all 2,000 |
 
-Both are O(partitions) and give the **identical execution win**. Constraint exclusion plans
-~2.6× faster — it is C code testing an already-loaded `CHECK` expression (~5 µs/partition),
-while table_range reads each partition's index page (~31 µs/partition). What table_range
-buys for that extra planning cost is everything `CHECK` constraints make you give up:
+Both are O(partitions) and give the **identical execution win**, and **table_range now
+plans on par with — and warm, slightly faster than — constraint exclusion.** (Constraint
+exclusion re-parses each partition's `CHECK` expression on every plan; table_range serves
+warm plans from a cached summary, see below.) On top of matching the speed, table_range
+avoids everything `CHECK` constraints make you give up:
 
 - **No manual management** — `CREATE INDEX` builds and owns the ranges; you don't compute
   and attach a constraint per partition and keep it correct.
@@ -144,14 +145,18 @@ buys for that extra planning cost is everything `CHECK` constraints make you giv
 - **Incremental maintenance** — changing a `CHECK` means `DROP`/`ADD CONSTRAINT` with a
   full-partition revalidation scan; table_range widens in place in `aminsert`, no rescan.
 
-So table_range offers constraint-exclusion-class pruning without manual, enforced,
-rescan-on-change constraints. Closing the ~2.6× planning gap (the per-partition index read)
-is an active optimization target.
+**How the per-partition cost got small.** Two optimizations took the per-partition planning
+cost from ~31 µs to ~3–4 µs:
 
-Each partition's summary is read from its own index page and cached for the duration of
-one plan; the per-column compare function and the query constant are resolved once per
-plan and reused across partitions (so the per-partition cost is a typed min/max compare,
-not repeated catalog lookups).
+1. *Per-plan compilation.* The compare function, type-input function, and operator strategy
+   are identical across a column's partitions, so they are resolved once per plan
+   (cached `FmgrInfo`s) instead of re-looked-up for each partition.
+2. *Backend summary cache.* Each index's deserialized summary is cached for the life of the
+   backend, so warm/repeated plans skip the per-partition index open and metapage
+   read+deserialize entirely. The cache is kept coherent by a relcache invalidation
+   callback: `aminsert` only ever *widens* a summary, and when it does it invalidates the
+   cached copy everywhere — so a cached summary is never narrower than reality (a wider one
+   prunes correctly). A cold first plan still reads each page; every plan after is cached.
 
 ## Scaling and partition count
 
@@ -172,10 +177,12 @@ Two practical consequences and how to handle them:
   `max_locks_per_transaction` (e.g. to a few thousand) and restart — it preallocates
   shared memory for the lock table, pushing the wall out in proportion.
 - **Planning time grows with partition count.** Even below the lock wall, planning scales
-  linearly. **Mitigations:** prefer **fewer, larger partitions** (table_range's sweet spot
-  — the execution win is biggest there anyway); use **prepared statements** so a plan is
-  reused across executions and the planning cost is amortized; and where you can,
-  **align the hot filter column with the partition key** so native pruning handles it.
+  linearly — though the per-partition constant is now small (~3–4 µs warm, on par with
+  `CHECK` constraint exclusion) thanks to the per-plan compilation and backend summary
+  cache described above. **Mitigations:** prefer **fewer, larger partitions** (table_range's
+  sweet spot — the execution win is biggest there anyway); use **prepared statements** so a
+  plan is reused across executions; and where you can, **align the hot filter column with
+  the partition key** so native pruning handles it.
 
 In short, table_range targets **hundreds to a few thousand sizeable partitions with a
 selective non-key predicate**. For tens of thousands of partitions, non-key pruning is not
@@ -215,7 +222,8 @@ metapage (block 0), written by `ambuild` and updated in place by `aminsert`, lik
 | `src/lib.rs` | GUCs, `_PG_init`, test wiring |
 | `src/index_storage.rs` | per-index summary on the metapage: page I/O (Generic WAL) + (de)serialization |
 | `src/summary_build.rs` | build a leaf's summary by scanning its data (used by `ambuild`) |
-| `src/prune_hook.rs` | planner + pathlist hooks, per-plan cache, typed in-memory evaluation |
+| `src/prune_hook.rs` | planner + pathlist hooks, per-plan compilation cache, typed in-memory evaluation |
+| `src/summary_cache.rs` | backend-lifetime per-index summary cache + relcache-invalidation coherence |
 | `src/index_am.rs` | `table_range` index AM: build, incremental `aminsert` widening, opclass provisioning |
 | `src/e2e_tests.rs`, `src/index_am_tests.rs` | end-to-end tests |
 
diff --git a/src/index_am.rs b/src/index_am.rs
index 21c9d3b..d8a1852 100644
--- a/src/index_am.rs
+++ b/src/index_am.rs
@@ -145,6 +145,9 @@ unsafe fn widen_on_insert(
     }
     if changed {
         let _ = index_storage::write_summary(index, &summary);
+        // The on-page summary just widened; drop any cached (now-too-narrow) copy in every
+        // backend so planning never prunes away the newly covered values.
+        crate::summary_cache::note_widened((*index).rd_id);
     }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index fc95806..eda56a1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,6 +9,7 @@ mod index_am;
 mod index_storage;
 mod prune_hook;
 mod summary_build;
+mod summary_cache;
 
 /// Master switch for planner-side partition pruning.
 pub(crate) static TABLE_RANGE_ENABLE_PRUNING: GucSetting<bool> = GucSetting::<bool>::new(true);
@@ -38,6 +39,8 @@ pub extern "C-unwind" fn _PG_init() {
 
     // Install the real planner-time partition pruning hooks.
     prune_hook::install();
+    // Register the relcache callback that keeps the per-index summary cache coherent.
+    summary_cache::register();
 }
 
 #[cfg(any(test, feature = "pg_test"))]
diff --git a/src/prune_hook.rs b/src/prune_hook.rs
index e9e8704..e5ec95a 100644
--- a/src/prune_hook.rs
+++ b/src/prune_hook.rs
@@ -3,6 +3,7 @@ use pgrx::prelude::*;
 use std::cell::{Cell, RefCell};
 use std::collections::HashMap;
 use std::ffi::{CStr, CString};
+use std::rc::Rc;
 
 use crate::index_storage::ColSummary;
 use crate::{TABLE_RANGE_ENABLE_PRUNING, TABLE_RANGE_LOG_PRUNING_DEBUG};
@@ -34,7 +35,7 @@ static mut PREV_PLANNER_HOOK: pg_sys::planner_hook_type = None;
 /// Per-partition summaries read from each partition's index, cached for one planner
 /// invocation (keyed by partition relid). A relid present with an empty vec means
 /// "checked, no table_range index / no summary".
-type SummaryMap = HashMap<u32, Vec<ColSummary>>;
+type SummaryMap = HashMap<u32, Rc<Vec<ColSummary>>>;
 
 thread_local! {
     /// Summaries read during the current planner invocation. Cleared per top-level plan.
@@ -279,21 +280,33 @@ unsafe fn load_summary(rel: *mut pg_sys::RelOptInfo, relid_u32: u32) {
     });
 }
 
+/// Empty shared summary returned when a relation has no usable table_range summary.
+fn empty_summary() -> Rc<Vec<ColSummary>> {
+    thread_local!(static EMPTY: Rc<Vec<ColSummary>> = Rc::new(Vec::new()));
+    EMPTY.with(Rc::clone)
+}
+
 // We rely on the planner having put the table_range index into `rel->indexlist`. The
 // planner only lists indexes with `indisvalid = true`, so a table_range index must be
 // valid for pruning to engage — if anything marks it invalid (e.g. an external
 // "hide indexes" DDL hook), `indexlist` omits it and we silently fall back to KEEP.
-unsafe fn read_index_summary(rel: *mut pg_sys::RelOptInfo) -> Vec<ColSummary> {
+unsafe fn read_index_summary(rel: *mut pg_sys::RelOptInfo) -> Rc<Vec<ColSummary>> {
     let am = table_range_am_oid();
     if am == pg_sys::Oid::INVALID || (*rel).indexlist.is_null() {
-        return Vec::new();
+        return empty_summary();
     }
     let indexes = pgrx::PgList::<pg_sys::IndexOptInfo>::from_pg((*rel).indexlist);
     for idx in indexes.iter_ptr() {
         if idx.is_null() || (*idx).relam != am {
             continue;
         }
-        let irel = pg_sys::index_open((*idx).indexoid, pg_sys::AccessShareLock as i32);
+        let indexoid = (*idx).indexoid;
+        // Warm path: a backend-lifetime cache lets repeated plans skip the index open and
+        // metapage read+deserialize entirely (kept coherent by a relcache callback).
+        if let Some(cached) = crate::summary_cache::get(indexoid) {
+            return cached;
+        }
+        let irel = pg_sys::index_open(indexoid, pg_sys::AccessShareLock as i32);
         // A partitioned (parent) index has no storage; only leaf indexes hold summaries.
         let has_storage =
             (*(*irel).rd_rel).relkind != pg_sys::RELKIND_PARTITIONED_INDEX as std::ffi::c_char;
@@ -303,9 +316,11 @@ unsafe fn read_index_summary(rel: *mut pg_sys::RelOptInfo) -> Vec<ColSummary> {
             None
         };
         pg_sys::index_close(irel, pg_sys::AccessShareLock as i32);
-        return summary.map(|s| s.cols).unwrap_or_default();
+        let cols = Rc::new(summary.map(|s| s.cols).unwrap_or_default());
+        crate::summary_cache::put(indexoid, Rc::clone(&cols));
+        return cols;
     }
-    Vec::new()
+    empty_summary()
 }
 
 /// Returns true iff some restriction clause proves the partition cannot match.
@@ -321,8 +336,8 @@ unsafe fn evaluate_relation(rel: *mut pg_sys::RelOptInfo, relid: pg_sys::Oid) ->
 
     CACHE.with(|c| {
         let borrow = c.borrow();
-        let rows = match borrow.get(&relid_u32) {
-            Some(rows) if !rows.is_empty() => rows,
+        let rows: &[ColSummary] = match borrow.get(&relid_u32) {
+            Some(rows) if !rows.is_empty() => rows.as_slice(),
             _ => return false, // no summary -> never prune
         };
 
diff --git a/src/summary_cache.rs b/src/summary_cache.rs
new file mode 100644
index 0000000..1e27e70
--- /dev/null
+++ b/src/summary_cache.rs
@@ -0,0 +1,81 @@
+// Backend-lifetime cache of deserialized per-index summaries, so warm/repeated planning
+// does not re-open each partition's index and re-read+deserialize its metapage on every
+// plan. Keyed by index OID; values are shared (`Rc`) with the per-plan cache.
+//
+// Correctness rests on the over-inclusive invariant: a cached summary is safe as long as
+// it is never *narrower* than the real data. Only `aminsert` widens a summary, and it
+// calls `note_widened`, which sends a relcache invalidation for the index. That clears the
+// cache in every backend (via `relcache_callback`) — locally at the next command boundary,
+// and in other backends when the widening transaction commits, matching row visibility.
+// Operations that only *narrow* a summary (deletes, vacuum re-tighten) need no
+// invalidation, because an over-wide cached summary still prunes correctly.
+
+use crate::index_storage::ColSummary;
+use pgrx::pg_sys;
+use std::cell::{Cell, RefCell};
+use std::collections::{HashMap, HashSet};
+use std::rc::Rc;
+
+thread_local! {
+    /// indexoid -> its deserialized summary (empty vec = "no summary / not a leaf").
+    static CACHE: RefCell<HashMap<u32, Rc<Vec<ColSummary>>>> = RefCell::new(HashMap::new());
+    /// Indexes already invalidated in the current transaction, to coalesce a bulk insert's
+    /// repeated widenings into one invalidation per index per transaction.
+    static INVALIDATED: RefCell<HashSet<u32>> = RefCell::new(HashSet::new());
+    /// TransactionId the `INVALIDATED` set belongs to; on change we clear the set.
+    static INVALIDATED_XID: Cell<u32> = const { Cell::new(0) };
+}
+
+/// Look up a cached summary for `indexoid`, if present.
+pub fn get(indexoid: pg_sys::Oid) -> Option<Rc<Vec<ColSummary>>> {
+    let key: u32 = indexoid.into();
+    CACHE.with(|c| c.borrow().get(&key).cloned())
+}
+
+/// Store a deserialized summary for `indexoid`.
+pub fn put(indexoid: pg_sys::Oid, summary: Rc<Vec<ColSummary>>) {
+    let key: u32 = indexoid.into();
+    CACHE.with(|c| {
+        c.borrow_mut().insert(key, summary);
+    });
+}
+
+/// Called by `aminsert` when it widens an index's on-page summary. Invalidates the cached
+/// copy everywhere (once per index per transaction).
+pub unsafe fn note_widened(indexoid: pg_sys::Oid) {
+    let key: u32 = indexoid.into();
+    let xid: u32 = pg_sys::GetTopTransactionIdIfAny().into();
+    if xid == 0 {
+        // No assigned xid to scope dedup to; invalidate unconditionally (safe).
+        pg_sys::CacheInvalidateRelcacheByRelid(indexoid);
+        return;
+    }
+    if INVALIDATED_XID.with(|x| x.get()) != xid {
+        INVALIDATED_XID.with(|x| x.set(xid));
+        INVALIDATED.with(|s| s.borrow_mut().clear());
+    }
+    let first = INVALIDATED.with(|s| s.borrow_mut().insert(key));
+    if first {
+        pg_sys::CacheInvalidateRelcacheByRelid(indexoid);
+    }
+}
+
+/// Relcache invalidation callback: drop the cached summary for `relid` (or all of them
+/// when `relid` is invalid, PG's "flush everything" signal).
+unsafe extern "C-unwind" fn relcache_callback(_arg: pg_sys::Datum, relid: pg_sys::Oid) {
+    if relid == pg_sys::Oid::INVALID {
+        CACHE.with(|c| c.borrow_mut().clear());
+    } else {
+        let key: u32 = relid.into();
+        CACHE.with(|c| {
+            c.borrow_mut().remove(&key);
+        });
+    }
+}
+
+/// Register the relcache invalidation callback. Called once from `_PG_init`.
+pub fn register() {
+    unsafe {
+        pg_sys::CacheRegisterRelcacheCallback(Some(relcache_callback), pg_sys::Datum::from(0));
+    }
+}