From a883b098a36b933e6da315dd6883d8c549aef9dc Mon Sep 17 00:00:00 2001
From: AmitMY <amit@nagish.com>
Date: Fri, 26 Jun 2026 16:46:03 +0200
Subject: [PATCH] perf: intern per-character Nodes in characters()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

characters() allocated a fresh Node per character. Share one immutable
Node per character via a lazy cache (the unbounded analogue of _BYTE_NODES
for bytes). Nodes are frozen, so sharing is safe and has no memo/settings
dependency.

Dedups repeated characters (memory) and lets equal characters compare by
identity, so merge's tuple comparisons hit CPython's identity short-circuit
(speed) — the same effect #36 gave the utf8 byte layer. On repeated
multilingual text with units="characters": ~-36% time, ~-45% peak memory.
Output identical; 137 tests pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 complex_tokenization/graphs/units.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/complex_tokenization/graphs/units.py b/complex_tokenization/graphs/units.py
index c29cf92..c44af32 100644
--- a/complex_tokenization/graphs/units.py
+++ b/complex_tokenization/graphs/units.py
@@ -1,4 +1,5 @@
 from collections.abc import Callable
+from functools import cache
 
 import regex
 
@@ -27,8 +28,16 @@ def _get_handler(cluster: str) -> Callable[[str], GraphVertex] | None:
     return None
 
 
+# Share one immutable Node per character (like _BYTE_NODES for bytes, but
+# unbounded so cached lazily): dedups repeated characters and lets equal ones
+# compare by identity. Nodes are frozen, so sharing is safe.
+@cache
+def _char_node(char: str) -> Node:
+    return Node(char.encode("utf-8"))
+
+
 def characters(s: str) -> GraphVertex:
-    nodes = [Node(c.encode("utf-8")) for c in s]
+    nodes = [_char_node(c) for c in s]
 
     if len(nodes) == 1:
         return nodes[0]