From a883b098a36b933e6da315dd6883d8c549aef9dc Mon Sep 17 00:00:00 2001 From: AmitMY Date: Fri, 26 Jun 2026 16:46:03 +0200 Subject: [PATCH] perf: intern per-character Nodes in characters() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit characters() allocated a fresh Node per character. Share one immutable Node per character via a lazy cache (the unbounded analogue of _BYTE_NODES for bytes). Nodes are frozen, so sharing is safe and has no memo/settings dependency. Dedups repeated characters (memory) and lets equal characters compare by identity, so merge's tuple comparisons hit CPython's identity short-circuit (speed) — the same effect #36 gave the utf8 byte layer. On repeated multilingual text with units="characters": ~-36% time, ~-45% peak memory. Output identical; 137 tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- complex_tokenization/graphs/units.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/complex_tokenization/graphs/units.py b/complex_tokenization/graphs/units.py index c29cf92..c44af32 100644 --- a/complex_tokenization/graphs/units.py +++ b/complex_tokenization/graphs/units.py @@ -1,4 +1,5 @@ from collections.abc import Callable +from functools import cache import regex @@ -27,8 +28,16 @@ def _get_handler(cluster: str) -> Callable[[str], GraphVertex] | None: return None +# Share one immutable Node per character (like _BYTE_NODES for bytes, but +# unbounded so cached lazily): dedups repeated characters and lets equal ones +# compare by identity. Nodes are frozen, so sharing is safe. +@cache +def _char_node(char: str) -> Node: + return Node(char.encode("utf-8")) + + def characters(s: str) -> GraphVertex: - nodes = [Node(c.encode("utf-8")) for c in s] + nodes = [_char_node(c) for c in s] if len(nodes) == 1: return nodes[0]