Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion complex_tokenization/graphs/units.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections.abc import Callable
from functools import cache

import regex

Expand Down Expand Up @@ -27,8 +28,16 @@ def _get_handler(cluster: str) -> Callable[[str], GraphVertex] | None:
return None


# Share one immutable Node per character (like _BYTE_NODES for bytes, but
# unbounded so cached lazily): dedups repeated characters and lets equal ones
# compare by identity. Nodes are frozen, so sharing is safe.
@cache
def _char_node(char: str) -> Node:
return Node(char.encode("utf-8"))


def characters(s: str) -> GraphVertex:
nodes = [Node(c.encode("utf-8")) for c in s]
nodes = [_char_node(c) for c in s]

if len(nodes) == 1:
return nodes[0]
Expand Down
Loading