From 82a3ce4ec30591c6d4b8eb0fde34a84400645a37 Mon Sep 17 00:00:00 2001 From: Kyrian Obikwelu Date: Wed, 17 Jun 2026 21:43:53 +0100 Subject: [PATCH] fix: swap encode/decode to read correct vocab direction --- src/Models/FallbackModel.php | 112 ++++++++++++++++------- src/PreTokenizers/DigitsPreTokenizer.php | 2 +- 2 files changed, 79 insertions(+), 35 deletions(-) diff --git a/src/Models/FallbackModel.php b/src/Models/FallbackModel.php index 01af33f..bfa8714 100644 --- a/src/Models/FallbackModel.php +++ b/src/Models/FallbackModel.php @@ -6,40 +6,55 @@ use Codewithkyrian\Tokenizers\Contracts\ModelInterface; +/** + * A minimal vocabulary-mapping model with no subword algorithm. + * + * Used for tokenizers where tokens map 1:1 to characters or bytes + * without BPE, WordPiece, or Unigram segmentation — typically CTC + * models such as Wav2Vec2. + * + * `tokenize()` is an identity transform; `encode()` and `decode()` + * are simple dictionary lookups. + */ class FallbackModel implements ModelInterface { /** + * Maps token strings to integer IDs. + * + * @var array + */ + protected array $tokenToId = []; + + /** + * Maps integer IDs back to token strings. + * * @var array */ - protected array $vocab = []; + protected array $idToToken = []; /** - * @var array + * The unknown-token string, used as a fallback when a token + * or ID cannot be found in the vocabulary. */ - protected array $vocabReversed = []; protected ?string $unkToken; /** - * @param array $vocab the vocabulary - * @param null|string $unkToken the unknown token + * @param array $vocab Token → ID mapping (e.g. `['e' => 5, …]`) + * @param null|string $unkToken Fallback string for unknown tokens / IDs */ - public function __construct( - array $vocab = [], - ?string $unkToken = null - ) { + public function __construct(array $vocab = [], ?string $unkToken = null) + { + $this->tokenToId = $vocab; + $this->idToToken = array_flip($this->tokenToId); $this->unkToken = $unkToken; - - // Populate vocab - foreach ($vocab as $token => $id) { - $this->vocab[$token] = $id; - $this->vocabReversed[$id] = $token; - } } /** - * @param string[] $messages the messages to tokenize + * Identity transform — returns tokens unchanged. + * + * @param string[] $messages Input token strings * - * @return string[] + * @return string[] Same token strings, unchanged */ public function tokenize(array $messages): array { @@ -47,63 +62,92 @@ public function tokenize(array $messages): array } /** - * @param string[] $tokens the tokens to encode + * Convert token strings to their integer IDs. * - * @return int[] + * Unknown tokens resolve to the unk-token's ID if available, + * otherwise to 0. + * + * @param string[] $tokens Token strings to encode + * + * @return int[] Integer token IDs */ public function encode(array $tokens): array { - return array_map(function ($token) { - return $this->vocabReversed[$token] ?? $this->vocabReversed[$this->unkToken] ?? 0; - }, $tokens); + return array_map( + fn (string $token): int => $this->tokenToId[$token] + ?? $this->tokenToId[$this->unkToken] + ?? 0, + $tokens, + ); } /** - * @param int[] $ids the IDs to decode + * Convert integer IDs back to their token strings. + * + * Unknown IDs resolve to the unk-token string if available, + * otherwise to an empty string. * - * @return string[] + * @param int[] $ids Integer token IDs to decode + * + * @return string[] Token strings */ public function decode(array $ids): array { - return array_map(fn ($id) => $this->vocab[$id] ?? $this->unkToken ?? '', $ids); + return array_map( + fn (int $id): string => $this->idToToken[$id] + ?? $this->unkToken + ?? '', + $ids, + ); } /** + * Return the full token → ID vocabulary. + * * @return array */ public function getVocab(): array { - return $this->vocab; + return $this->idToToken; } + /** + * Return the number of tokens in the vocabulary. + */ public function getVocabSize(): int { - return \count($this->vocab); + return \count($this->idToToken); } /** - * @param string $token the token to add - * @param int $id the ID of the token + * Add a token or override an existing one. + * + * @param string $token The token string + * @param int $id The integer ID to assign */ public function addToken(string $token, int $id): void { - $this->vocab[$id] = $token; - $this->vocabReversed[$token] = $id; + $this->tokenToId[$token] = $id; + $this->idToToken[$id] = $token; } + /** + * Return configuration, a single config key, or a default value. + * + * @return ($key is null ? array : mixed) + */ public function getConfig(?string $key = null, mixed $default = null): mixed { if (null !== $key) { return match ($key) { - 'vocab' => $this->vocab, + 'vocab' => $this->tokenToId, 'unk_token' => $this->unkToken, default => $default, }; } - // 2. Full Config Reconstruction return [ - 'vocab' => $this->vocab, + 'vocab' => $this->tokenToId, 'unk_token' => $this->unkToken, ]; } diff --git a/src/PreTokenizers/DigitsPreTokenizer.php b/src/PreTokenizers/DigitsPreTokenizer.php index 79a7a47..4dc7d2e 100644 --- a/src/PreTokenizers/DigitsPreTokenizer.php +++ b/src/PreTokenizers/DigitsPreTokenizer.php @@ -28,7 +28,7 @@ public function preTokenize(array|string $text, array $options = []): array return $result; } - preg_match_all($this->pattern, $text, $matches, \PREG_SPLIT_NO_EMPTY); + preg_match_all($this->pattern, $text, $matches); return $matches[0] ?? []; }