Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 78 additions & 34 deletions src/Models/FallbackModel.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,104 +6,148 @@

use Codewithkyrian\Tokenizers\Contracts\ModelInterface;

/**
* A minimal vocabulary-mapping model with no subword algorithm.
*
* Used for tokenizers where tokens map 1:1 to characters or bytes
* without BPE, WordPiece, or Unigram segmentation — typically CTC
* models such as Wav2Vec2.
*
* `tokenize()` is an identity transform; `encode()` and `decode()`
* are simple dictionary lookups.
*/
class FallbackModel implements ModelInterface
{
/**
* Maps token strings to integer IDs.
*
* @var array<string, int>
*/
protected array $tokenToId = [];

/**
* Maps integer IDs back to token strings.
*
* @var array<int, string>
*/
protected array $vocab = [];
protected array $idToToken = [];

/**
* @var array<string, int>
* The unknown-token string, used as a fallback when a token
* or ID cannot be found in the vocabulary.
*/
protected array $vocabReversed = [];
protected ?string $unkToken;

/**
* @param array<int, string> $vocab the vocabulary
* @param null|string $unkToken the unknown token
* @param array<string, int> $vocab Token → ID mapping (e.g. `['e' => 5, …]`)
* @param null|string $unkToken Fallback string for unknown tokens / IDs
*/
public function __construct(
array $vocab = [],
?string $unkToken = null
) {
public function __construct(array $vocab = [], ?string $unkToken = null)
{
$this->tokenToId = $vocab;
$this->idToToken = array_flip($this->tokenToId);
$this->unkToken = $unkToken;

// Populate vocab
foreach ($vocab as $token => $id) {
$this->vocab[$token] = $id;
$this->vocabReversed[$id] = $token;
}
}

/**
* @param string[] $messages the messages to tokenize
* Identity transform — returns tokens unchanged.
*
* @param string[] $messages Input token strings
*
* @return string[]
* @return string[] Same token strings, unchanged
*/
public function tokenize(array $messages): array
{
return $messages;
}

/**
* @param string[] $tokens the tokens to encode
* Convert token strings to their integer IDs.
*
* @return int[]
* Unknown tokens resolve to the unk-token's ID if available,
* otherwise to 0.
*
* @param string[] $tokens Token strings to encode
*
* @return int[] Integer token IDs
*/
public function encode(array $tokens): array
{
return array_map(function ($token) {
return $this->vocabReversed[$token] ?? $this->vocabReversed[$this->unkToken] ?? 0;
}, $tokens);
return array_map(
fn (string $token): int => $this->tokenToId[$token]
?? $this->tokenToId[$this->unkToken]
?? 0,
$tokens,
);
}

/**
* @param int[] $ids the IDs to decode
* Convert integer IDs back to their token strings.
*
* Unknown IDs resolve to the unk-token string if available,
* otherwise to an empty string.
*
* @return string[]
* @param int[] $ids Integer token IDs to decode
*
* @return string[] Token strings
*/
public function decode(array $ids): array
{
return array_map(fn ($id) => $this->vocab[$id] ?? $this->unkToken ?? '', $ids);
return array_map(
fn (int $id): string => $this->idToToken[$id]
?? $this->unkToken
?? '',
$ids,
);
}

/**
* Return the full token → ID vocabulary.
*
* @return array<int, string>
*/
public function getVocab(): array
{
return $this->vocab;
return $this->idToToken;
}

/**
* Return the number of tokens in the vocabulary.
*/
public function getVocabSize(): int
{
return \count($this->vocab);
return \count($this->idToToken);
}

/**
* @param string $token the token to add
* @param int $id the ID of the token
* Add a token or override an existing one.
*
* @param string $token The token string
* @param int $id The integer ID to assign
*/
public function addToken(string $token, int $id): void
{
$this->vocab[$id] = $token;
$this->vocabReversed[$token] = $id;
$this->tokenToId[$token] = $id;
$this->idToToken[$id] = $token;
}

/**
* Return configuration, a single config key, or a default value.
*
* @return ($key is null ? array<string, mixed> : mixed)
*/
public function getConfig(?string $key = null, mixed $default = null): mixed
{
if (null !== $key) {
return match ($key) {
'vocab' => $this->vocab,
'vocab' => $this->tokenToId,
'unk_token' => $this->unkToken,
default => $default,
};
}

// 2. Full Config Reconstruction
return [
'vocab' => $this->vocab,
'vocab' => $this->tokenToId,
'unk_token' => $this->unkToken,
];
}
Expand Down
2 changes: 1 addition & 1 deletion src/PreTokenizers/DigitsPreTokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public function preTokenize(array|string $text, array $options = []): array
return $result;
}

preg_match_all($this->pattern, $text, $matches, \PREG_SPLIT_NO_EMPTY);
preg_match_all($this->pattern, $text, $matches);

return $matches[0] ?? [];
}
Expand Down
Loading