From fd9d5fba5e7a7c06dd52337ac0625706147f21dd Mon Sep 17 00:00:00 2001 From: Kyrian Obikwelu Date: Fri, 6 Feb 2026 11:08:20 +0100 Subject: [PATCH] feat: implement getConfig for tokenizer and all components --- src/Contracts/DecoderInterface.php | 10 ++ src/Contracts/ModelInterface.php | 10 +- src/Contracts/NormalizerInterface.php | 10 ++ src/Contracts/PostProcessorInterface.php | 10 ++ src/Contracts/PreTokenizerInterface.php | 10 ++ src/DataStructures/AddedToken.php | 18 +++- src/Decoders/BPEDecoder.php | 16 ++++ src/Decoders/ByteFallbackDecoder.php | 9 ++ src/Decoders/ByteLevelDecoder.php | 13 ++- src/Decoders/CTCDecoder.php | 20 ++++ src/Decoders/DecoderSequence.php | 16 ++++ src/Decoders/FuseDecoder.php | 16 ++++ src/Decoders/MetaspaceDecoder.php | 18 ++++ src/Decoders/ReplaceDecoder.php | 18 ++++ src/Decoders/StripDecoder.php | 20 ++++ src/Decoders/WordPieceDecoder.php | 18 ++++ src/Factories/DecoderFactory.php | 4 +- src/Factories/NormalizerFactory.php | 5 + src/Factories/PostProcessorFactory.php | 5 +- src/Factories/PreTokenizerFactory.php | 5 + src/Models/AbstractModel.php | 11 --- src/Models/BPEModel.php | 34 ++++++- src/Models/FallbackModel.php | 16 +++- src/Models/UnigramModel.php | 35 +++++++ src/Models/WordPieceModel.php | 22 +++++ src/Normalizers/BertNormalizer.php | 22 +++++ src/Normalizers/LowercaseNormalizer.php | 9 ++ src/Normalizers/NFCNormalizer.php | 9 ++ src/Normalizers/NFKCNormalizer.php | 9 ++ src/Normalizers/NFKDNormalizer.php | 9 ++ src/Normalizers/NormalizerSequence.php | 16 ++++ src/Normalizers/PassThroughNormalizer.php | 5 + src/Normalizers/PrecompiledNormalizer.php | 22 ++++- src/Normalizers/PrependNormalizer.php | 16 ++++ src/Normalizers/ReplaceNormalizer.php | 18 ++++ src/Normalizers/StripAccentsNormalizer.php | 9 ++ src/Normalizers/StripNormalizer.php | 18 ++++ src/PostProcessors/BertPostProcessor.php | 18 ++++ src/PostProcessors/ByteLevelPostProcessor.php | 20 ++++ src/PostProcessors/DefaultPostProcessor.php | 5 + src/PostProcessors/PostProcessorSequence.php | 16 ++++ src/PostProcessors/RobertaPostProcessor.php | 22 +++++ src/PostProcessors/TemplatePostProcessor.php | 18 ++++ src/PreTokenizers/BertPreTokenizer.php | 9 ++ src/PreTokenizers/ByteLevelPreTokenizer.php | 20 ++++ src/PreTokenizers/DigitsPreTokenizer.php | 16 ++++ src/PreTokenizers/FixedLengthPreTokenizer.php | 16 ++++ src/PreTokenizers/IdentityPreTokenizer.php | 5 + src/PreTokenizers/MetaspacePreTokenizer.php | 22 +++++ src/PreTokenizers/PreTokenizerSequence.php | 16 ++++ src/PreTokenizers/PunctuationPreTokenizer.php | 16 ++++ src/PreTokenizers/ReplacePreTokenizer.php | 18 ++++ src/PreTokenizers/SplitPreTokenizer.php | 18 ++++ src/PreTokenizers/WhitespacePreTokenizer.php | 9 ++ src/PreTokenizers/WhitespaceSplit.php | 9 ++ src/Tokenizer.php | 94 ++++++++++++------- tests/Feature/GetConfigTest.php | 62 ++++++++++++ tests/Unit/TokenizerBuilderTest.php | 28 +++++- 58 files changed, 916 insertions(+), 72 deletions(-) create mode 100644 tests/Feature/GetConfigTest.php diff --git a/src/Contracts/DecoderInterface.php b/src/Contracts/DecoderInterface.php index 72edba3..d70f0ec 100644 --- a/src/Contracts/DecoderInterface.php +++ b/src/Contracts/DecoderInterface.php @@ -12,4 +12,14 @@ interface DecoderInterface * @param string[] $tokens */ public function decode(array $tokens): string; + + /** + * Get configuration value(s). + * + * @param null|string $key The configuration key. If null, returns all config. + * @param mixed $default The default value if the key doesn't exist + * + * @return mixed the configuration value, or full config array if $key is null + */ + public function getConfig(?string $key = null, mixed $default = null): mixed; } diff --git a/src/Contracts/ModelInterface.php b/src/Contracts/ModelInterface.php index f92dc11..553720d 100644 --- a/src/Contracts/ModelInterface.php +++ b/src/Contracts/ModelInterface.php @@ -54,10 +54,12 @@ public function getVocabSize(): int; public function addToken(string $token, int $id): void; /** - * Get the end of word suffix, if any. - * Only some models (like BPE) have this property. + * Get configuration value(s). * - * @return null|string the end of word suffix + * @param null|string $key The configuration key (e.g., 'dropout'). If null, returns all config. + * @param mixed $default The default value if the key doesn't exist (ignored when $key is null) + * + * @return mixed the configuration value, or full config array if $key is null */ - public function getEndOfWordSuffix(): ?string; + public function getConfig(?string $key = null, mixed $default = null): mixed; } diff --git a/src/Contracts/NormalizerInterface.php b/src/Contracts/NormalizerInterface.php index 43d8e45..2f21c56 100644 --- a/src/Contracts/NormalizerInterface.php +++ b/src/Contracts/NormalizerInterface.php @@ -7,4 +7,14 @@ interface NormalizerInterface { public function normalize(string $text): string; + + /** + * Get configuration value(s). + * + * @param null|string $key The configuration key. If null, returns all config. + * @param mixed $default The default value if the key doesn't exist + * + * @return mixed the configuration value, or full config array if $key is null + */ + public function getConfig(?string $key = null, mixed $default = null): mixed; } diff --git a/src/Contracts/PostProcessorInterface.php b/src/Contracts/PostProcessorInterface.php index 1386573..dbbd98c 100644 --- a/src/Contracts/PostProcessorInterface.php +++ b/src/Contracts/PostProcessorInterface.php @@ -15,4 +15,14 @@ interface PostProcessorInterface * @return array{0: string[], 1: int[]} the processed tokens and type IDs */ public function process(array $tokens, ?array $pair = null, bool $addSpecialTokens = true): array; + + /** + * Get configuration value(s). + * + * @param null|string $key The configuration key. If null, returns all config. + * @param mixed $default The default value if the key doesn't exist + * + * @return mixed the configuration value, or full config array if $key is null + */ + public function getConfig(?string $key = null, mixed $default = null): mixed; } diff --git a/src/Contracts/PreTokenizerInterface.php b/src/Contracts/PreTokenizerInterface.php index 6d2b206..4a3c19b 100644 --- a/src/Contracts/PreTokenizerInterface.php +++ b/src/Contracts/PreTokenizerInterface.php @@ -15,4 +15,14 @@ interface PreTokenizerInterface * @return string[] */ public function preTokenize(array|string $text, array $options = []): array; + + /** + * Get configuration value(s). + * + * @param null|string $key The configuration key. If null, returns all config. + * @param mixed $default The default value if the key doesn't exist + * + * @return mixed the configuration value, or full config array if $key is null + */ + public function getConfig(?string $key = null, mixed $default = null): mixed; } diff --git a/src/DataStructures/AddedToken.php b/src/DataStructures/AddedToken.php index 11fbaaa..d0a5d50 100644 --- a/src/DataStructures/AddedToken.php +++ b/src/DataStructures/AddedToken.php @@ -10,7 +10,7 @@ * - Whether they should only match single words * - Whether to include any whitespace on its left or right. */ -readonly class AddedToken +class AddedToken implements \JsonSerializable { public function __construct( /** @@ -58,4 +58,20 @@ public static function fromArray(array $data): self $data['special'] ?? false, ); } + + /** + * @return array + */ + public function jsonSerialize(): array + { + return [ + 'id' => $this->id, + 'content' => $this->content, + 'single_word' => $this->singleWord, + 'lstrip' => $this->lStrip, + 'rstrip' => $this->rStrip, + 'normalized' => $this->normalized, + 'special' => $this->special, + ]; + } } diff --git a/src/Decoders/BPEDecoder.php b/src/Decoders/BPEDecoder.php index 08dd5ef..d9e9f4d 100644 --- a/src/Decoders/BPEDecoder.php +++ b/src/Decoders/BPEDecoder.php @@ -8,6 +8,22 @@ class BPEDecoder extends BaseDecoder { public function __construct(protected string $suffix = '') {} + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'BPEDecoder', + 'suffix' => $this->suffix, + default => $default, + }; + } + + return [ + 'type' => 'BPEDecoder', + 'suffix' => $this->suffix, + ]; + } + protected function processTokens(array $tokens): array { $decoded = []; diff --git a/src/Decoders/ByteFallbackDecoder.php b/src/Decoders/ByteFallbackDecoder.php index e904e3c..e4fe776 100644 --- a/src/Decoders/ByteFallbackDecoder.php +++ b/src/Decoders/ByteFallbackDecoder.php @@ -6,6 +6,15 @@ class ByteFallbackDecoder extends BaseDecoder { + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return 'type' === $key ? 'ByteFallback' : $default; + } + + return ['type' => 'ByteFallback']; + } + protected function processTokens(array $tokens): array { $newTokens = []; diff --git a/src/Decoders/ByteLevelDecoder.php b/src/Decoders/ByteLevelDecoder.php index 9422c88..3356d44 100644 --- a/src/Decoders/ByteLevelDecoder.php +++ b/src/Decoders/ByteLevelDecoder.php @@ -275,11 +275,16 @@ public function __construct(protected array $addedTokens = [], protected ?string /** * Convert an array of tokens to a string by decoding each byte. - * - * @param string[] $tokens array of tokens to be decoded - * - * @return string the decoded string */ + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return 'type' === $key ? 'ByteLevel' : $default; + } + + return ['type' => 'ByteLevel']; + } + public function decode(array $tokens): string { $decoded = parent::decode($tokens); diff --git a/src/Decoders/CTCDecoder.php b/src/Decoders/CTCDecoder.php index 71841ea..f7e92cf 100644 --- a/src/Decoders/CTCDecoder.php +++ b/src/Decoders/CTCDecoder.php @@ -14,6 +14,26 @@ public function __construct( protected bool $cleanup = true ) {} + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'CTC', + 'pad_token' => $this->padToken, + 'word_delimiter_token' => $this->wordDelimiterToken, + 'cleanup' => $this->cleanup, + default => $default, + }; + } + + return [ + 'type' => 'CTC', + 'pad_token' => $this->padToken, + 'word_delimiter_token' => $this->wordDelimiterToken, + 'cleanup' => $this->cleanup, + ]; + } + protected function processTokens(array $tokens): array { if (empty($tokens)) { diff --git a/src/Decoders/DecoderSequence.php b/src/Decoders/DecoderSequence.php index 7009236..a080d4b 100644 --- a/src/Decoders/DecoderSequence.php +++ b/src/Decoders/DecoderSequence.php @@ -11,6 +11,22 @@ class DecoderSequence extends BaseDecoder */ public function __construct(protected array $decoders) {} + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Sequence', + 'decoders' => array_map(static fn (BaseDecoder $d) => $d->getConfig(), $this->decoders), + default => $default, + }; + } + + return [ + 'type' => 'Sequence', + 'decoders' => array_map(static fn (BaseDecoder $d) => $d->getConfig(), $this->decoders), + ]; + } + protected function processTokens(array $tokens): array { return array_reduce( diff --git a/src/Decoders/FuseDecoder.php b/src/Decoders/FuseDecoder.php index d3f0962..fcc4829 100644 --- a/src/Decoders/FuseDecoder.php +++ b/src/Decoders/FuseDecoder.php @@ -14,6 +14,22 @@ public function __construct( protected string $separator = '' ) {} + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Fuse', + 'separator' => $this->separator, + default => $default, + }; + } + + return [ + 'type' => 'Fuse', + 'separator' => $this->separator, + ]; + } + protected function processTokens(array $tokens): array { return [implode($this->separator, $tokens)]; diff --git a/src/Decoders/MetaspaceDecoder.php b/src/Decoders/MetaspaceDecoder.php index ab9eb30..bb7d9fc 100644 --- a/src/Decoders/MetaspaceDecoder.php +++ b/src/Decoders/MetaspaceDecoder.php @@ -11,6 +11,24 @@ public function __construct( protected bool $addPrefixSpace = true ) {} + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Metaspace', + 'replacement' => $this->replacement, + 'add_prefix_space' => $this->addPrefixSpace, + default => $default, + }; + } + + return [ + 'type' => 'Metaspace', + 'replacement' => $this->replacement, + 'add_prefix_space' => $this->addPrefixSpace, + ]; + } + protected function processTokens(array $tokens): array { $result = []; diff --git a/src/Decoders/ReplaceDecoder.php b/src/Decoders/ReplaceDecoder.php index ad9a5c6..f7ca1af 100644 --- a/src/Decoders/ReplaceDecoder.php +++ b/src/Decoders/ReplaceDecoder.php @@ -12,6 +12,24 @@ public function __construct( protected string $replacement = '' ) {} + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Replace', + 'pattern' => $this->regex ? ['Regex' => $this->regex] : ['String' => $this->subString], + 'content' => $this->replacement, + default => $default, + }; + } + + return [ + 'type' => 'Replace', + 'pattern' => $this->regex ? ['Regex' => $this->regex] : ['String' => $this->subString], + 'content' => $this->replacement, + ]; + } + protected function processTokens(array $tokens): array { return array_map(function ($token) { diff --git a/src/Decoders/StripDecoder.php b/src/Decoders/StripDecoder.php index 4067934..0bd11d2 100644 --- a/src/Decoders/StripDecoder.php +++ b/src/Decoders/StripDecoder.php @@ -12,6 +12,26 @@ public function __construct( protected int $stop ) {} + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Strip', + 'content' => $this->content, + 'start' => $this->start, + 'stop' => $this->stop, + default => $default, + }; + } + + return [ + 'type' => 'Strip', + 'content' => $this->content, + 'start' => $this->start, + 'stop' => $this->stop, + ]; + } + protected function processTokens(array $tokens): array { return array_map(function ($token) { diff --git a/src/Decoders/WordPieceDecoder.php b/src/Decoders/WordPieceDecoder.php index 5955599..aa11136 100644 --- a/src/Decoders/WordPieceDecoder.php +++ b/src/Decoders/WordPieceDecoder.php @@ -13,6 +13,24 @@ public function __construct( protected bool $cleanup = true ) {} + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'WordPiece', + 'prefix' => $this->prefix, + 'cleanup' => $this->cleanup, + default => $default, + }; + } + + return [ + 'type' => 'WordPiece', + 'prefix' => $this->prefix, + 'cleanup' => $this->cleanup, + ]; + } + protected function processTokens(array $tokens): array { $decodedTokens = []; diff --git a/src/Factories/DecoderFactory.php b/src/Factories/DecoderFactory.php index 89c5b01..ecc6bf4 100644 --- a/src/Factories/DecoderFactory.php +++ b/src/Factories/DecoderFactory.php @@ -25,10 +25,10 @@ class DecoderFactory * @param array $addedTokens Optional. Only needed for ByteLevelDecoder. * @param null|string $endOfWordSuffix Optional. Only needed for ByteLevelDecoder. */ - public static function create(array $config, array $addedTokens = [], ?string $endOfWordSuffix = null): ?DecoderInterface + public static function create(array $config, array $addedTokens = [], ?string $endOfWordSuffix = null): DecoderInterface { if (empty($config)) { - return null; + return new FuseDecoder(' '); } $type = $config['type'] ?? null; diff --git a/src/Factories/NormalizerFactory.php b/src/Factories/NormalizerFactory.php index f119931..9c8be8f 100644 --- a/src/Factories/NormalizerFactory.php +++ b/src/Factories/NormalizerFactory.php @@ -11,6 +11,7 @@ use Codewithkyrian\Tokenizers\Normalizers\NFKCNormalizer; use Codewithkyrian\Tokenizers\Normalizers\NFKDNormalizer; use Codewithkyrian\Tokenizers\Normalizers\NormalizerSequence; +use Codewithkyrian\Tokenizers\Normalizers\PassThroughNormalizer; use Codewithkyrian\Tokenizers\Normalizers\PrecompiledNormalizer; use Codewithkyrian\Tokenizers\Normalizers\PrependNormalizer; use Codewithkyrian\Tokenizers\Normalizers\ReplaceNormalizer; @@ -24,6 +25,10 @@ class NormalizerFactory */ public static function create(array $config): NormalizerInterface { + if (empty($config)) { + return new PassThroughNormalizer(); + } + $type = $config['type'] ?? null; return match ($type) { diff --git a/src/Factories/PostProcessorFactory.php b/src/Factories/PostProcessorFactory.php index cc2e880..53faef0 100644 --- a/src/Factories/PostProcessorFactory.php +++ b/src/Factories/PostProcessorFactory.php @@ -7,6 +7,7 @@ use Codewithkyrian\Tokenizers\Contracts\PostProcessorInterface; use Codewithkyrian\Tokenizers\PostProcessors\BertPostProcessor; use Codewithkyrian\Tokenizers\PostProcessors\ByteLevelPostProcessor; +use Codewithkyrian\Tokenizers\PostProcessors\DefaultPostProcessor; use Codewithkyrian\Tokenizers\PostProcessors\PostProcessorSequence; use Codewithkyrian\Tokenizers\PostProcessors\RobertaPostProcessor; use Codewithkyrian\Tokenizers\PostProcessors\TemplatePostProcessor; @@ -16,10 +17,10 @@ class PostProcessorFactory /** * @param array $config the post-processor configuration */ - public static function create(array $config): ?PostProcessorInterface + public static function create(array $config): PostProcessorInterface { if (empty($config)) { - return null; + return new DefaultPostProcessor(); } $type = $config['type'] ?? null; diff --git a/src/Factories/PreTokenizerFactory.php b/src/Factories/PreTokenizerFactory.php index 9c1340b..e318aa1 100644 --- a/src/Factories/PreTokenizerFactory.php +++ b/src/Factories/PreTokenizerFactory.php @@ -9,6 +9,7 @@ use Codewithkyrian\Tokenizers\PreTokenizers\ByteLevelPreTokenizer; use Codewithkyrian\Tokenizers\PreTokenizers\DigitsPreTokenizer; use Codewithkyrian\Tokenizers\PreTokenizers\FixedLengthPreTokenizer; +use Codewithkyrian\Tokenizers\PreTokenizers\IdentityPreTokenizer; use Codewithkyrian\Tokenizers\PreTokenizers\MetaspacePreTokenizer; use Codewithkyrian\Tokenizers\PreTokenizers\PreTokenizerSequence; use Codewithkyrian\Tokenizers\PreTokenizers\PunctuationPreTokenizer; @@ -24,6 +25,10 @@ class PreTokenizerFactory */ public static function create(array $config): PreTokenizerInterface { + if (empty($config)) { + return new IdentityPreTokenizer(); + } + $type = $config['type'] ?? null; return match ($type) { diff --git a/src/Models/AbstractModel.php b/src/Models/AbstractModel.php index c7a6540..4e0ae29 100644 --- a/src/Models/AbstractModel.php +++ b/src/Models/AbstractModel.php @@ -80,17 +80,6 @@ public function addToken(string $token, int $id): void $this->vocab[$id] = $token; } - /** - * Get the end of word suffix, if any. - * Default implementation returns null. Models like BPE override this. - * - * @return null|string the end of word suffix - */ - public function getEndOfWordSuffix(): ?string - { - return null; - } - /** * @param string[] $tokens * diff --git a/src/Models/BPEModel.php b/src/Models/BPEModel.php index 273434f..1c2ffba 100644 --- a/src/Models/BPEModel.php +++ b/src/Models/BPEModel.php @@ -78,11 +78,6 @@ public function __construct( $this->cache = new LRUCache($this->cacheCapacity); } - public function getEndOfWordSuffix(): ?string - { - return $this->endOfWordSuffix; - } - /** * @param string[] $tokens * @@ -138,6 +133,35 @@ public function clearCache(): void $this->cache->clear(); } + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'BPE', + 'vocab' => $this->tokenToIds, + 'merges' => $this->merges, + 'unk_token' => $this->unkToken, + 'end_of_word_suffix' => $this->endOfWordSuffix, + 'continuing_subword_suffix' => $this->continuingSubwordSuffix, + 'byte_fallback' => $this->byteFallback, + 'ignore_merges' => $this->ignoreMerges, + 'cache_capacity' => $this->cacheCapacity, + default => $default, + }; + } + + return [ + 'type' => 'BPE', + 'vocab' => $this->tokenToIds, + 'merges' => $this->merges, + 'unk_token' => $this->unkToken, + 'end_of_word_suffix' => $this->endOfWordSuffix, + 'continuing_subword_suffix' => $this->continuingSubwordSuffix, + 'byte_fallback' => $this->byteFallback, + 'ignore_merges' => $this->ignoreMerges, + ]; + } + /** * @param \SplPriorityQueue $queue the queue to add the node to * @param BPENode $node the node to add to the queue diff --git a/src/Models/FallbackModel.php b/src/Models/FallbackModel.php index be1f4c0..01af33f 100644 --- a/src/Models/FallbackModel.php +++ b/src/Models/FallbackModel.php @@ -91,8 +91,20 @@ public function addToken(string $token, int $id): void $this->vocabReversed[$token] = $id; } - public function getEndOfWordSuffix(): ?string + public function getConfig(?string $key = null, mixed $default = null): mixed { - return null; + if (null !== $key) { + return match ($key) { + 'vocab' => $this->vocab, + 'unk_token' => $this->unkToken, + default => $default, + }; + } + + // 2. Full Config Reconstruction + return [ + 'vocab' => $this->vocab, + 'unk_token' => $this->unkToken, + ]; } } diff --git a/src/Models/UnigramModel.php b/src/Models/UnigramModel.php index a15c195..594d9be 100644 --- a/src/Models/UnigramModel.php +++ b/src/Models/UnigramModel.php @@ -66,6 +66,26 @@ public function __construct( $this->fuseUnk = true; } + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Unigram', + 'vocab' => $this->getVocabWithScores(), + 'unk_id' => $this->unkTokenId, + 'eos_token' => $this->eosToken, + default => $default, + }; + } + + return [ + 'type' => 'Unigram', + 'vocab' => $this->getVocabWithScores(), + 'unk_id' => $this->unkTokenId, + 'eos_token' => $this->eosToken, + ]; + } + /** * @param string[] $tokens the tokens to tokenize * @@ -116,6 +136,21 @@ public function populateNodes(TokenLattice $lattice): void } } + /** + * Reconstructs the vocabulary array as [ [token, score], ... ]. + * + * @return array + */ + protected function getVocabWithScores(): array + { + $vocab = []; + foreach ($this->vocab as $i => $token) { + $vocab[] = [$token, $this->scores[$i]]; + } + + return $vocab; + } + /** * @param string $normalized the normalized string to tokenize * diff --git a/src/Models/WordPieceModel.php b/src/Models/WordPieceModel.php index 9b4bd1a..1c1666b 100644 --- a/src/Models/WordPieceModel.php +++ b/src/Models/WordPieceModel.php @@ -90,4 +90,26 @@ public function tokenize(array $tokens): array return $outputTokens; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'WordPiece', + 'vocab' => $this->tokenToIds, + 'unk_token' => $this->unkToken, + 'max_input_chars_per_word' => $this->maxInputCharsPerWord, + 'continuing_subword_prefix' => $this->continuingSubwordPrefix, + default => $default, + }; + } + + return [ + 'type' => 'WordPiece', + 'vocab' => $this->tokenToIds, + 'unk_token' => $this->unkToken, + 'max_input_chars_per_word' => $this->maxInputCharsPerWord, + 'continuing_subword_prefix' => $this->continuingSubwordPrefix, + ]; + } } diff --git a/src/Normalizers/BertNormalizer.php b/src/Normalizers/BertNormalizer.php index 2a27944..4530acc 100644 --- a/src/Normalizers/BertNormalizer.php +++ b/src/Normalizers/BertNormalizer.php @@ -57,6 +57,28 @@ public function tokenizeChineseChars(string $text): string return implode('', $output); } + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'BertNormalizer', + 'clean_text' => $this->cleanText, + 'handle_chinese_chars' => $this->handleChineseChars, + 'strip_accents' => $this->stripAccents, + 'lowercase' => $this->lowercase, + default => $default, + }; + } + + return [ + 'type' => 'BertNormalizer', + 'clean_text' => $this->cleanText, + 'handle_chinese_chars' => $this->handleChineseChars, + 'strip_accents' => $this->stripAccents, + 'lowercase' => $this->lowercase, + ]; + } + protected function doCleanText(string $text): string { $output = []; diff --git a/src/Normalizers/LowercaseNormalizer.php b/src/Normalizers/LowercaseNormalizer.php index 5e10253..117fb63 100644 --- a/src/Normalizers/LowercaseNormalizer.php +++ b/src/Normalizers/LowercaseNormalizer.php @@ -12,4 +12,13 @@ public function normalize(string $text): string { return mb_strtolower($text); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return 'type' === $key ? 'Lowercase' : $default; + } + + return ['type' => 'Lowercase']; + } } diff --git a/src/Normalizers/NFCNormalizer.php b/src/Normalizers/NFCNormalizer.php index 0fe7f50..9ea834e 100644 --- a/src/Normalizers/NFCNormalizer.php +++ b/src/Normalizers/NFCNormalizer.php @@ -12,4 +12,13 @@ public function normalize(string $text): string { return normalizer_normalize($text, \Normalizer::NFC); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return 'type' === $key ? 'NFC' : $default; + } + + return ['type' => 'NFC']; + } } diff --git a/src/Normalizers/NFKCNormalizer.php b/src/Normalizers/NFKCNormalizer.php index 3f788bf..b92e0ee 100644 --- a/src/Normalizers/NFKCNormalizer.php +++ b/src/Normalizers/NFKCNormalizer.php @@ -12,4 +12,13 @@ public function normalize(string $text): string { return normalizer_normalize($text, \Normalizer::NFKC); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return 'type' === $key ? 'NFKC' : $default; + } + + return ['type' => 'NFKC']; + } } diff --git a/src/Normalizers/NFKDNormalizer.php b/src/Normalizers/NFKDNormalizer.php index d6ad542..30ffa78 100644 --- a/src/Normalizers/NFKDNormalizer.php +++ b/src/Normalizers/NFKDNormalizer.php @@ -12,4 +12,13 @@ public function normalize(string $text): string { return normalizer_normalize($text, \Normalizer::NFKD); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return 'type' === $key ? 'NFKD' : $default; + } + + return ['type' => 'NFKD']; + } } diff --git a/src/Normalizers/NormalizerSequence.php b/src/Normalizers/NormalizerSequence.php index 8117e0c..35447c8 100644 --- a/src/Normalizers/NormalizerSequence.php +++ b/src/Normalizers/NormalizerSequence.php @@ -21,4 +21,20 @@ public function normalize(string $text): string $text ); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + if ('normalizers' === $key) { + return array_map(static fn (NormalizerInterface $n) => $n->getConfig(), $this->normalizers); + } + + return 'type' === $key ? 'Sequence' : $default; + } + + return [ + 'type' => 'Sequence', + 'normalizers' => array_map(static fn (NormalizerInterface $n) => $n->getConfig(), $this->normalizers), + ]; + } } diff --git a/src/Normalizers/PassThroughNormalizer.php b/src/Normalizers/PassThroughNormalizer.php index 6ba75de..c1743c4 100644 --- a/src/Normalizers/PassThroughNormalizer.php +++ b/src/Normalizers/PassThroughNormalizer.php @@ -16,4 +16,9 @@ public function normalize(string $text): string { return $text; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + return null === $key ? [] : $default; + } } diff --git a/src/Normalizers/PrecompiledNormalizer.php b/src/Normalizers/PrecompiledNormalizer.php index 5d37126..7e010e9 100644 --- a/src/Normalizers/PrecompiledNormalizer.php +++ b/src/Normalizers/PrecompiledNormalizer.php @@ -9,14 +9,12 @@ class PrecompiledNormalizer implements NormalizerInterface { - private string $precompiled_charsmap; private string $normalized; private DoubleArray $trie; - public function __construct(string $precompiledCharsmap) + public function __construct(private string $precompiledCharsmap) { - $this->precompiled_charsmap = $precompiledCharsmap; - $this->parse(base64_decode($this->precompiled_charsmap)); + $this->parse(base64_decode($precompiledCharsmap)); } public function normalize(string $text): string @@ -69,6 +67,22 @@ public function normalize(string $text): string return $text; } + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Precompiled', + 'precompiled_charsmap' => $this->precompiledCharsmap, + default => $default, + }; + } + + return [ + 'type' => 'Precompiled', + 'precompiled_charsmap' => $this->precompiledCharsmap, + ]; + } + private function parse(string $precompiled_charsmap): void { $trie_size = unpack('V', substr($precompiled_charsmap, 0, 4))[1]; diff --git a/src/Normalizers/PrependNormalizer.php b/src/Normalizers/PrependNormalizer.php index 86a9574..c65a0f2 100644 --- a/src/Normalizers/PrependNormalizer.php +++ b/src/Normalizers/PrependNormalizer.php @@ -14,4 +14,20 @@ public function normalize(string $text): string { return $this->prepend.$text; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Prepend', + 'prepend' => $this->prepend, + default => $default, + }; + } + + return [ + 'type' => 'Prepend', + 'prepend' => $this->prepend, + ]; + } } diff --git a/src/Normalizers/ReplaceNormalizer.php b/src/Normalizers/ReplaceNormalizer.php index 67d8729..8c60cfc 100644 --- a/src/Normalizers/ReplaceNormalizer.php +++ b/src/Normalizers/ReplaceNormalizer.php @@ -26,4 +26,22 @@ public function normalize(string $text): string return $text; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Replace', + 'pattern' => $this->regex ? ['Regex' => $this->regex] : ['String' => $this->subString], + 'content' => $this->replacement, + default => $default, + }; + } + + return [ + 'type' => 'Replace', + 'pattern' => $this->regex ? ['Regex' => $this->regex] : ['String' => $this->subString], + 'content' => $this->replacement, + ]; + } } diff --git a/src/Normalizers/StripAccentsNormalizer.php b/src/Normalizers/StripAccentsNormalizer.php index 5a9e631..14c389b 100644 --- a/src/Normalizers/StripAccentsNormalizer.php +++ b/src/Normalizers/StripAccentsNormalizer.php @@ -22,4 +22,13 @@ public function normalize(string $text): string { return preg_replace('/[\x{0300}-\x{036f}]/u', '', $text); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return 'type' === $key ? 'StripAccents' : $default; + } + + return ['type' => 'StripAccents']; + } } diff --git a/src/Normalizers/StripNormalizer.php b/src/Normalizers/StripNormalizer.php index 7db5968..7d6915e 100644 --- a/src/Normalizers/StripNormalizer.php +++ b/src/Normalizers/StripNormalizer.php @@ -27,4 +27,22 @@ public function normalize(string $text): string return $text; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Strip', + 'strip_left' => $this->stripLeft, + 'strip_right' => $this->stripRight, + default => $default, + }; + } + + return [ + 'type' => 'Strip', + 'strip_left' => $this->stripLeft, + 'strip_right' => $this->stripRight, + ]; + } } diff --git a/src/PostProcessors/BertPostProcessor.php b/src/PostProcessors/BertPostProcessor.php index e9aae86..9d3af91 100644 --- a/src/PostProcessors/BertPostProcessor.php +++ b/src/PostProcessors/BertPostProcessor.php @@ -33,4 +33,22 @@ public function process(array $tokens, ?array $pair = null, bool $addSpecialToke return [$processedTokens, $tokenTypeIds]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'BertProcessing', + 'sep' => [$this->sep, 0], // Best effort reconstruction + 'cls' => [$this->cls, 0], + default => $default, + }; + } + + return [ + 'type' => 'BertProcessing', + 'sep' => [$this->sep, 0], + 'cls' => [$this->cls, 0], + ]; + } } diff --git a/src/PostProcessors/ByteLevelPostProcessor.php b/src/PostProcessors/ByteLevelPostProcessor.php index e123459..28e15a5 100644 --- a/src/PostProcessors/ByteLevelPostProcessor.php +++ b/src/PostProcessors/ByteLevelPostProcessor.php @@ -34,4 +34,24 @@ public function process(array $tokens, ?array $tokenPair = null, bool $addSpecia return [$tokens, array_fill(0, \count($tokens), 0)]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'ByteLevel', + 'trim_offsets' => $this->trimOffsets, + 'use_regex' => $this->useRegex, + 'add_prefix_space' => $this->addPrefixSpace, + default => $default, + }; + } + + return [ + 'type' => 'ByteLevel', + 'trim_offsets' => $this->trimOffsets, + 'use_regex' => $this->useRegex, + 'add_prefix_space' => $this->addPrefixSpace, + ]; + } } diff --git a/src/PostProcessors/DefaultPostProcessor.php b/src/PostProcessors/DefaultPostProcessor.php index c2009c5..af88006 100644 --- a/src/PostProcessors/DefaultPostProcessor.php +++ b/src/PostProcessors/DefaultPostProcessor.php @@ -26,4 +26,9 @@ public function process(array $tokens, ?array $pair = null, bool $addSpecialToke return [$tokens, $typeIds]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + return null === $key ? null : $default; + } } diff --git a/src/PostProcessors/PostProcessorSequence.php b/src/PostProcessors/PostProcessorSequence.php index 981f850..d2e4cca 100644 --- a/src/PostProcessors/PostProcessorSequence.php +++ b/src/PostProcessors/PostProcessorSequence.php @@ -55,4 +55,20 @@ public function process(array $tokens, ?array $tokenPair = null, bool $addSpecia return [$tokens, $tokenTypeIds ?? array_fill(0, \count($tokens), 0)]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + if ('processors' === $key) { + return array_map(static fn (PostProcessorInterface $p) => $p->getConfig(), $this->processors); + } + + return 'type' === $key ? 'Sequence' : $default; + } + + return [ + 'type' => 'Sequence', + 'processors' => array_map(static fn (PostProcessorInterface $p) => $p->getConfig(), $this->processors), + ]; + } } diff --git a/src/PostProcessors/RobertaPostProcessor.php b/src/PostProcessors/RobertaPostProcessor.php index 07924b2..580eaab 100644 --- a/src/PostProcessors/RobertaPostProcessor.php +++ b/src/PostProcessors/RobertaPostProcessor.php @@ -14,4 +14,26 @@ public function __construct( ) { parent::__construct($sep, $cls); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'RobertaProcessing', + 'sep' => [$this->sep, 0], + 'cls' => [$this->cls, 0], + 'trim_offsets' => $this->trimOffsets, + 'add_prefix_space' => $this->addPrefixSpace, + default => $default, + }; + } + + return [ + 'type' => 'RobertaProcessing', + 'sep' => [$this->sep, 0], + 'cls' => [$this->cls, 0], + 'trim_offsets' => $this->trimOffsets, + 'add_prefix_space' => $this->addPrefixSpace, + ]; + } } diff --git a/src/PostProcessors/TemplatePostProcessor.php b/src/PostProcessors/TemplatePostProcessor.php index 3034bde..52b0ff1 100644 --- a/src/PostProcessors/TemplatePostProcessor.php +++ b/src/PostProcessors/TemplatePostProcessor.php @@ -49,4 +49,22 @@ public function process(array $tokens, ?array $pair = null, bool $addSpecialToke return [$processedTokens, $typeIds]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'TemplateProcessing', + 'single' => $this->single, + 'pair' => $this->pair, + default => $default, + }; + } + + return [ + 'type' => 'TemplateProcessing', + 'single' => $this->single, + 'pair' => $this->pair, + ]; + } } diff --git a/src/PreTokenizers/BertPreTokenizer.php b/src/PreTokenizers/BertPreTokenizer.php index 50e9b77..2b97416 100644 --- a/src/PreTokenizers/BertPreTokenizer.php +++ b/src/PreTokenizers/BertPreTokenizer.php @@ -37,6 +37,15 @@ public function preTokenize(array|string $text, array $options = []): array return $this->bertPreTokenize($text); } + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null === $key) { + return ['type' => 'BertPreTokenizer']; + } + + return 'type' === $key ? 'BertPreTokenizer' : $default; + } + /** * @param string $text the text to pre-tokenize * diff --git a/src/PreTokenizers/ByteLevelPreTokenizer.php b/src/PreTokenizers/ByteLevelPreTokenizer.php index 0a86faf..6af75f9 100644 --- a/src/PreTokenizers/ByteLevelPreTokenizer.php +++ b/src/PreTokenizers/ByteLevelPreTokenizer.php @@ -310,4 +310,24 @@ public function preTokenize(array|string $text, array $options = []): array return implode('', $bytes); }, $tokens); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'ByteLevel', + 'add_prefix_space' => $this->addPrefixSpace, + 'trim_offsets' => $this->trimOffsets, + 'use_regex' => $this->useRegex, + default => $default, + }; + } + + return [ + 'type' => 'ByteLevel', + 'add_prefix_space' => $this->addPrefixSpace, + 'trim_offsets' => $this->trimOffsets, + 'use_regex' => $this->useRegex, + ]; + } } diff --git a/src/PreTokenizers/DigitsPreTokenizer.php b/src/PreTokenizers/DigitsPreTokenizer.php index d639119..79a7a47 100644 --- a/src/PreTokenizers/DigitsPreTokenizer.php +++ b/src/PreTokenizers/DigitsPreTokenizer.php @@ -32,4 +32,20 @@ public function preTokenize(array|string $text, array $options = []): array return $matches[0] ?? []; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Digits', + 'individual_digits' => $this->individualDigits, + default => $default, + }; + } + + return [ + 'type' => 'Digits', + 'individual_digits' => $this->individualDigits, + ]; + } } diff --git a/src/PreTokenizers/FixedLengthPreTokenizer.php b/src/PreTokenizers/FixedLengthPreTokenizer.php index de87b60..7b009ed 100644 --- a/src/PreTokenizers/FixedLengthPreTokenizer.php +++ b/src/PreTokenizers/FixedLengthPreTokenizer.php @@ -35,4 +35,20 @@ public function preTokenize(array|string $text, array $options = []): array return $tokens; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'FixedLength', + 'length' => $this->length, + default => $default, + }; + } + + return [ + 'type' => 'FixedLength', + 'length' => $this->length, + ]; + } } diff --git a/src/PreTokenizers/IdentityPreTokenizer.php b/src/PreTokenizers/IdentityPreTokenizer.php index 80324a0..f2eb5c3 100644 --- a/src/PreTokenizers/IdentityPreTokenizer.php +++ b/src/PreTokenizers/IdentityPreTokenizer.php @@ -20,4 +20,9 @@ public function preTokenize(array|string $text, array $options = []): array return [$text]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + return null === $key ? [] : $default; + } } diff --git a/src/PreTokenizers/MetaspacePreTokenizer.php b/src/PreTokenizers/MetaspacePreTokenizer.php index 9954d4a..56569a5 100644 --- a/src/PreTokenizers/MetaspacePreTokenizer.php +++ b/src/PreTokenizers/MetaspacePreTokenizer.php @@ -43,4 +43,26 @@ public function preTokenize(array|string $text, array $options = []): array return [$normalized]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Metaspace', + 'replacement' => $this->replacement, + 'add_prefix_space' => $this->addPrefixSpace, + 'str_rep' => $this->strRep, + 'prepend_scheme' => $this->prependScheme, + default => $default, + }; + } + + return [ + 'type' => 'Metaspace', + 'replacement' => $this->replacement, + 'add_prefix_space' => $this->addPrefixSpace, + 'str_rep' => $this->strRep, + 'prepend_scheme' => $this->prependScheme, + ]; + } } diff --git a/src/PreTokenizers/PreTokenizerSequence.php b/src/PreTokenizers/PreTokenizerSequence.php index 16819aa..6221fe2 100644 --- a/src/PreTokenizers/PreTokenizerSequence.php +++ b/src/PreTokenizers/PreTokenizerSequence.php @@ -21,4 +21,20 @@ public function preTokenize(array|string $text, array $options = []): array \is_array($text) ? $text : [$text] ); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + if ('pretokenizers' === $key) { + return array_map(static fn (PreTokenizerInterface $pt) => $pt->getConfig(), $this->preTokenizers); + } + + return 'type' === $key ? 'Sequence' : $default; + } + + return [ + 'type' => 'Sequence', + 'pretokenizers' => array_map(static fn (PreTokenizerInterface $pt) => $pt->getConfig(), $this->preTokenizers), + ]; + } } diff --git a/src/PreTokenizers/PunctuationPreTokenizer.php b/src/PreTokenizers/PunctuationPreTokenizer.php index ddc8b05..e13ea92 100644 --- a/src/PreTokenizers/PunctuationPreTokenizer.php +++ b/src/PreTokenizers/PunctuationPreTokenizer.php @@ -31,4 +31,20 @@ public function preTokenize(array|string $text, array $options = []): array return $matches[0] ?? []; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Punctuation', + 'behavior' => $this->behavior, + default => $default, + }; + } + + return [ + 'type' => 'Punctuation', + 'behavior' => $this->behavior, + ]; + } } diff --git a/src/PreTokenizers/ReplacePreTokenizer.php b/src/PreTokenizers/ReplacePreTokenizer.php index 5bae604..2280a21 100644 --- a/src/PreTokenizers/ReplacePreTokenizer.php +++ b/src/PreTokenizers/ReplacePreTokenizer.php @@ -30,4 +30,22 @@ public function preTokenize(array|string $text, array $options = []): array return [str_replace($this->pattern, $this->content, $text)]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Replace', + 'pattern' => ['String' => $this->pattern], + 'content' => $this->content, + default => $default, + }; + } + + return [ + 'type' => 'Replace', + 'pattern' => ['String' => $this->pattern], + 'content' => $this->content, + ]; + } } diff --git a/src/PreTokenizers/SplitPreTokenizer.php b/src/PreTokenizers/SplitPreTokenizer.php index b4e76e6..e717e48 100644 --- a/src/PreTokenizers/SplitPreTokenizer.php +++ b/src/PreTokenizers/SplitPreTokenizer.php @@ -84,4 +84,22 @@ public function preTokenize(array|string $text, array $options = []): array return $result; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null !== $key) { + return match ($key) { + 'type' => 'Split', + 'pattern' => $this->pattern, // Ideally this should be the original pattern string, not compiled regex? + 'invert' => $this->invert, + default => $default, + }; + } + + return [ + 'type' => 'Split', + 'pattern' => $this->pattern, + 'invert' => $this->invert, + ]; + } } diff --git a/src/PreTokenizers/WhitespacePreTokenizer.php b/src/PreTokenizers/WhitespacePreTokenizer.php index ea608a8..af25363 100644 --- a/src/PreTokenizers/WhitespacePreTokenizer.php +++ b/src/PreTokenizers/WhitespacePreTokenizer.php @@ -21,4 +21,13 @@ public function preTokenize(array|string $text, array $options = []): array return preg_split('/\s+/u', $text, -1, \PREG_SPLIT_NO_EMPTY); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null === $key) { + return ['type' => 'Whitespace']; + } + + return 'type' === $key ? 'Whitespace' : $default; + } } diff --git a/src/PreTokenizers/WhitespaceSplit.php b/src/PreTokenizers/WhitespaceSplit.php index 16829fa..43d88b8 100644 --- a/src/PreTokenizers/WhitespaceSplit.php +++ b/src/PreTokenizers/WhitespaceSplit.php @@ -21,4 +21,13 @@ public function preTokenize(array|string $text, array $options = []): array return preg_split('/[\s\x{FFFD}]+/u', $text, flags: \PREG_SPLIT_NO_EMPTY); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null === $key) { + return ['type' => 'WhitespaceSplit']; + } + + return 'type' === $key ? 'WhitespaceSplit' : $default; + } } diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 90e60be..7d2223a 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -11,7 +11,6 @@ use Codewithkyrian\Tokenizers\Contracts\PreTokenizerInterface; use Codewithkyrian\Tokenizers\DataStructures\AddedToken; use Codewithkyrian\Tokenizers\DataStructures\DictionarySplitter; -use Codewithkyrian\Tokenizers\Decoders\FuseDecoder; use Codewithkyrian\Tokenizers\Factories\DecoderFactory; use Codewithkyrian\Tokenizers\Factories\ModelFactory; use Codewithkyrian\Tokenizers\Factories\NormalizerFactory; @@ -19,9 +18,6 @@ use Codewithkyrian\Tokenizers\Factories\PreTokenizerFactory; use Codewithkyrian\Tokenizers\Loaders\FileLoader; use Codewithkyrian\Tokenizers\Loaders\HubLoader; -use Codewithkyrian\Tokenizers\Normalizers\PassThroughNormalizer; -use Codewithkyrian\Tokenizers\PostProcessors\DefaultPostProcessor; -use Codewithkyrian\Tokenizers\PreTokenizers\IdentityPreTokenizer; use Codewithkyrian\Tokenizers\Utils\DecoderUtils; use Codewithkyrian\Tokenizers\Utils\NormalizerUtils; @@ -59,23 +55,6 @@ public function __construct( $this->modelMaxLength = null !== $maxLength ? (int) $maxLength : null; } - /** - * Get configuration value(s). - * - * @param null|string $key The configuration key (e.g., 'model_max_length', 'remove_space'). If null, returns all config. - * @param mixed $default The default value if the key doesn't exist (ignored when $key is null) - * - * @return mixed the configuration value, or full config array if $key is null - */ - public function getConfig(?string $key = null, mixed $default = null): mixed - { - if (null === $key) { - return $this->config; - } - - return $this->config[$key] ?? $default; - } - /** * Load a tokenizer from a file, the Hugging Face Hub, or a configuration array. * @@ -159,18 +138,14 @@ public static function fromConfig(array $config): self } } - $normalizer = isset($config['normalizer']) - ? NormalizerFactory::create($config['normalizer']) - : new PassThroughNormalizer(); - $preTokenizer = isset($config['pre_tokenizer']) - ? PreTokenizerFactory::create($config['pre_tokenizer']) - : new IdentityPreTokenizer(); - $postProcessor = isset($config['post_processor']) - ? PostProcessorFactory::create($config['post_processor']) - : new DefaultPostProcessor(); - $decoder = isset($config['decoder']) - ? DecoderFactory::create($config['decoder'], $addedTokens, $model->getEndOfWordSuffix()) - : new FuseDecoder(' '); + $normalizer = NormalizerFactory::create($config['normalizer'] ?? []); + $preTokenizer = PreTokenizerFactory::create($config['pre_tokenizer'] ?? []); + $postProcessor = PostProcessorFactory::create($config['post_processor'] ?? []); + $decoder = DecoderFactory::create( + $config['decoder'] ?? [], + $addedTokens, + $config['model']['end_of_word_suffix'] ?? null + ); $additionalSpecialTokens = $config['additional_special_tokens'] ?? []; $specialTokens = array_unique([...$specialTokens, ...$additionalSpecialTokens]); @@ -254,6 +229,59 @@ public function decode(array $ids, bool $skipSpecialTokens = true, ?bool $cleanu return $decoded; } + /** + * Get configuration value(s). + * + * @param null|string $key The configuration key (e.g., 'model_max_length', 'remove_space'). If null, returns all config. + * @param mixed $default The default value if the key doesn't exist (ignored when $key is null) + * + * @return mixed the configuration value, or full config array if $key is null + */ + public function getConfig(?string $key = null, mixed $default = null): mixed + { + if (null === $key) { + $fullConfig = $this->config; + $fullConfig['model'] = $this->model->getConfig(); + $fullConfig['normalizer'] = $this->normalizer->getConfig(); + $fullConfig['pre_tokenizer'] = $this->preTokenizer->getConfig(); + $fullConfig['post_processor'] = $this->postProcessor->getConfig(); + $fullConfig['decoder'] = $this->decoder->getConfig(); + + $addedTokens = []; + foreach ($this->addedTokens as $token) { + $addedTokens[] = $token->jsonSerialize(); + } + $fullConfig['added_tokens'] = $addedTokens; + + return $fullConfig; + } + + if (str_contains($key, '.')) { + [$component, $subKey] = explode('.', $key, 2); + + return match ($component) { + 'model' => $this->model->getConfig($subKey, $default), + 'normalizer' => $this->normalizer->getConfig($subKey, $default), + 'pre_tokenizer' => $this->preTokenizer->getConfig($subKey, $default), + 'post_processor' => $this->postProcessor->getConfig($subKey, $default), + 'decoder' => $this->decoder->getConfig($subKey, $default), + default => $default, + }; + } + + if (\in_array($key, ['model', 'normalizer', 'pre_tokenizer', 'post_processor', 'decoder'])) { + return match ($key) { + 'model' => $this->model->getConfig(), + 'normalizer' => $this->normalizer->getConfig(), + 'pre_tokenizer' => $this->preTokenizer->getConfig(), + 'post_processor' => $this->postProcessor->getConfig(), + 'decoder' => $this->decoder->getConfig(), + }; + } + + return $this->config[$key] ?? $default; + } + /** * Converts a text into a list of tokens. * diff --git a/tests/Feature/GetConfigTest.php b/tests/Feature/GetConfigTest.php new file mode 100644 index 0000000..4e0829f --- /dev/null +++ b/tests/Feature/GetConfigTest.php @@ -0,0 +1,62 @@ + 0, 'b' => 1, '[UNK]' => 2]; + $merges = ['a b']; + $expectedMerges = [['a', 'b']]; + $model = new BPEModel( + vocab: $vocab, + merges: $merges, + unkToken: '[UNK]' + ); + + $normalizer = new LowercaseNormalizer(); + $preTokenizer = new WhitespacePreTokenizer(); + $postProcessor = new BertPostProcessor(sep: '[SEP]', cls: '[CLS]'); + $decoder = new BPEDecoder(suffix: ''); + + $tokenizer = new Tokenizer( + model: $model, + normalizer: $normalizer, + preTokenizer: $preTokenizer, + postProcessor: $postProcessor, + decoder: $decoder + ); + + // Test specific key access + expect($tokenizer->getConfig('model.vocab'))->toBe($vocab); + expect($tokenizer->getConfig('model.merges'))->toBe($expectedMerges); + expect($tokenizer->getConfig('model.unk_token'))->toBe('[UNK]'); + + expect($tokenizer->getConfig('normalizer.type'))->toBe('Lowercase'); + + expect($tokenizer->getConfig('pre_tokenizer.type'))->toBe('Whitespace'); + + // BertPostProcessor returns [token, id] structure for compatibility + expect($tokenizer->getConfig('post_processor.sep'))->toBe(['[SEP]', 0]); + expect($tokenizer->getConfig('post_processor.cls'))->toBe(['[CLS]', 0]); + + expect($tokenizer->getConfig('decoder.suffix'))->toBe(''); + + // Test component config retrieval + $modelConfig = $tokenizer->getConfig('model'); + expect($modelConfig['type'])->toBe('BPE'); + expect($modelConfig['vocab'])->toBe($vocab); + + // Test full config reconstruction + $fullConfig = $tokenizer->getConfig(); + + expect($fullConfig)->toBeArray(); + expect($fullConfig['model']['type'])->toBe('BPE'); + expect($fullConfig['normalizer']['type'])->toBe('Lowercase'); + expect($fullConfig['pre_tokenizer']['type'])->toBe('Whitespace'); + expect($fullConfig['post_processor']['type'])->toBe('BertProcessing'); + expect($fullConfig['decoder']['type'])->toBe('BPEDecoder'); +}); diff --git a/tests/Unit/TokenizerBuilderTest.php b/tests/Unit/TokenizerBuilderTest.php index ab36918..fec5966 100644 --- a/tests/Unit/TokenizerBuilderTest.php +++ b/tests/Unit/TokenizerBuilderTest.php @@ -31,12 +31,12 @@ public function tokenize(array $tokens): array public function encode(array $tokens): array { - return array_map(fn (string $token) => $this->reverse[$token] ?? 0, $tokens); + return array_map(fn(string $token) => $this->reverse[$token] ?? 0, $tokens); } public function decode(array $ids): array { - return array_map(fn (int $id) => $this->vocab[$id] ?? '', $ids); + return array_map(fn(int $id) => $this->vocab[$id] ?? '', $ids); } public function getVocab(): array @@ -55,9 +55,9 @@ public function addToken(string $token, int $id): void $this->reverse[$id] = $token; } - public function getEndOfWordSuffix(): ?string + public function getConfig(?string $key = null, mixed $default = null): mixed { - return null; + return []; } }; } @@ -69,6 +69,11 @@ public function normalize(string $text): string { return strtoupper($text); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + return []; + } }; } @@ -79,6 +84,11 @@ public function preTokenize(array|string $text, array $options = []): array { return is_array($text) ? $text : preg_split('/\s+/', trim($text)); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + return []; + } }; } @@ -91,6 +101,11 @@ public function process(array $tokens, ?array $pair = null, bool $addSpecialToke return [$tokens, $typeIds]; } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + return []; + } }; } @@ -101,6 +116,11 @@ public function decode(array $tokens): string { return implode(' ', $tokens); } + + public function getConfig(?string $key = null, mixed $default = null): mixed + { + return []; + } }; }