diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 081212c..dc904c4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: false matrix: - php: ['8.1', '8.2', '8.3', '8.4'] + php: ['8.2', '8.3', '8.4'] steps: - name: Checkout diff --git a/.php-cs-fixer.php b/.php-cs-fixer.php index 4bceb23..d6acb67 100644 --- a/.php-cs-fixer.php +++ b/.php-cs-fixer.php @@ -1,7 +1,7 @@ in(__DIR__); + ->in(__DIR__)->exclude('tests'); return (new PhpCsFixer\Config()) ->setParallelConfig(PhpCsFixer\Runner\Parallel\ParallelConfigFactory::detect()) diff --git a/README.md b/README.md index 18ff7c3..7675fbd 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ - **Zero Hard Dependencies** — Core tokenization has no required dependencies. Optional HTTP client needed only for Hub downloads. - **Hub Compatible** — Load tokenizers directly from Hugging Face Hub or from local files. - **Fully Tested** — Validated against BERT, GPT-2, Llama, Gemma, Qwen, RoBERTa, ALBERT, and more. -- **Modern PHP** — Built for PHP 8.1+ with strict types, readonly properties, and clean interfaces. +- **Modern PHP** — Built for PHP 8.2+ with strict types, readonly classes, and clean interfaces. ## Installation diff --git a/composer.json b/composer.json index b0cf8c4..457685d 100644 --- a/composer.json +++ b/composer.json @@ -31,7 +31,7 @@ } }, "require": { - "php": "^8.1", + "php": "^8.2", "psr/http-client": "^1.0", "psr/http-factory": "^1.0", "php-http/discovery": "^1.19" diff --git a/examples/document_chunking_pipeline.php b/examples/document_chunking_pipeline.php index aafbf46..9abde6a 100644 --- a/examples/document_chunking_pipeline.php +++ b/examples/document_chunking_pipeline.php @@ -232,7 +232,7 @@ function createChunks(Tokenizer $tokenizer, string $text, int $maxTokens, int $o $originalWords = array_slice(explode(' ', $chunk), 0, 5); $decodedWords = explode(' ', $decoded); - $matchCount = count(array_filter($originalWords, fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords)))); + $matchCount = count(array_filter($originalWords, static fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords)))); if ($matchCount < 3) { echo '⚠️ Chunk '.($index + 1)." may have encoding issues\n"; diff --git a/examples/text_classification_preprocessing.php b/examples/text_classification_preprocessing.php index 1c9f331..a75c423 100644 --- a/examples/text_classification_preprocessing.php +++ b/examples/text_classification_preprocessing.php @@ -138,8 +138,8 @@ echo " Structure: [CLS] premise [SEP] hypothesis [SEP]\n"; // Type IDs distinguish between premise (0) and hypothesis (1) - $segment0Count = count(array_filter($encoding->typeIds, fn ($t) => 0 === $t)); - $segment1Count = count(array_filter($encoding->typeIds, fn ($t) => 1 === $t)); + $segment0Count = count(array_filter($encoding->typeIds, static fn ($t) => 0 === $t)); + $segment1Count = count(array_filter($encoding->typeIds, static fn ($t) => 1 === $t)); echo " Segment A (premise) tokens: {$segment0Count}\n"; echo " Segment B (hypothesis) tokens: {$segment1Count}\n"; diff --git a/src/DataStructures/AddedToken.php b/src/DataStructures/AddedToken.php index 7c37157..11fbaaa 100644 --- a/src/DataStructures/AddedToken.php +++ b/src/DataStructures/AddedToken.php @@ -10,7 +10,7 @@ * - Whether they should only match single words * - Whether to include any whitespace on its left or right. */ -class AddedToken +readonly class AddedToken { public function __construct( /** diff --git a/src/DataStructures/TokenLattice.php b/src/DataStructures/TokenLattice.php index f8165f9..0eb95a8 100644 --- a/src/DataStructures/TokenLattice.php +++ b/src/DataStructures/TokenLattice.php @@ -117,6 +117,6 @@ public function tokenIds(): array { $nodes = $this->viterbi(); - return array_map(fn ($x) => $x->tokenId, $nodes); + return array_map(static fn ($x) => $x->tokenId, $nodes); } } diff --git a/src/Decoders/ByteLevelDecoder.php b/src/Decoders/ByteLevelDecoder.php index c5194b0..9422c88 100644 --- a/src/Decoders/ByteLevelDecoder.php +++ b/src/Decoders/ByteLevelDecoder.php @@ -302,7 +302,7 @@ protected function convertTokensToString(array $tokens): string { $text = implode('', $tokens); $textArray = preg_split('//u', $text, -1, \PREG_SPLIT_NO_EMPTY); - $byteArray = array_map(fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray); + $byteArray = array_map(static fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray); $binaryString = pack('C*', ...$byteArray); return mb_convert_encoding($binaryString, 'UTF-8'); diff --git a/src/Decoders/DecoderSequence.php b/src/Decoders/DecoderSequence.php index 633dae2..7009236 100644 --- a/src/Decoders/DecoderSequence.php +++ b/src/Decoders/DecoderSequence.php @@ -15,7 +15,7 @@ protected function processTokens(array $tokens): array { return array_reduce( $this->decoders, - fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens), + static fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens), $tokens ); } diff --git a/src/Encoding.php b/src/Encoding.php index 676582b..523bc9d 100644 --- a/src/Encoding.php +++ b/src/Encoding.php @@ -7,7 +7,7 @@ /** * Represents the output of tokenization. */ -class Encoding +readonly class Encoding { /** * @param int[] $ids The list of token IDs @@ -15,8 +15,8 @@ class Encoding * @param int[] $typeIds The list of type IDs */ public function __construct( - public readonly array $ids, - public readonly array $tokens, - public readonly array $typeIds = [], + public array $ids, + public array $tokens, + public array $typeIds = [], ) {} } diff --git a/src/Factories/NormalizerFactory.php b/src/Factories/NormalizerFactory.php index 8da9034..f119931 100644 --- a/src/Factories/NormalizerFactory.php +++ b/src/Factories/NormalizerFactory.php @@ -35,7 +35,7 @@ public static function create(array $config): NormalizerInterface ), 'Lowercase' => new LowercaseNormalizer(), 'Sequence' => new NormalizerSequence( - array_map(fn ($c) => self::create($c), $config['normalizers'] ?? []) + array_map(static fn ($c) => self::create($c), $config['normalizers'] ?? []) ), 'Strip' => new StripNormalizer( stripLeft: $config['strip_left'] ?? true, diff --git a/src/Factories/PostProcessorFactory.php b/src/Factories/PostProcessorFactory.php index b0c9c06..cc2e880 100644 --- a/src/Factories/PostProcessorFactory.php +++ b/src/Factories/PostProcessorFactory.php @@ -45,7 +45,7 @@ public static function create(array $config): ?PostProcessorInterface pair: $config['pair'] ), 'Sequence' => new PostProcessorSequence( - array_map(fn ($c) => self::create($c), $config['processors'] ?? []) + array_map(static fn ($c) => self::create($c), $config['processors'] ?? []) ), default => throw new \Exception("Unknown post-processor type: {$type}"), }; diff --git a/src/Factories/PreTokenizerFactory.php b/src/Factories/PreTokenizerFactory.php index f2d4443..b5dc1e3 100644 --- a/src/Factories/PreTokenizerFactory.php +++ b/src/Factories/PreTokenizerFactory.php @@ -55,7 +55,7 @@ public static function create(array $config): PreTokenizerInterface invert: $config['invert'] ?? true ), 'Sequence' => new PreTokenizerSequence( - array_map(fn ($c) => self::create($c), $config['pretokenizers'] ?? []) + array_map(static fn ($c) => self::create($c), $config['pretokenizers'] ?? []) ), default => throw new \Exception("Unknown pre-tokenizer type: {$type}"), }; diff --git a/src/Models/AbstractModel.php b/src/Models/AbstractModel.php index 09ad61f..c7a6540 100644 --- a/src/Models/AbstractModel.php +++ b/src/Models/AbstractModel.php @@ -45,7 +45,7 @@ public function decode(array $ids): array $tokens[] = $this->vocab[$id] ?? $this->unkToken ?? null; } - return array_filter($tokens, fn ($t) => null !== $t); + return array_filter($tokens, static fn ($t) => null !== $t); } /** @@ -65,7 +65,7 @@ public function encode(array $tokens): array } // Removing nulls in case unkTokenId is null (though should exist) - return array_filter($ids, fn ($id) => null !== $id); + return array_filter($ids, static fn ($id) => null !== $id); } /** diff --git a/src/Models/BPEModel.php b/src/Models/BPEModel.php index d47be28..273434f 100644 --- a/src/Models/BPEModel.php +++ b/src/Models/BPEModel.php @@ -59,7 +59,7 @@ public function __construct( $this->merges = $merges; } else { $this->merges = array_map( - fn ($merge) => \is_string($merge) ? explode(' ', $merge, 2) : $merge, + static fn ($merge) => \is_string($merge) ? explode(' ', $merge, 2) : $merge, $merges ); } diff --git a/src/Normalizers/NormalizerSequence.php b/src/Normalizers/NormalizerSequence.php index 183100c..8117e0c 100644 --- a/src/Normalizers/NormalizerSequence.php +++ b/src/Normalizers/NormalizerSequence.php @@ -17,7 +17,7 @@ public function normalize(string $text): string { return array_reduce( $this->normalizers, - fn (string $text, NormalizerInterface $normalizer) => $normalizer->normalize($text), + static fn (string $text, NormalizerInterface $normalizer) => $normalizer->normalize($text), $text ); } diff --git a/src/PreTokenizers/ByteLevelPreTokenizer.php b/src/PreTokenizers/ByteLevelPreTokenizer.php index 985dd7d..0a86faf 100644 --- a/src/PreTokenizers/ByteLevelPreTokenizer.php +++ b/src/PreTokenizers/ByteLevelPreTokenizer.php @@ -303,9 +303,9 @@ public function preTokenize(array|string $text, array $options = []): array } // Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) - return array_map(function ($token) { + return array_map(static function ($token) { $utf8Bytes = mb_convert_encoding($token, 'UTF-8'); - $bytes = array_map(fn ($byte) => self::BYTES_TO_UNICODE[$byte], unpack('C*', $utf8Bytes)); + $bytes = array_map(static fn ($byte) => self::BYTES_TO_UNICODE[$byte], unpack('C*', $utf8Bytes)); return implode('', $bytes); }, $tokens); diff --git a/src/PreTokenizers/PreTokenizerSequence.php b/src/PreTokenizers/PreTokenizerSequence.php index bb4a0f3..16819aa 100644 --- a/src/PreTokenizers/PreTokenizerSequence.php +++ b/src/PreTokenizers/PreTokenizerSequence.php @@ -17,7 +17,7 @@ public function preTokenize(array|string $text, array $options = []): array { return array_reduce( $this->preTokenizers, - fn ($text, PreTokenizerInterface $preTokenizer) => $preTokenizer->preTokenize($text, $options), + static fn ($text, PreTokenizerInterface $preTokenizer) => $preTokenizer->preTokenize($text, $options), \is_array($text) ? $text : [$text] ); } diff --git a/src/Tokenizer.php b/src/Tokenizer.php index d4c790e..90e60be 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -25,12 +25,12 @@ use Codewithkyrian\Tokenizers\Utils\DecoderUtils; use Codewithkyrian\Tokenizers\Utils\NormalizerUtils; -class Tokenizer +readonly class Tokenizer { /** * The model's maximum sequence length (convenience accessor for config). */ - public readonly ?int $modelMaxLength; + public ?int $modelMaxLength; protected DictionarySplitter $addedTokensSplitter; /** @@ -44,11 +44,11 @@ class Tokenizer * @param array $config Additional configuration options */ public function __construct( - protected ModelInterface $model, - protected NormalizerInterface $normalizer, - protected PreTokenizerInterface $preTokenizer, - protected PostProcessorInterface $postProcessor, - protected DecoderInterface $decoder, + public ModelInterface $model, + public NormalizerInterface $normalizer, + public PreTokenizerInterface $preTokenizer, + public PostProcessorInterface $postProcessor, + public DecoderInterface $decoder, protected array $specialTokens = [], protected array $addedTokens = [], protected array $config = [] diff --git a/tests/Pest.php b/tests/Pest.php index 1fc0391..d5d9823 100644 --- a/tests/Pest.php +++ b/tests/Pest.php @@ -9,7 +9,7 @@ */ function modelTokenizationDataset(string $datasetClass, bool $withTextPair = false): Closure { - return function () use ($datasetClass, $withTextPair) { + return static function () use ($datasetClass, $withTextPair) { if (!class_exists($datasetClass) || !method_exists($datasetClass, 'data')) { return; }