Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
strategy:
fail-fast: false
matrix:
php: ['8.1', '8.2', '8.3', '8.4']
php: ['8.2', '8.3', '8.4']

steps:
- name: Checkout
Expand Down
2 changes: 1 addition & 1 deletion .php-cs-fixer.php
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?php

$finder = (new PhpCsFixer\Finder())
->in(__DIR__);
->in(__DIR__)->exclude('tests');

return (new PhpCsFixer\Config())
->setParallelConfig(PhpCsFixer\Runner\Parallel\ParallelConfigFactory::detect())
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
- **Zero Hard Dependencies** — Core tokenization has no required dependencies. Optional HTTP client needed only for Hub downloads.
- **Hub Compatible** — Load tokenizers directly from Hugging Face Hub or from local files.
- **Fully Tested** — Validated against BERT, GPT-2, Llama, Gemma, Qwen, RoBERTa, ALBERT, and more.
- **Modern PHP** — Built for PHP 8.1+ with strict types, readonly properties, and clean interfaces.
- **Modern PHP** — Built for PHP 8.2+ with strict types, readonly classes, and clean interfaces.

## Installation

Expand Down
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
}
},
"require": {
"php": "^8.1",
"php": "^8.2",
"psr/http-client": "^1.0",
"psr/http-factory": "^1.0",
"php-http/discovery": "^1.19"
Expand Down
2 changes: 1 addition & 1 deletion examples/document_chunking_pipeline.php
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ function createChunks(Tokenizer $tokenizer, string $text, int $maxTokens, int $o
$originalWords = array_slice(explode(' ', $chunk), 0, 5);
$decodedWords = explode(' ', $decoded);

$matchCount = count(array_filter($originalWords, fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords))));
$matchCount = count(array_filter($originalWords, static fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords))));

if ($matchCount < 3) {
echo '⚠️ Chunk '.($index + 1)." may have encoding issues\n";
Expand Down
4 changes: 2 additions & 2 deletions examples/text_classification_preprocessing.php
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@
echo " Structure: [CLS] premise [SEP] hypothesis [SEP]\n";

// Type IDs distinguish between premise (0) and hypothesis (1)
$segment0Count = count(array_filter($encoding->typeIds, fn ($t) => 0 === $t));
$segment1Count = count(array_filter($encoding->typeIds, fn ($t) => 1 === $t));
$segment0Count = count(array_filter($encoding->typeIds, static fn ($t) => 0 === $t));
$segment1Count = count(array_filter($encoding->typeIds, static fn ($t) => 1 === $t));

echo " Segment A (premise) tokens: {$segment0Count}\n";
echo " Segment B (hypothesis) tokens: {$segment1Count}\n";
Expand Down
2 changes: 1 addition & 1 deletion src/DataStructures/AddedToken.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* - Whether they should only match single words
* - Whether to include any whitespace on its left or right.
*/
class AddedToken
readonly class AddedToken
{
public function __construct(
/**
Expand Down
2 changes: 1 addition & 1 deletion src/DataStructures/TokenLattice.php
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,6 @@ public function tokenIds(): array
{
$nodes = $this->viterbi();

return array_map(fn ($x) => $x->tokenId, $nodes);
return array_map(static fn ($x) => $x->tokenId, $nodes);
}
}
2 changes: 1 addition & 1 deletion src/Decoders/ByteLevelDecoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ protected function convertTokensToString(array $tokens): string
{
$text = implode('', $tokens);
$textArray = preg_split('//u', $text, -1, \PREG_SPLIT_NO_EMPTY);
$byteArray = array_map(fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray);
$byteArray = array_map(static fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray);
$binaryString = pack('C*', ...$byteArray);

return mb_convert_encoding($binaryString, 'UTF-8');
Expand Down
2 changes: 1 addition & 1 deletion src/Decoders/DecoderSequence.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ protected function processTokens(array $tokens): array
{
return array_reduce(
$this->decoders,
fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens),
static fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens),
$tokens
);
}
Expand Down
8 changes: 4 additions & 4 deletions src/Encoding.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
/**
* Represents the output of tokenization.
*/
class Encoding
readonly class Encoding
{
/**
* @param int[] $ids The list of token IDs
* @param string[] $tokens The list of tokens
* @param int[] $typeIds The list of type IDs
*/
public function __construct(
public readonly array $ids,
public readonly array $tokens,
public readonly array $typeIds = [],
public array $ids,
public array $tokens,
public array $typeIds = [],
) {}
}
2 changes: 1 addition & 1 deletion src/Factories/NormalizerFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public static function create(array $config): NormalizerInterface
),
'Lowercase' => new LowercaseNormalizer(),
'Sequence' => new NormalizerSequence(
array_map(fn ($c) => self::create($c), $config['normalizers'] ?? [])
array_map(static fn ($c) => self::create($c), $config['normalizers'] ?? [])
),
'Strip' => new StripNormalizer(
stripLeft: $config['strip_left'] ?? true,
Expand Down
2 changes: 1 addition & 1 deletion src/Factories/PostProcessorFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public static function create(array $config): ?PostProcessorInterface
pair: $config['pair']
),
'Sequence' => new PostProcessorSequence(
array_map(fn ($c) => self::create($c), $config['processors'] ?? [])
array_map(static fn ($c) => self::create($c), $config['processors'] ?? [])
),
default => throw new \Exception("Unknown post-processor type: {$type}"),
};
Expand Down
2 changes: 1 addition & 1 deletion src/Factories/PreTokenizerFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public static function create(array $config): PreTokenizerInterface
invert: $config['invert'] ?? true
),
'Sequence' => new PreTokenizerSequence(
array_map(fn ($c) => self::create($c), $config['pretokenizers'] ?? [])
array_map(static fn ($c) => self::create($c), $config['pretokenizers'] ?? [])
),
default => throw new \Exception("Unknown pre-tokenizer type: {$type}"),
};
Expand Down
4 changes: 2 additions & 2 deletions src/Models/AbstractModel.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public function decode(array $ids): array
$tokens[] = $this->vocab[$id] ?? $this->unkToken ?? null;
}

return array_filter($tokens, fn ($t) => null !== $t);
return array_filter($tokens, static fn ($t) => null !== $t);
}

/**
Expand All @@ -65,7 +65,7 @@ public function encode(array $tokens): array
}

// Removing nulls in case unkTokenId is null (though should exist)
return array_filter($ids, fn ($id) => null !== $id);
return array_filter($ids, static fn ($id) => null !== $id);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/Models/BPEModel.php
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public function __construct(
$this->merges = $merges;
} else {
$this->merges = array_map(
fn ($merge) => \is_string($merge) ? explode(' ', $merge, 2) : $merge,
static fn ($merge) => \is_string($merge) ? explode(' ', $merge, 2) : $merge,
$merges
);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Normalizers/NormalizerSequence.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public function normalize(string $text): string
{
return array_reduce(
$this->normalizers,
fn (string $text, NormalizerInterface $normalizer) => $normalizer->normalize($text),
static fn (string $text, NormalizerInterface $normalizer) => $normalizer->normalize($text),
$text
);
}
Expand Down
4 changes: 2 additions & 2 deletions src/PreTokenizers/ByteLevelPreTokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,9 @@ public function preTokenize(array|string $text, array $options = []): array
}

// Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
return array_map(function ($token) {
return array_map(static function ($token) {
$utf8Bytes = mb_convert_encoding($token, 'UTF-8');
$bytes = array_map(fn ($byte) => self::BYTES_TO_UNICODE[$byte], unpack('C*', $utf8Bytes));
$bytes = array_map(static fn ($byte) => self::BYTES_TO_UNICODE[$byte], unpack('C*', $utf8Bytes));

return implode('', $bytes);
}, $tokens);
Expand Down
2 changes: 1 addition & 1 deletion src/PreTokenizers/PreTokenizerSequence.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public function preTokenize(array|string $text, array $options = []): array
{
return array_reduce(
$this->preTokenizers,
fn ($text, PreTokenizerInterface $preTokenizer) => $preTokenizer->preTokenize($text, $options),
static fn ($text, PreTokenizerInterface $preTokenizer) => $preTokenizer->preTokenize($text, $options),
\is_array($text) ? $text : [$text]
);
}
Expand Down
14 changes: 7 additions & 7 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
use Codewithkyrian\Tokenizers\Utils\DecoderUtils;
use Codewithkyrian\Tokenizers\Utils\NormalizerUtils;

class Tokenizer
readonly class Tokenizer
{
/**
* The model's maximum sequence length (convenience accessor for config).
*/
public readonly ?int $modelMaxLength;
public ?int $modelMaxLength;
protected DictionarySplitter $addedTokensSplitter;

/**
Expand All @@ -44,11 +44,11 @@ class Tokenizer
* @param array<string, mixed> $config Additional configuration options
*/
public function __construct(
protected ModelInterface $model,
protected NormalizerInterface $normalizer,
protected PreTokenizerInterface $preTokenizer,
protected PostProcessorInterface $postProcessor,
protected DecoderInterface $decoder,
public ModelInterface $model,
public NormalizerInterface $normalizer,
public PreTokenizerInterface $preTokenizer,
public PostProcessorInterface $postProcessor,
public DecoderInterface $decoder,
protected array $specialTokens = [],
protected array $addedTokens = [],
protected array $config = []
Expand Down
2 changes: 1 addition & 1 deletion tests/Pest.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
*/
function modelTokenizationDataset(string $datasetClass, bool $withTextPair = false): Closure
{
return function () use ($datasetClass, $withTextPair) {
return static function () use ($datasetClass, $withTextPair) {
if (!class_exists($datasetClass) || !method_exists($datasetClass, 'data')) {
return;
}
Expand Down