From 0253e045d0906f6bd22938c4b0446f0a0b5f4d7d Mon Sep 17 00:00:00 2001 From: Henrique Moody Date: Sat, 31 Jan 2026 01:46:11 +0100 Subject: [PATCH] Add TrimFormatter for configurable string edge trimming Allows precise control over trimming operations with support for left, right, or both sides and custom character masks, using UTF-8-aware regex operations for proper international text handling. The formatter automatically escapes special regex characters in the custom mask and handles complex multi-byte characters including CJK spaces, emoji, and combining diacritics which are essential for global applications. Includes comprehensive tests covering all trim modes, custom masks, Unicode characters (CJK, emoji), special characters, multi-byte strings, and edge cases like empty strings and strings shorter than the mask. Assisted-by: OpenCode (GLM-4.7) --- README.md | 1 + docs/TrimFormatter.md | 145 +++++++++++++++ src/Mixin/Builder.php | 11 +- src/Mixin/Chain.php | 11 +- src/TrimFormatter.php | 53 ++++++ tests/Unit/TrimFormatterTest.php | 300 +++++++++++++++++++++++++++++++ 6 files changed, 513 insertions(+), 8 deletions(-) create mode 100644 docs/TrimFormatter.md create mode 100644 src/TrimFormatter.php create mode 100644 tests/Unit/TrimFormatterTest.php diff --git a/README.md b/README.md index 9451966..812452f 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ See the [PlaceholderFormatter documentation](docs/PlaceholderFormatter.md) and [ | [PatternFormatter](docs/PatternFormatter.md) | Pattern-based string filtering with placeholders | | [PlaceholderFormatter](docs/PlaceholderFormatter.md) | Template interpolation with placeholder replacement | | [TimeFormatter](docs/TimeFormatter.md) | Time promotion (mil, c, dec, y, mo, w, d, h, min, s, ms, us, ns) | +| [TrimFormatter](docs/TrimFormatter.md) | Remove whitespace from string edges | ## Contributing diff --git a/docs/TrimFormatter.md b/docs/TrimFormatter.md new file mode 100644 index 0000000..b6b22fc --- /dev/null +++ b/docs/TrimFormatter.md @@ -0,0 +1,145 @@ + + +# TrimFormatter + +The `TrimFormatter` removes characters from the edges of strings with configurable masking and side selection, fully supporting UTF-8 Unicode characters. + +## Usage + +### Basic Usage + +```php +use Respect\StringFormatter\TrimFormatter; + +$formatter = new TrimFormatter(); + +echo $formatter->format(' hello world '); +// Outputs: "hello world" +``` + +### Trim Specific Side + +```php +use Respect\StringFormatter\TrimFormatter; + +$formatter = new TrimFormatter('left'); + +echo $formatter->format(' hello '); +// Outputs: "hello " + +$formatterRight = new TrimFormatter('right'); + +echo $formatterRight->format(' hello '); +// Outputs: " hello" +``` + +### Custom Mask + +```php +use Respect\StringFormatter\TrimFormatter; + +$formatter = new TrimFormatter('both', '-._'); + +echo $formatter->format('---hello---'); +// Outputs: "hello" + +echo $formatter->format('._hello_._'); +// Outputs: "hello" +``` + +### Unicode Characters + +```php +use Respect\StringFormatter\TrimFormatter; + +// CJK full-width spaces are trimmed by default +$formatter = new TrimFormatter(); + +echo $formatter->format(' hello世界 '); +// Outputs: "hello世界" + +// Trim emoji with custom mask +$formatterEmoji = new TrimFormatter('both', '😊'); + +echo $formatterEmoji->format('😊hello😊'); +// Outputs: "hello" +``` + +## API + +### `TrimFormatter::__construct` + +- `__construct(string $side = "both", string|null $mask = null)` + +Creates a new trim formatter instance. + +**Parameters:** + +- `$side`: Which side(s) to trim: "left", "right", or "both" (default: "both") +- `$mask`: The characters to trim from the string edges, or `null` for default Unicode whitespace (default: `null`) + +**Throws:** `InvalidFormatterException` when `$side` is not "left", "right", or "both" + +### `format` + +- `format(string $input): string` + +Removes characters from the specified side(s) of the input string. + +**Parameters:** + +- `$input`: The string to trim + +**Returns:** The trimmed string + +## Examples + +| Side | Mask | Input | Output | Description | +| --------- | -------------- | --------------- | ------------ | ----------------------------------- | +| `"both"` | `null` | `" hello "` | `"hello"` | Trim default whitespace both sides | +| `"left"` | `null` | `" hello "` | `"hello "` | Trim default whitespace left only | +| `"right"` | `null` | `" hello "` | `" hello"` | Trim default whitespace right only | +| `"both"` | `"-"` | `"---hello---"` | `"hello"` | Trim hyphens from both sides | +| `"both"` | `"-._"` | `"-._hello_.-"` | `"hello"` | Trim multiple custom characters | +| `"left"` | `":"` | `":::hello:::"` | `"hello:::"` | Trim colons from left only | +| `"both"` | `null` | `" hello"` | `"hello"` | CJK space trimmed by default | +| `"both"` | `"😊"` | `"😊hello😊"` | `"hello"` | Trim emoji with custom mask | + +## Notes + +- Uses PHP's `mb_trim`, `mb_ltrim`, and `mb_rtrim` functions for multibyte-safe trimming +- Fully UTF-8 aware - handles all Unicode scripts including CJK, emoji, and complex characters +- Empty strings return empty strings +- If the mask is empty or contains no characters present in the input, the string is returned unchanged +- Trimming operations are character-oriented, not byte-oriented + +### Default Mask + +When no mask is provided (`null`), the formatter uses `mb_trim`'s default which includes all Unicode whitespace characters: + +**ASCII whitespace:** +- ` ` (U+0020): Ordinary space +- `\t` (U+0009): Tab +- `\n` (U+000A): New line (line feed) +- `\r` (U+000D): Carriage return +- `\0` (U+0000): NUL-byte +- `\v` (U+000B): Vertical tab +- `\f` (U+000C): Form feed + +**Unicode whitespace:** +- U+00A0: No-break space +- U+1680: Ogham space mark +- U+2000–U+200A: Various width spaces (en quad, em quad, en space, em space, etc.) +- U+2028: Line separator +- U+2029: Paragraph separator +- U+202F: Narrow no-break space +- U+205F: Medium mathematical space +- U+3000: Ideographic space (CJK full-width space) +- U+0085: Next line (NEL) +- U+180E: Mongolian vowel separator + +See [mb_trim documentation](https://www.php.net/manual/en/function.mb-trim.php) for the complete list. diff --git a/src/Mixin/Builder.php b/src/Mixin/Builder.php index 539fd0d..3396c04 100644 --- a/src/Mixin/Builder.php +++ b/src/Mixin/Builder.php @@ -18,30 +18,33 @@ interface Builder { public static function area(string $unit): FormatterBuilder; + public static function date(string $format = 'Y-m-d H:i:s'): FormatterBuilder; + public static function imperialArea(string $unit): FormatterBuilder; public static function imperialLength(string $unit): FormatterBuilder; public static function imperialMass(string $unit): FormatterBuilder; - public static function date(string $format = 'Y-m-d H:i:s'): FormatterBuilder; - public static function mask(string $range, string $replacement = '*'): FormatterBuilder; public static function metric(string $unit): FormatterBuilder; + public static function metricMass(string $unit): FormatterBuilder; + public static function number( int $decimals = 0, string $decimalSeparator = '.', string $thousandsSeparator = ',', ): FormatterBuilder; - public static function metricMass(string $unit): FormatterBuilder; - public static function pattern(string $pattern): FormatterBuilder; /** @param array $parameters */ public static function placeholder(array $parameters): FormatterBuilder; public static function time(string $unit): FormatterBuilder; + + /** @param 'both'|'left'|'right' $side */ + public static function trim(string $side = 'both', string $mask = " \t\n\r\0\x0B"): FormatterBuilder; } diff --git a/src/Mixin/Chain.php b/src/Mixin/Chain.php index 780ba0b..4fc20c8 100644 --- a/src/Mixin/Chain.php +++ b/src/Mixin/Chain.php @@ -18,30 +18,33 @@ interface Chain extends Formatter { public function area(string $unit): FormatterBuilder; + public function date(string $format = 'Y-m-d H:i:s'): FormatterBuilder; + public function imperialArea(string $unit): FormatterBuilder; public function imperialLength(string $unit): FormatterBuilder; public function imperialMass(string $unit): FormatterBuilder; - public function date(string $format = 'Y-m-d H:i:s'): FormatterBuilder; - public function mask(string $range, string $replacement = '*'): FormatterBuilder; public function metric(string $unit): FormatterBuilder; + public function metricMass(string $unit): FormatterBuilder; + public function number( int $decimals = 0, string $decimalSeparator = '.', string $thousandsSeparator = ',', ): FormatterBuilder; - public function metricMass(string $unit): FormatterBuilder; - public function pattern(string $pattern): FormatterBuilder; /** @param array $parameters */ public function placeholder(array $parameters): FormatterBuilder; public function time(string $unit): FormatterBuilder; + + /** @param 'both'|'left'|'right' $side */ + public function trim(string $side = 'both', string $mask = " \t\n\r\0\x0B"): FormatterBuilder; } diff --git a/src/TrimFormatter.php b/src/TrimFormatter.php new file mode 100644 index 0000000..7d8470a --- /dev/null +++ b/src/TrimFormatter.php @@ -0,0 +1,53 @@ + + */ + +declare(strict_types=1); + +namespace Respect\StringFormatter; + +use function in_array; +use function mb_ltrim; +use function mb_rtrim; +use function mb_trim; +use function sprintf; + +/** + * Trims characters from strings using multibyte-safe functions. + * + * When no mask is provided, trims all Unicode whitespace characters including: + * regular space, tab, newline, carriage return, vertical tab, form feed, + * no-break space (U+00A0), em space (U+2003), ideographic space (U+3000), and others. + * + * @see https://www.php.net/manual/en/function.mb-trim.php + */ +final readonly class TrimFormatter implements Formatter +{ + /** + * @param 'both'|'left'|'right' $side Which side(s) to trim + * @param string|null $mask Characters to trim, or null for default Unicode whitespace + */ + public function __construct( + private string $side = 'both', + private string|null $mask = null, + ) { + if (!in_array($this->side, ['left', 'right', 'both'], true)) { + throw new InvalidFormatterException( + sprintf('Invalid side "%s". Must be "left", "right", or "both".', $this->side), + ); + } + } + + public function format(string $input): string + { + return match ($this->side) { + 'left' => mb_ltrim($input, $this->mask), + 'right' => mb_rtrim($input, $this->mask), + default => mb_trim($input, $this->mask), + }; + } +} diff --git a/tests/Unit/TrimFormatterTest.php b/tests/Unit/TrimFormatterTest.php new file mode 100644 index 0000000..e514928 --- /dev/null +++ b/tests/Unit/TrimFormatterTest.php @@ -0,0 +1,300 @@ + + */ + +declare(strict_types=1); + +namespace Respect\StringFormatter\Test\Unit; + +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProvider; +use PHPUnit\Framework\Attributes\Test; +use PHPUnit\Framework\TestCase; +use Respect\StringFormatter\InvalidFormatterException; +use Respect\StringFormatter\TrimFormatter; + +#[CoversClass(TrimFormatter::class)] +final class TrimFormatterTest extends TestCase +{ + #[Test] + #[DataProvider('providerForValidFormattedString')] + public function testShouldTrimString( + string $input, + string $expected, + string $side = 'both', + string|null $mask = null, + ): void { + // @phpstan-ignore argument.type + $formatter = new TrimFormatter($side, $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + #[DataProvider('providerForLeftTrim')] + public function testShouldTrimLeft(string $input, string $expected, string|null $mask = null): void + { + $formatter = new TrimFormatter('left', $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + #[DataProvider('providerForRightTrim')] + public function testShouldTrimRight(string $input, string $expected, string|null $mask = null): void + { + $formatter = new TrimFormatter('right', $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + #[DataProvider('providerForBothTrim')] + public function testShouldTrimBoth(string $input, string $expected, string|null $mask = null): void + { + $formatter = new TrimFormatter('both', $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + public function testShouldHandleEmptyString(): void + { + $formatter = new TrimFormatter(); + + $actual = $formatter->format(''); + + self::assertSame('', $actual); + } + + #[Test] + public function testShouldThrowExceptionForInvalidSide(): void + { + $this->expectException(InvalidFormatterException::class); + $this->expectExceptionMessage('Invalid side "middle"'); + + // @phpstan-ignore argument.type + new TrimFormatter('middle'); + } + + #[Test] + #[DataProvider('providerForUnicode')] + public function testShouldHandleUnicodeCharacters(string $input, string $expected, string $mask): void + { + $formatter = new TrimFormatter('both', $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + #[DataProvider('providerForEmoji')] + public function testShouldHandleEmoji(string $input, string $expected, string $mask): void + { + $formatter = new TrimFormatter('both', $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + #[DataProvider('providerForCustomMask')] + public function testShouldHandleCustomMask(string $input, string $expected, string $mask): void + { + $formatter = new TrimFormatter('both', $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + #[DataProvider('providerForSpecialChars')] + public function testShouldHandleSpecialCharactersInMask(string $input, string $expected, string $mask): void + { + $formatter = new TrimFormatter('both', $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + #[DataProvider('providerForMultiByte')] + public function testShouldHandleMultiByteCharacters(string $input, string $expected, string|null $mask = null): void + { + $formatter = new TrimFormatter('both', $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + #[Test] + #[DataProvider('providerForEdgeCases')] + public function testShouldHandleEdgeCases(string $input, string $expected, string $side, string $mask): void + { + // @phpstan-ignore argument.type + $formatter = new TrimFormatter($side, $mask); + + $actual = $formatter->format($input); + + self::assertSame($expected, $actual); + } + + /** @return array */ + public static function providerForValidFormattedString(): array + { + return [ + 'whitespace both sides' => [' hello ', 'hello'], + 'tab both sides' => ["\thello\t", 'hello'], + 'newline both sides' => ["\nhello\n", 'hello'], + 'mixed whitespace' => [" \t\n hello \t\n", 'hello'], + 'already trimmed' => ['hello', 'hello'], + 'only spaces' => [' ', ''], + 'no characters in mask' => ['hello', 'hello', 'both', 'xyz'], + 'all characters to trim' => [' !!! ', '!!!', 'both', ' '], + // Unicode whitespace (trimmed by default with mb_trim) + 'ideographic space' => ["\u{3000}hello\u{3000}", 'hello'], + 'em space' => ["\u{2003}hello\u{2003}", 'hello'], + 'no-break space' => ["\u{00A0}hello\u{00A0}", 'hello'], + 'thin space' => ["\u{2009}hello\u{2009}", 'hello'], + 'mixed unicode whitespace' => ["\u{3000}\u{2003} hello \u{00A0}\u{2009}", 'hello'], + ]; + } + + /** @return array */ + public static function providerForLeftTrim(): array + { + return [ + 'spaces left' => [' hello', 'hello'], + 'spaces right not trimmed' => ['hello ', 'hello '], + 'spaces left and right' => [' hello ', 'hello '], + 'tabs left' => ["\thello\t", "hello\t"], + 'mixed whitespace left' => ["\t\n hello world", 'hello world'], + ]; + } + + /** @return array */ + public static function providerForRightTrim(): array + { + return [ + 'spaces right' => ['hello ', 'hello'], + 'spaces left not trimmed' => [' hello', ' hello'], + 'spaces left and right' => [' hello ', ' hello'], + 'tabs right' => ["\thello\t", "\thello"], + 'mixed whitespace right' => ["hello world \t", 'hello world'], + ]; + } + + /** @return array */ + public static function providerForBothTrim(): array + { + return [ + 'spaces both' => [' hello ', 'hello'], + 'tabs both' => ["\thello\t", 'hello'], + 'newlines both' => ["\nhello\n", 'hello'], + 'mixed whitespace' => [" \t\n hello \t\n ", 'hello'], + 'single space' => [' hello ', 'hello'], + // Unicode whitespace (trimmed by default with mb_trim) + 'ideographic space both' => ["\u{3000}hello\u{3000}", 'hello'], + 'narrow no-break space' => ["\u{202F}hello \u{202F}", 'hello'], + ]; + } + + /** @return array */ + public static function providerForUnicode(): array + { + return [ + // Non-whitespace Unicode characters require explicit mask + 'latin accented chars' => ['éééhelloééé', 'hello', 'é'], + 'greek letters' => ['αααhelloααα', 'hello', 'α'], + 'cyrillic letters' => ['бббhelloббб', 'hello', 'б'], + 'arabic letters' => ['مرحبا', 'ا', 'مرحب'], + 'chinese characters' => ['中中hello中中', 'hello', '中'], + 'japanese hiragana' => ['あああhelloあああ', 'hello', 'あ'], + ]; + } + + /** @return array */ + public static function providerForEmoji(): array + { + return [ + 'smiley faces' => ['😊😊hello😊😊', 'hello', '😊'], + 'mixed emoji' => ['👋👋hi👋👋', 'hi', '👋'], + 'hearts' => ['❤️❤️love❤️❤️', 'love', '❤️'], + ]; + } + + /** @return array */ + public static function providerForCustomMask(): array + { + return [ + 'custom characters' => ['---hello---', 'hello', '-'], + 'multiple custom chars' => ['-._hello-._', 'hello', '_.-'], + 'dots' => ['...hello...', 'hello', '.'], + 'underscores' => ['___hello___', 'hello', '_'], + 'mixed custom' => ['*-+hello+-*', 'hello', '+-*'], + ]; + } + + /** @return array */ + public static function providerForSpecialChars(): array + { + return [ + 'dash' => ['--hello--', 'hello', '-'], + 'asterisk' => ['**hello**', 'hello', '*'], + 'dot' => ['..hello..', 'hello', '.'], + 'dollar sign' => ['$$hello$$', 'hello', '$'], + 'caret' => ['^^hello^^', 'hello', '^'], + 'pipe' => ['||hello||', 'hello', '|'], + 'question mark' => ['??hello??', 'hello', '?'], + 'multiple special' => ['@#$hello$#@', 'hello', '@#$'], + ]; + } + + /** @return array */ + public static function providerForMultiByte(): array + { + return [ + // Ideographic space (U+3000) is trimmed by default with mb_trim + 'chinese with ideographic space' => [' 你好 ', '你好'], + 'japanese with ideographic space' => [' こんにちは ', 'こんにちは'], + 'korean with ideographic space' => [' 안녕하세요 ', '안녕하세요'], + // Custom mask for non-whitespace multibyte chars + 'fullwidth letters with custom mask' => ['aaahelloaaa', 'hello', 'a'], + 'mixed cjk and ascii' => [' hello 你好 ', 'hello 你好'], + ]; + } + + /** @return array */ + public static function providerForEdgeCases(): array + { + return [ + 'empty string' => ['', '', 'both', ' '], + 'string shorter than mask' => ['a', '', 'both', 'abcdef'], + 'all characters trimmed' => ['--', '', 'both', '-'], + 'only one side trimmed left' => ['--a', 'a', 'left', '-'], + 'only one side trimmed right' => ['a--', 'a', 'right', '-'], + 'no characters to trim' => ['hello', 'hello', 'both', 'xyz'], + 'mask longer than string' => ['hello', 'hello', 'both', 'abcdefgzij'], + 'empty mask' => ['hello', 'hello', 'both', ''], + 'repeated characters' => ['aaaaahelloaaaaa', 'hello', 'both', 'a'], + 'interleaved characters' => ['ababhelloabab', 'hello', 'both', 'ab'], + ]; + } +}