From 72253eedb80a978c5442621049cc0f28b7182687 Mon Sep 17 00:00:00 2001 From: Daniel Karbach Date: Wed, 8 May 2024 21:47:28 +0200 Subject: [PATCH] multibyte handling in emote tokenizer --- app/TwitchBot/TokenizedMessage.php | 10 ++++------ tests/Unit/TwitchBot/TokenizedMessageTest.php | 3 ++- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/app/TwitchBot/TokenizedMessage.php b/app/TwitchBot/TokenizedMessage.php index b32ba72..37836a9 100644 --- a/app/TwitchBot/TokenizedMessage.php +++ b/app/TwitchBot/TokenizedMessage.php @@ -22,15 +22,13 @@ class TokenizedMessage { $positions = explode(',', $set[1]); foreach ($positions as $position) { $coords = explode('-', $position); - $this->emotes[] = preg_replace('/\d+$/', '', strtolower(substr($this->text, $coords[0], $coords[1] - $coords[0] + 1))); - for ($i = intval($coords[0]); $i <= intval($coords[1]); ++$i) { - $this->emoteless[$i] = ' '; - } + $this->emotes[] = preg_replace('/\d+$/', '', strtolower(mb_substr($this->text, $coords[0], $coords[1] - $coords[0] + 1))); + $this->emoteless = mb_substr($this->emoteless, 0, $coords[0]).str_repeat(' ', $coords[1] - $coords[0] + 1).mb_substr($this->emoteless, $coords[1] + 1); } } - $this->emoteless = trim(preg_replace('/\s+/', ' ', $this->emoteless)); + $this->emoteless = trim(preg_replace('/\s+/u', ' ', $this->emoteless)); } - $this->emoteless_raw = strtolower(preg_replace('/[^\w]/', '', $this->emoteless)); + $this->emoteless_raw = strtolower(preg_replace('/[^\w]/u', '', $this->emoteless)); $this->emoteless_tokens = array_values(array_map('trim', array_filter(preg_split('/\b/u', strtolower($this->emoteless))))); } diff --git a/tests/Unit/TwitchBot/TokenizedMessageTest.php b/tests/Unit/TwitchBot/TokenizedMessageTest.php index 1af6bb2..832b970 100644 --- a/tests/Unit/TwitchBot/TokenizedMessageTest.php +++ b/tests/Unit/TwitchBot/TokenizedMessageTest.php @@ -34,6 +34,7 @@ class TokenizedMessageTest extends TestCase { $this->assertNotEquals('hi', TokenizedMessage::fromString('hier steht was')->classify()); $this->assertEquals('hype', TokenizedMessage::fromString('122 Hype!')->classify()); + $this->assertEquals('hype', TokenizedMessage::fromString('Sarühlalü KomodoHype', ['emotes' => '81273:10-19'])->classify()); $this->assertEquals('kappa', TokenizedMessage::fromString('Kappa', ['emotes' => 'blah:0-4'])->classify()); $this->assertEquals('kappa', TokenizedMessage::fromString('KappaClaus', ['emotes' => 'blah:0-9'])->classify()); @@ -72,7 +73,7 @@ class TokenizedMessageTest extends TestCase { $this->assertNotEquals('wtf', TokenizedMessage::fromString('ein waterwalk aufgesetzt')->classify()); $this->assertEquals('yes', TokenizedMessage::fromString('ja geht SeemsGood')->classify()); - $this->assertEquals('yes', TokenizedMessage::fromString('also ich würde sagen ja LUL', ['emotes' => 'blah:25-27'])->classify()); + $this->assertEquals('yes', TokenizedMessage::fromString('also ich würde sagen ja LUL', ['emotes' => 'blah:24-26'])->classify()); $this->assertNotEquals('yes', TokenizedMessage::fromString('find ich ja gut')->classify()); $this->assertEquals('unclassified', TokenizedMessage::fromString('')->classify()); -- 2.39.2