From: Daniel Karbach Date: Thu, 27 Jun 2024 17:01:36 +0000 (+0200) Subject: revamp chatlib tokenization X-Git-Url: https://git.localhorst.tv/?a=commitdiff_plain;h=1dea58cb6fa9cf28966e75c1e1af87f67e6c0fd1;p=alttp.git revamp chatlib tokenization --- diff --git a/app/Console/Commands/ChatlibDatabase.php b/app/Console/Commands/ChatlibDatabase.php index e492bb7..9bf143f 100644 --- a/app/Console/Commands/ChatlibDatabase.php +++ b/app/Console/Commands/ChatlibDatabase.php @@ -13,7 +13,7 @@ class ChatlibDatabase extends Command { * * @var string */ - protected $signature = 'chatlib:database {which=de} {size=7}'; + protected $signature = 'chatlib:database {which=de} {size=3}'; /** * The console command description. @@ -44,10 +44,13 @@ class ChatlibDatabase extends Command { $query->whereNull('detected_language'); $query->orWhere('detected_language', '=', $lang); }) - ->whereRaw('LENGTH(`text_content`) > 10') + ->orderBy('channel_id') + ->orderBy('created_at') ->chunk(5000, function ($msgs) use (&$count, $db) { + $previous = null; foreach ($msgs as $msg) { - $db->addMessage($msg); + $db->addMessage($msg, $previous); + $previous = $msg; ++$count; } $this->line($count); diff --git a/app/Console/Commands/ChatlibGenerate.php b/app/Console/Commands/ChatlibGenerate.php index 04c9e41..caa223d 100644 --- a/app/Console/Commands/ChatlibGenerate.php +++ b/app/Console/Commands/ChatlibGenerate.php @@ -12,7 +12,7 @@ class ChatlibGenerate extends Command { * * @var string */ - protected $signature = 'chatlib:generate {which=de} {amount=50}'; + protected $signature = 'chatlib:generate {which=de} {amount=50} {context?}'; /** * The console command description. @@ -39,7 +39,7 @@ class ChatlibGenerate extends Command { $amount = intval($this->argument('amount')); for ($i = 0; $i < $amount; ++$i) { - $this->line($db->generate()); + $this->line($db->generate($this->argument('context'))); } return 0; diff --git a/app/Console/Commands/ReevaluateChatCommand.php b/app/Console/Commands/ReevaluateChatCommand.php index b289bd0..d87179c 100644 --- a/app/Console/Commands/ReevaluateChatCommand.php +++ b/app/Console/Commands/ReevaluateChatCommand.php @@ -32,12 +32,14 @@ class ReevaluateChatCommand extends Command { ChatLog::whereIn('type', ['chat', 'error']) ->where('banned', false) ->orderBy('created_at') - ->chunk(10000, function ($logs) use (&$good, &$bad) { + ->chunk(5000, function ($logs) use (&$good, &$bad) { foreach ($logs as $line) { try { $line->evaluate(); - $line->evaluated_at = now(); - $line->save(); + if ($line->isDirty()) { + $line->evaluated_at = now(); + $line->save(); + } ++$good; } catch (\Exception $e) { ++$bad; diff --git a/app/Models/ChatLib.php b/app/Models/ChatLib.php index c7a19b6..d6a7ffc 100644 --- a/app/Models/ChatLib.php +++ b/app/Models/ChatLib.php @@ -6,79 +6,65 @@ use Illuminate\Support\Facades\Storage; class ChatLib { - public function __construct($size = 6) { + public function __construct($size = 3) { $this->size = $size; - $converted = []; foreach ($this->categories as $category => $patterns) { $converted_patterns = []; foreach ($patterns as $pattern) { $converted_patterns[] = '/\b'.$pattern.'\b/u'; } - $converted['%'.strtoupper($category).'%'] = $converted_patterns; + $converted[strtoupper($category)] = $converted_patterns; } $this->categories = $converted; } - public function addMessage(ChatLog $msg) { - $this->addText($msg->text_content); + public function addMessage(ChatLog $msg, ChatLog $previous = null) { + if ($msg->isReply()) { + $this->addText($msg->text_content, $msg->getReplyParent()); + } else if (!is_null($previous)) { + $this->addText($msg->text_content, $previous->text_content); + } else { + $this->addText($msg->text_content); + } } - public function addText($text) { + public function addText($text, $context = '') { $tokens = $this->tokenize($text); - if (empty($tokens)) return; - $tokens[] = ''; - foreach ($tokens as $num => $token) { - if ($num === 0) { - $this->addTransition([], $token); - } else { - $start = max(0, $num - $this->size - 1); - $end = $num; - for ($i = $start; $i < $end; ++$i) { - $this->addTransition(array_slice($tokens, $i, $end - $i), $token); - if ($end - $i < 5) break; - } + for ($i = 0; $i < count($tokens) - $this->size; ++$i) { + $this->addTransition(array_slice($tokens, $i, $this->size), $tokens[$i + $this->size]); + } + if (!empty($context)) { + $tokens = $this->tokenizeWithContext($text, $context); + $size = min($this->size - 1, count($tokens) - $this->size); + for ($i = 0; $i < $size; ++$i) { + $this->addTransition(array_slice($tokens, $i, $this->size), $tokens[$i + $this->size]); } - $this->addExample(array_slice($tokens, 0, $num), $token); } } public function compile() { foreach ($this->transitions as $key => $values) { - $this->transitions[$key] = $this->index($values, 2); - if (empty($this->transitions[$key])) { - unset($this->transitions[$key]); - } - } - foreach ($this->examples as $key => $values) { - if (in_array($key, ['', ' '])) { - unset($this->examples[$key]); - continue; - } - $this->examples[$key] = $this->index($values, 1); - if (empty($this->examples[$key]) || (count($this->examples[$key]) === 1 && $this->examples[$key][0][0] === $key)) { - unset($this->examples[$key]); - } + $this->transitions[$key] = $this->index($values); } } - public function generate($limit = 100) { - $tokens = ['']; - $generated = ''; - while (strlen($generated) < $limit) { - $next = $this->randomNext($tokens); - if ($next === '') break; - $tokens[] = $next; - $generated .= $next; + public function generate($context = null) { + if (!is_null($context)) { + $tokens = $this->tokenizeWithContext('', $context); + $generated = $this->loop($tokens); + if (!empty($generated)) { + return $generated; + } } - return $generated; + $tokens = $this->tokenize(''); + return $this->loop($tokens); } public function saveAs($name) { $data = [ 'size' => $this->size, 'transitions' => $this->transitions, - 'examples' => $this->examples, ]; Storage::disk('chatlib')->put($name.'.json', json_encode($data)); } @@ -87,14 +73,12 @@ class ChatLib { $data = json_decode(Storage::disk('chatlib')->get($name.'.json'), true); $this->size = $data['size']; $this->transitions = $data['transitions']; - $this->examples = $data['examples']; } - private function index($arr, $min_weight = 2) { + private function index($arr) { $result = []; $sum = 0; foreach ($arr as $key => $weight) { - if ($weight < $min_weight) continue; $lower = $sum; $sum += $weight; $result[] = [$key, $lower, $sum]; @@ -102,18 +86,20 @@ class ChatLib { return $result; } - private function randomNext($tokens) { - $cnt = count($tokens); - for ($size = min($this->size, $cnt); $size > 0; --$size) { - $cmb = $this->generalize(array_slice($tokens, -$size)); - if (isset($this->transitions[$cmb])) { - $pick = $this->pick($this->transitions[$cmb]); - if (!is_null($pick)) { - return $this->exampleOf($pick, $tokens); - } - } + private function loop($tokens) { + while (count($tokens) < 50) { + $next = $this->randomNext($tokens); + if ($next === ' ') break; + $tokens[] = $next; } - return ''; + return $this->untokenize($tokens); + } + + private function randomNext($tokens) { + $key = $this->makeKey($tokens); + if (!isset($this->transitions[$key])) return ' '; + $pick = $this->pick($this->transitions[$key]); + return $pick[0]; } private function pick($options) { @@ -142,84 +128,62 @@ class ChatLib { return $options[$min_index]; } - private function addTransition($state, $next) { - $ctx = $this->generalize($state); - $cmb = $this->generalize([$next]); - if (!isset($this->transitions[$ctx])) { - $this->transitions[$ctx] = []; + private function addTransition($tokens, $next) { + $key = $this->makeKey($tokens); + if (!isset($this->transitions[$key])) { + $this->transitions[$key] = []; } - if (!isset($this->transitions[$ctx][$cmb])) { - $this->transitions[$ctx][$cmb] = 1; + if (!isset($this->transitions[$key][$next])) { + $this->transitions[$key][$next] = 1; } else { - ++$this->transitions[$ctx][$cmb]; + ++$this->transitions[$key][$next]; } } - private function addExample($context, $token) { - $cmb = $this->generalize([$token]); - if (!isset($this->examples[$cmb])) { - $this->examples[$cmb] = []; - } - if (!isset($this->examples[$cmb][$token])) { - $this->examples[$cmb][$token] = 1; - } else { - ++$this->examples[$cmb][$token]; + private function splitText($text) { + if (trim($text) === '') return []; + return preg_split('/\s+/u', $text); + } + + private function makeKey($tokens) { + $key = $this->joinText(array_slice($tokens, $this->size * -1)); + $key = mb_strtolower($key); + $key = str_replace(['.', ',', ':', ';', '!', '?', '^', '+', '-', '"', "'", '(', ')', '[', ']'], '', $key); + $key = preg_replace('/\d+/u', '0', $key); + foreach ($this->categories as $category => $patterns) { + $key = preg_replace($patterns, $category, $key); } + return $key; } - private function tokenize($str) { - return array_values(array_filter(preg_split('/\b/u', $str), function($token) { - if ($token === '') return false; - if (preg_match('/cheer\d+/u', strtolower($token))) return false; - return true; - })); + private function joinText($tokens) { + return implode(' ', $tokens); } - private function generalize($tokens) { - $str = ''; - foreach ($tokens as $token) { - $replaced = preg_replace('/\d+/u', '0', $token); - $replaced = preg_replace('/\s+/u', ' ', $replaced); - $replaced = preg_replace('/(.)\1{2,}/u', '$1$1', $replaced); - $replaced = strtolower($replaced); - foreach ($this->aliases as $canonical => $variants) { - if (in_array($replaced, $variants)) { - $replaced = $canonical; - break; - } - if ($replaced === $canonical) { - break; - } - } - $str .= $replaced; - } - foreach ($this->categories as $category => $patterns) { - $str = preg_replace($patterns, $category, $str); - } - return $str; + private function untokenize($tokens) { + return $this->joinText(array_slice($tokens, $this->size)); } - private function exampleOf($pick, $context) { - if (!isset($this->examples[$pick[0]])) { - return $pick[0]; + private function tokenize($text) { + $tokens = $this->splitText($text); + $combined = array_merge(array_fill(0, $this->size, ' '), $tokens); + if (!empty($tokens)) { + $combined[] = ' '; } - if (isset($this->examples[$pick[0]])) { - $example = $this->pick($this->examples[$pick[0]]); - return $example[0]; + return $combined; + } + + private function tokenizeWithContext($text, $context) { + $combined = $this->tokenize($text); + $context_tokens = array_slice($this->splitText($context), $this->size * -1 + 1); + for ($i = 0; $i < count($context_tokens); ++$i) { + $combined[$this->size - $i - 2] = $context_tokens[count($context_tokens) - $i - 1]; } - return $pick[0]; + return $combined; } private $size; private $transitions = []; - private $examples = []; - - private $aliases = [ - 'chest' => ['kiste'], - 'einen' => ['n', 'nen'], - 'musik' => ['mukke'], - 'schade' => ['schad', 'schaade'], - ]; private $categories = [ 'fail' => [ diff --git a/app/Models/ChatLog.php b/app/Models/ChatLog.php index 2d43b2f..cad1bfe 100644 --- a/app/Models/ChatLog.php +++ b/app/Models/ChatLog.php @@ -25,8 +25,24 @@ class ChatLog extends Model { return TokenizedMessage::fromLog($this); } + public function isReply() { + return !empty($this->tags['reply-parent-msg-body']); + } + + public function getReplyParent() { + return str_replace('\\s', ' ', $this->tags['reply-parent-msg-body']); + } + + public function getReplyParentUser() { + return $this->tags['reply-parent-display-name']; + } + + public function getText() { + return $this->params[1]; + } + public function getTextWithoutEmotes() { - $text = $this->text_content; + $text = $this->params[1]; if (isset($this->tags['emotes']) && !empty($this->tags['emotes'])) { $emotes = explode('/', $this->tags['emotes']); foreach ($emotes as $emote) { @@ -41,6 +57,13 @@ class ChatLog extends Model { return trim(preg_replace('/\s+/', ' ', $text)); } + public function getTextWithoutReply() { + if ($this->isReply()) { + return mb_substr($this->params[1], mb_strlen($this->getReplyParentUser()) + 2); + } + return $this->params[1]; + } + public function evaluate() { $this->evaluateUser(); $this->evaluateChannel(); @@ -62,7 +85,7 @@ class ChatLog extends Model { } else { $this->type = 'dm'; } - $this->text_content = $this->params[1]; + $this->text_content = $this->getTextWithoutReply(); $this->detectLanguage(); $tokenized = $this->tokenize(); if ($tokenized->isSpammy()) { diff --git a/app/TwitchBot/TokenizedMessage.php b/app/TwitchBot/TokenizedMessage.php index b460020..72817c1 100644 --- a/app/TwitchBot/TokenizedMessage.php +++ b/app/TwitchBot/TokenizedMessage.php @@ -11,10 +11,13 @@ class TokenizedMessage { public function __construct($text, $tags = []) { $this->text = trim($text); $this->tags = $tags; + if (isset($tags['reply-parent-display-name'])) { + $this->text = mb_substr($text, mb_strlen($tags['reply-parent-display-name']) + 2); + } $this->raw = strtolower(preg_replace('/[^\w]/u', '', $this->text)); $this->tokens = array_values(array_map('trim', array_filter(preg_split('/\b/u', strtolower($this->text))))); - $this->emoteless = $this->text; + $this->emoteless = $text; if (isset($this->tags['emotes']) && !empty($this->tags['emotes'])) { $emotes = explode('/', $this->tags['emotes']); foreach ($emotes as $emote) { @@ -253,7 +256,7 @@ class TokenizedMessage { if ($this->contains(['€', '$', '@', '://'])) { return true; } - if ($this->containsRaw(['followers', 'promotion', 'viewers'])) { + if ($this->containsRaw(['follow', 'promotion', 'viewer'])) { return true; } if ($this->containsRaw('horsti')) {