]> git.localhorst.tv Git - alttp.git/commitdiff
revamp chatlib tokenization
authorDaniel Karbach <daniel.karbach@localhorst.tv>
Thu, 27 Jun 2024 17:01:36 +0000 (19:01 +0200)
committerDaniel Karbach <daniel.karbach@localhorst.tv>
Thu, 27 Jun 2024 17:01:36 +0000 (19:01 +0200)
app/Console/Commands/ChatlibDatabase.php
app/Console/Commands/ChatlibGenerate.php
app/Console/Commands/ReevaluateChatCommand.php
app/Models/ChatLib.php
app/Models/ChatLog.php
app/TwitchBot/TokenizedMessage.php

index e492bb7c1169160fafea1c9a938aec7f5b0c8b86..9bf143f11e9956d188f81c4e241ba6a749e57876 100644 (file)
@@ -13,7 +13,7 @@ class ChatlibDatabase extends Command {
         *
         * @var string
         */
-       protected $signature = 'chatlib:database {which=de} {size=7}';
+       protected $signature = 'chatlib:database {which=de} {size=3}';
 
        /**
         * The console command description.
@@ -44,10 +44,13 @@ class ChatlibDatabase extends Command {
                                $query->whereNull('detected_language');
                                $query->orWhere('detected_language', '=', $lang);
                        })
-                       ->whereRaw('LENGTH(`text_content`) > 10')
+                       ->orderBy('channel_id')
+                       ->orderBy('created_at')
                        ->chunk(5000, function ($msgs) use (&$count, $db) {
+                               $previous = null;
                                foreach ($msgs as $msg) {
-                                       $db->addMessage($msg);
+                                       $db->addMessage($msg, $previous);
+                                       $previous = $msg;
                                        ++$count;
                                }
                                $this->line($count);
index 04c9e41b63a07887d4f7b78a753807995f2415d5..caa223d51495323e446246c871a3139d0bc825c6 100644 (file)
@@ -12,7 +12,7 @@ class ChatlibGenerate extends Command {
         *
         * @var string
         */
-       protected $signature = 'chatlib:generate {which=de} {amount=50}';
+       protected $signature = 'chatlib:generate {which=de} {amount=50} {context?}';
 
        /**
         * The console command description.
@@ -39,7 +39,7 @@ class ChatlibGenerate extends Command {
 
                $amount = intval($this->argument('amount'));
                for ($i = 0; $i < $amount; ++$i) {
-                       $this->line($db->generate());
+                       $this->line($db->generate($this->argument('context')));
                }
 
                return 0;
index b289bd0bde84b9a36ae92c9a75196f96cc547497..d87179cf665c256b1f16c285ff115a141f72452d 100644 (file)
@@ -32,12 +32,14 @@ class ReevaluateChatCommand extends Command {
                ChatLog::whereIn('type', ['chat', 'error'])
                        ->where('banned', false)
                        ->orderBy('created_at')
-                       ->chunk(10000, function ($logs) use (&$good, &$bad) {
+                       ->chunk(5000, function ($logs) use (&$good, &$bad) {
                                foreach ($logs as $line) {
                                        try {
                                                $line->evaluate();
-                                               $line->evaluated_at = now();
-                                               $line->save();
+                                               if ($line->isDirty()) {
+                                                       $line->evaluated_at = now();
+                                                       $line->save();
+                                               }
                                                ++$good;
                                        } catch (\Exception $e) {
                                                ++$bad;
index c7a19b6998d7acafc09e4fefb2b708ba8087f02f..d6a7ffcbbe7906bef5ae3e25b40cdf0e080cad71 100644 (file)
@@ -6,79 +6,65 @@ use Illuminate\Support\Facades\Storage;
 
 class ChatLib {
 
-       public function __construct($size = 6) {
+       public function __construct($size = 3) {
                $this->size = $size;
-
                $converted = [];
                foreach ($this->categories as $category => $patterns) {
                        $converted_patterns = [];
                        foreach ($patterns as $pattern) {
                                $converted_patterns[] = '/\b'.$pattern.'\b/u';
                        }
-                       $converted['%'.strtoupper($category).'%'] = $converted_patterns;
+                       $converted[strtoupper($category)] = $converted_patterns;
                }
                $this->categories = $converted;
        }
 
-       public function addMessage(ChatLog $msg) {
-               $this->addText($msg->text_content);
+       public function addMessage(ChatLog $msg, ChatLog $previous = null) {
+               if ($msg->isReply()) {
+                       $this->addText($msg->text_content, $msg->getReplyParent());
+               } else if (!is_null($previous)) {
+                       $this->addText($msg->text_content, $previous->text_content);
+               } else {
+                       $this->addText($msg->text_content);
+               }
        }
 
-       public function addText($text) {
+       public function addText($text, $context = '') {
                $tokens = $this->tokenize($text);
-               if (empty($tokens)) return;
-               $tokens[] = '';
-               foreach ($tokens as $num => $token) {
-                       if ($num === 0) {
-                               $this->addTransition([], $token);
-                       } else {
-                               $start = max(0, $num - $this->size - 1);
-                               $end = $num;
-                               for ($i = $start; $i < $end; ++$i) {
-                                       $this->addTransition(array_slice($tokens, $i, $end - $i), $token);
-                                       if ($end - $i < 5) break;
-                               }
+               for ($i = 0; $i < count($tokens) - $this->size; ++$i) {
+                       $this->addTransition(array_slice($tokens, $i, $this->size), $tokens[$i + $this->size]);
+               }
+               if (!empty($context)) {
+                       $tokens = $this->tokenizeWithContext($text, $context);
+                       $size = min($this->size - 1, count($tokens) - $this->size);
+                       for ($i = 0; $i < $size; ++$i) {
+                               $this->addTransition(array_slice($tokens, $i, $this->size), $tokens[$i + $this->size]);
                        }
-                       $this->addExample(array_slice($tokens, 0, $num), $token);
                }
        }
 
        public function compile() {
                foreach ($this->transitions as $key => $values) {
-                       $this->transitions[$key] = $this->index($values, 2);
-                       if (empty($this->transitions[$key])) {
-                               unset($this->transitions[$key]);
-                       }
-               }
-               foreach ($this->examples as $key => $values) {
-                       if (in_array($key, ['', ' '])) {
-                               unset($this->examples[$key]);
-                               continue;
-                       }
-                       $this->examples[$key] = $this->index($values, 1);
-                       if (empty($this->examples[$key]) || (count($this->examples[$key]) === 1 && $this->examples[$key][0][0] === $key)) {
-                               unset($this->examples[$key]);
-                       }
+                       $this->transitions[$key] = $this->index($values);
                }
        }
 
-       public function generate($limit = 100) {
-               $tokens = [''];
-               $generated = '';
-               while (strlen($generated) < $limit) {
-                       $next = $this->randomNext($tokens);
-                       if ($next === '') break;
-                       $tokens[] = $next;
-                       $generated .= $next;
+       public function generate($context = null) {
+               if (!is_null($context)) {
+                       $tokens = $this->tokenizeWithContext('', $context);
+                       $generated = $this->loop($tokens);
+                       if (!empty($generated)) {
+                               return $generated;
+                       }
                }
-               return $generated;
+               $tokens = $this->tokenize('');
+               return $this->loop($tokens);
        }
 
        public function saveAs($name) {
                $data = [
                        'size' => $this->size,
                        'transitions' => $this->transitions,
-                       'examples' => $this->examples,
                ];
                Storage::disk('chatlib')->put($name.'.json', json_encode($data));
        }
@@ -87,14 +73,12 @@ class ChatLib {
                $data = json_decode(Storage::disk('chatlib')->get($name.'.json'), true);
                $this->size = $data['size'];
                $this->transitions = $data['transitions'];
-               $this->examples = $data['examples'];
        }
 
-       private function index($arr, $min_weight = 2) {
+       private function index($arr) {
                $result = [];
                $sum = 0;
                foreach ($arr as $key => $weight) {
-                       if ($weight < $min_weight) continue;
                        $lower = $sum;
                        $sum += $weight;
                        $result[] = [$key, $lower, $sum];
@@ -102,18 +86,20 @@ class ChatLib {
                return $result;
        }
 
-       private function randomNext($tokens) {
-               $cnt = count($tokens);
-               for ($size = min($this->size, $cnt); $size > 0; --$size) {
-                       $cmb = $this->generalize(array_slice($tokens, -$size));
-                       if (isset($this->transitions[$cmb])) {
-                               $pick = $this->pick($this->transitions[$cmb]);
-                               if (!is_null($pick)) {
-                                       return $this->exampleOf($pick, $tokens);
-                               }
-                       }
+       private function loop($tokens) {
+               while (count($tokens) < 50) {
+                       $next = $this->randomNext($tokens);
+                       if ($next === ' ') break;
+                       $tokens[] = $next;
                }
-               return '';
+               return $this->untokenize($tokens);
+       }
+
+       private function randomNext($tokens) {
+               $key = $this->makeKey($tokens);
+               if (!isset($this->transitions[$key])) return ' ';
+               $pick = $this->pick($this->transitions[$key]);
+               return $pick[0];
        }
 
        private function pick($options) {
@@ -142,84 +128,62 @@ class ChatLib {
                return $options[$min_index];
        }
 
-       private function addTransition($state, $next) {
-               $ctx = $this->generalize($state);
-               $cmb = $this->generalize([$next]);
-               if (!isset($this->transitions[$ctx])) {
-                       $this->transitions[$ctx] = [];
+       private function addTransition($tokens, $next) {
+               $key = $this->makeKey($tokens);
+               if (!isset($this->transitions[$key])) {
+                       $this->transitions[$key] = [];
                }
-               if (!isset($this->transitions[$ctx][$cmb])) {
-                       $this->transitions[$ctx][$cmb] = 1;
+               if (!isset($this->transitions[$key][$next])) {
+                       $this->transitions[$key][$next] = 1;
                } else {
-                       ++$this->transitions[$ctx][$cmb];
+                       ++$this->transitions[$key][$next];
                }
        }
 
-       private function addExample($context, $token) {
-               $cmb = $this->generalize([$token]);
-               if (!isset($this->examples[$cmb])) {
-                       $this->examples[$cmb] = [];
-               }
-               if (!isset($this->examples[$cmb][$token])) {
-                       $this->examples[$cmb][$token] = 1;
-               } else {
-                       ++$this->examples[$cmb][$token];
+       private function splitText($text) {
+               if (trim($text) === '') return [];
+               return preg_split('/\s+/u', $text);
+       }
+
+       private function makeKey($tokens) {
+               $key = $this->joinText(array_slice($tokens, $this->size * -1));
+               $key = mb_strtolower($key);
+               $key = str_replace(['.', ',', ':', ';', '!', '?', '^', '+', '-', '"', "'", '(', ')', '[', ']'], '', $key);
+               $key = preg_replace('/\d+/u', '0', $key);
+               foreach ($this->categories as $category => $patterns) {
+                       $key = preg_replace($patterns, $category, $key);
                }
+               return $key;
        }
 
-       private function tokenize($str) {
-               return array_values(array_filter(preg_split('/\b/u', $str), function($token) {
-                       if ($token === '') return false;
-                       if (preg_match('/cheer\d+/u', strtolower($token))) return false;
-                       return true;
-               }));
+       private function joinText($tokens) {
+               return implode(' ', $tokens);
        }
 
-       private function generalize($tokens) {
-               $str = '';
-               foreach ($tokens as $token) {
-                       $replaced = preg_replace('/\d+/u', '0', $token);
-                       $replaced = preg_replace('/\s+/u', ' ', $replaced);
-                       $replaced = preg_replace('/(.)\1{2,}/u', '$1$1', $replaced);
-                       $replaced = strtolower($replaced);
-                       foreach ($this->aliases as $canonical => $variants) {
-                               if (in_array($replaced, $variants)) {
-                                       $replaced = $canonical;
-                                       break;
-                               }
-                               if ($replaced === $canonical) {
-                                       break;
-                               }
-                       }
-                       $str .= $replaced;
-               }
-               foreach ($this->categories as $category => $patterns) {
-                       $str = preg_replace($patterns, $category, $str);
-               }
-               return $str;
+       private function untokenize($tokens) {
+               return $this->joinText(array_slice($tokens, $this->size));
        }
 
-       private function exampleOf($pick, $context) {
-               if (!isset($this->examples[$pick[0]])) {
-                       return $pick[0];
+       private function tokenize($text) {
+               $tokens = $this->splitText($text);
+               $combined = array_merge(array_fill(0, $this->size, ' '), $tokens);
+               if (!empty($tokens)) {
+                       $combined[] = ' ';
                }
-               if (isset($this->examples[$pick[0]])) {
-                       $example = $this->pick($this->examples[$pick[0]]);
-                       return $example[0];
+               return $combined;
+       }
+
+       private function tokenizeWithContext($text, $context) {
+               $combined = $this->tokenize($text);
+               $context_tokens = array_slice($this->splitText($context), $this->size * -1 + 1);
+               for ($i = 0; $i < count($context_tokens); ++$i) {
+                       $combined[$this->size - $i - 2] = $context_tokens[count($context_tokens) - $i - 1];
                }
-               return $pick[0];
+               return $combined;
        }
 
        private $size;
        private $transitions = [];
-       private $examples = [];
-
-       private $aliases = [
-               'chest' => ['kiste'],
-               'einen' => ['n', 'nen'],
-               'musik' => ['mukke'],
-               'schade' => ['schad', 'schaade'],
-       ];
 
        private $categories = [
                'fail' => [
index 2d43b2f08b008e3684e0b9c0ed6fb0532f8194af..cad1bfead6668070b239e5b483f1d3fe199f5caf 100644 (file)
@@ -25,8 +25,24 @@ class ChatLog extends Model {
                return TokenizedMessage::fromLog($this);
        }
 
+       public function isReply() {
+               return !empty($this->tags['reply-parent-msg-body']);
+       }
+
+       public function getReplyParent() {
+               return str_replace('\\s', ' ', $this->tags['reply-parent-msg-body']);
+       }
+
+       public function getReplyParentUser() {
+               return $this->tags['reply-parent-display-name'];
+       }
+
+       public function getText() {
+               return $this->params[1];
+       }
+
        public function getTextWithoutEmotes() {
-               $text = $this->text_content;
+               $text = $this->params[1];
                if (isset($this->tags['emotes']) && !empty($this->tags['emotes'])) {
                        $emotes = explode('/', $this->tags['emotes']);
                        foreach ($emotes as $emote) {
@@ -41,6 +57,13 @@ class ChatLog extends Model {
                return trim(preg_replace('/\s+/', ' ', $text));
        }
 
+       public function getTextWithoutReply() {
+               if ($this->isReply()) {
+                       return mb_substr($this->params[1], mb_strlen($this->getReplyParentUser()) + 2);
+               }
+               return $this->params[1];
+       }
+
        public function evaluate() {
                $this->evaluateUser();
                $this->evaluateChannel();
@@ -62,7 +85,7 @@ class ChatLog extends Model {
                        } else {
                                $this->type = 'dm';
                        }
-                       $this->text_content = $this->params[1];
+                       $this->text_content = $this->getTextWithoutReply();
                        $this->detectLanguage();
                        $tokenized = $this->tokenize();
                        if ($tokenized->isSpammy()) {
index b460020a3e2e2a42a5683cf7ba78c41f1e01792b..72817c1dbc6066556f77e134684e74137d64f863 100644 (file)
@@ -11,10 +11,13 @@ class TokenizedMessage {
        public function __construct($text, $tags = []) {
                $this->text = trim($text);
                $this->tags = $tags;
+               if (isset($tags['reply-parent-display-name'])) {
+                       $this->text = mb_substr($text, mb_strlen($tags['reply-parent-display-name']) + 2);
+               }
                $this->raw = strtolower(preg_replace('/[^\w]/u', '', $this->text));
                $this->tokens = array_values(array_map('trim', array_filter(preg_split('/\b/u', strtolower($this->text)))));
 
-               $this->emoteless = $this->text;
+               $this->emoteless = $text;
                if (isset($this->tags['emotes']) && !empty($this->tags['emotes'])) {
                        $emotes = explode('/', $this->tags['emotes']);
                        foreach ($emotes as $emote) {
@@ -253,7 +256,7 @@ class TokenizedMessage {
                if ($this->contains(['€', '$', '@', '://'])) {
                        return true;
                }
-               if ($this->containsRaw(['followers', 'promotion', 'viewers'])) {
+               if ($this->containsRaw(['follow', 'promotion', 'viewer'])) {
                        return true;
                }
                if ($this->containsRaw('horsti')) {