From c2cc99020eee56f7790d1358abb44df078f2e655 Mon Sep 17 00:00:00 2001 From: Daniel Karbach Date: Wed, 8 May 2024 13:09:05 +0200 Subject: [PATCH] try to improve message genration --- app/Console/Commands/ChatlibDatabase.php | 24 ++- app/Models/ChatLib.php | 209 +++++------------------ 2 files changed, 57 insertions(+), 176 deletions(-) diff --git a/app/Console/Commands/ChatlibDatabase.php b/app/Console/Commands/ChatlibDatabase.php index cb7978f..bfe0135 100644 --- a/app/Console/Commands/ChatlibDatabase.php +++ b/app/Console/Commands/ChatlibDatabase.php @@ -28,21 +28,35 @@ class ChatlibDatabase extends Command { * @return int */ public function handle() { - $db = new ChatLib(); + $de = new ChatLib(); + $en = new ChatLib(); ChatLog::where('type', '=', 'chat') ->where('banned', '=', false) ->whereNotNull('evaluated_at') - ->chunk(500, function ($msgs) use ($db) { + ->chunk(5000, function ($msgs) use ($de, $en) { foreach ($msgs as $msg) { - $db->addMessage($msg); + if ($msg->detected_language === 'de') { + $de->addMessage($msg); + } else if ($msg->detected_language === 'en') { + $en->addMessage($msg); + } else if (is_null($msg->detected_language)) { + $de->addMessage($msg); + $en->addMessage($msg); + } } }); - $db->compile(); + $de->compile(); + $en->compile(); + $this->line(''); for ($i = 0; $i < 50; ++$i) { - $this->line($db->generate()); + $this->line($de->generate()); + } + $this->line(''); + for ($i = 0; $i < 50; ++$i) { + $this->line($en->generate()); } return 0; diff --git a/app/Models/ChatLib.php b/app/Models/ChatLib.php index 8134d13..417b18f 100644 --- a/app/Models/ChatLib.php +++ b/app/Models/ChatLib.php @@ -5,75 +5,41 @@ namespace App\Models; class ChatLib { public function addMessage($msg) { - $tokens = array_values(array_filter(preg_split('/\s+/u', $msg->text_content))); + $tokens = $this->tokenize($msg->text_content); if (empty($tokens)) return; - $tokens [] = ''; + $tokens[] = ''; foreach ($tokens as $num => $token) { if ($num === 0) { - $this->addStart($token); - } - if ($num > 0) { - $this->addOne($tokens[$num - 1], $token); - } - if ($num > 1) { - $this->addTwo($tokens[$num - 2], $tokens[$num - 1], $token); - } - if ($num > 2) { - $this->addThree($tokens[$num - 3], $tokens[$num - 2], $tokens[$num - 1], $token); - } - if ($num > 3) { - $this->addFour($tokens[$num - 4], $tokens[$num - 3], $tokens[$num - 2], $tokens[$num - 1], $token); - } - if ($num > 4) { - $this->addFive($tokens[$num - 5], $tokens[$num - 4], $tokens[$num - 3], $tokens[$num - 2], $tokens[$num - 1], $token); + $this->addTransition([], $token); + } else { + $start = max(0, $num - $this->size); + $end = $num; + for ($i = $start; $i < $end; ++$i) { + $this->addTransition(array_slice($tokens, $i, $end - $i), $token); + if ($end - $i < 3) break; + } } } } public function compile() { - $this->start = $this->index($this->start); - foreach ($this->one as $key => $value) { - $this->one[$key] = $this->index($this->one[$key]); - if (empty($this->one[$key])) { - unset($this->one[$key]); - } - } - foreach ($this->two as $key => $value) { - $this->two[$key] = $this->index($this->two[$key]); - if (empty($this->two[$key])) { - unset($this->two[$key]); - } - } - foreach ($this->three as $key => $value) { - $this->three[$key] = $this->index($this->three[$key]); - if (empty($this->three[$key])) { - unset($this->three[$key]); - } - } - foreach ($this->four as $key => $value) { - $this->four[$key] = $this->index($this->four[$key]); - if (empty($this->four[$key])) { - unset($this->four[$key]); - } - } - foreach ($this->five as $key => $value) { - $this->five[$key] = $this->index($this->five[$key]); - if (empty($this->five[$key])) { - unset($this->five[$key]); + foreach ($this->transitions as $key => $value) { + $this->transitions[$key] = $this->index($this->transitions[$key]); + if (empty($this->transitions[$key])) { + unset($this->transitions[$key]); } } + echo 'size: ', number_format(strlen(json_encode($this->transitions)), 0), PHP_EOL; } - public function generate($limit = 75) { + public function generate($limit = 100) { $tokens = []; - $start = $this->randomStart(); - $tokens[] = $start; - $generated = $start; + $generated = ''; while (strlen($generated) < $limit) { $next = $this->randomNext($tokens); if (empty($next)) break; $tokens[] = $next; - $generated .= ' '.$next; + $generated .= $next; } return $generated; } @@ -85,7 +51,7 @@ class ChatLib { $weight = $entry['count']; if ($weight == 1) continue; $lower = $sum; - $sum += intval(pow($weight, 1.4)); + $sum += $weight; $examples = []; if (is_array(end($entry['examples']))) { // already processed @@ -103,79 +69,18 @@ class ChatLib { return $result; } - private function randomStart() { - $pick = $this->pick($this->start); - if (is_null($pick)) return ''; - return $this->exampleOf($pick); - } - private function randomNext($tokens) { $cnt = count($tokens); - $picks = []; - if ($cnt >= 5) { - $cmb = $this->generalize(array_slice($tokens, $cnt - 5, 5)); - if (isset($this->five[$cmb])) { - $pick = $this->pick($this->five[$cmb]); - if (!is_null($pick)) { - $picks[$pick[0]] = [ - 'count' => 10, - 'examples' => $pick[3], - ]; - } - } - } - if ($cnt >= 4) { - $cmb = $this->generalize(array_slice($tokens, $cnt - 4, 4)); - if (isset($this->four[$cmb])) { - $pick = $this->pick($this->four[$cmb]); - if (!is_null($pick)) { - $picks[$pick[0]] = [ - 'count' => 12, - 'examples' => $pick[3], - ]; - } - } - } - if ($cnt >= 3) { - $cmb = $this->generalize(array_slice($tokens, $cnt - 3, 3)); - if (isset($this->three[$cmb])) { - $pick = $this->pick($this->three[$cmb]); + for ($size = min($this->size, $cnt); $size >= 0; --$size) { + $cmb = $this->generalize(array_slice($tokens, $cnt - $size, $size)); + if (isset($this->transitions[$cmb])) { + $pick = $this->pick($this->transitions[$cmb]); if (!is_null($pick)) { - $picks[$pick[0]] = [ - 'count' => 14, - 'examples' => $pick[3], - ]; + return $this->exampleOf($pick); } } } - if ($cnt >= 2) { - $cmb = $this->generalize(array_slice($tokens, $cnt - 2, 2)); - if (isset($this->two[$cmb])) { - $pick = $this->pick($this->two[$cmb]); - if (!is_null($pick)) { - $picks[$pick[0]] = [ - 'count' => 4, - 'examples' => $pick[3], - ]; - } - } - } - if ($cnt >= 1) { - $cmb = $this->generalize(array_slice($tokens, $cnt - 1, 1)); - if (isset($this->one[$cmb])) { - $pick = $this->pick($this->one[$cmb]); - if (!is_null($pick)) { - $picks[$pick[0]] = [ - 'count' => 2, - 'examples' => $pick[3], - ]; - } - } - } - if (empty($picks)) return ''; - $picks = $this->index($picks); - $pick = $this->pick($picks); - return $this->exampleOf($pick); + return ''; } private function pick($options) { @@ -200,49 +105,12 @@ class ChatLib { return $options[$min_index]; } - private function addStart($token) { - if (empty($token)) return; - $this->increment($this->start, $token); - } - - private function addOne($one, $token) { - $cmb = $this->generalize([$one]); - if (!isset($this->one[$cmb])) { - $this->one[$cmb] = []; - } - $this->increment($this->one[$cmb], $token); - } - - private function addTwo($one, $two, $token) { - $cmb = $this->generalize([$one, $two]); - if (!isset($this->two[$cmb])) { - $this->two[$cmb] = []; - } - $this->increment($this->two[$cmb], $token); - } - - private function addThree($one, $two, $three, $token) { - $cmb = $this->generalize([$one, $two, $three]); - if (!isset($this->three[$cmb])) { - $this->three[$cmb] = []; - } - $this->increment($this->three[$cmb], $token); - } - - private function addFour($one, $two, $three, $four, $token) { - $cmb = $this->generalize([$one, $two, $three, $four]); - if (!isset($this->four[$cmb])) { - $this->four[$cmb] = []; - } - $this->increment($this->four[$cmb], $token); - } - - private function addFive($one, $two, $three, $four, $five, $token) { - $cmb = $this->generalize([$one, $two, $three, $four, $five]); - if (!isset($this->five[$cmb])) { - $this->five[$cmb] = []; + private function addTransition($state, $next) { + $cmb = $this->generalize($state); + if (!isset($this->transitions[$cmb])) { + $this->transitions[$cmb] = []; } - $this->increment($this->five[$cmb], $token); + $this->increment($this->transitions[$cmb], $next); } private function increment(&$which, $token) { @@ -263,12 +131,15 @@ class ChatLib { } } + private function tokenize($str) { + return array_values(array_filter(preg_split('/\b/u', $str))); + } + private function generalize($tokens) { $str = ''; foreach ($tokens as $token) { - $replaced = preg_replace('/\W/u', '', $token); - $replaced = preg_replace('/\d+/', '0', $replaced); - $replaced = strtolower(trim($replaced)); + $replaced = preg_replace('/\d+/', '0', $token); + $replaced = strtolower($replaced); $str .= empty($replaced) ? $token : $replaced; } return $str; @@ -279,11 +150,7 @@ class ChatLib { return $example[0]; } - private $start = []; - private $one = []; - private $two = []; - private $three = []; - private $four = []; - private $five = []; + private $size = 5; + private $transitions = []; } -- 2.39.2