From 26d47ca368d8e7c2690cec49f6ae2ad509a0428d Mon Sep 17 00:00:00 2001 From: Daniel Karbach Date: Wed, 8 May 2024 19:03:07 +0200 Subject: [PATCH] further refine chat gen --- app/Console/Commands/ChatlibDatabase.php | 2 ++ app/Models/ChatLib.php | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/app/Console/Commands/ChatlibDatabase.php b/app/Console/Commands/ChatlibDatabase.php index b95bb64..280d4a0 100644 --- a/app/Console/Commands/ChatlibDatabase.php +++ b/app/Console/Commands/ChatlibDatabase.php @@ -35,6 +35,8 @@ class ChatlibDatabase extends Command { ->where('banned', '=', false) ->whereNotNull('evaluated_at') ->where('created_at', '<', now()->sub(7, 'day')) + ->whereNotIn('classification', ['gg', 'gl', 'number', 'o7']) + ->whereRaw('LENGTH(`text_content`) > 12') ->chunk(5000, function ($msgs) use ($de, $en) { foreach ($msgs as $msg) { if ($msg->detected_language === 'de') { diff --git a/app/Models/ChatLib.php b/app/Models/ChatLib.php index c89ef94..3f1530b 100644 --- a/app/Models/ChatLib.php +++ b/app/Models/ChatLib.php @@ -22,7 +22,7 @@ class ChatLib { $end = $num; for ($i = $start; $i < $end; ++$i) { $this->addTransition(array_slice($tokens, $i, $end - $i), $token); - if ($end - $i < 4) break; + if ($end - $i < 5) break; } } } @@ -75,6 +75,8 @@ class ChatLib { if (is_array(end($entry['examples']))) { // already processed $examples = $entry['examples']; + } else if ($key === ' ') { + $examples = [[' ', 0, 1]]; } else { $subsum = 0; foreach ($entry['examples'] as $example => $subweight) { @@ -162,6 +164,8 @@ class ChatLib { $str = ''; foreach ($tokens as $token) { $replaced = preg_replace('/\d+/', '0', $token); + $replaced = preg_replace('/\s+/', ' ', $token); + $replaced = preg_replace('/(.)\1{2,}/', '$1', $token); $replaced = strtolower($replaced); $str .= $replaced; } @@ -173,7 +177,7 @@ class ChatLib { return $example[0]; } - private $size = 5; + private $size = 7; private $transitions = []; } -- 2.39.2