->where('banned', '=', false)
->whereNotNull('evaluated_at')
->where('created_at', '<', now()->sub(7, 'day'))
+ ->whereNotIn('classification', ['gg', 'gl', 'number', 'o7'])
+ ->whereRaw('LENGTH(`text_content`) > 12')
->chunk(5000, function ($msgs) use ($de, $en) {
foreach ($msgs as $msg) {
if ($msg->detected_language === 'de') {
$end = $num;
for ($i = $start; $i < $end; ++$i) {
$this->addTransition(array_slice($tokens, $i, $end - $i), $token);
- if ($end - $i < 4) break;
+ if ($end - $i < 5) break;
}
}
}
if (is_array(end($entry['examples']))) {
// already processed
$examples = $entry['examples'];
+ } else if ($key === ' ') {
+ $examples = [[' ', 0, 1]];
} else {
$subsum = 0;
foreach ($entry['examples'] as $example => $subweight) {
$str = '';
foreach ($tokens as $token) {
$replaced = preg_replace('/\d+/', '0', $token);
+ $replaced = preg_replace('/\s+/', ' ', $token);
+ $replaced = preg_replace('/(.)\1{2,}/', '$1', $token);
$replaced = strtolower($replaced);
$str .= $replaced;
}
return $example[0];
}
- private $size = 5;
+ private $size = 7;
private $transitions = [];
}