class ChatLib {
public function addMessage($msg) {
- $tokens = array_values(array_filter(preg_split('/\s+/u', $msg->text_content)));
+ $tokens = $this->tokenize($msg->text_content);
if (empty($tokens)) return;
- $tokens [] = '';
+ $tokens[] = '';
foreach ($tokens as $num => $token) {
if ($num === 0) {
- $this->addStart($token);
- }
- if ($num > 0) {
- $this->addOne($tokens[$num - 1], $token);
- }
- if ($num > 1) {
- $this->addTwo($tokens[$num - 2], $tokens[$num - 1], $token);
- }
- if ($num > 2) {
- $this->addThree($tokens[$num - 3], $tokens[$num - 2], $tokens[$num - 1], $token);
- }
- if ($num > 3) {
- $this->addFour($tokens[$num - 4], $tokens[$num - 3], $tokens[$num - 2], $tokens[$num - 1], $token);
- }
- if ($num > 4) {
- $this->addFive($tokens[$num - 5], $tokens[$num - 4], $tokens[$num - 3], $tokens[$num - 2], $tokens[$num - 1], $token);
+ $this->addTransition([], $token);
+ } else {
+ $start = max(0, $num - $this->size);
+ $end = $num;
+ for ($i = $start; $i < $end; ++$i) {
+ $this->addTransition(array_slice($tokens, $i, $end - $i), $token);
+ if ($end - $i < 3) break;
+ }
}
}
}
public function compile() {
- $this->start = $this->index($this->start);
- foreach ($this->one as $key => $value) {
- $this->one[$key] = $this->index($this->one[$key]);
- if (empty($this->one[$key])) {
- unset($this->one[$key]);
- }
- }
- foreach ($this->two as $key => $value) {
- $this->two[$key] = $this->index($this->two[$key]);
- if (empty($this->two[$key])) {
- unset($this->two[$key]);
- }
- }
- foreach ($this->three as $key => $value) {
- $this->three[$key] = $this->index($this->three[$key]);
- if (empty($this->three[$key])) {
- unset($this->three[$key]);
- }
- }
- foreach ($this->four as $key => $value) {
- $this->four[$key] = $this->index($this->four[$key]);
- if (empty($this->four[$key])) {
- unset($this->four[$key]);
- }
- }
- foreach ($this->five as $key => $value) {
- $this->five[$key] = $this->index($this->five[$key]);
- if (empty($this->five[$key])) {
- unset($this->five[$key]);
+ foreach ($this->transitions as $key => $value) {
+ $this->transitions[$key] = $this->index($this->transitions[$key]);
+ if (empty($this->transitions[$key])) {
+ unset($this->transitions[$key]);
}
}
+ echo 'size: ', number_format(strlen(json_encode($this->transitions)), 0), PHP_EOL;
}
- public function generate($limit = 75) {
+ public function generate($limit = 100) {
$tokens = [];
- $start = $this->randomStart();
- $tokens[] = $start;
- $generated = $start;
+ $generated = '';
while (strlen($generated) < $limit) {
$next = $this->randomNext($tokens);
if (empty($next)) break;
$tokens[] = $next;
- $generated .= ' '.$next;
+ $generated .= $next;
}
return $generated;
}
$weight = $entry['count'];
if ($weight == 1) continue;
$lower = $sum;
- $sum += intval(pow($weight, 1.4));
+ $sum += $weight;
$examples = [];
if (is_array(end($entry['examples']))) {
// already processed
return $result;
}
- private function randomStart() {
- $pick = $this->pick($this->start);
- if (is_null($pick)) return '';
- return $this->exampleOf($pick);
- }
-
private function randomNext($tokens) {
$cnt = count($tokens);
- $picks = [];
- if ($cnt >= 5) {
- $cmb = $this->generalize(array_slice($tokens, $cnt - 5, 5));
- if (isset($this->five[$cmb])) {
- $pick = $this->pick($this->five[$cmb]);
- if (!is_null($pick)) {
- $picks[$pick[0]] = [
- 'count' => 10,
- 'examples' => $pick[3],
- ];
- }
- }
- }
- if ($cnt >= 4) {
- $cmb = $this->generalize(array_slice($tokens, $cnt - 4, 4));
- if (isset($this->four[$cmb])) {
- $pick = $this->pick($this->four[$cmb]);
- if (!is_null($pick)) {
- $picks[$pick[0]] = [
- 'count' => 12,
- 'examples' => $pick[3],
- ];
- }
- }
- }
- if ($cnt >= 3) {
- $cmb = $this->generalize(array_slice($tokens, $cnt - 3, 3));
- if (isset($this->three[$cmb])) {
- $pick = $this->pick($this->three[$cmb]);
+ for ($size = min($this->size, $cnt); $size >= 0; --$size) {
+ $cmb = $this->generalize(array_slice($tokens, $cnt - $size, $size));
+ if (isset($this->transitions[$cmb])) {
+ $pick = $this->pick($this->transitions[$cmb]);
if (!is_null($pick)) {
- $picks[$pick[0]] = [
- 'count' => 14,
- 'examples' => $pick[3],
- ];
+ return $this->exampleOf($pick);
}
}
}
- if ($cnt >= 2) {
- $cmb = $this->generalize(array_slice($tokens, $cnt - 2, 2));
- if (isset($this->two[$cmb])) {
- $pick = $this->pick($this->two[$cmb]);
- if (!is_null($pick)) {
- $picks[$pick[0]] = [
- 'count' => 4,
- 'examples' => $pick[3],
- ];
- }
- }
- }
- if ($cnt >= 1) {
- $cmb = $this->generalize(array_slice($tokens, $cnt - 1, 1));
- if (isset($this->one[$cmb])) {
- $pick = $this->pick($this->one[$cmb]);
- if (!is_null($pick)) {
- $picks[$pick[0]] = [
- 'count' => 2,
- 'examples' => $pick[3],
- ];
- }
- }
- }
- if (empty($picks)) return '';
- $picks = $this->index($picks);
- $pick = $this->pick($picks);
- return $this->exampleOf($pick);
+ return '';
}
private function pick($options) {
return $options[$min_index];
}
- private function addStart($token) {
- if (empty($token)) return;
- $this->increment($this->start, $token);
- }
-
- private function addOne($one, $token) {
- $cmb = $this->generalize([$one]);
- if (!isset($this->one[$cmb])) {
- $this->one[$cmb] = [];
- }
- $this->increment($this->one[$cmb], $token);
- }
-
- private function addTwo($one, $two, $token) {
- $cmb = $this->generalize([$one, $two]);
- if (!isset($this->two[$cmb])) {
- $this->two[$cmb] = [];
- }
- $this->increment($this->two[$cmb], $token);
- }
-
- private function addThree($one, $two, $three, $token) {
- $cmb = $this->generalize([$one, $two, $three]);
- if (!isset($this->three[$cmb])) {
- $this->three[$cmb] = [];
- }
- $this->increment($this->three[$cmb], $token);
- }
-
- private function addFour($one, $two, $three, $four, $token) {
- $cmb = $this->generalize([$one, $two, $three, $four]);
- if (!isset($this->four[$cmb])) {
- $this->four[$cmb] = [];
- }
- $this->increment($this->four[$cmb], $token);
- }
-
- private function addFive($one, $two, $three, $four, $five, $token) {
- $cmb = $this->generalize([$one, $two, $three, $four, $five]);
- if (!isset($this->five[$cmb])) {
- $this->five[$cmb] = [];
+ private function addTransition($state, $next) {
+ $cmb = $this->generalize($state);
+ if (!isset($this->transitions[$cmb])) {
+ $this->transitions[$cmb] = [];
}
- $this->increment($this->five[$cmb], $token);
+ $this->increment($this->transitions[$cmb], $next);
}
private function increment(&$which, $token) {
}
}
+ private function tokenize($str) {
+ return array_values(array_filter(preg_split('/\b/u', $str)));
+ }
+
private function generalize($tokens) {
$str = '';
foreach ($tokens as $token) {
- $replaced = preg_replace('/\W/u', '', $token);
- $replaced = preg_replace('/\d+/', '0', $replaced);
- $replaced = strtolower(trim($replaced));
+ $replaced = preg_replace('/\d+/', '0', $token);
+ $replaced = strtolower($replaced);
$str .= empty($replaced) ? $token : $replaced;
}
return $str;
return $example[0];
}
- private $start = [];
- private $one = [];
- private $two = [];
- private $three = [];
- private $four = [];
- private $five = [];
+ private $size = 5;
+ private $transitions = [];
}