tokenize($msg->text_content); if (empty($tokens)) return; $tokens[] = ''; foreach ($tokens as $num => $token) { if ($num === 0) { $this->addTransition([], $token); } else { $start = max(0, $num - $this->size); $end = $num; for ($i = $start; $i < $end; ++$i) { $this->addTransition(array_slice($tokens, $i, $end - $i), $token); if ($end - $i < 3) break; } } } } public function compile() { foreach ($this->transitions as $key => $value) { $this->transitions[$key] = $this->index($this->transitions[$key]); if (empty($this->transitions[$key])) { unset($this->transitions[$key]); } } echo 'size: ', number_format(strlen(json_encode($this->transitions)), 0), PHP_EOL; } public function generate($limit = 100) { $tokens = []; $generated = ''; while (strlen($generated) < $limit) { $next = $this->randomNext($tokens); if (empty($next)) break; $tokens[] = $next; $generated .= $next; } return $generated; } private function index($arr) { $result = []; $sum = 0; foreach ($arr as $key => $entry) { $weight = $entry['count']; if ($weight == 1) continue; $lower = $sum; $sum += $weight; $examples = []; if (is_array(end($entry['examples']))) { // already processed $examples = $entry['examples']; } else { $subsum = 0; foreach ($entry['examples'] as $example => $subweight) { $sublower = $subsum; $subsum += $subweight * $subweight; $examples[] = [$example, $sublower, $subsum]; } } $result[] = [$key, $lower, $sum, $examples]; } return $result; } private function randomNext($tokens) { $cnt = count($tokens); for ($size = min($this->size, $cnt); $size >= 0; --$size) { $cmb = $this->generalize(array_slice($tokens, $cnt - $size, $size)); if (isset($this->transitions[$cmb])) { $pick = $this->pick($this->transitions[$cmb]); if (!is_null($pick)) { return $this->exampleOf($pick); } } } return ''; } private function pick($options) { if (empty($options)) return null; $max = end($options)[2]; $num = random_int(0, $max); $min_index = 0; $max_index = count($options) - 1; while ($min_index < $max_index) { $cur_index = intval(($min_index + $max_index) / 2); $cur_low = $options[$cur_index][1]; $cur_high = $options[$cur_index][2]; if ($cur_low > $num) { $max_index = $cur_index; } else if ($cur_high < $num) { $min_index = $cur_index + 1; } else { $min_index = $cur_index; break; } } return $options[$min_index]; } private function addTransition($state, $next) { $cmb = $this->generalize($state); if (!isset($this->transitions[$cmb])) { $this->transitions[$cmb] = []; } $this->increment($this->transitions[$cmb], $next); } private function increment(&$which, $token) { $generalized = $this->generalize([$token]); if (!isset($which[$generalized])) { $which[$generalized] = [ 'count' => 1, 'examples' => [], ]; $which[$generalized]['examples'][$token] = 1; } else { ++$which[$generalized]['count']; if (!isset($which[$generalized]['examples'][$token])) { $which[$generalized]['examples'][$token] = 1; } else { ++$which[$generalized]['examples'][$token]; } } } private function tokenize($str) { return array_values(array_filter(preg_split('/\b/u', $str))); } private function generalize($tokens) { $str = ''; foreach ($tokens as $token) { $replaced = preg_replace('/\d+/', '0', $token); $replaced = strtolower($replaced); $str .= empty($replaced) ? $token : $replaced; } return $str; } private function exampleOf($pick) { $example = $this->pick($pick[3]); return $example[0]; } private $size = 5; private $transitions = []; }