7 public function addMessage($msg) {
8 $tokens = $this->tokenize($msg->text_content);
9 if (empty($tokens)) return;
11 foreach ($tokens as $num => $token) {
13 $this->addTransition([], $token);
15 $start = max(0, $num - $this->size);
17 for ($i = $start; $i < $end; ++$i) {
18 $this->addTransition(array_slice($tokens, $i, $end - $i), $token);
19 if ($end - $i < 3) break;
25 public function compile() {
26 foreach ($this->transitions as $key => $value) {
27 $this->transitions[$key] = $this->index($this->transitions[$key]);
28 if (empty($this->transitions[$key])) {
29 unset($this->transitions[$key]);
32 echo 'size: ', number_format(strlen(json_encode($this->transitions)), 0), PHP_EOL;
35 public function generate($limit = 100) {
38 while (strlen($generated) < $limit) {
39 $next = $this->randomNext($tokens);
40 if (empty($next)) break;
47 private function index($arr) {
50 foreach ($arr as $key => $entry) {
51 $weight = $entry['count'];
52 if ($weight == 1) continue;
56 if (is_array(end($entry['examples']))) {
58 $examples = $entry['examples'];
61 foreach ($entry['examples'] as $example => $subweight) {
63 $subsum += $subweight * $subweight;
64 $examples[] = [$example, $sublower, $subsum];
67 $result[] = [$key, $lower, $sum, $examples];
72 private function randomNext($tokens) {
73 $cnt = count($tokens);
74 for ($size = min($this->size, $cnt); $size >= 0; --$size) {
75 $cmb = $this->generalize(array_slice($tokens, $cnt - $size, $size));
76 if (isset($this->transitions[$cmb])) {
77 $pick = $this->pick($this->transitions[$cmb]);
78 if (!is_null($pick)) {
79 return $this->exampleOf($pick);
86 private function pick($options) {
87 if (empty($options)) return null;
88 $max = end($options)[2];
89 $num = random_int(0, $max);
91 $max_index = count($options) - 1;
92 while ($min_index < $max_index) {
93 $cur_index = intval(($min_index + $max_index) / 2);
94 $cur_low = $options[$cur_index][1];
95 $cur_high = $options[$cur_index][2];
96 if ($cur_low > $num) {
97 $max_index = $cur_index;
98 } else if ($cur_high < $num) {
99 $min_index = $cur_index + 1;
101 $min_index = $cur_index;
105 return $options[$min_index];
108 private function addTransition($state, $next) {
109 $cmb = $this->generalize($state);
110 if (!isset($this->transitions[$cmb])) {
111 $this->transitions[$cmb] = [];
113 $this->increment($this->transitions[$cmb], $next);
116 private function increment(&$which, $token) {
117 $generalized = $this->generalize([$token]);
118 if (!isset($which[$generalized])) {
119 $which[$generalized] = [
123 $which[$generalized]['examples'][$token] = 1;
125 ++$which[$generalized]['count'];
126 if (!isset($which[$generalized]['examples'][$token])) {
127 $which[$generalized]['examples'][$token] = 1;
129 ++$which[$generalized]['examples'][$token];
134 private function tokenize($str) {
135 return array_values(array_filter(preg_split('/\b/u', $str)));
138 private function generalize($tokens) {
140 foreach ($tokens as $token) {
141 $replaced = preg_replace('/\d+/', '0', $token);
142 $replaced = strtolower($replaced);
143 $str .= empty($replaced) ? $token : $replaced;
148 private function exampleOf($pick) {
149 $example = $this->pick($pick[3]);
154 private $transitions = [];