Classifier.php 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. <?php
  2. namespace App\Util\Lexer;
  3. use Brick\Math\BigDecimal;
  4. use Illuminate\Support\Collection;
  5. use Illuminate\Support\Str;
  6. class Classifier
  7. {
  8. /**
  9. * @var ?callable(string): array<int, string>
  10. */
  11. private $tokenizer;
  12. /**
  13. * @var array<string, array<string, int>>
  14. */
  15. private array $words = [];
  16. /**
  17. * @var array<string, int>
  18. */
  19. private array $documents = [];
  20. private bool $uneven = false;
  21. /**
  22. * @param callable(string): array<int, string> $tokenizer
  23. */
  24. public function setTokenizer(callable $tokenizer): void
  25. {
  26. $this->tokenizer = $tokenizer;
  27. }
  28. /**
  29. * @return Collection<int, string>
  30. */
  31. public function tokenize(string $string): Collection
  32. {
  33. if ($this->tokenizer) {
  34. /** @var array<int, string> */
  35. $tokens = call_user_func($this->tokenizer, $string);
  36. return collect($tokens);
  37. }
  38. return Str::of($string)
  39. ->lower()
  40. ->matchAll('/[[:alpha:]]+/u');
  41. }
  42. /**
  43. * @return $this
  44. */
  45. public function learn(string $statement, string $type): self
  46. {
  47. foreach ($this->tokenize($statement) as $word) {
  48. $this->incrementWord($type, $word);
  49. }
  50. $this->incrementType($type);
  51. return $this;
  52. }
  53. /**
  54. * @return Collection<string, string>
  55. */
  56. public function guess(string $statement): Collection
  57. {
  58. $words = $this->tokenize($statement);
  59. return collect($this->documents)
  60. ->map(function ($count, string $type) use ($words) {
  61. $likelihood = $this->pTotal($type);
  62. foreach ($words as $word) {
  63. $likelihood *= $this->p($word, $type);
  64. }
  65. return (string) BigDecimal::of($likelihood);
  66. })
  67. ->sortDesc();
  68. }
  69. public function most(string $statement): string
  70. {
  71. /** @var string */
  72. return $this->guess($statement)->keys()->first();
  73. }
  74. /**
  75. * @return self
  76. */
  77. public function uneven(bool $enabled = false): self
  78. {
  79. $this->uneven = $enabled;
  80. return $this;
  81. }
  82. /**
  83. * Increment the document count for the type
  84. */
  85. private function incrementType(string $type): void
  86. {
  87. if (! isset($this->documents[$type])) {
  88. $this->documents[$type] = 0;
  89. }
  90. $this->documents[$type]++;
  91. }
  92. /**
  93. * Increment the word count for the given type
  94. */
  95. private function incrementWord(string $type, string $word): void
  96. {
  97. $ignored = config('autospam.ignored_tokens');
  98. if(!$ignored) {
  99. $ignored = ['the', 'a', 'of', 'and'];
  100. } else {
  101. $ignored = explode(',', $ignored);
  102. }
  103. if ($type == 'spam' && in_array($word, $ignored)) {
  104. return;
  105. }
  106. if (! isset($this->words[$type][$word])) {
  107. $this->words[$type][$word] = 0;
  108. }
  109. $this->words[$type][$word]++;
  110. }
  111. /**
  112. * @return float|int
  113. */
  114. private function p(string $word, string $type)
  115. {
  116. $count = $this->words[$type][$word] ?? 0;
  117. return ($count + 1) / (array_sum($this->words[$type]) + 1);
  118. }
  119. /**
  120. * @return float|int
  121. */
  122. private function pTotal(string $type)
  123. {
  124. return $this->uneven
  125. ? ($this->documents[$type] + 1) / (array_sum($this->documents) + 1)
  126. : 1;
  127. }
  128. public function export()
  129. {
  130. $words = $this->words;
  131. $words = collect($words)
  132. ->map(function($w) {
  133. arsort($w);
  134. return $w;
  135. })
  136. ->all();
  137. return json_encode([
  138. '_ns' => 'https://pixelfed.org/ns/nlp',
  139. '_v' => '1.0',
  140. 'documents' => $this->documents,
  141. 'words' => $words
  142. ], JSON_PRETTY_PRINT|JSON_UNESCAPED_SLASHES);
  143. }
  144. public function import($documents, $words)
  145. {
  146. $this->documents = $documents;
  147. $this->words = $words;
  148. }
  149. }