Extractor.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. <?php
  2. /**
  3. * @author Mike Cochrane <mikec@mikenz.geek.nz>
  4. * @author Nick Pope <nick@nickpope.me.uk>
  5. * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
  6. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
  7. */
  8. namespace App\Util\Lexer;
  9. use Illuminate\Support\Str;
  10. use App\Status;
  11. use App\Services\AutolinkService;
  12. use App\Services\TrendingHashtagService;
  13. /**
  14. * Twitter Extractor Class.
  15. *
  16. * Parses tweets and extracts URLs, usernames, username/list pairs and
  17. * hashtags.
  18. *
  19. * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
  20. * is based on code by {@link http://github.com/mzsanford Matt Sanford} and
  21. * heavily modified by {@link http://github.com/ngnpope Nick Pope}.
  22. *
  23. * @author Mike Cochrane <mikec@mikenz.geek.nz>
  24. * @author Nick Pope <nick@nickpope.me.uk>
  25. * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
  26. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
  27. */
  28. class Extractor extends Regex
  29. {
  30. /**
  31. * @var bool
  32. */
  33. protected $extractURLWithoutProtocol = true;
  34. protected $activeUsersOnly = false;
  35. /**
  36. * Provides fluent method chaining.
  37. *
  38. * @param string $tweet The tweet to be converted.
  39. *
  40. * @see __construct()
  41. *
  42. * @return Extractor
  43. */
  44. public static function create($tweet = null)
  45. {
  46. return new self($tweet);
  47. }
  48. public function setActiveUsersOnly($active)
  49. {
  50. $this->activeUsersOnly = $active;
  51. return $this;
  52. }
  53. /**
  54. * Reads in a tweet to be parsed and extracts elements from it.
  55. *
  56. * Extracts various parts of a tweet including URLs, usernames, hashtags...
  57. *
  58. * @param string $tweet The tweet to extract.
  59. */
  60. public function __construct($tweet = null)
  61. {
  62. parent::__construct($tweet);
  63. }
  64. /**
  65. * Extracts all parts of a tweet and returns an associative array containing
  66. * the extracted elements.
  67. *
  68. * @param string $tweet The tweet to extract.
  69. *
  70. * @return array The elements in the tweet.
  71. */
  72. public function extract($tweet = null)
  73. {
  74. if (is_null($tweet)) {
  75. $tweet = $this->tweet;
  76. }
  77. return [
  78. 'hashtags' => $this->extractHashtags($tweet),
  79. 'urls' => $this->extractURLs($tweet),
  80. 'mentions' => $this->extractMentionedUsernames($tweet),
  81. 'replyto' => $this->extractRepliedUsernames($tweet),
  82. 'hashtags_with_indices' => $this->extractHashtagsWithIndices($tweet),
  83. 'urls_with_indices' => $this->extractURLsWithIndices($tweet),
  84. 'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices($tweet),
  85. ];
  86. }
  87. /**
  88. * Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
  89. *
  90. * @param string $tweet The tweet to extract.
  91. *
  92. * @return array list of extracted entities
  93. */
  94. public function extractEntitiesWithIndices($tweet = null)
  95. {
  96. if (is_null($tweet)) {
  97. $tweet = $this->tweet;
  98. }
  99. $entities = [];
  100. $entities = array_merge($entities, $this->extractURLsWithIndices($tweet));
  101. $entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false));
  102. $entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet));
  103. $entities = $this->removeOverlappingEntities($entities);
  104. return $entities;
  105. }
  106. /**
  107. * Extracts all the hashtags from the tweet.
  108. *
  109. * @param string $tweet The tweet to extract.
  110. *
  111. * @return array The hashtag elements in the tweet.
  112. */
  113. public function extractHashtags($tweet = null)
  114. {
  115. $hashtagsOnly = [];
  116. $hashtagsWithIndices = $this->extractHashtagsWithIndices($tweet);
  117. foreach ($hashtagsWithIndices as $hashtagWithIndex) {
  118. $hashtagsOnly[] = $hashtagWithIndex['hashtag'];
  119. }
  120. return array_slice($hashtagsOnly, 0, Status::MAX_HASHTAGS);
  121. }
  122. /**
  123. * Extracts all the cashtags from the tweet.
  124. *
  125. * @param string $tweet The tweet to extract.
  126. *
  127. * @return array The cashtag elements in the tweet.
  128. */
  129. public function extractCashtags($tweet = null)
  130. {
  131. $cashtagsOnly = [];
  132. return $cashtagsOnly;
  133. }
  134. /**
  135. * Extracts all the URLs from the tweet.
  136. *
  137. * @param string $tweet The tweet to extract.
  138. *
  139. * @return array The URL elements in the tweet.
  140. */
  141. public function extractURLs($tweet = null)
  142. {
  143. $urlsOnly = [];
  144. $urlsWithIndices = $this->extractURLsWithIndices($tweet);
  145. foreach ($urlsWithIndices as $urlWithIndex) {
  146. $urlsOnly[] = $urlWithIndex['url'];
  147. }
  148. return array_slice($urlsOnly, 0, Status::MAX_LINKS);
  149. }
  150. /**
  151. * Extract all the usernames from the tweet.
  152. *
  153. * A mention is an occurrence of a username anywhere in a tweet.
  154. *
  155. * @param string $tweet The tweet to extract.
  156. *
  157. * @return array The usernames elements in the tweet.
  158. */
  159. public function extractMentionedScreennames($tweet = null)
  160. {
  161. $usernamesOnly = [];
  162. $mentionsWithIndices = $this->extractMentionsOrListsWithIndices($tweet);
  163. foreach ($mentionsWithIndices as $mentionWithIndex) {
  164. if($this->activeUsersOnly == true) {
  165. if(!AutolinkService::mentionedUsernameExists($mentionWithIndex['screen_name'])) {
  166. continue;
  167. }
  168. }
  169. $screen_name = mb_strtolower($mentionWithIndex['screen_name']);
  170. if (empty($screen_name) or in_array($screen_name, $usernamesOnly)) {
  171. continue;
  172. }
  173. $usernamesOnly[] = $screen_name;
  174. }
  175. return $usernamesOnly;
  176. }
  177. /**
  178. * Extract all the usernames from the tweet.
  179. *
  180. * A mention is an occurrence of a username anywhere in a tweet.
  181. *
  182. * @return array The usernames elements in the tweet.
  183. *
  184. * @deprecated since version 1.1.0
  185. */
  186. public function extractMentionedUsernames($tweet)
  187. {
  188. $this->tweet = $tweet;
  189. return $this->extractMentionedScreennames($tweet);
  190. }
  191. /**
  192. * Extract all the usernames replied to from the tweet.
  193. *
  194. * A reply is an occurrence of a username at the beginning of a tweet.
  195. *
  196. * @param string $tweet The tweet to extract.
  197. *
  198. * @return array The usernames replied to in a tweet.
  199. */
  200. public function extractReplyScreenname($tweet = null)
  201. {
  202. if (is_null($tweet)) {
  203. $tweet = $this->tweet;
  204. }
  205. $matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches);
  206. // Check username ending in
  207. if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) {
  208. $matched = false;
  209. }
  210. return $matched ? $matches[1] : null;
  211. }
  212. /**
  213. * Extract all the usernames replied to from the tweet.
  214. *
  215. * A reply is an occurrence of a username at the beginning of a tweet.
  216. *
  217. * @return array The usernames replied to in a tweet.
  218. *
  219. * @deprecated since version 1.1.0
  220. */
  221. public function extractRepliedUsernames()
  222. {
  223. return $this->extractReplyScreenname();
  224. }
  225. /**
  226. * Extracts all the hashtags and the indices they occur at from the tweet.
  227. *
  228. * @param string $tweet The tweet to extract.
  229. * @param bool $checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove overlapping ones
  230. *
  231. * @return array The hashtag elements in the tweet.
  232. */
  233. public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = true)
  234. {
  235. if (is_null($tweet)) {
  236. $tweet = $this->tweet;
  237. }
  238. if (!preg_match('/[##]/iu', $tweet)) {
  239. return [];
  240. }
  241. $bannedTags = config('app.env') === 'production' ? TrendingHashtagService::getBannedHashtagNames() : [];
  242. preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  243. $tags = [];
  244. foreach ($matches as $match) {
  245. list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, ['', 0]);
  246. $start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1];
  247. $end_position = $start_position + StringUtils::strlen($hash[0].$hashtag[0]);
  248. if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
  249. continue;
  250. }
  251. if (count($bannedTags)) {
  252. if(in_array(strtolower($hashtag[0]), array_map('strtolower', $bannedTags))) {
  253. continue;
  254. }
  255. }
  256. if (mb_strlen($hashtag[0]) > 124) {
  257. continue;
  258. }
  259. $tags[] = [
  260. 'hashtag' => $hashtag[0],
  261. 'indices' => [$start_position, $end_position],
  262. ];
  263. }
  264. if (!$checkUrlOverlap) {
  265. return array_slice($tags, 0, Status::MAX_HASHTAGS);
  266. }
  267. // check url overlap
  268. $urls = $this->extractURLsWithIndices($tweet);
  269. $entities = $this->removeOverlappingEntities(array_merge($tags, $urls));
  270. $validTags = [];
  271. foreach ($entities as $entity) {
  272. if (empty($entity['hashtag'])) {
  273. continue;
  274. }
  275. $validTags[] = $entity;
  276. }
  277. return array_slice($validTags, 0, Status::MAX_HASHTAGS);
  278. }
  279. /**
  280. * Extracts all the cashtags and the indices they occur at from the tweet.
  281. *
  282. * @param string $tweet The tweet to extract.
  283. *
  284. * @return array The cashtag elements in the tweet.
  285. */
  286. public function extractCashtagsWithIndices($tweet = null)
  287. {
  288. }
  289. /**
  290. * Extracts all the URLs and the indices they occur at from the tweet.
  291. *
  292. * @param string $tweet The tweet to extract.
  293. *
  294. * @return array The URLs elements in the tweet.
  295. */
  296. public function extractURLsWithIndices($tweet = null)
  297. {
  298. if (is_null($tweet)) {
  299. $tweet = $this->tweet;
  300. }
  301. $needle = $this->extractURLWithoutProtocol() ? '.' : ':';
  302. if (strpos($tweet, $needle) === false) {
  303. return [];
  304. }
  305. $urls = [];
  306. preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  307. foreach ($matches as $match) {
  308. list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, ['']);
  309. $start_position = $url[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $url[1])) : $url[1];
  310. $end_position = $start_position + StringUtils::strlen($url[0]);
  311. $all = $all[0];
  312. $before = $before[0];
  313. $url = $url[0];
  314. $protocol = $protocol[0];
  315. $domain = $domain[0];
  316. $port = $port[0];
  317. $path = $path[0];
  318. $query = $query[0];
  319. // If protocol is missing and domain contains non-ASCII characters,
  320. // extract ASCII-only domains.
  321. if (empty($protocol)) {
  322. if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) {
  323. continue;
  324. }
  325. $last_url = null;
  326. $ascii_end_position = 0;
  327. if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) {
  328. $asciiDomain[0] = preg_replace('/'.preg_quote($domain, '/').'/u', $asciiDomain[0], $url);
  329. $ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position);
  330. $ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]);
  331. $last_url = [
  332. 'url' => $asciiDomain[0],
  333. 'indices' => [$start_position + $ascii_start_position, $start_position + $ascii_end_position],
  334. ];
  335. if (!empty($path)
  336. || preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0])
  337. || !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) {
  338. $urls[] = $last_url;
  339. }
  340. }
  341. // no ASCII-only domain found. Skip the entire URL
  342. if (empty($last_url)) {
  343. continue;
  344. }
  345. // $last_url only contains domain. Need to add path and query if they exist.
  346. if (!empty($path)) {
  347. // last_url was not added. Add it to urls here.
  348. $last_url['url'] = preg_replace('/'.preg_quote($domain, '/').'/u', $last_url['url'], $url);
  349. $last_url['indices'][1] = $end_position;
  350. }
  351. } else {
  352. // In the case of t.co URLs, don't allow additional path characters
  353. if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) {
  354. $url = $tcoUrlMatches[0];
  355. $end_position = $start_position + StringUtils::strlen($url);
  356. }
  357. $urls[] = [
  358. 'url' => $url,
  359. 'indices' => [$start_position, $end_position],
  360. ];
  361. }
  362. }
  363. return array_slice($urls, 0, Status::MAX_LINKS);
  364. }
  365. /**
  366. * Extracts all the usernames and the indices they occur at from the tweet.
  367. *
  368. * @param string $tweet The tweet to extract.
  369. *
  370. * @return array The username elements in the tweet.
  371. */
  372. public function extractMentionedScreennamesWithIndices($tweet = null)
  373. {
  374. if (is_null($tweet)) {
  375. $tweet = $this->tweet;
  376. }
  377. $usernamesOnly = [];
  378. $mentions = $this->extractMentionsOrListsWithIndices($tweet);
  379. foreach ($mentions as $mention) {
  380. if (isset($mention['list_slug'])) {
  381. unset($mention['list_slug']);
  382. }
  383. $usernamesOnly[] = $mention;
  384. }
  385. return array_slice($usernamesOnly, 0, Status::MAX_MENTIONS);
  386. }
  387. /**
  388. * Extracts all the usernames and the indices they occur at from the tweet.
  389. *
  390. * @return array The username elements in the tweet.
  391. *
  392. * @deprecated since version 1.1.0
  393. */
  394. public function extractMentionedUsernamesWithIndices()
  395. {
  396. return $this->extractMentionedScreennamesWithIndices();
  397. }
  398. /**
  399. * Extracts all the usernames and the indices they occur at from the tweet.
  400. *
  401. * @param string $tweet The tweet to extract.
  402. *
  403. * @return array The username elements in the tweet.
  404. */
  405. public function extractMentionsOrListsWithIndices($tweet = null)
  406. {
  407. if (is_null($tweet)) {
  408. $tweet = $this->tweet;
  409. }
  410. if (!preg_match('/[@@]/iu', $tweet)) {
  411. return [];
  412. }
  413. preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  414. $results = [];
  415. foreach ($matches as $match) {
  416. list($all, $before, $at, $username, $list_slug, $outer) = array_pad($match, 6, ['', 0]);
  417. $start_position = $at[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $at[1])) : $at[1];
  418. $end_position = $start_position + StringUtils::strlen($at[0]) + StringUtils::strlen($username[0]);
  419. $screenname = trim($all[0]) == '@'.$username[0] ? $username[0] : trim($all[0]);
  420. if($this->activeUsersOnly == true) {
  421. if(!AutolinkService::mentionedUsernameExists($screenname)) {
  422. continue;
  423. }
  424. }
  425. $entity = [
  426. 'screen_name' => $screenname,
  427. 'list_slug' => $list_slug[0],
  428. 'indices' => [$start_position, $end_position],
  429. ];
  430. if (preg_match(self::$patterns['end_mention_match'], $outer[0])) {
  431. continue;
  432. }
  433. if (!empty($list_slug[0])) {
  434. $entity['indices'][1] = $end_position + StringUtils::strlen($list_slug[0]);
  435. }
  436. $results[] = $entity;
  437. }
  438. return array_slice($results, 0, Status::MAX_MENTIONS);
  439. }
  440. /**
  441. * Extracts all the usernames and the indices they occur at from the tweet.
  442. *
  443. * @return array The username elements in the tweet.
  444. *
  445. * @deprecated since version 1.1.0
  446. */
  447. public function extractMentionedUsernamesOrListsWithIndices()
  448. {
  449. return $this->extractMentionsOrListsWithIndices();
  450. }
  451. /**
  452. * setter/getter for extractURLWithoutProtocol.
  453. *
  454. * @param bool $flag
  455. *
  456. * @return Extractor
  457. */
  458. public function extractURLWithoutProtocol($flag = null)
  459. {
  460. if (is_null($flag)) {
  461. return $this->extractURLWithoutProtocol;
  462. }
  463. $this->extractURLWithoutProtocol = (bool) $flag;
  464. return $this;
  465. }
  466. /**
  467. * Remove overlapping entities.
  468. * This returns a new array with no overlapping entities.
  469. *
  470. * @param array $entities
  471. *
  472. * @return array
  473. */
  474. public function removeOverlappingEntities($entities)
  475. {
  476. $result = [];
  477. usort($entities, [$this, 'sortEntites']);
  478. $prev = null;
  479. foreach ($entities as $entity) {
  480. if (isset($prev) && $entity['indices'][0] < $prev['indices'][1]) {
  481. continue;
  482. }
  483. $prev = $entity;
  484. $result[] = $entity;
  485. }
  486. return $result;
  487. }
  488. /**
  489. * sort by entity start index.
  490. *
  491. * @param array $a
  492. * @param array $b
  493. *
  494. * @return int
  495. */
  496. protected function sortEntites($a, $b)
  497. {
  498. if ($a['indices'][0] == $b['indices'][0]) {
  499. return 0;
  500. }
  501. return ($a['indices'][0] < $b['indices'][0]) ? -1 : 1;
  502. }
  503. }