Extractor.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. <?php
  2. /**
  3. * @author Mike Cochrane <mikec@mikenz.geek.nz>
  4. * @author Nick Pope <nick@nickpope.me.uk>
  5. * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
  6. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
  7. */
  8. namespace App\Util\Lexer;
  9. use Illuminate\Support\Str;
  10. use App\Status;
  11. use App\Services\AutolinkService;
  12. /**
  13. * Twitter Extractor Class.
  14. *
  15. * Parses tweets and extracts URLs, usernames, username/list pairs and
  16. * hashtags.
  17. *
  18. * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
  19. * is based on code by {@link http://github.com/mzsanford Matt Sanford} and
  20. * heavily modified by {@link http://github.com/ngnpope Nick Pope}.
  21. *
  22. * @author Mike Cochrane <mikec@mikenz.geek.nz>
  23. * @author Nick Pope <nick@nickpope.me.uk>
  24. * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
  25. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
  26. */
  27. class Extractor extends Regex
  28. {
  29. /**
  30. * @var bool
  31. */
  32. protected $extractURLWithoutProtocol = true;
  33. protected $activeUsersOnly = false;
  34. /**
  35. * Provides fluent method chaining.
  36. *
  37. * @param string $tweet The tweet to be converted.
  38. *
  39. * @see __construct()
  40. *
  41. * @return Extractor
  42. */
  43. public static function create($tweet = null)
  44. {
  45. return new self($tweet);
  46. }
  47. public function setActiveUsersOnly($active)
  48. {
  49. $this->activeUsersOnly = $active;
  50. return $this;
  51. }
  52. /**
  53. * Reads in a tweet to be parsed and extracts elements from it.
  54. *
  55. * Extracts various parts of a tweet including URLs, usernames, hashtags...
  56. *
  57. * @param string $tweet The tweet to extract.
  58. */
  59. public function __construct($tweet = null)
  60. {
  61. parent::__construct($tweet);
  62. }
  63. /**
  64. * Extracts all parts of a tweet and returns an associative array containing
  65. * the extracted elements.
  66. *
  67. * @param string $tweet The tweet to extract.
  68. *
  69. * @return array The elements in the tweet.
  70. */
  71. public function extract($tweet = null)
  72. {
  73. if (is_null($tweet)) {
  74. $tweet = $this->tweet;
  75. }
  76. return [
  77. 'hashtags' => $this->extractHashtags($tweet),
  78. 'urls' => $this->extractURLs($tweet),
  79. 'mentions' => $this->extractMentionedUsernames($tweet),
  80. 'replyto' => $this->extractRepliedUsernames($tweet),
  81. 'hashtags_with_indices' => $this->extractHashtagsWithIndices($tweet),
  82. 'urls_with_indices' => $this->extractURLsWithIndices($tweet),
  83. 'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices($tweet),
  84. ];
  85. }
  86. /**
  87. * Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
  88. *
  89. * @param string $tweet The tweet to extract.
  90. *
  91. * @return array list of extracted entities
  92. */
  93. public function extractEntitiesWithIndices($tweet = null)
  94. {
  95. if (is_null($tweet)) {
  96. $tweet = $this->tweet;
  97. }
  98. $entities = [];
  99. $entities = array_merge($entities, $this->extractURLsWithIndices($tweet));
  100. $entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false));
  101. $entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet));
  102. $entities = $this->removeOverlappingEntities($entities);
  103. return $entities;
  104. }
  105. /**
  106. * Extracts all the hashtags from the tweet.
  107. *
  108. * @param string $tweet The tweet to extract.
  109. *
  110. * @return array The hashtag elements in the tweet.
  111. */
  112. public function extractHashtags($tweet = null)
  113. {
  114. $hashtagsOnly = [];
  115. $hashtagsWithIndices = $this->extractHashtagsWithIndices($tweet);
  116. foreach ($hashtagsWithIndices as $hashtagWithIndex) {
  117. $hashtagsOnly[] = $hashtagWithIndex['hashtag'];
  118. }
  119. return array_slice($hashtagsOnly, 0, Status::MAX_HASHTAGS);
  120. }
  121. /**
  122. * Extracts all the cashtags from the tweet.
  123. *
  124. * @param string $tweet The tweet to extract.
  125. *
  126. * @return array The cashtag elements in the tweet.
  127. */
  128. public function extractCashtags($tweet = null)
  129. {
  130. $cashtagsOnly = [];
  131. return $cashtagsOnly;
  132. }
  133. /**
  134. * Extracts all the URLs from the tweet.
  135. *
  136. * @param string $tweet The tweet to extract.
  137. *
  138. * @return array The URL elements in the tweet.
  139. */
  140. public function extractURLs($tweet = null)
  141. {
  142. $urlsOnly = [];
  143. $urlsWithIndices = $this->extractURLsWithIndices($tweet);
  144. foreach ($urlsWithIndices as $urlWithIndex) {
  145. $urlsOnly[] = $urlWithIndex['url'];
  146. }
  147. return array_slice($urlsOnly, 0, Status::MAX_LINKS);
  148. }
  149. /**
  150. * Extract all the usernames from the tweet.
  151. *
  152. * A mention is an occurrence of a username anywhere in a tweet.
  153. *
  154. * @param string $tweet The tweet to extract.
  155. *
  156. * @return array The usernames elements in the tweet.
  157. */
  158. public function extractMentionedScreennames($tweet = null)
  159. {
  160. $usernamesOnly = [];
  161. $mentionsWithIndices = $this->extractMentionsOrListsWithIndices($tweet);
  162. foreach ($mentionsWithIndices as $mentionWithIndex) {
  163. if($this->activeUsersOnly == true) {
  164. if(!AutolinkService::mentionedUsernameExists($mentionWithIndex['screen_name'])) {
  165. continue;
  166. }
  167. }
  168. $screen_name = mb_strtolower($mentionWithIndex['screen_name']);
  169. if (empty($screen_name) or in_array($screen_name, $usernamesOnly)) {
  170. continue;
  171. }
  172. $usernamesOnly[] = $screen_name;
  173. }
  174. return $usernamesOnly;
  175. }
  176. /**
  177. * Extract all the usernames from the tweet.
  178. *
  179. * A mention is an occurrence of a username anywhere in a tweet.
  180. *
  181. * @return array The usernames elements in the tweet.
  182. *
  183. * @deprecated since version 1.1.0
  184. */
  185. public function extractMentionedUsernames($tweet)
  186. {
  187. $this->tweet = $tweet;
  188. return $this->extractMentionedScreennames($tweet);
  189. }
  190. /**
  191. * Extract all the usernames replied to from the tweet.
  192. *
  193. * A reply is an occurrence of a username at the beginning of a tweet.
  194. *
  195. * @param string $tweet The tweet to extract.
  196. *
  197. * @return array The usernames replied to in a tweet.
  198. */
  199. public function extractReplyScreenname($tweet = null)
  200. {
  201. if (is_null($tweet)) {
  202. $tweet = $this->tweet;
  203. }
  204. $matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches);
  205. // Check username ending in
  206. if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) {
  207. $matched = false;
  208. }
  209. return $matched ? $matches[1] : null;
  210. }
  211. /**
  212. * Extract all the usernames replied to from the tweet.
  213. *
  214. * A reply is an occurrence of a username at the beginning of a tweet.
  215. *
  216. * @return array The usernames replied to in a tweet.
  217. *
  218. * @deprecated since version 1.1.0
  219. */
  220. public function extractRepliedUsernames()
  221. {
  222. return $this->extractReplyScreenname();
  223. }
  224. /**
  225. * Extracts all the hashtags and the indices they occur at from the tweet.
  226. *
  227. * @param string $tweet The tweet to extract.
  228. * @param bool $checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove overlapping ones
  229. *
  230. * @return array The hashtag elements in the tweet.
  231. */
  232. public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = true)
  233. {
  234. if (is_null($tweet)) {
  235. $tweet = $this->tweet;
  236. }
  237. if (!preg_match('/[##]/iu', $tweet)) {
  238. return [];
  239. }
  240. preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  241. $tags = [];
  242. foreach ($matches as $match) {
  243. list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, ['', 0]);
  244. $start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1];
  245. $end_position = $start_position + StringUtils::strlen($hash[0].$hashtag[0]);
  246. if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
  247. continue;
  248. }
  249. if(mb_strlen($hashtag[0]) > 124) {
  250. continue;
  251. }
  252. $tags[] = [
  253. 'hashtag' => $hashtag[0],
  254. 'indices' => [$start_position, $end_position],
  255. ];
  256. }
  257. if (!$checkUrlOverlap) {
  258. return array_slice($tags, 0, Status::MAX_HASHTAGS);
  259. }
  260. // check url overlap
  261. $urls = $this->extractURLsWithIndices($tweet);
  262. $entities = $this->removeOverlappingEntities(array_merge($tags, $urls));
  263. $validTags = [];
  264. foreach ($entities as $entity) {
  265. if (empty($entity['hashtag'])) {
  266. continue;
  267. }
  268. $validTags[] = $entity;
  269. }
  270. return array_slice($validTags, 0, Status::MAX_HASHTAGS);
  271. }
  272. /**
  273. * Extracts all the cashtags and the indices they occur at from the tweet.
  274. *
  275. * @param string $tweet The tweet to extract.
  276. *
  277. * @return array The cashtag elements in the tweet.
  278. */
  279. public function extractCashtagsWithIndices($tweet = null)
  280. {
  281. }
  282. /**
  283. * Extracts all the URLs and the indices they occur at from the tweet.
  284. *
  285. * @param string $tweet The tweet to extract.
  286. *
  287. * @return array The URLs elements in the tweet.
  288. */
  289. public function extractURLsWithIndices($tweet = null)
  290. {
  291. if (is_null($tweet)) {
  292. $tweet = $this->tweet;
  293. }
  294. $needle = $this->extractURLWithoutProtocol() ? '.' : ':';
  295. if (strpos($tweet, $needle) === false) {
  296. return [];
  297. }
  298. $urls = [];
  299. preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  300. foreach ($matches as $match) {
  301. list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, ['']);
  302. $start_position = $url[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $url[1])) : $url[1];
  303. $end_position = $start_position + StringUtils::strlen($url[0]);
  304. $all = $all[0];
  305. $before = $before[0];
  306. $url = $url[0];
  307. $protocol = $protocol[0];
  308. $domain = $domain[0];
  309. $port = $port[0];
  310. $path = $path[0];
  311. $query = $query[0];
  312. // If protocol is missing and domain contains non-ASCII characters,
  313. // extract ASCII-only domains.
  314. if (empty($protocol)) {
  315. if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) {
  316. continue;
  317. }
  318. $last_url = null;
  319. $ascii_end_position = 0;
  320. if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) {
  321. $asciiDomain[0] = preg_replace('/'.preg_quote($domain, '/').'/u', $asciiDomain[0], $url);
  322. $ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position);
  323. $ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]);
  324. $last_url = [
  325. 'url' => $asciiDomain[0],
  326. 'indices' => [$start_position + $ascii_start_position, $start_position + $ascii_end_position],
  327. ];
  328. if (!empty($path)
  329. || preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0])
  330. || !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) {
  331. $urls[] = $last_url;
  332. }
  333. }
  334. // no ASCII-only domain found. Skip the entire URL
  335. if (empty($last_url)) {
  336. continue;
  337. }
  338. // $last_url only contains domain. Need to add path and query if they exist.
  339. if (!empty($path)) {
  340. // last_url was not added. Add it to urls here.
  341. $last_url['url'] = preg_replace('/'.preg_quote($domain, '/').'/u', $last_url['url'], $url);
  342. $last_url['indices'][1] = $end_position;
  343. }
  344. } else {
  345. // In the case of t.co URLs, don't allow additional path characters
  346. if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) {
  347. $url = $tcoUrlMatches[0];
  348. $end_position = $start_position + StringUtils::strlen($url);
  349. }
  350. $urls[] = [
  351. 'url' => $url,
  352. 'indices' => [$start_position, $end_position],
  353. ];
  354. }
  355. }
  356. return array_slice($urls, 0, Status::MAX_LINKS);
  357. }
  358. /**
  359. * Extracts all the usernames and the indices they occur at from the tweet.
  360. *
  361. * @param string $tweet The tweet to extract.
  362. *
  363. * @return array The username elements in the tweet.
  364. */
  365. public function extractMentionedScreennamesWithIndices($tweet = null)
  366. {
  367. if (is_null($tweet)) {
  368. $tweet = $this->tweet;
  369. }
  370. $usernamesOnly = [];
  371. $mentions = $this->extractMentionsOrListsWithIndices($tweet);
  372. foreach ($mentions as $mention) {
  373. if (isset($mention['list_slug'])) {
  374. unset($mention['list_slug']);
  375. }
  376. $usernamesOnly[] = $mention;
  377. }
  378. return array_slice($usernamesOnly, 0, Status::MAX_MENTIONS);
  379. }
  380. /**
  381. * Extracts all the usernames and the indices they occur at from the tweet.
  382. *
  383. * @return array The username elements in the tweet.
  384. *
  385. * @deprecated since version 1.1.0
  386. */
  387. public function extractMentionedUsernamesWithIndices()
  388. {
  389. return $this->extractMentionedScreennamesWithIndices();
  390. }
  391. /**
  392. * Extracts all the usernames and the indices they occur at from the tweet.
  393. *
  394. * @param string $tweet The tweet to extract.
  395. *
  396. * @return array The username elements in the tweet.
  397. */
  398. public function extractMentionsOrListsWithIndices($tweet = null)
  399. {
  400. if (is_null($tweet)) {
  401. $tweet = $this->tweet;
  402. }
  403. if (!preg_match('/[@@]/iu', $tweet)) {
  404. return [];
  405. }
  406. preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  407. $results = [];
  408. foreach ($matches as $match) {
  409. list($all, $before, $at, $username, $list_slug, $outer) = array_pad($match, 6, ['', 0]);
  410. $start_position = $at[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $at[1])) : $at[1];
  411. $end_position = $start_position + StringUtils::strlen($at[0]) + StringUtils::strlen($username[0]);
  412. $screenname = trim($all[0]) == '@'.$username[0] ? $username[0] : trim($all[0]);
  413. if($this->activeUsersOnly == true) {
  414. if(!AutolinkService::mentionedUsernameExists($screenname)) {
  415. continue;
  416. }
  417. }
  418. $entity = [
  419. 'screen_name' => $screenname,
  420. 'list_slug' => $list_slug[0],
  421. 'indices' => [$start_position, $end_position],
  422. ];
  423. if (preg_match(self::$patterns['end_mention_match'], $outer[0])) {
  424. continue;
  425. }
  426. if (!empty($list_slug[0])) {
  427. $entity['indices'][1] = $end_position + StringUtils::strlen($list_slug[0]);
  428. }
  429. $results[] = $entity;
  430. }
  431. return array_slice($results, 0, Status::MAX_MENTIONS);
  432. }
  433. /**
  434. * Extracts all the usernames and the indices they occur at from the tweet.
  435. *
  436. * @return array The username elements in the tweet.
  437. *
  438. * @deprecated since version 1.1.0
  439. */
  440. public function extractMentionedUsernamesOrListsWithIndices()
  441. {
  442. return $this->extractMentionsOrListsWithIndices();
  443. }
  444. /**
  445. * setter/getter for extractURLWithoutProtocol.
  446. *
  447. * @param bool $flag
  448. *
  449. * @return Extractor
  450. */
  451. public function extractURLWithoutProtocol($flag = null)
  452. {
  453. if (is_null($flag)) {
  454. return $this->extractURLWithoutProtocol;
  455. }
  456. $this->extractURLWithoutProtocol = (bool) $flag;
  457. return $this;
  458. }
  459. /**
  460. * Remove overlapping entities.
  461. * This returns a new array with no overlapping entities.
  462. *
  463. * @param array $entities
  464. *
  465. * @return array
  466. */
  467. public function removeOverlappingEntities($entities)
  468. {
  469. $result = [];
  470. usort($entities, [$this, 'sortEntites']);
  471. $prev = null;
  472. foreach ($entities as $entity) {
  473. if (isset($prev) && $entity['indices'][0] < $prev['indices'][1]) {
  474. continue;
  475. }
  476. $prev = $entity;
  477. $result[] = $entity;
  478. }
  479. return $result;
  480. }
  481. /**
  482. * sort by entity start index.
  483. *
  484. * @param array $a
  485. * @param array $b
  486. *
  487. * @return int
  488. */
  489. protected function sortEntites($a, $b)
  490. {
  491. if ($a['indices'][0] == $b['indices'][0]) {
  492. return 0;
  493. }
  494. return ($a['indices'][0] < $b['indices'][0]) ? -1 : 1;
  495. }
  496. }