Extractor.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. <?php
  2. /**
  3. * @author Mike Cochrane <mikec@mikenz.geek.nz>
  4. * @author Nick Pope <nick@nickpope.me.uk>
  5. * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
  6. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
  7. */
  8. namespace App\Util\Lexer;
  9. use Illuminate\Support\Str;
  10. use App\Status;
  11. /**
  12. * Twitter Extractor Class.
  13. *
  14. * Parses tweets and extracts URLs, usernames, username/list pairs and
  15. * hashtags.
  16. *
  17. * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
  18. * is based on code by {@link http://github.com/mzsanford Matt Sanford} and
  19. * heavily modified by {@link http://github.com/ngnpope Nick Pope}.
  20. *
  21. * @author Mike Cochrane <mikec@mikenz.geek.nz>
  22. * @author Nick Pope <nick@nickpope.me.uk>
  23. * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
  24. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
  25. */
  26. class Extractor extends Regex
  27. {
  28. /**
  29. * @var bool
  30. */
  31. protected $extractURLWithoutProtocol = true;
  32. /**
  33. * Provides fluent method chaining.
  34. *
  35. * @param string $tweet The tweet to be converted.
  36. *
  37. * @see __construct()
  38. *
  39. * @return Extractor
  40. */
  41. public static function create($tweet = null)
  42. {
  43. return new self($tweet);
  44. }
  45. /**
  46. * Reads in a tweet to be parsed and extracts elements from it.
  47. *
  48. * Extracts various parts of a tweet including URLs, usernames, hashtags...
  49. *
  50. * @param string $tweet The tweet to extract.
  51. */
  52. public function __construct($tweet = null)
  53. {
  54. parent::__construct($tweet);
  55. }
  56. /**
  57. * Extracts all parts of a tweet and returns an associative array containing
  58. * the extracted elements.
  59. *
  60. * @param string $tweet The tweet to extract.
  61. *
  62. * @return array The elements in the tweet.
  63. */
  64. public function extract($tweet = null)
  65. {
  66. if (is_null($tweet)) {
  67. $tweet = $this->tweet;
  68. }
  69. return [
  70. 'hashtags' => $this->extractHashtags($tweet),
  71. 'urls' => $this->extractURLs($tweet),
  72. 'mentions' => $this->extractMentionedUsernames($tweet),
  73. 'replyto' => $this->extractRepliedUsernames($tweet),
  74. 'hashtags_with_indices' => $this->extractHashtagsWithIndices($tweet),
  75. 'urls_with_indices' => $this->extractURLsWithIndices($tweet),
  76. 'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices($tweet),
  77. ];
  78. }
  79. /**
  80. * Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
  81. *
  82. * @param string $tweet The tweet to extract.
  83. *
  84. * @return array list of extracted entities
  85. */
  86. public function extractEntitiesWithIndices($tweet = null)
  87. {
  88. if (is_null($tweet)) {
  89. $tweet = $this->tweet;
  90. }
  91. $entities = [];
  92. $entities = array_merge($entities, $this->extractURLsWithIndices($tweet));
  93. $entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false));
  94. $entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet));
  95. $entities = $this->removeOverlappingEntities($entities);
  96. return $entities;
  97. }
  98. /**
  99. * Extracts all the hashtags from the tweet.
  100. *
  101. * @param string $tweet The tweet to extract.
  102. *
  103. * @return array The hashtag elements in the tweet.
  104. */
  105. public function extractHashtags($tweet = null)
  106. {
  107. $hashtagsOnly = [];
  108. $hashtagsWithIndices = $this->extractHashtagsWithIndices($tweet);
  109. foreach ($hashtagsWithIndices as $hashtagWithIndex) {
  110. $hashtagsOnly[] = $hashtagWithIndex['hashtag'];
  111. }
  112. return array_slice($hashtagsOnly, 0, Status::MAX_HASHTAGS);
  113. }
  114. /**
  115. * Extracts all the cashtags from the tweet.
  116. *
  117. * @param string $tweet The tweet to extract.
  118. *
  119. * @return array The cashtag elements in the tweet.
  120. */
  121. public function extractCashtags($tweet = null)
  122. {
  123. $cashtagsOnly = [];
  124. return $cashtagsOnly;
  125. }
  126. /**
  127. * Extracts all the URLs from the tweet.
  128. *
  129. * @param string $tweet The tweet to extract.
  130. *
  131. * @return array The URL elements in the tweet.
  132. */
  133. public function extractURLs($tweet = null)
  134. {
  135. $urlsOnly = [];
  136. $urlsWithIndices = $this->extractURLsWithIndices($tweet);
  137. foreach ($urlsWithIndices as $urlWithIndex) {
  138. $urlsOnly[] = $urlWithIndex['url'];
  139. }
  140. return array_slice($urlsOnly, 0, Status::MAX_LINKS);
  141. }
  142. /**
  143. * Extract all the usernames from the tweet.
  144. *
  145. * A mention is an occurrence of a username anywhere in a tweet.
  146. *
  147. * @param string $tweet The tweet to extract.
  148. *
  149. * @return array The usernames elements in the tweet.
  150. */
  151. public function extractMentionedScreennames($tweet = null)
  152. {
  153. $usernamesOnly = [];
  154. $mentionsWithIndices = $this->extractMentionsOrListsWithIndices($tweet);
  155. foreach ($mentionsWithIndices as $mentionWithIndex) {
  156. $screen_name = mb_strtolower($mentionWithIndex['screen_name']);
  157. if (empty($screen_name) or in_array($screen_name, $usernamesOnly)) {
  158. continue;
  159. }
  160. $usernamesOnly[] = $screen_name;
  161. }
  162. return $usernamesOnly;
  163. }
  164. /**
  165. * Extract all the usernames from the tweet.
  166. *
  167. * A mention is an occurrence of a username anywhere in a tweet.
  168. *
  169. * @return array The usernames elements in the tweet.
  170. *
  171. * @deprecated since version 1.1.0
  172. */
  173. public function extractMentionedUsernames($tweet)
  174. {
  175. $this->tweet = $tweet;
  176. return $this->extractMentionedScreennames($tweet);
  177. }
  178. /**
  179. * Extract all the usernames replied to from the tweet.
  180. *
  181. * A reply is an occurrence of a username at the beginning of a tweet.
  182. *
  183. * @param string $tweet The tweet to extract.
  184. *
  185. * @return array The usernames replied to in a tweet.
  186. */
  187. public function extractReplyScreenname($tweet = null)
  188. {
  189. if (is_null($tweet)) {
  190. $tweet = $this->tweet;
  191. }
  192. $matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches);
  193. // Check username ending in
  194. if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) {
  195. $matched = false;
  196. }
  197. return $matched ? $matches[1] : null;
  198. }
  199. /**
  200. * Extract all the usernames replied to from the tweet.
  201. *
  202. * A reply is an occurrence of a username at the beginning of a tweet.
  203. *
  204. * @return array The usernames replied to in a tweet.
  205. *
  206. * @deprecated since version 1.1.0
  207. */
  208. public function extractRepliedUsernames()
  209. {
  210. return $this->extractReplyScreenname();
  211. }
  212. /**
  213. * Extracts all the hashtags and the indices they occur at from the tweet.
  214. *
  215. * @param string $tweet The tweet to extract.
  216. * @param bool $checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove overlapping ones
  217. *
  218. * @return array The hashtag elements in the tweet.
  219. */
  220. public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = true)
  221. {
  222. if (is_null($tweet)) {
  223. $tweet = $this->tweet;
  224. }
  225. if (!preg_match('/[##]/iu', $tweet)) {
  226. return [];
  227. }
  228. preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  229. $tags = [];
  230. foreach ($matches as $match) {
  231. list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, ['', 0]);
  232. $start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1];
  233. $end_position = $start_position + StringUtils::strlen($hash[0].$hashtag[0]);
  234. if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
  235. continue;
  236. }
  237. if(mb_strlen($hashtag[0]) > 124) {
  238. continue;
  239. }
  240. $tags[] = [
  241. 'hashtag' => $hashtag[0],
  242. 'indices' => [$start_position, $end_position],
  243. ];
  244. }
  245. if (!$checkUrlOverlap) {
  246. return array_slice($tags, 0, Status::MAX_HASHTAGS);
  247. }
  248. // check url overlap
  249. $urls = $this->extractURLsWithIndices($tweet);
  250. $entities = $this->removeOverlappingEntities(array_merge($tags, $urls));
  251. $validTags = [];
  252. foreach ($entities as $entity) {
  253. if (empty($entity['hashtag'])) {
  254. continue;
  255. }
  256. $validTags[] = $entity;
  257. }
  258. return array_slice($validTags, 0, Status::MAX_HASHTAGS);
  259. }
  260. /**
  261. * Extracts all the cashtags and the indices they occur at from the tweet.
  262. *
  263. * @param string $tweet The tweet to extract.
  264. *
  265. * @return array The cashtag elements in the tweet.
  266. */
  267. public function extractCashtagsWithIndices($tweet = null)
  268. {
  269. }
  270. /**
  271. * Extracts all the URLs and the indices they occur at from the tweet.
  272. *
  273. * @param string $tweet The tweet to extract.
  274. *
  275. * @return array The URLs elements in the tweet.
  276. */
  277. public function extractURLsWithIndices($tweet = null)
  278. {
  279. if (is_null($tweet)) {
  280. $tweet = $this->tweet;
  281. }
  282. $needle = $this->extractURLWithoutProtocol() ? '.' : ':';
  283. if (strpos($tweet, $needle) === false) {
  284. return [];
  285. }
  286. $urls = [];
  287. preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  288. foreach ($matches as $match) {
  289. list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, ['']);
  290. $start_position = $url[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $url[1])) : $url[1];
  291. $end_position = $start_position + StringUtils::strlen($url[0]);
  292. $all = $all[0];
  293. $before = $before[0];
  294. $url = $url[0];
  295. $protocol = $protocol[0];
  296. $domain = $domain[0];
  297. $port = $port[0];
  298. $path = $path[0];
  299. $query = $query[0];
  300. // If protocol is missing and domain contains non-ASCII characters,
  301. // extract ASCII-only domains.
  302. if (empty($protocol)) {
  303. if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) {
  304. continue;
  305. }
  306. $last_url = null;
  307. $ascii_end_position = 0;
  308. if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) {
  309. $asciiDomain[0] = preg_replace('/'.preg_quote($domain, '/').'/u', $asciiDomain[0], $url);
  310. $ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position);
  311. $ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]);
  312. $last_url = [
  313. 'url' => $asciiDomain[0],
  314. 'indices' => [$start_position + $ascii_start_position, $start_position + $ascii_end_position],
  315. ];
  316. if (!empty($path)
  317. || preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0])
  318. || !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) {
  319. $urls[] = $last_url;
  320. }
  321. }
  322. // no ASCII-only domain found. Skip the entire URL
  323. if (empty($last_url)) {
  324. continue;
  325. }
  326. // $last_url only contains domain. Need to add path and query if they exist.
  327. if (!empty($path)) {
  328. // last_url was not added. Add it to urls here.
  329. $last_url['url'] = preg_replace('/'.preg_quote($domain, '/').'/u', $last_url['url'], $url);
  330. $last_url['indices'][1] = $end_position;
  331. }
  332. } else {
  333. // In the case of t.co URLs, don't allow additional path characters
  334. if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) {
  335. $url = $tcoUrlMatches[0];
  336. $end_position = $start_position + StringUtils::strlen($url);
  337. }
  338. $urls[] = [
  339. 'url' => $url,
  340. 'indices' => [$start_position, $end_position],
  341. ];
  342. }
  343. }
  344. return array_slice($urls, 0, Status::MAX_LINKS);
  345. }
  346. /**
  347. * Extracts all the usernames and the indices they occur at from the tweet.
  348. *
  349. * @param string $tweet The tweet to extract.
  350. *
  351. * @return array The username elements in the tweet.
  352. */
  353. public function extractMentionedScreennamesWithIndices($tweet = null)
  354. {
  355. if (is_null($tweet)) {
  356. $tweet = $this->tweet;
  357. }
  358. $usernamesOnly = [];
  359. $mentions = $this->extractMentionsOrListsWithIndices($tweet);
  360. foreach ($mentions as $mention) {
  361. if (isset($mention['list_slug'])) {
  362. unset($mention['list_slug']);
  363. }
  364. $usernamesOnly[] = $mention;
  365. }
  366. return array_slice($usernamesOnly, 0, Status::MAX_MENTIONS);
  367. }
  368. /**
  369. * Extracts all the usernames and the indices they occur at from the tweet.
  370. *
  371. * @return array The username elements in the tweet.
  372. *
  373. * @deprecated since version 1.1.0
  374. */
  375. public function extractMentionedUsernamesWithIndices()
  376. {
  377. return $this->extractMentionedScreennamesWithIndices();
  378. }
  379. /**
  380. * Extracts all the usernames and the indices they occur at from the tweet.
  381. *
  382. * @param string $tweet The tweet to extract.
  383. *
  384. * @return array The username elements in the tweet.
  385. */
  386. public function extractMentionsOrListsWithIndices($tweet = null)
  387. {
  388. if (is_null($tweet)) {
  389. $tweet = $this->tweet;
  390. }
  391. if (!preg_match('/[@@]/iu', $tweet)) {
  392. return [];
  393. }
  394. preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
  395. $results = [];
  396. foreach ($matches as $match) {
  397. list($all, $before, $at, $username, $list_slug, $outer) = array_pad($match, 6, ['', 0]);
  398. $start_position = $at[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $at[1])) : $at[1];
  399. $end_position = $start_position + StringUtils::strlen($at[0]) + StringUtils::strlen($username[0]);
  400. $screenname = trim($all[0]) == '@'.$username[0] ? $username[0] : trim($all[0]);
  401. if(config('app.env') == 'production' && \App\Profile::whereUsername($screenname)->exists() == false) {
  402. continue;
  403. }
  404. $entity = [
  405. 'screen_name' => $screenname,
  406. 'list_slug' => $list_slug[0],
  407. 'indices' => [$start_position, $end_position],
  408. ];
  409. if (preg_match(self::$patterns['end_mention_match'], $outer[0])) {
  410. continue;
  411. }
  412. if (!empty($list_slug[0])) {
  413. $entity['indices'][1] = $end_position + StringUtils::strlen($list_slug[0]);
  414. }
  415. $results[] = $entity;
  416. }
  417. return array_slice($results, 0, Status::MAX_MENTIONS);
  418. }
  419. /**
  420. * Extracts all the usernames and the indices they occur at from the tweet.
  421. *
  422. * @return array The username elements in the tweet.
  423. *
  424. * @deprecated since version 1.1.0
  425. */
  426. public function extractMentionedUsernamesOrListsWithIndices()
  427. {
  428. return $this->extractMentionsOrListsWithIndices();
  429. }
  430. /**
  431. * setter/getter for extractURLWithoutProtocol.
  432. *
  433. * @param bool $flag
  434. *
  435. * @return Extractor
  436. */
  437. public function extractURLWithoutProtocol($flag = null)
  438. {
  439. if (is_null($flag)) {
  440. return $this->extractURLWithoutProtocol;
  441. }
  442. $this->extractURLWithoutProtocol = (bool) $flag;
  443. return $this;
  444. }
  445. /**
  446. * Remove overlapping entities.
  447. * This returns a new array with no overlapping entities.
  448. *
  449. * @param array $entities
  450. *
  451. * @return array
  452. */
  453. public function removeOverlappingEntities($entities)
  454. {
  455. $result = [];
  456. usort($entities, [$this, 'sortEntites']);
  457. $prev = null;
  458. foreach ($entities as $entity) {
  459. if (isset($prev) && $entity['indices'][0] < $prev['indices'][1]) {
  460. continue;
  461. }
  462. $prev = $entity;
  463. $result[] = $entity;
  464. }
  465. return $result;
  466. }
  467. /**
  468. * sort by entity start index.
  469. *
  470. * @param array $a
  471. * @param array $b
  472. *
  473. * @return int
  474. */
  475. protected function sortEntites($a, $b)
  476. {
  477. if ($a['indices'][0] == $b['indices'][0]) {
  478. return 0;
  479. }
  480. return ($a['indices'][0] < $b['indices'][0]) ? -1 : 1;
  481. }
  482. }