123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566 |
- <?php
- /**
- * @author Mike Cochrane <mikec@mikenz.geek.nz>
- * @author Nick Pope <nick@nickpope.me.uk>
- * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
- * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
- */
- namespace App\Util\Lexer;
- use Illuminate\Support\Str;
- use App\Status;
- use App\Services\AutolinkService;
- /**
- * Twitter Extractor Class.
- *
- * Parses tweets and extracts URLs, usernames, username/list pairs and
- * hashtags.
- *
- * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
- * is based on code by {@link http://github.com/mzsanford Matt Sanford} and
- * heavily modified by {@link http://github.com/ngnpope Nick Pope}.
- *
- * @author Mike Cochrane <mikec@mikenz.geek.nz>
- * @author Nick Pope <nick@nickpope.me.uk>
- * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
- * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
- */
- class Extractor extends Regex
- {
- /**
- * @var bool
- */
- protected $extractURLWithoutProtocol = true;
- protected $activeUsersOnly = false;
- /**
- * Provides fluent method chaining.
- *
- * @param string $tweet The tweet to be converted.
- *
- * @see __construct()
- *
- * @return Extractor
- */
- public static function create($tweet = null)
- {
- return new self($tweet);
- }
- public function setActiveUsersOnly($active)
- {
- $this->activeUsersOnly = $active;
- return $this;
- }
- /**
- * Reads in a tweet to be parsed and extracts elements from it.
- *
- * Extracts various parts of a tweet including URLs, usernames, hashtags...
- *
- * @param string $tweet The tweet to extract.
- */
- public function __construct($tweet = null)
- {
- parent::__construct($tweet);
- }
- /**
- * Extracts all parts of a tweet and returns an associative array containing
- * the extracted elements.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The elements in the tweet.
- */
- public function extract($tweet = null)
- {
- if (is_null($tweet)) {
- $tweet = $this->tweet;
- }
- return [
- 'hashtags' => $this->extractHashtags($tweet),
- 'urls' => $this->extractURLs($tweet),
- 'mentions' => $this->extractMentionedUsernames($tweet),
- 'replyto' => $this->extractRepliedUsernames($tweet),
- 'hashtags_with_indices' => $this->extractHashtagsWithIndices($tweet),
- 'urls_with_indices' => $this->extractURLsWithIndices($tweet),
- 'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices($tweet),
- ];
- }
- /**
- * Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array list of extracted entities
- */
- public function extractEntitiesWithIndices($tweet = null)
- {
- if (is_null($tweet)) {
- $tweet = $this->tweet;
- }
- $entities = [];
- $entities = array_merge($entities, $this->extractURLsWithIndices($tweet));
- $entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false));
- $entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet));
- $entities = $this->removeOverlappingEntities($entities);
- return $entities;
- }
- /**
- * Extracts all the hashtags from the tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The hashtag elements in the tweet.
- */
- public function extractHashtags($tweet = null)
- {
- $hashtagsOnly = [];
- $hashtagsWithIndices = $this->extractHashtagsWithIndices($tweet);
- foreach ($hashtagsWithIndices as $hashtagWithIndex) {
- $hashtagsOnly[] = $hashtagWithIndex['hashtag'];
- }
- return array_slice($hashtagsOnly, 0, Status::MAX_HASHTAGS);
- }
- /**
- * Extracts all the cashtags from the tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The cashtag elements in the tweet.
- */
- public function extractCashtags($tweet = null)
- {
- $cashtagsOnly = [];
- return $cashtagsOnly;
- }
- /**
- * Extracts all the URLs from the tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The URL elements in the tweet.
- */
- public function extractURLs($tweet = null)
- {
- $urlsOnly = [];
- $urlsWithIndices = $this->extractURLsWithIndices($tweet);
- foreach ($urlsWithIndices as $urlWithIndex) {
- $urlsOnly[] = $urlWithIndex['url'];
- }
- return array_slice($urlsOnly, 0, Status::MAX_LINKS);
- }
- /**
- * Extract all the usernames from the tweet.
- *
- * A mention is an occurrence of a username anywhere in a tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The usernames elements in the tweet.
- */
- public function extractMentionedScreennames($tweet = null)
- {
- $usernamesOnly = [];
- $mentionsWithIndices = $this->extractMentionsOrListsWithIndices($tweet);
- foreach ($mentionsWithIndices as $mentionWithIndex) {
- if($this->activeUsersOnly == true) {
- if(!AutolinkService::mentionedUsernameExists($mentionWithIndex['screen_name'])) {
- continue;
- }
- }
- $screen_name = mb_strtolower($mentionWithIndex['screen_name']);
- if (empty($screen_name) or in_array($screen_name, $usernamesOnly)) {
- continue;
- }
- $usernamesOnly[] = $screen_name;
- }
- return $usernamesOnly;
- }
- /**
- * Extract all the usernames from the tweet.
- *
- * A mention is an occurrence of a username anywhere in a tweet.
- *
- * @return array The usernames elements in the tweet.
- *
- * @deprecated since version 1.1.0
- */
- public function extractMentionedUsernames($tweet)
- {
- $this->tweet = $tweet;
- return $this->extractMentionedScreennames($tweet);
- }
- /**
- * Extract all the usernames replied to from the tweet.
- *
- * A reply is an occurrence of a username at the beginning of a tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The usernames replied to in a tweet.
- */
- public function extractReplyScreenname($tweet = null)
- {
- if (is_null($tweet)) {
- $tweet = $this->tweet;
- }
- $matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches);
- // Check username ending in
- if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) {
- $matched = false;
- }
- return $matched ? $matches[1] : null;
- }
- /**
- * Extract all the usernames replied to from the tweet.
- *
- * A reply is an occurrence of a username at the beginning of a tweet.
- *
- * @return array The usernames replied to in a tweet.
- *
- * @deprecated since version 1.1.0
- */
- public function extractRepliedUsernames()
- {
- return $this->extractReplyScreenname();
- }
- /**
- * Extracts all the hashtags and the indices they occur at from the tweet.
- *
- * @param string $tweet The tweet to extract.
- * @param bool $checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove overlapping ones
- *
- * @return array The hashtag elements in the tweet.
- */
- public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = true)
- {
- if (is_null($tweet)) {
- $tweet = $this->tweet;
- }
- if (!preg_match('/[##]/iu', $tweet)) {
- return [];
- }
- preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
- $tags = [];
- foreach ($matches as $match) {
- list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, ['', 0]);
- $start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1];
- $end_position = $start_position + StringUtils::strlen($hash[0].$hashtag[0]);
- if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
- continue;
- }
- if(mb_strlen($hashtag[0]) > 124) {
- continue;
- }
- $tags[] = [
- 'hashtag' => $hashtag[0],
- 'indices' => [$start_position, $end_position],
- ];
- }
- if (!$checkUrlOverlap) {
- return array_slice($tags, 0, Status::MAX_HASHTAGS);
- }
- // check url overlap
- $urls = $this->extractURLsWithIndices($tweet);
- $entities = $this->removeOverlappingEntities(array_merge($tags, $urls));
- $validTags = [];
- foreach ($entities as $entity) {
- if (empty($entity['hashtag'])) {
- continue;
- }
- $validTags[] = $entity;
- }
- return array_slice($validTags, 0, Status::MAX_HASHTAGS);
- }
- /**
- * Extracts all the cashtags and the indices they occur at from the tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The cashtag elements in the tweet.
- */
- public function extractCashtagsWithIndices($tweet = null)
- {
- }
- /**
- * Extracts all the URLs and the indices they occur at from the tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The URLs elements in the tweet.
- */
- public function extractURLsWithIndices($tweet = null)
- {
- if (is_null($tweet)) {
- $tweet = $this->tweet;
- }
- $needle = $this->extractURLWithoutProtocol() ? '.' : ':';
- if (strpos($tweet, $needle) === false) {
- return [];
- }
- $urls = [];
- preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
- foreach ($matches as $match) {
- list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, ['']);
- $start_position = $url[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $url[1])) : $url[1];
- $end_position = $start_position + StringUtils::strlen($url[0]);
- $all = $all[0];
- $before = $before[0];
- $url = $url[0];
- $protocol = $protocol[0];
- $domain = $domain[0];
- $port = $port[0];
- $path = $path[0];
- $query = $query[0];
- // If protocol is missing and domain contains non-ASCII characters,
- // extract ASCII-only domains.
- if (empty($protocol)) {
- if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) {
- continue;
- }
- $last_url = null;
- $ascii_end_position = 0;
- if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) {
- $asciiDomain[0] = preg_replace('/'.preg_quote($domain, '/').'/u', $asciiDomain[0], $url);
- $ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position);
- $ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]);
- $last_url = [
- 'url' => $asciiDomain[0],
- 'indices' => [$start_position + $ascii_start_position, $start_position + $ascii_end_position],
- ];
- if (!empty($path)
- || preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0])
- || !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) {
- $urls[] = $last_url;
- }
- }
- // no ASCII-only domain found. Skip the entire URL
- if (empty($last_url)) {
- continue;
- }
- // $last_url only contains domain. Need to add path and query if they exist.
- if (!empty($path)) {
- // last_url was not added. Add it to urls here.
- $last_url['url'] = preg_replace('/'.preg_quote($domain, '/').'/u', $last_url['url'], $url);
- $last_url['indices'][1] = $end_position;
- }
- } else {
- // In the case of t.co URLs, don't allow additional path characters
- if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) {
- $url = $tcoUrlMatches[0];
- $end_position = $start_position + StringUtils::strlen($url);
- }
- $urls[] = [
- 'url' => $url,
- 'indices' => [$start_position, $end_position],
- ];
- }
- }
- return array_slice($urls, 0, Status::MAX_LINKS);
- }
- /**
- * Extracts all the usernames and the indices they occur at from the tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The username elements in the tweet.
- */
- public function extractMentionedScreennamesWithIndices($tweet = null)
- {
- if (is_null($tweet)) {
- $tweet = $this->tweet;
- }
- $usernamesOnly = [];
- $mentions = $this->extractMentionsOrListsWithIndices($tweet);
- foreach ($mentions as $mention) {
- if (isset($mention['list_slug'])) {
- unset($mention['list_slug']);
- }
- $usernamesOnly[] = $mention;
- }
- return array_slice($usernamesOnly, 0, Status::MAX_MENTIONS);
- }
- /**
- * Extracts all the usernames and the indices they occur at from the tweet.
- *
- * @return array The username elements in the tweet.
- *
- * @deprecated since version 1.1.0
- */
- public function extractMentionedUsernamesWithIndices()
- {
- return $this->extractMentionedScreennamesWithIndices();
- }
- /**
- * Extracts all the usernames and the indices they occur at from the tweet.
- *
- * @param string $tweet The tweet to extract.
- *
- * @return array The username elements in the tweet.
- */
- public function extractMentionsOrListsWithIndices($tweet = null)
- {
- if (is_null($tweet)) {
- $tweet = $this->tweet;
- }
- if (!preg_match('/[@@]/iu', $tweet)) {
- return [];
- }
- preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
- $results = [];
- foreach ($matches as $match) {
- list($all, $before, $at, $username, $list_slug, $outer) = array_pad($match, 6, ['', 0]);
- $start_position = $at[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $at[1])) : $at[1];
- $end_position = $start_position + StringUtils::strlen($at[0]) + StringUtils::strlen($username[0]);
- $screenname = trim($all[0]) == '@'.$username[0] ? $username[0] : trim($all[0]);
- if($this->activeUsersOnly == true) {
- if(!AutolinkService::mentionedUsernameExists($screenname)) {
- continue;
- }
- }
- $entity = [
- 'screen_name' => $screenname,
- 'list_slug' => $list_slug[0],
- 'indices' => [$start_position, $end_position],
- ];
- if (preg_match(self::$patterns['end_mention_match'], $outer[0])) {
- continue;
- }
- if (!empty($list_slug[0])) {
- $entity['indices'][1] = $end_position + StringUtils::strlen($list_slug[0]);
- }
- $results[] = $entity;
- }
- return array_slice($results, 0, Status::MAX_MENTIONS);
- }
- /**
- * Extracts all the usernames and the indices they occur at from the tweet.
- *
- * @return array The username elements in the tweet.
- *
- * @deprecated since version 1.1.0
- */
- public function extractMentionedUsernamesOrListsWithIndices()
- {
- return $this->extractMentionsOrListsWithIndices();
- }
- /**
- * setter/getter for extractURLWithoutProtocol.
- *
- * @param bool $flag
- *
- * @return Extractor
- */
- public function extractURLWithoutProtocol($flag = null)
- {
- if (is_null($flag)) {
- return $this->extractURLWithoutProtocol;
- }
- $this->extractURLWithoutProtocol = (bool) $flag;
- return $this;
- }
- /**
- * Remove overlapping entities.
- * This returns a new array with no overlapping entities.
- *
- * @param array $entities
- *
- * @return array
- */
- public function removeOverlappingEntities($entities)
- {
- $result = [];
- usort($entities, [$this, 'sortEntites']);
- $prev = null;
- foreach ($entities as $entity) {
- if (isset($prev) && $entity['indices'][0] < $prev['indices'][1]) {
- continue;
- }
- $prev = $entity;
- $result[] = $entity;
- }
- return $result;
- }
- /**
- * sort by entity start index.
- *
- * @param array $a
- * @param array $b
- *
- * @return int
- */
- protected function sortEntites($a, $b)
- {
- if ($a['indices'][0] == $b['indices'][0]) {
- return 0;
- }
- return ($a['indices'][0] < $b['indices'][0]) ? -1 : 1;
- }
- }
|