123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447 |
- <?php
- date_default_timezone_set('Europe/Moscow');
- require_once 'config/config.php';
- require_once 'parser.php';
- define('LOWERCASE',3);
- define('UPPERCASE',1);
- function getParam($param, $defaultValue = '') {
- $paramValue = (isset($_REQUEST[$param]) ? $_REQUEST[$param] : $defaultValue);
- return $paramValue;
- }
- function getEncoding($str, $check_utf = FALSE) {
- if (!$check_utf) {
- $result = getEncoding(mb_convert_encoding($str, 'cp1251', 'UTF-8'), TRUE);
- if ($result == 'w')
- return 'u';
- }
- $charsets = Array(
- 'k' => 0,
- 'w' => 0,
- 'd' => 0,
- 'i' => 0,
- 'm' => 0
- );
- $length = strlen($str);
- $block_size = ($length > 5*3000) ? 3000 : $length;
- $counter = 0;
- for ( $i = 0; $i < $length; $i++ ) {
- $char = ord($str[$i]);
- //non-russian characters
- if ($char < 128 || $char > 256)
- continue;
- //CP866
- if (($char > 159 && $char < 176) || ($char > 223 && $char < 242)) $charsets['d']+=LOWERCASE;
- if (($char > 127 && $char < 160)) $charsets['d']+=UPPERCASE;
- //KOI8-R
- if (($char > 191 && $char < 223)) $charsets['k']+=LOWERCASE;
- if (($char > 222 && $char < 256)) $charsets['k']+=UPPERCASE;
- //WIN-1251
- if ($char > 223 && $char < 256) $charsets['w']+=LOWERCASE;
- if ($char > 191 && $char < 224) $charsets['w']+=UPPERCASE;
- //MAC
- if ($char > 221 && $char < 255) $charsets['m']+=LOWERCASE;
- if ($char > 127 && $char < 160) $charsets['m']+=UPPERCASE;
- //ISO-8859-5
- if ($char > 207 && $char < 240) $charsets['i']+=LOWERCASE;
- if ($char > 175 && $char < 208) $charsets['i']+=UPPERCASE;
- $counter++;
- if ($counter > $block_size) {
- $counter = 0;
- $i += (int)($length/2 - 2*$block_size);
- }
- }
- arsort($charsets);
- if (preg_match('//u', $str))
- return 'u';
- else
- return key($charsets);
- }
- function getTag($tagName, $book) {
- $from_tag = '<' . $tagName . '>';
- $to_tag = '</' . $tagName . '>';
- $from = strpos($book, $from_tag);
- $to = strpos($book, $to_tag);
- if ($from === FALSE || $to === FALSE)
- return '';
- $from += strlen($from_tag);
- return trim(substr($book, $from, $to - $from));
- }
- function getMetaInfoAndFilter($book, &$meta_info) {
- $meta_info['author'] = '';
- $meta_info['title'] = getTag('title', $book);
- $out = $book;
- //fb2 ??? ---------------------
- if (strpos($meta_info['title'], '<p>') !== FALSE) {
- $s = str_replace('</p>', '', $meta_info['title']);
- $a = explode('<p>', $s);
- $meta_info['author'] = parseHtml($a[1], TRUE);
- $meta_info['title'] = parseHtml($a[2], TRUE);
- if ($meta_info['title'] === NULL || $meta_info['title'] === '') {
- $s = parseHtml($s, TRUE);
- $meta_info['author'] = '';
- $meta_info['title'] = $s;
- }
- }
- //samlib ----------------------
- $samlib_start_sign = '<!----------- Ñîáñòâåííî ïðîèçâåäåíèå --------------->';
- $samlib_book_idx = strpos($book, $samlib_start_sign);
- if ($samlib_book_idx !== FALSE) {
- $samlib_author = getTag('h3', $book);
- $meta_info['author'] = substr($samlib_author, 0, strpos($samlib_author, ': <small>'));
- $meta_info['title'] = getTag('h2', $book);;
- $samlib_book_idx += strlen($samlib_start_sign);
- $samlib_book_end_idx = strpos($book, '<!---- Áëîê îïèñàíèÿ ïðîèçâåäåíèÿ (ñëåâà âíèçó) ----------------------->');
- $samlib_book_end_idx = ($samlib_book_end_idx === FALSE ? strlen($book) : $samlib_book_end_idx);
- $out = '<dd>' . $meta_info['author'] . '<dd>' . $meta_info['title'] . '<empty-line/>' .
- substr($book, $samlib_book_idx, $samlib_book_end_idx - $samlib_book_idx);
- $out = preg_replace("/<dd>  [;]*\s*[\r\n]/", '<empty-line/>', $out);
- }
- return $out;
- }
- function filterTextAndGzip($meta_info, $txtin) {
- global $use_gzip;
- if (strpos($txtin, '<P>') === FALSE) {
- $len = strlen($txtin);
- $counts = array();
- $flag = 0;
- $c = 0;
- for ($i = 0; $i < $len; $i++) {
- if ($txtin[$i] == chr(10) || $i == 0) {
- $counts[$c]++;
- if ($c > 0)
- $counts[0]++;
- $c = 0;
- $flag = 1;
- } else
- if ($txtin[$i] != ' ')
- $flag = 0;
- else
- if ($flag)
- $c++;
- }
- arsort($counts);
- $key = 0;
- if (count($counts) > 1) {
- next($counts);
- $key = key($counts);
- }
- //$txtout .= print_r($counts, TRUE);
- //$txtout .= $key;
- $txtout = '';
- $flag = 0;
- $c = 0;
- for ($i = 0; $i < $len; $i++) {
- if ($txtin[$i] == chr(10) || $i == 0) {
- $c = 0;
- $flag = 1;
- } else
- if ($txtin[$i] != ' ') {
- if ($c >= $key && $flag)
- $txtout .= '<p>';
- $flag = 0;
- }
- else
- if ($flag)
- $c++;
- $txtout .= $txtin[$i];
- }
- } else
- $txtout = $txtin;
- $txtout = 'no_file' . '|' . $meta_info['author'] . '|' . $meta_info['title'] .
- '<<<bpr5A432688AB0467AA396E5A144830248Abpr>>>' . $txtout;
- $supportsGzip = strpos($_SERVER['HTTP_ACCEPT_ENCODING'], 'gzip') !== false;
- if ($use_gzip && $supportsGzip && getParam('meta') == '' && getParam('curl') == '') {
- $txtout = gzencode($txtout, 9);
- header('Content-Encoding: gzip');
- }
- return $txtout;
- }
- function myErrorHandler($errno, $errstr, $errfile, $errline)
- {
- if (!(error_reporting() & $errno)) {
- // Ýòîò êîä îøèáêè íå âêëþ÷åí â error_reporting
- return;
- }
- if ($errno == 8 /*|| $errno == 2*/)
- return;
- //throw new Exception("[$errno]: ($errstr) at $errfile line $errline");
- throw new Exception("$errstr");
- // Íå çàïóñêàåì âíóòðåííèé îáðàáîò÷èê îøèáîê PHP
- return TRUE; // ñþäà õîäà íåò, íî ïóñòü áóäåò êàê øàáëîí
- }
- function unzip($filein) {
- $zip = new ZipArchive;
- $result = '';
- if ($zip->open($filein) === TRUE) {
- $filename = '';
- $max_size = -1;
- for($i = 0; $i < $zip->numFiles; $i++) {
- $stat = $zip->statIndex($i);
- $size = $stat['size'];
- if ($size > $max_size) {
- $max_size = $size;
- $filename = $zip->getNameIndex($i);
- $fp = $zip->getStream($filename);
- if (!$fp)
- throw new Exception("zip->getStream failed");
- $result = stream_get_contents($fp);
- fclose($fp);
- }
- }
- $zip->close();
- } else
- throw new Exception("zip->open failed");
- return $result;
- }
- function create_guid($namespace = '') {
- $uid = md5(uniqid("", true));
- $data = $namespace;
- $data .= $_SERVER['REQUEST_TIME'];
- $data .= $_SERVER['HTTP_USER_AGENT'];
- $data .= $_SERVER['LOCAL_ADDR'];
- $data .= $_SERVER['LOCAL_PORT'];
- $data .= $_SERVER['REMOTE_ADDR'];
- $data .= $_SERVER['REMOTE_PORT'];
- $hash = strtoupper(hash('ripemd128', $uid . $guid . md5($data)));
- return $hash;
- }
- function microtime_float()
- {
- list($usec, $sec) = explode(" ", microtime());
- return ((float)$usec + (float)$sec);
- }
- function curlExec(/* Array */$curlOptions='', /* Array */$curlHeaders='', /* Array */$postFields='')
- {
- $newUrl = '';
- $maxRedirection = 10;
- do
- {
- if ($maxRedirection<1) die('Error: reached the limit of redirections');
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
- if (!empty($curlOptions)) curl_setopt_array($ch, $curlOptions);
- if (!empty($curlHeaders)) curl_setopt($ch, CURLOPT_HTTPHEADER, $curlHeaders);
- if (!empty($postFields))
- {
- curl_setopt($ch, CURLOPT_POST, 1);
- curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields);
- }
- if (!empty($newUrl)) curl_setopt($ch, CURLOPT_URL, $newUrl); // redirect needed
- curl_setopt($ch, CURLOPT_HEADER, 1);
- $response = curl_exec($ch);
- // Then, after your curl_exec call:
- $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
- $header = substr($response, 0, $header_size);
- $curlResult = substr($response, $header_size);
- $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
- $info = curl_getinfo($ch);
- if (getParam('curl') != '') {;
- throw new Exception("<br>" . str_replace("[", "<br>[", print_r($info, TRUE)) . "<br>$header<br>END");
- }
- if ($code == 301 || $code == 302 || $code == 303 || $code == 307)
- {
- if (array_key_exists('redirect_url', $info) && !empty($info['redirect_url'])) {
- $newUrl = trim($info['redirect_url']);
- } else {
- preg_match('/Location:(.*?)\n/', $header, $matches);
- $newUrl = trim(array_pop($matches));
- }
- curl_close($ch);
- $maxRedirection--;
- continue;
- }
- else // no more redirection
- {
- if ($curlResult === FALSE || $info['http_code'] != 200) {
- $curlResult = "ERROR ". $info['http_code'];
- if (curl_error($ch))
- $curlResult .= "<br>". curl_error($ch);
- throw new Exception($curlResult);
- } else {
- $code = 0; //OK
- curl_close($ch);
- }
- }
- }
- while($code);
- return $curlResult;
- }
- {
- set_error_handler("myErrorHandler");
- // set_time_limit(300);
- $url = getParam('url');
- try {
- $body = '';
- if ($url == '')
- throw new Exception("íå çàäàí àäðåñ êíèãè");
- $meta_info = array();
- $time_start = $time = microtime_float();
- $pid = create_guid();
- $dir = 'txt/';
- $encoding = getParam('encoding');
- if (strpos($url, 'http://') !== 0 && strpos($url, 'https://') !== 0)
- $url = 'http://' . $url;
- $url = str_replace('"', '', $url);
- $url = str_replace('\'', '', $url);
- $url = str_replace(']', '%5D', str_replace('[', '%5B', $url));
- $options = array(
- CURLOPT_RETURNTRANSFER => TRUE,
- CURLOPT_TIMEOUT => 300,
- CURLOPT_URL => $url,
- CURLOPT_BUFFERSIZE => 1024*128,
- CURLOPT_NOPROGRESS => FALSE,
- CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
- CURLOPT_PROGRESSFUNCTION => function(
- $DownloadSize, $Downloaded, $UploadSize, $Uploaded
- ) {
- // If $Downloaded exceeds, returning non-0 breaks the connection!
- return ($Downloaded > (50 * 1024 * 1024)) ? 1 : 0;
- }
- );
- $out = curlExec($options);
- $meta_info['time_curl'] = microtime_float() - $time;
- $time = microtime_float();
- //zip
- if ($out[0] == chr(0x50) && $out[1] == chr(0x4B) && $out[2] == chr(0x03) && $out[3] == chr(0x04)) {
- $zipped_file = $tmp_dir . "/{$pid}-temp.zip";
- file_put_contents($zipped_file, $out);
- $out = unzip($zipped_file);
- if (file_exists($zipped_file)) unlink($zipped_file);
- }
- //pdf
- /* if ($out[0] == chr(0x25) && $out[1] == chr(0x50) && $out[2] == chr(0x44) && $out[3] == chr(0x46)) {
- $a = new PDF2Text();
- $a->reset();
- $a->decodePDF($out);
- $out = $a->output();
- file_put_contents('/tmp/1', $out);
- }*/
- $meta_info['time_unzip'] = microtime_float() - $time;
- $time = microtime_float();
- //decoding and parsing
- if ($out !== FALSE) {
- if ($encoding == '')
- $encoding = getEncoding($out);
- switch ($encoding) {
- case 'k':
- $out = mb_convert_encoding($out, 'cp1251', 'KOI8-R');
- break;
- case 'w':
- break;
- case 'd':
- $out = mb_convert_encoding($out, 'cp1251', 'cp866');
- break;
- case 'i':
- $out = mb_convert_encoding($out, 'cp1251', 'ISO-8859-5');
- break;
- case 'm':
- $out = mb_convert_encoding($out, 'cp1251', 'MACINTOSH');
- break;
- case 'u':
- $out = mb_convert_encoding($out, 'cp1251', 'UTF-8');
- break;
- }
- //$out = $encoding . '===' . $out;
- //file_put_contents('/tmp/bpr1', $out);
- $meta_info['time_decodepage'] = microtime_float() - $time;
- $time = microtime_float();
- $out = getMetaInfoAndFilter($out, $meta_info);
- $meta_info['time_metainfo'] = microtime_float() - $time;
- $time = microtime_float();
- $out = parseHtml($out);
- $meta_info['time_parsehtml'] = microtime_float() - $time;
- $time = microtime_float();
- $out = filterTextAndGzip($meta_info, $out);
- $meta_info['time_filter_gzip'] = microtime_float() - $time;
- $meta_info['time_total'] = microtime_float() - $time_start;
- $meta = getParam('meta');
- if ($meta != '') {
- $info = '';
- foreach ($meta_info as $key => $value) {
- if (strpos($key, 'time') !== FALSE)
- $info .= sprintf("%06.3f", $value) . " $key <br>";
- else
- $info .= "$key: $value<br>";
- }
- throw new Exception("<br>" . $info);
- }
- header('Content-Type: text/plain; charset=windows-1251');
- echo $out;
- //file_put_contents('/tmp/bpr2', $out);
- return;
- } else
- throw new Exception("îøèáêà çàãðóçêè ôàéëà. Ïîïðîáóéòå åùå ðàç.");
- } catch (Exception $e) {
- header('Content-Type: text/html; charset=windows-1251');
- $err = $e->getMessage();
- if (strpos($err, 'ERROR 404') !== FALSE)
- $err = 'ñòðàíèöà íå íàéäåíà';
- $body = "Îøèáêà çàãðóçêè êíèãè: " . ($url == '' ? '' : "($url) ") . $err;
- }
- echo $body;
- }
- ?>
|