0, 'w' => 0, 'd' => 0, 'i' => 0, 'm' => 0 ); $length = strlen($str); $block_size = ($length > 5*3000) ? 3000 : $length; $counter = 0; for ( $i = 0; $i < $length; $i++ ) { $char = ord($str[$i]); //non-russian characters if ($char < 128 || $char > 256) continue; //CP866 if (($char > 159 && $char < 176) || ($char > 223 && $char < 242)) $charsets['d']+=LOWERCASE; if (($char > 127 && $char < 160)) $charsets['d']+=UPPERCASE; //KOI8-R if (($char > 191 && $char < 223)) $charsets['k']+=LOWERCASE; if (($char > 222 && $char < 256)) $charsets['k']+=UPPERCASE; //WIN-1251 if ($char > 223 && $char < 256) $charsets['w']+=LOWERCASE; if ($char > 191 && $char < 224) $charsets['w']+=UPPERCASE; //MAC if ($char > 221 && $char < 255) $charsets['m']+=LOWERCASE; if ($char > 127 && $char < 160) $charsets['m']+=UPPERCASE; //ISO-8859-5 if ($char > 207 && $char < 240) $charsets['i']+=LOWERCASE; if ($char > 175 && $char < 208) $charsets['i']+=UPPERCASE; $counter++; if ($counter > $block_size) { $counter = 0; $i += (int)($length/2 - 2*$block_size); } } arsort($charsets); if (preg_match('//u', $str)) return 'u'; else return key($charsets); } function getTag($tagName, $book) { $from_tag = '<' . $tagName . '>'; $to_tag = '' . $tagName . '>'; $from = strpos($book, $from_tag); $to = strpos($book, $to_tag); if ($from === FALSE || $to === FALSE) return ''; $from += strlen($from_tag); return trim(substr($book, $from, $to - $from)); } function getMetaInfoAndFilter($book, &$meta_info) { $meta_info['author'] = ''; $meta_info['title'] = getTag('title', $book); $out = $book; //fb2 ??? --------------------- if (strpos($meta_info['title'], '
') !== FALSE) { $s = str_replace('
', '', $meta_info['title']); $a = explode('', $s);
$meta_info['author'] = parseHtml($a[1], TRUE);
$meta_info['title'] = parseHtml($a[2], TRUE);
if ($meta_info['title'] === NULL || $meta_info['title'] === '') {
$s = parseHtml($s, TRUE);
$meta_info['author'] = '';
$meta_info['title'] = $s;
}
}
//samlib ----------------------
$samlib_start_sign = '';
$samlib_book_idx = strpos($book, $samlib_start_sign);
if ($samlib_book_idx !== FALSE) {
$samlib_author = getTag('h3', $book);
$meta_info['author'] = substr($samlib_author, 0, strpos($samlib_author, ': '));
$meta_info['title'] = getTag('h2', $book);;
$samlib_book_idx += strlen($samlib_start_sign);
$samlib_book_end_idx = strpos($book, '');
$samlib_book_end_idx = ($samlib_book_end_idx === FALSE ? strlen($book) : $samlib_book_end_idx);
$out = ' ') === FALSE) {
$len = strlen($txtin);
$counts = array();
$flag = 0;
$c = 0;
for ($i = 0; $i < $len; $i++) {
if ($txtin[$i] == chr(10) || $i == 0) {
$counts[$c]++;
if ($c > 0)
$counts[0]++;
$c = 0;
$flag = 1;
} else
if ($txtin[$i] != ' ')
$flag = 0;
else
if ($flag)
$c++;
}
arsort($counts);
$key = 0;
if (count($counts) > 1) {
next($counts);
$key = key($counts);
}
//$txtout .= print_r($counts, TRUE);
//$txtout .= $key;
$txtout = '';
$flag = 0;
$c = 0;
for ($i = 0; $i < $len; $i++) {
if ($txtin[$i] == chr(10) || $i == 0) {
$c = 0;
$flag = 1;
} else
if ($txtin[$i] != ' ') {
if ($c >= $key && $flag)
$txtout .= ' ';
$flag = 0;
}
else
if ($flag)
$c++;
$txtout .= $txtin[$i];
}
} else
$txtout = $txtin;
$txtout = 'no_file' . '|' . $meta_info['author'] . '|' . $meta_info['title'] .
'<<
" . str_replace("[", "
[", print_r($info, TRUE)) . "
$header
END");
}
if ($code == 301 || $code == 302 || $code == 303 || $code == 307)
{
if (array_key_exists('redirect_url', $info) && !empty($info['redirect_url'])) {
$newUrl = trim($info['redirect_url']);
} else {
preg_match('/Location:(.*?)\n/', $header, $matches);
$newUrl = trim(array_pop($matches));
}
curl_close($ch);
$maxRedirection--;
continue;
}
else // no more redirection
{
if ($curlResult === FALSE || $info['http_code'] != 200) {
$curlResult = "ERROR ". $info['http_code'];
if (curl_error($ch))
$curlResult .= "
". curl_error($ch);
throw new Exception($curlResult);
} else {
$code = 0; //OK
curl_close($ch);
}
}
}
while($code);
return $curlResult;
}
{
set_error_handler("myErrorHandler");
// set_time_limit(300);
$url = getParam('url');
try {
$body = '';
if ($url == '')
throw new Exception("не задан адрес книги");
$meta_info = array();
$time_start = $time = microtime_float();
$pid = create_guid();
$dir = 'txt/';
$encoding = getParam('encoding');
if (strpos($url, 'http://') !== 0 && strpos($url, 'https://') !== 0)
$url = 'http://' . $url;
$url = str_replace('"', '', $url);
$url = str_replace('\'', '', $url);
$url = str_replace(']', '%5D', str_replace('[', '%5B', $url));
$options = array(
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 300,
CURLOPT_URL => $url,
CURLOPT_BUFFERSIZE => 1024*128,
CURLOPT_NOPROGRESS => FALSE,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
CURLOPT_PROGRESSFUNCTION => function(
$DownloadSize, $Downloaded, $UploadSize, $Uploaded
) {
// If $Downloaded exceeds, returning non-0 breaks the connection!
return ($Downloaded > (50 * 1024 * 1024)) ? 1 : 0;
}
);
$out = curlExec($options);
$meta_info['time_curl'] = microtime_float() - $time;
$time = microtime_float();
//zip
if ($out[0] == chr(0x50) && $out[1] == chr(0x4B) && $out[2] == chr(0x03) && $out[3] == chr(0x04)) {
$zipped_file = $tmp_dir . "/{$pid}-temp.zip";
file_put_contents($zipped_file, $out);
$out = unzip($zipped_file);
if (file_exists($zipped_file)) unlink($zipped_file);
}
//pdf
/* if ($out[0] == chr(0x25) && $out[1] == chr(0x50) && $out[2] == chr(0x44) && $out[3] == chr(0x46)) {
$a = new PDF2Text();
$a->reset();
$a->decodePDF($out);
$out = $a->output();
file_put_contents('/tmp/1', $out);
}*/
$meta_info['time_unzip'] = microtime_float() - $time;
$time = microtime_float();
//decoding and parsing
if ($out !== FALSE) {
if ($encoding == '')
$encoding = getEncoding($out);
switch ($encoding) {
case 'k':
$out = mb_convert_encoding($out, 'cp1251', 'KOI8-R');
break;
case 'w':
break;
case 'd':
$out = mb_convert_encoding($out, 'cp1251', 'cp866');
break;
case 'i':
$out = mb_convert_encoding($out, 'cp1251', 'ISO-8859-5');
break;
case 'm':
$out = mb_convert_encoding($out, 'cp1251', 'MACINTOSH');
break;
case 'u':
$out = mb_convert_encoding($out, 'cp1251', 'UTF-8');
break;
}
//$out = $encoding . '===' . $out;
//file_put_contents('/tmp/bpr1', $out);
$meta_info['time_decodepage'] = microtime_float() - $time;
$time = microtime_float();
$out = getMetaInfoAndFilter($out, $meta_info);
$meta_info['time_metainfo'] = microtime_float() - $time;
$time = microtime_float();
$out = parseHtml($out);
$meta_info['time_parsehtml'] = microtime_float() - $time;
$time = microtime_float();
$out = filterTextAndGzip($meta_info, $out);
$meta_info['time_filter_gzip'] = microtime_float() - $time;
$meta_info['time_total'] = microtime_float() - $time_start;
$meta = getParam('meta');
if ($meta != '') {
$info = '';
foreach ($meta_info as $key => $value) {
if (strpos($key, 'time') !== FALSE)
$info .= sprintf("%06.3f", $value) . " $key
";
else
$info .= "$key: $value
";
}
throw new Exception("
" . $info);
}
header('Content-Type: text/plain; charset=windows-1251');
echo $out;
//file_put_contents('/tmp/bpr2', $out);
return;
} else
throw new Exception("ошибка загрузки файла. Попробуйте еще раз.");
} catch (Exception $e) {
header('Content-Type: text/html; charset=windows-1251');
$err = $e->getMessage();
if (strpos($err, 'ERROR 404') !== FALSE)
$err = 'страница не найдена';
$body = "Ошибка загрузки книги: " . ($url == '' ? '' : "($url) ") . $err;
}
echo $body;
}
?>