소스 검색

Улучшил поддержку текстовых файлов

Book Pauk 6 년 전
부모
커밋
9c20df510d
3개의 변경된 파일98개의 추가작업 그리고 76개의 파일을 삭제
  1. 0 68
      server/core/BookConverter/getEncoding.js
  2. 16 8
      server/core/BookConverter/index.js
  3. 82 0
      server/core/BookConverter/textUtils.js

+ 0 - 68
server/core/BookConverter/getEncoding.js

@@ -1,68 +0,0 @@
-function getEncoding(buf) {
-        const lowerCase = 3;
-        const upperCase = 1;
-
-        const codePage = {
-            'k': 'koi8-r',
-            'w': 'Windows-1251',
-            'd': 'cp866',
-            'i': 'ISO-8859-5',
-            'm': 'maccyrillic',
-        };
-
-        let charsets = {
-            'k': 0,
-            'w': 0,
-            'd': 0,
-            'i': 0,
-            'm': 0
-        };
-
-        const len = buf.length;
-        const blockSize = (len > 5*3000 ? 3000 : len);
-        let counter = 0;
-        let i = 0;
-        while (i < len) {
-            const char = buf[i];
-            i++;
-            //non-russian characters
-            if (char < 128 || char > 256)
-                continue;
-            //CP866
-            if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
-            if ((char > 127 && char < 160)) charsets['d'] += upperCase;
-
-            //KOI8-R
-            if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
-            if ((char > 222 && char < 256)) charsets['k'] += upperCase;
-
-            //WIN-1251
-            if (char > 223 && char < 256) charsets['w'] += lowerCase;
-            if (char > 191 && char < 224) charsets['w'] += upperCase;
-
-            //MAC
-            if (char > 221 && char < 255) charsets['m'] += lowerCase;
-            if (char > 127 && char < 160) charsets['m'] += upperCase;
-
-            //ISO-8859-5
-            if (char > 207 && char < 240) charsets['i'] += lowerCase;
-            if (char > 175 && char < 208) charsets['i'] += upperCase;
-
-            counter++;
-
-            if (counter > blockSize) {
-                counter = 0;
-                i += Math.round(len/2 - 2*blockSize);
-            }
-        }
-
-        let sorted = Object.keys(charsets).map(function(key) {
-            return { codePage: codePage[key], c: charsets[key] };
-        });
-
-        sorted.sort((a, b) => b.c - a.c);
-
-        return sorted[0].codePage;
-    }
-
-module.exports = getEncoding;

+ 16 - 8
server/core/BookConverter/index.js

@@ -4,7 +4,7 @@ const iconv = require('iconv-lite');
 const chardet = require('chardet');
 const _ = require('lodash');
 const sax = require('./sax');
-const getEncoding = require('./getEncoding');
+const textUtils = require('./textUtils');
 
 const FileDetector = require('../FileDetector');
 
@@ -18,9 +18,10 @@ class BookConverter {
     async convertToFb2(inputFile, outputFile, url, callback) {
         const fileType = await this.detector.detectFile(inputFile);
         
-        if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) {
-            const data = await fs.readFile(inputFile);
+        const data = await fs.readFile(inputFile);
+        callback(100);
 
+        if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) {
             if (data.toString().indexOf('<FictionBook') >= 0) {            
                 await fs.writeFile(outputFile, data);
                 return;
@@ -34,12 +35,19 @@ class BookConverter {
             }
 
             await fs.writeFile(outputFile, this.convertHtml(data));
-            callback(100);
+            return;
         } else {
             if (fileType)
-                throw new Error(`unknown file format: ${fileType.mime}`);
-            else
-                throw new Error(`unsupported file format: ${url}`);
+                throw new Error(`Этот формат файла не поддерживается: ${fileType.mime}`);
+            else {
+                //может это чистый текст?
+                if (textUtils.checkIfText(data)) {
+                    await fs.writeFile(outputFile, this.convertHtml(data));
+                    return;
+                }
+
+                throw new Error(`Не удалось определить формат файла: ${url}`);
+            }
         }
     }
 
@@ -55,7 +63,7 @@ class BookConverter {
         }
 
         if (selected == 'ISO-8859-5') {
-            selected = getEncoding(data);
+            selected = textUtils.getEncoding(data);
         }
 
         return iconv.decode(data, selected);

+ 82 - 0
server/core/BookConverter/textUtils.js

@@ -0,0 +1,82 @@
+function getEncoding(buf) {
+    const lowerCase = 3;
+    const upperCase = 1;
+
+    const codePage = {
+        'k': 'koi8-r',
+        'w': 'Windows-1251',
+        'd': 'cp866',
+        'i': 'ISO-8859-5',
+        'm': 'maccyrillic',
+    };
+
+    let charsets = {
+        'k': 0,
+        'w': 0,
+        'd': 0,
+        'i': 0,
+        'm': 0
+    };
+
+    const len = buf.length;
+    const blockSize = (len > 5*3000 ? 3000 : len);
+    let counter = 0;
+    let i = 0;
+    while (i < len) {
+        const char = buf[i];
+        i++;
+        //non-russian characters
+        if (char < 128 || char > 256)
+            continue;
+        //CP866
+        if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
+        if ((char > 127 && char < 160)) charsets['d'] += upperCase;
+
+        //KOI8-R
+        if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
+        if ((char > 222 && char < 256)) charsets['k'] += upperCase;
+
+        //WIN-1251
+        if (char > 223 && char < 256) charsets['w'] += lowerCase;
+        if (char > 191 && char < 224) charsets['w'] += upperCase;
+
+        //MAC
+        if (char > 221 && char < 255) charsets['m'] += lowerCase;
+        if (char > 127 && char < 160) charsets['m'] += upperCase;
+
+        //ISO-8859-5
+        if (char > 207 && char < 240) charsets['i'] += lowerCase;
+        if (char > 175 && char < 208) charsets['i'] += upperCase;
+
+        counter++;
+
+        if (counter > blockSize) {
+            counter = 0;
+            i += Math.round(len/2 - 2*blockSize);
+        }
+    }
+
+    let sorted = Object.keys(charsets).map(function(key) {
+        return { codePage: codePage[key], c: charsets[key] };
+    });
+
+    sorted.sort((a, b) => b.c - a.c);
+
+    return sorted[0].codePage;
+}
+
+function checkIfText(buf) {
+    let spaceCount = 0;
+    for (let i = 0; i < buf.length; i++) {
+        if (buf[i] == 32)
+            spaceCount++;
+    }
+    const freq = spaceCount/(buf.length + 1);
+
+    return (freq > 0.1);
+}
+
+module.exports = {
+    getEncoding,
+    checkIfText,
+}