浏览代码

Улучшил определение codePage

Book Pauk 6 年之前
父节点
当前提交
1f6dd9a00f
共有 2 个文件被更改,包括 77 次插入2 次删除
  1. 70 0
      server/core/BookConverter/getEncoding.js
  2. 7 2
      server/core/BookConverter/index.js

+ 70 - 0
server/core/BookConverter/getEncoding.js

@@ -0,0 +1,70 @@
+function getEncoding(str) {
+        const lowerCase = 3;
+        const upperCase = 1;
+
+        const codePage = {
+            'k': 'koi8-r',
+            'w': 'Windows-1251',
+            'd': 'cp866',
+            'i': 'ISO-8859-5',
+            'm': 'maccyrillic',
+        };
+
+        let charsets = {
+            'k': 0,
+            'w': 0,
+            'd': 0,
+            'i': 0,
+            'm': 0
+        };
+
+        const len = str.len;
+        const blockSize = (len > 5*3000 ? 3000 : len);
+        let counter = 0;
+        let i = 0;
+        while (i < len) {
+            const char = str.charCodeAt(i);
+
+            //non-russian characters
+            if (char < 128 || char > 256)
+                continue;
+
+            //CP866
+            if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
+            if ((char > 127 && char < 160)) charsets['d'] += upperCase;
+
+            //KOI8-R
+            if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
+            if ((char > 222 && char < 256)) charsets['k'] += upperCase;
+
+            //WIN-1251
+            if (char > 223 && char < 256) charsets['w'] += lowerCase;
+            if (char > 191 && char < 224) charsets['w'] += upperCase;
+
+            //MAC
+            if (char > 221 && char < 255) charsets['m'] += lowerCase;
+            if (char > 127 && char < 160) charsets['m'] += upperCase;
+
+            //ISO-8859-5
+            if (char > 207 && char < 240) charsets['i'] += lowerCase;
+            if (char > 175 && char < 208) charsets['i'] += upperCase;
+
+            counter++;
+
+            if (counter > blockSize) {
+                counter = 0;
+                i += Math.round(len/2 - 2*blockSize);
+            }
+            i++;
+        }
+
+        let sorted = Object.keys(charsets).map(function(key) {
+            return { codePage: codePage[key], c: charsets[key] };
+        });
+
+        sorted.sort((a, b) => a.c - b.c);
+
+        return sorted[0].codePage;
+    }
+
+module.exports = getEncoding;

+ 7 - 2
server/core/BookConverter/index.js

@@ -4,6 +4,7 @@ const iconv = require('iconv-lite');
 const chardet = require('chardet');
 const _ = require('lodash');
 const sax = require('./sax');
+const getEncoding = require('./getEncoding');
 
 const FileDetector = require('../FileDetector');
 
@@ -43,9 +44,9 @@ class BookConverter {
     }
 
     decode(data) {
-        const charsetAll = chardet.detectAll(data.slice(0, 10000));
+        const charsetAll = chardet.detectAll(data.slice(0, 20000));
 
-        let selected = 'ISO-8859-1';
+        let selected = 'ISO-8859-5';
         for (const charset of charsetAll) {
             if (charset.name.indexOf('ISO-8859') < 0) {
                 selected = charset.name;
@@ -53,6 +54,10 @@ class BookConverter {
             }
         }
 
+        if (selected == 'ISO-8859-5') {
+            selected = getEncoding(data);
+        }
+
         return iconv.decode(data, selected);
     }