Explorar o código

Улучшено определение кодировки и текстового файла

Book Pauk %!s(int64=6) %!d(string=hai) anos
pai
achega
aeadb5aeb8
Modificáronse 2 ficheiros con 43 adicións e 24 borrados
  1. 4 1
      server/core/BookConverter/index.js
  2. 39 23
      server/core/BookConverter/textUtils.js

+ 4 - 1
server/core/BookConverter/index.js

@@ -66,7 +66,10 @@ class BookConverter {
             }
         }
 
-        return iconv.decode(data, selected);
+        if (selected.toLowerCase() != 'utf-8')
+            return iconv.decode(data, selected);
+        else
+            return data;
     }
 
     checkEncoding(data) {

+ 39 - 23
server/core/BookConverter/textUtils.js

@@ -1,4 +1,4 @@
-function getEncoding(buf) {
+function getEncoding(buf, returnAll) {
     const lowerCase = 3;
     const upperCase = 1;
 
@@ -8,6 +8,7 @@ function getEncoding(buf) {
         'd': 'cp866',
         'i': 'ISO-8859-5',
         'm': 'maccyrillic',
+        'u': 'utf-8',
     };
 
     let charsets = {
@@ -15,38 +16,47 @@ function getEncoding(buf) {
         'w': 0,
         'd': 0,
         'i': 0,
-        'm': 0
+        'm': 0,
+        'u': 0,
     };
 
     const len = buf.length;
     const blockSize = (len > 5*3000 ? 3000 : len);
     let counter = 0;
     let i = 0;
+    let totalChecked = 0;
     while (i < len) {
         const char = buf[i];
+        const nextChar = (i < len - 1 ? buf[i + 1] : 0);
+        totalChecked++;
         i++;
         //non-russian characters
         if (char < 128 || char > 256)
             continue;
-        //CP866
-        if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
-        if ((char > 127 && char < 160)) charsets['d'] += upperCase;
-
-        //KOI8-R
-        if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
-        if ((char > 222 && char < 256)) charsets['k'] += upperCase;
-
-        //WIN-1251
-        if (char > 223 && char < 256) charsets['w'] += lowerCase;
-        if (char > 191 && char < 224) charsets['w'] += upperCase;
-
-        //MAC
-        if (char > 221 && char < 255) charsets['m'] += lowerCase;
-        if (char > 127 && char < 160) charsets['m'] += upperCase;
-
-        //ISO-8859-5
-        if (char > 207 && char < 240) charsets['i'] += lowerCase;
-        if (char > 175 && char < 208) charsets['i'] += upperCase;
+        //UTF-8
+        if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190)
+            charsets['u'] += lowerCase;
+        else {
+            //CP866
+            if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
+            if ((char > 127 && char < 160)) charsets['d'] += upperCase;
+
+            //KOI8-R
+            if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
+            if ((char > 222 && char < 256)) charsets['k'] += upperCase;
+
+            //WIN-1251
+            if (char > 223 && char < 256) charsets['w'] += lowerCase;
+            if (char > 191 && char < 224) charsets['w'] += upperCase;
+
+            //MAC
+            if (char > 221 && char < 255) charsets['m'] += lowerCase;
+            if (char > 127 && char < 160) charsets['m'] += upperCase;
+
+            //ISO-8859-5
+            if (char > 207 && char < 240) charsets['i'] += lowerCase;
+            if (char > 175 && char < 208) charsets['i'] += upperCase;
+        }
 
         counter++;
 
@@ -57,18 +67,24 @@ function getEncoding(buf) {
     }
 
     let sorted = Object.keys(charsets).map(function(key) {
-        return { codePage: codePage[key], c: charsets[key] };
+        return { codePage: codePage[key], c: charsets[key], totalChecked };
     });
 
     sorted.sort((a, b) => b.c - a.c);
 
-    if (sorted[0].c > 0)
+    if (returnAll)
+        return sorted;
+    else if (sorted[0].c > 0)
         return sorted[0].codePage;
     else
         return 'ISO-8859-5';
 }
 
 function checkIfText(buf) {
+    const enc = getEncoding(buf, true);
+    if (enc[0].c > enc[0].totalChecked*0.9)
+        return true;
+
     let spaceCount = 0;
     let crCount = 0;
     let lfCount = 0;