textUtils.js 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. function getEncoding(buf, returnAll) {
  2. const lowerCase = 3;
  3. const upperCase = 1;
  4. const codePage = {
  5. 'k': 'koi8-r',
  6. 'w': 'Windows-1251',
  7. 'd': 'cp866',
  8. 'i': 'ISO-8859-5',
  9. 'm': 'maccyrillic',
  10. 'u': 'utf-8',
  11. };
  12. let charsets = {
  13. 'k': 0,
  14. 'w': 0,
  15. 'd': 0,
  16. 'i': 0,
  17. 'm': 0,
  18. 'u': 0,
  19. };
  20. const len = buf.length;
  21. const blockSize = (len > 5*3000 ? 3000 : len);
  22. let counter = 0;
  23. let i = 0;
  24. let totalChecked = 0;
  25. while (i < len) {
  26. const char = buf[i];
  27. const nextChar = (i < len - 1 ? buf[i + 1] : 0);
  28. totalChecked++;
  29. i++;
  30. //non-russian characters
  31. if (char < 128 || char > 256)
  32. continue;
  33. //UTF-8
  34. if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190)
  35. charsets['u'] += lowerCase;
  36. else {
  37. //CP866
  38. if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
  39. if ((char > 127 && char < 160)) charsets['d'] += upperCase;
  40. //KOI8-R
  41. if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
  42. if ((char > 222 && char < 256)) charsets['k'] += upperCase;
  43. //WIN-1251
  44. if (char > 223 && char < 256) charsets['w'] += lowerCase;
  45. if (char > 191 && char < 224) charsets['w'] += upperCase;
  46. //MAC
  47. if (char > 221 && char < 255) charsets['m'] += lowerCase;
  48. if (char > 127 && char < 160) charsets['m'] += upperCase;
  49. //ISO-8859-5
  50. if (char > 207 && char < 240) charsets['i'] += lowerCase;
  51. if (char > 175 && char < 208) charsets['i'] += upperCase;
  52. }
  53. counter++;
  54. if (counter > blockSize) {
  55. counter = 0;
  56. i += Math.round(len/2 - 2*blockSize);
  57. }
  58. }
  59. let sorted = Object.keys(charsets).map(function(key) {
  60. return { codePage: codePage[key], c: charsets[key], totalChecked };
  61. });
  62. sorted.sort((a, b) => b.c - a.c);
  63. if (returnAll)
  64. return sorted;
  65. else if (sorted[0].c > 0)
  66. return sorted[0].codePage;
  67. else
  68. return 'ISO-8859-5';
  69. }
  70. function checkIfText(buf) {
  71. const enc = getEncoding(buf, true);
  72. if (enc[0].c > enc[0].totalChecked*0.9)
  73. return true;
  74. let spaceCount = 0;
  75. let crCount = 0;
  76. let lfCount = 0;
  77. for (let i = 0; i < buf.length; i++) {
  78. if (buf[i] == 32)
  79. spaceCount++;
  80. if (buf[i] == 13)
  81. crCount++;
  82. if (buf[i] == 10)
  83. lfCount++;
  84. }
  85. const spaceFreq = spaceCount/(buf.length + 1);
  86. const crFreq = crCount/(buf.length + 1);
  87. const lfFreq = lfCount/(buf.length + 1);
  88. return (buf.length < 1000 || spaceFreq > 0.1 || crFreq > 0.03 || lfFreq > 0.03);
  89. }
  90. module.exports = {
  91. getEncoding,
  92. checkIfText,
  93. }