textUtils.js 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. const chardet = require('chardet');
  2. function getEncoding(buf) {
  3. let selected = getEncodingLite(buf);
  4. if (selected == 'ISO-8859-5') {
  5. const charsetAll = chardet.detectAll(buf.slice(0, 20000));
  6. for (const charset of charsetAll) {
  7. if (charset.name.indexOf('ISO-8859') < 0) {
  8. selected = charset.name;
  9. break;
  10. }
  11. }
  12. }
  13. return selected;
  14. }
  15. function getEncodingLite(buf, returnAll) {
  16. const lowerCase = 3;
  17. const upperCase = 1;
  18. const codePage = {
  19. 'k': 'koi8-r',
  20. 'w': 'Windows-1251',
  21. 'd': 'cp866',
  22. 'i': 'ISO-8859-5',
  23. 'm': 'maccyrillic',
  24. 'u': 'utf-8',
  25. };
  26. let charsets = {
  27. 'k': 0,
  28. 'w': 0,
  29. 'd': 0,
  30. 'i': 0,
  31. 'm': 0,
  32. 'u': 0,
  33. };
  34. const len = buf.length;
  35. const blockSize = (len > 5*3000 ? 3000 : len);
  36. let counter = 0;
  37. let i = 0;
  38. let totalChecked = 0;
  39. while (i < len) {
  40. const char = buf[i];
  41. const nextChar = (i < len - 1 ? buf[i + 1] : 0);
  42. totalChecked++;
  43. i++;
  44. //non-russian characters
  45. if (char < 128 || char > 256)
  46. continue;
  47. //UTF-8
  48. if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190)
  49. charsets['u'] += lowerCase;
  50. else {
  51. //CP866
  52. if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
  53. if ((char > 127 && char < 160)) charsets['d'] += upperCase;
  54. //KOI8-R
  55. if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
  56. if ((char > 222 && char < 256)) charsets['k'] += upperCase;
  57. //WIN-1251
  58. if (char > 223 && char < 256) charsets['w'] += lowerCase;
  59. if (char > 191 && char < 224) charsets['w'] += upperCase;
  60. //MAC
  61. if (char > 221 && char < 255) charsets['m'] += lowerCase;
  62. if (char > 127 && char < 160) charsets['m'] += upperCase;
  63. //ISO-8859-5
  64. if (char > 207 && char < 240) charsets['i'] += lowerCase;
  65. if (char > 175 && char < 208) charsets['i'] += upperCase;
  66. }
  67. counter++;
  68. if (counter > blockSize) {
  69. counter = 0;
  70. i += Math.round(len/2 - 2*blockSize);
  71. }
  72. }
  73. let sorted = Object.keys(charsets).map(function(key) {
  74. return { codePage: codePage[key], c: charsets[key], totalChecked };
  75. });
  76. sorted.sort((a, b) => b.c - a.c);
  77. if (returnAll)
  78. return sorted;
  79. else if (sorted[0].c > 0 && sorted[0].c > sorted[0].totalChecked/2)
  80. return sorted[0].codePage;
  81. else
  82. return 'ISO-8859-5';
  83. }
  84. function checkIfText(buf) {
  85. const enc = getEncoding(buf, true);
  86. if (enc[0].c > enc[0].totalChecked*0.9)
  87. return true;
  88. let spaceCount = 0;
  89. let crCount = 0;
  90. let lfCount = 0;
  91. for (let i = 0; i < buf.length; i++) {
  92. if (buf[i] == 32)
  93. spaceCount++;
  94. if (buf[i] == 13)
  95. crCount++;
  96. if (buf[i] == 10)
  97. lfCount++;
  98. }
  99. const spaceFreq = spaceCount/(buf.length + 1);
  100. const crFreq = crCount/(buf.length + 1);
  101. const lfFreq = lfCount/(buf.length + 1);
  102. return (buf.length < 1000 || spaceFreq > 0.1 || crFreq > 0.03 || lfFreq > 0.03);
  103. }
  104. module.exports = {
  105. getEncoding,
  106. getEncodingLite,
  107. checkIfText,
  108. }