textUtils.js 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. function getEncoding(buf) {
  2. const lowerCase = 3;
  3. const upperCase = 1;
  4. const codePage = {
  5. 'k': 'koi8-r',
  6. 'w': 'Windows-1251',
  7. 'd': 'cp866',
  8. 'i': 'ISO-8859-5',
  9. 'm': 'maccyrillic',
  10. };
  11. let charsets = {
  12. 'k': 0,
  13. 'w': 0,
  14. 'd': 0,
  15. 'i': 0,
  16. 'm': 0
  17. };
  18. const len = buf.length;
  19. const blockSize = (len > 5*3000 ? 3000 : len);
  20. let counter = 0;
  21. let i = 0;
  22. while (i < len) {
  23. const char = buf[i];
  24. i++;
  25. //non-russian characters
  26. if (char < 128 || char > 256)
  27. continue;
  28. //CP866
  29. if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
  30. if ((char > 127 && char < 160)) charsets['d'] += upperCase;
  31. //KOI8-R
  32. if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
  33. if ((char > 222 && char < 256)) charsets['k'] += upperCase;
  34. //WIN-1251
  35. if (char > 223 && char < 256) charsets['w'] += lowerCase;
  36. if (char > 191 && char < 224) charsets['w'] += upperCase;
  37. //MAC
  38. if (char > 221 && char < 255) charsets['m'] += lowerCase;
  39. if (char > 127 && char < 160) charsets['m'] += upperCase;
  40. //ISO-8859-5
  41. if (char > 207 && char < 240) charsets['i'] += lowerCase;
  42. if (char > 175 && char < 208) charsets['i'] += upperCase;
  43. counter++;
  44. if (counter > blockSize) {
  45. counter = 0;
  46. i += Math.round(len/2 - 2*blockSize);
  47. }
  48. }
  49. let sorted = Object.keys(charsets).map(function(key) {
  50. return { codePage: codePage[key], c: charsets[key] };
  51. });
  52. sorted.sort((a, b) => b.c - a.c);
  53. if (sorted[0].c > 0)
  54. return sorted[0].codePage;
  55. else
  56. return 'ISO-8859-5';
  57. }
  58. function checkIfText(buf) {
  59. let spaceCount = 0;
  60. let crCount = 0;
  61. let lfCount = 0;
  62. for (let i = 0; i < buf.length; i++) {
  63. if (buf[i] == 32)
  64. spaceCount++;
  65. if (buf[i] == 13)
  66. crCount++;
  67. if (buf[i] == 10)
  68. lfCount++;
  69. }
  70. const spaceFreq = spaceCount/(buf.length + 1);
  71. const crFreq = crCount/(buf.length + 1);
  72. const lfFreq = lfCount/(buf.length + 1);
  73. return (buf.length < 1000 || spaceFreq > 0.1 || crFreq > 0.03 || lfFreq > 0.03);
  74. }
  75. module.exports = {
  76. getEncoding,
  77. checkIfText,
  78. }