textUtils.js 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. const chardet = require('chardet');
  2. function getEncoding(buf) {
  3. let selected = getEncodingLite(buf);
  4. if (selected == 'ISO-8859-5' && buf.length > 10) {
  5. const charsetAll = chardet.analyse(buf.slice(0, 100000));
  6. for (const charset of charsetAll) {
  7. if (charset.name.indexOf('ISO-8859') < 0) {
  8. selected = charset.name;
  9. break;
  10. }
  11. }
  12. }
  13. return selected;
  14. }
  15. function getEncodingLite(buf, returnAll) {
  16. const lowerCase = 3;
  17. const upperCase = 1;
  18. const codePage = {
  19. 'k': 'koi8-r',
  20. 'w': 'Windows-1251',
  21. 'd': 'cp866',
  22. 'i': 'ISO-8859-5',
  23. 'm': 'maccyrillic',
  24. 'u': 'utf-8',
  25. };
  26. let charsets = {
  27. 'k': 0,
  28. 'w': 0,
  29. 'd': 0,
  30. 'i': 0,
  31. 'm': 0,
  32. 'u': 0,
  33. };
  34. const len = (buf.length > 100000 ? 100000 : buf.length);
  35. let i = 0;
  36. let totalChecked = 0;
  37. while (i < len) {
  38. const char = buf[i];
  39. const nextChar = (i < len - 1 ? buf[i + 1] : 0);
  40. totalChecked++;
  41. i++;
  42. //non-russian characters
  43. if (char < 128 || char > 256)
  44. continue;
  45. //UTF-8
  46. if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190)
  47. charsets['u'] += lowerCase;
  48. else {
  49. //CP866
  50. if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
  51. if ((char > 127 && char < 160)) charsets['d'] += upperCase;
  52. //KOI8-R
  53. if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
  54. if ((char > 222 && char < 256)) charsets['k'] += upperCase;
  55. //WIN-1251
  56. if (char > 223 && char < 256) charsets['w'] += lowerCase;
  57. if (char > 191 && char < 224) charsets['w'] += upperCase;
  58. //MAC
  59. if (char > 221 && char < 255) charsets['m'] += lowerCase;
  60. if (char > 127 && char < 160) charsets['m'] += upperCase;
  61. //ISO-8859-5
  62. if (char > 207 && char < 240) charsets['i'] += lowerCase;
  63. if (char > 175 && char < 208) charsets['i'] += upperCase;
  64. }
  65. }
  66. let sorted = Object.keys(charsets).map(function(key) {
  67. return { codePage: codePage[key], c: charsets[key], totalChecked };
  68. });
  69. sorted.sort((a, b) => b.c - a.c);
  70. if (returnAll)
  71. return sorted;
  72. else if (sorted[0].c > 0 && sorted[0].c > sorted[0].totalChecked/2)
  73. return sorted[0].codePage;
  74. else
  75. return 'ISO-8859-5';
  76. }
  77. function checkIfText(buf) {
  78. const enc = getEncodingLite(buf, true);
  79. if (enc[0].c > enc[0].totalChecked*0.9)
  80. return true;
  81. let spaceCount = 0;
  82. let crCount = 0;
  83. let lfCount = 0;
  84. for (let i = 0; i < buf.length; i++) {
  85. if (buf[i] == 32)
  86. spaceCount++;
  87. if (buf[i] == 13)
  88. crCount++;
  89. if (buf[i] == 10)
  90. lfCount++;
  91. }
  92. const spaceFreq = spaceCount/(buf.length + 1);
  93. const crFreq = crCount/(buf.length + 1);
  94. const lfFreq = lfCount/(buf.length + 1);
  95. return (buf.length < 1000 || spaceFreq > 0.1 || crFreq > 0.03 || lfFreq > 0.03);
  96. }
  97. module.exports = {
  98. getEncoding,
  99. getEncodingLite,
  100. checkIfText,
  101. }