ConvertPdf.js 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. const fs = require('fs-extra');
  2. const sax = require('./sax');
  3. const utils = require('../utils');
  4. const ConvertHtml = require('./ConvertHtml');
  5. class ConvertPdf extends ConvertHtml {
  6. check(data, opts) {
  7. const {inputFiles} = opts;
  8. return this.config.useExternalBookConverter &&
  9. inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf';
  10. }
  11. async run(notUsed, opts) {
  12. if (!this.check(notUsed, opts))
  13. return false;
  14. await this.checkExternalConverterPresent();
  15. const {inputFiles, callback} = opts;
  16. const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
  17. //конвертируем в xml
  18. let perc = 0;
  19. await this.execConverter(this.pdfToHtmlPath, ['-c', '-s', '-xml', inputFiles.sourceFile, outFile], () => {
  20. perc = (perc < 80 ? perc + 10 : 40);
  21. callback(perc);
  22. });
  23. callback(80);
  24. const data = await fs.readFile(outFile);
  25. callback(90);
  26. //парсим xml
  27. let lines = [];
  28. let inText = false;
  29. let title = '';
  30. let prevTop = 0;
  31. let i = -1;
  32. const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  33. if (!cutCounter && inText) {
  34. lines[i].text += text + ' ';
  35. if (i < 2)
  36. title += text + ' ';
  37. }
  38. };
  39. const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  40. if (!cutCounter) {
  41. if (tag == 'text' && !inText) {
  42. let attrs = sax.getAttrsSync(tail);
  43. const line = {
  44. text: '',
  45. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
  46. left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
  47. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
  48. height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
  49. };
  50. if (line.width !== '0' || line.height !== '0') {
  51. inText = true;
  52. if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
  53. i++;
  54. lines[i] = line;
  55. }
  56. prevTop = line.top;
  57. }
  58. }
  59. }
  60. };
  61. const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  62. if (tag == 'text')
  63. inText = false;
  64. };
  65. let buf = this.decode(data).toString();
  66. sax.parseSync(buf, {
  67. onStartNode, onEndNode, onTextNode
  68. });
  69. //найдем параграфы и отступы
  70. const indents = [];
  71. for (const line of lines) {
  72. if (!isNaN(line.left)) {
  73. indents[line.left] = 1;
  74. }
  75. }
  76. let j = 0;
  77. for (let i = 0; i < indents.length; i++) {
  78. if (indents[i]) {
  79. j++;
  80. indents[i] = j;
  81. }
  82. }
  83. indents[0] = 0;
  84. //формируем текст
  85. let text = `<title>${title}</title>`;
  86. let concat = '';
  87. let sp = '';
  88. for (const line of lines) {
  89. if (concat == '') {
  90. const left = line.left || 0;
  91. sp = ' '.repeat(indents[left]);
  92. }
  93. let t = line.text.trim();
  94. if (t.substr(-1) == '-') {
  95. t = t.substr(0, t.length - 1);
  96. concat += t;
  97. } else {
  98. text += sp + concat + t + "\n";
  99. concat = '';
  100. }
  101. }
  102. if (concat)
  103. text += sp + concat + "\n";
  104. return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
  105. }
  106. }
  107. module.exports = ConvertPdf;