ConvertPdf.js 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. const fs = require('fs-extra');
  2. const sax = require('./sax');
  3. const utils = require('../utils');
  4. const ConvertHtml = require('./ConvertHtml');
  5. class ConvertPdf extends ConvertHtml {
  6. check(data, opts) {
  7. const {inputFiles} = opts;
  8. return this.config.useExternalBookConverter &&
  9. inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf';
  10. }
  11. async run(notUsed, opts) {
  12. if (!this.check(notUsed, opts))
  13. return false;
  14. await this.checkExternalConverterPresent();
  15. const {inputFiles, callback} = opts;
  16. const outFile = `${inputFiles.fileListDir}/${utils.randomHexString(10)}.xml`;
  17. //конвертируем в xml
  18. await this.execConverter(this.pdfToHtmlPath, ['-c', '-s', '-xml', inputFiles.sourceFile, outFile]);
  19. callback(50);
  20. const data = await fs.readFile(outFile);
  21. callback(60);
  22. //парсим xml
  23. let lines = [];
  24. let inText = false;
  25. let title = '';
  26. let i = -1;
  27. const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  28. if (!cutCounter && inText) {
  29. lines[i].text += text + ' ';
  30. if (i < 2)
  31. title += text + ' ';
  32. }
  33. };
  34. const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  35. if (!cutCounter) {
  36. if (tag == 'text' && !inText) {
  37. let attrs = sax.getAttrsSync(tail);
  38. const line = {
  39. text: '',
  40. top: (attrs.top && attrs.top.value ? attrs.top.value : null),
  41. left: (attrs.left && attrs.left.value ? attrs.left.value : null),
  42. width: (attrs.width && attrs.width.value ? attrs.width.value : null),
  43. height: (attrs.height && attrs.height.value ? attrs.height.value : null),
  44. };
  45. if (line.width !== '0' || line.height !== '0') {
  46. inText = true;
  47. i++;
  48. lines[i] = line;
  49. }
  50. }
  51. }
  52. };
  53. const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  54. if (tag == 'text')
  55. inText = false;
  56. };
  57. let buf = this.decode(data).toString();
  58. sax.parseSync(buf, {
  59. onStartNode, onEndNode, onTextNode
  60. });
  61. //найдем параграфы и отступы
  62. const indents = [];
  63. for (const line of lines) {
  64. const top = parseInt(line.top);
  65. const left = parseInt(line.left);
  66. if (!isNaN(top)) {
  67. line.top = top;
  68. }
  69. if (!isNaN(left)) {
  70. indents[left] = 1;
  71. line.left = left;
  72. }
  73. }
  74. let j = 0;
  75. for (let i = 0; i < indents.length; i++) {
  76. if (indents[i]) {
  77. j++;
  78. indents[i] = j;
  79. }
  80. }
  81. indents[0] = 0;
  82. //формируем текст
  83. let text = `<title>${title}</title>`;
  84. for (const line of lines) {
  85. const left = line.left || 0;
  86. const sp = ' '.repeat(indents[left]);
  87. text += sp + line.text + "\n";
  88. }
  89. return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
  90. }
  91. }
  92. module.exports = ConvertPdf;