ConvertPdf.js 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. const fs = require('fs-extra');
  2. const path = require('path');
  3. const sax = require('./sax');
  4. const utils = require('../utils');
  5. const ConvertHtml = require('./ConvertHtml');
  6. class ConvertPdf extends ConvertHtml {
  7. check(data, opts) {
  8. const {inputFiles} = opts;
  9. return this.config.useExternalBookConverter &&
  10. inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf';
  11. }
  12. async run(notUsed, opts) {
  13. if (!this.check(notUsed, opts))
  14. return false;
  15. await this.checkExternalConverterPresent();
  16. const {inputFiles, callback} = opts;
  17. const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
  18. //конвертируем в xml
  19. let perc = 0;
  20. await this.execConverter(this.pdfToHtmlPath, ['-c', '-s', '-xml', inputFiles.sourceFile, outFile], () => {
  21. perc = (perc < 80 ? perc + 10 : 40);
  22. callback(perc);
  23. });
  24. callback(80);
  25. const data = await fs.readFile(outFile);
  26. callback(90);
  27. //парсим xml
  28. let lines = [];
  29. let images = [];
  30. let loading = [];
  31. let inText = false;
  32. let title = '';
  33. let prevTop = 0;
  34. let i = -1;
  35. const loadImage = async(image) => {
  36. const src = path.parse(image.src);
  37. let type = 'unknown';
  38. switch (src.ext) {
  39. case '.jpg': type = 'image/jpeg'; break;
  40. case '.png': type = 'image/png'; break;
  41. }
  42. if (type != 'unknown') {
  43. image.data = (await fs.readFile(image.src)).toString('base64');
  44. image.type = type;
  45. image.name = src.base;
  46. }
  47. }
  48. const putImage = (curTop) => {
  49. if (!isNaN(curTop) && images.length) {
  50. while (images.length && images[0].top < curTop) {
  51. i++;
  52. lines[i] = images[0];
  53. images.shift();
  54. }
  55. }
  56. }
  57. const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  58. if (!cutCounter && inText) {
  59. lines[i].text += text + ' ';
  60. if (i < 2)
  61. title += text + ' ';
  62. }
  63. };
  64. const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  65. if (!cutCounter) {
  66. if (tag == 'text' && !inText) {
  67. let attrs = sax.getAttrsSync(tail);
  68. const line = {
  69. text: '',
  70. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
  71. left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
  72. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
  73. height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
  74. };
  75. if (line.width !== '0' || line.height !== '0') {
  76. inText = true;
  77. if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
  78. putImage(line.top);
  79. i++;
  80. lines[i] = line;
  81. }
  82. prevTop = line.top;
  83. }
  84. }
  85. if (tag == 'image') {
  86. let attrs = sax.getAttrsSync(tail);
  87. const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
  88. if (src) {
  89. const image = {
  90. isImage: true,
  91. src,
  92. data: '',
  93. type: '',
  94. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
  95. };
  96. loading.push(loadImage(image));
  97. images.push(image);
  98. images.sort((a, b) => a.top - b.top)
  99. }
  100. }
  101. if (tag == 'page') {
  102. putImage(100000);
  103. }
  104. }
  105. };
  106. const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  107. if (tag == 'text')
  108. inText = false;
  109. };
  110. let buf = this.decode(data).toString();
  111. sax.parseSync(buf, {
  112. onStartNode, onEndNode, onTextNode
  113. });
  114. putImage(100000);
  115. await Promise.all(loading);
  116. //найдем параграфы и отступы
  117. const indents = [];
  118. for (const line of lines) {
  119. if (line.isImage)
  120. continue;
  121. if (!isNaN(line.left)) {
  122. indents[line.left] = 1;
  123. }
  124. }
  125. let j = 0;
  126. for (let i = 0; i < indents.length; i++) {
  127. if (indents[i]) {
  128. j++;
  129. indents[i] = j;
  130. }
  131. }
  132. indents[0] = 0;
  133. //формируем текст
  134. let text = `<title>${title}</title>`;
  135. let concat = '';
  136. let sp = '';
  137. for (const line of lines) {
  138. if (line.isImage) {
  139. text += `<fb2-image type="${line.type}" name="${line.name}">${line.data}</fb2-image>`;
  140. continue;
  141. }
  142. if (concat == '') {
  143. const left = line.left || 0;
  144. sp = ' '.repeat(indents[left]);
  145. }
  146. let t = line.text.trim();
  147. if (t.substr(-1) == '-') {
  148. t = t.substr(0, t.length - 1);
  149. concat += t;
  150. } else {
  151. text += sp + concat + t + "\n";
  152. concat = '';
  153. }
  154. }
  155. if (concat)
  156. text += sp + concat + "\n";
  157. return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
  158. }
  159. }
  160. module.exports = ConvertPdf;