ConvertPdf.js 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. const fs = require('fs-extra');
  2. const path = require('path');
  3. const sax = require('../../sax');
  4. const utils = require('../../utils');
  5. const ConvertHtml = require('./ConvertHtml');
  6. class ConvertPdf extends ConvertHtml {
  7. check(data, opts) {
  8. const {inputFiles} = opts;
  9. return this.config.useExternalBookConverter &&
  10. inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf';
  11. }
  12. async run(notUsed, opts) {
  13. if (!opts.skipCheck) {
  14. if (!this.check(notUsed, opts))
  15. return false;
  16. }
  17. await this.checkExternalConverterPresent();
  18. const {inputFiles, callback, abort, uploadFileName} = opts;
  19. const inpFile = (opts.pdfFile ? opts.pdfFile : inputFiles.sourceFile);
  20. const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
  21. //конвертируем в xml
  22. let perc = 0;
  23. await this.execConverter(this.pdfToHtmlPath, ['-nodrm', '-c', '-s', '-xml', inpFile, outFile], () => {
  24. perc = (perc < 80 ? perc + 10 : 40);
  25. callback(perc);
  26. }, abort);
  27. callback(80);
  28. const data = await fs.readFile(outFile);
  29. callback(90);
  30. //парсим xml
  31. let lines = [];
  32. let images = [];
  33. let loading = [];
  34. let inText = false;
  35. let bold = false;
  36. let italic = false;
  37. let title = '';
  38. let prevTop = 0;
  39. let i = -1;
  40. let titleCount = 0;
  41. const loadImage = async(image) => {
  42. const src = path.parse(image.src);
  43. let type = 'unknown';
  44. switch (src.ext) {
  45. case '.jpg': type = 'image/jpeg'; break;
  46. case '.png': type = 'image/png'; break;
  47. }
  48. if (type != 'unknown') {
  49. image.data = (await fs.readFile(image.src)).toString('base64');
  50. image.type = type;
  51. image.name = src.base;
  52. }
  53. }
  54. const putImage = (curTop) => {
  55. if (!isNaN(curTop) && images.length) {
  56. while (images.length && images[0].top < curTop) {
  57. i++;
  58. lines[i] = images[0];
  59. images.shift();
  60. }
  61. }
  62. }
  63. const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  64. if (!cutCounter && inText) {
  65. let tOpen = (bold ? '<b>' : '');
  66. tOpen += (italic ? '<i>' : '');
  67. let tClose = (italic ? '</i>' : '');
  68. tClose += (bold ? '</b>' : '');
  69. lines[i].text += `${tOpen}${text}${tClose} `;
  70. if (titleCount < 2 && text.trim() != '') {
  71. title += text + (titleCount ? '' : ' - ');
  72. titleCount++;
  73. }
  74. }
  75. };
  76. const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  77. if (!cutCounter) {
  78. if (inText) {
  79. switch (tag) {
  80. case 'i':
  81. italic = true;
  82. break;
  83. case 'b':
  84. bold = true;
  85. break;
  86. }
  87. }
  88. if (tag == 'text' && !inText) {
  89. let attrs = sax.getAttrsSync(tail);
  90. const line = {
  91. text: '',
  92. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
  93. left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
  94. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
  95. height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
  96. };
  97. if (line.width != 0 || line.height != 0) {
  98. inText = true;
  99. if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
  100. putImage(line.top);
  101. i++;
  102. lines[i] = line;
  103. }
  104. prevTop = line.top;
  105. }
  106. }
  107. if (tag == 'image') {
  108. const attrs = sax.getAttrsSync(tail);
  109. const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
  110. if (src) {
  111. const image = {
  112. isImage: true,
  113. src,
  114. data: '',
  115. type: '',
  116. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
  117. };
  118. loading.push(loadImage(image));
  119. images.push(image);
  120. images.sort((a, b) => a.top - b.top)
  121. }
  122. }
  123. if (tag == 'page') {
  124. putImage(100000);
  125. }
  126. }
  127. };
  128. const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  129. if (inText) {
  130. switch (tag) {
  131. case 'i':
  132. italic = false;
  133. break;
  134. case 'b':
  135. bold = false;
  136. break;
  137. }
  138. }
  139. if (tag == 'text')
  140. inText = false;
  141. };
  142. let buf = this.decode(data).toString();
  143. sax.parseSync(buf, {
  144. onStartNode, onEndNode, onTextNode
  145. });
  146. putImage(100000);
  147. await Promise.all(loading);
  148. //найдем параграфы и отступы
  149. const indents = [];
  150. for (const line of lines) {
  151. if (line.isImage)
  152. continue;
  153. if (!isNaN(line.left)) {
  154. indents[line.left] = 1;
  155. }
  156. }
  157. let j = 0;
  158. for (let i = 0; i < indents.length; i++) {
  159. if (indents[i]) {
  160. j++;
  161. indents[i] = j;
  162. }
  163. }
  164. indents[0] = 0;
  165. //формируем текст
  166. if (!title && uploadFileName)
  167. title = uploadFileName;
  168. let text = `<title>${title}</title>`;
  169. let concat = '';
  170. let sp = '';
  171. for (const line of lines) {
  172. if (line.isImage) {
  173. text += `<fb2-image type="${line.type}" name="${line.name}">${line.data}</fb2-image>`;
  174. continue;
  175. }
  176. if (concat == '') {
  177. const left = line.left || 0;
  178. sp = ' '.repeat(indents[left]);
  179. }
  180. let t = line.text.trim();
  181. if (t.substr(-1) == '-') {
  182. t = t.substr(0, t.length - 1);
  183. concat += t;
  184. } else {
  185. text += sp + concat + t + "\n";
  186. concat = '';
  187. }
  188. }
  189. if (concat)
  190. text += sp + concat + "\n";
  191. return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
  192. }
  193. }
  194. module.exports = ConvertPdf;