ConvertPdf.js 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. const fs = require('fs-extra');
  2. const path = require('path');
  3. const sax = require('../../sax');
  4. const utils = require('../../utils');
  5. const ConvertHtml = require('./ConvertHtml');
  6. class ConvertPdf extends ConvertHtml {
  7. check(data, opts) {
  8. const {inputFiles} = opts;
  9. return this.config.useExternalBookConverter &&
  10. inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf';
  11. }
  12. async run(notUsed, opts) {
  13. if (!this.check(notUsed, opts))
  14. return false;
  15. await this.checkExternalConverterPresent();
  16. const {inputFiles, callback, abort, uploadFileName} = opts;
  17. const inpFile = inputFiles.sourceFile;
  18. const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
  19. //конвертируем в xml
  20. let perc = 0;
  21. await this.execConverter(this.pdfToHtmlPath, ['-nodrm', '-c', '-s', '-xml', inpFile, outFile], () => {
  22. perc = (perc < 80 ? perc + 10 : 40);
  23. callback(perc);
  24. }, abort);
  25. callback(80);
  26. const data = await fs.readFile(outFile);
  27. callback(90);
  28. //парсим xml
  29. let lines = [];
  30. let images = [];
  31. let loading = [];
  32. let inText = false;
  33. let bold = false;
  34. let italic = false;
  35. let title = '';
  36. let prevTop = 0;
  37. let i = -1;
  38. let titleCount = 0;
  39. const loadImage = async(image) => {
  40. const src = path.parse(image.src);
  41. let type = 'unknown';
  42. switch (src.ext) {
  43. case '.jpg': type = 'image/jpeg'; break;
  44. case '.png': type = 'image/png'; break;
  45. }
  46. if (type != 'unknown') {
  47. image.data = (await fs.readFile(image.src)).toString('base64');
  48. image.type = type;
  49. image.name = src.base;
  50. }
  51. }
  52. const putImage = (curTop) => {
  53. if (!isNaN(curTop) && images.length) {
  54. while (images.length && images[0].top < curTop) {
  55. i++;
  56. lines[i] = images[0];
  57. images.shift();
  58. }
  59. }
  60. }
  61. const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  62. if (!cutCounter && inText) {
  63. let tOpen = (bold ? '<b>' : '');
  64. tOpen += (italic ? '<i>' : '');
  65. let tClose = (italic ? '</i>' : '');
  66. tClose += (bold ? '</b>' : '');
  67. lines[i].text += `${tOpen}${text}${tClose} `;
  68. if (titleCount < 2 && text.trim() != '') {
  69. title += text + (titleCount ? '' : ' - ');
  70. titleCount++;
  71. }
  72. }
  73. };
  74. const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  75. if (!cutCounter) {
  76. if (inText) {
  77. switch (tag) {
  78. case 'i':
  79. italic = true;
  80. break;
  81. case 'b':
  82. bold = true;
  83. break;
  84. }
  85. }
  86. if (tag == 'text' && !inText) {
  87. let attrs = sax.getAttrsSync(tail);
  88. const line = {
  89. text: '',
  90. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
  91. left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
  92. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
  93. height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
  94. };
  95. if (line.width != 0 || line.height != 0) {
  96. inText = true;
  97. if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
  98. putImage(line.top);
  99. i++;
  100. lines[i] = line;
  101. }
  102. prevTop = line.top;
  103. }
  104. }
  105. if (tag == 'image') {
  106. const attrs = sax.getAttrsSync(tail);
  107. const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
  108. if (src) {
  109. const image = {
  110. isImage: true,
  111. src,
  112. data: '',
  113. type: '',
  114. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
  115. };
  116. loading.push(loadImage(image));
  117. images.push(image);
  118. images.sort((a, b) => a.top - b.top)
  119. }
  120. }
  121. if (tag == 'page') {
  122. putImage(100000);
  123. }
  124. }
  125. };
  126. const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  127. if (inText) {
  128. switch (tag) {
  129. case 'i':
  130. italic = false;
  131. break;
  132. case 'b':
  133. bold = false;
  134. break;
  135. }
  136. }
  137. if (tag == 'text')
  138. inText = false;
  139. };
  140. let buf = this.decode(data).toString();
  141. sax.parseSync(buf, {
  142. onStartNode, onEndNode, onTextNode
  143. });
  144. putImage(100000);
  145. await Promise.all(loading);
  146. //найдем параграфы и отступы
  147. const indents = [];
  148. for (const line of lines) {
  149. if (line.isImage)
  150. continue;
  151. if (!isNaN(line.left)) {
  152. indents[line.left] = 1;
  153. }
  154. }
  155. let j = 0;
  156. for (let i = 0; i < indents.length; i++) {
  157. if (indents[i]) {
  158. j++;
  159. indents[i] = j;
  160. }
  161. }
  162. indents[0] = 0;
  163. //формируем текст
  164. const limitSize = 2*this.config.maxUploadFileSize;
  165. if (!title && uploadFileName)
  166. title = uploadFileName;
  167. let text = `<title>${title}</title>`;
  168. let concat = '';
  169. let sp = '';
  170. for (const line of lines) {
  171. if (text.length > limitSize) {
  172. throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
  173. }
  174. if (line.isImage) {
  175. text += `<fb2-image type="${line.type}" name="${line.name}">${line.data}</fb2-image>`;
  176. continue;
  177. }
  178. if (concat == '') {
  179. const left = line.left || 0;
  180. sp = ' '.repeat(indents[left]);
  181. }
  182. let t = line.text.trim();
  183. if (t.substr(-1) == '-') {
  184. t = t.substr(0, t.length - 1);
  185. concat += t;
  186. } else {
  187. text += sp + concat + t + "\n";
  188. concat = '';
  189. }
  190. }
  191. if (concat)
  192. text += sp + concat + "\n";
  193. return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
  194. }
  195. }
  196. module.exports = ConvertPdf;