ConvertPdf.js 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. const fs = require('fs-extra');
  2. const path = require('path');
  3. const sax = require('../../sax');
  4. const utils = require('../../utils');
  5. const ConvertHtml = require('./ConvertHtml');
  6. class ConvertPdf extends ConvertHtml {
  7. check(data, opts) {
  8. const {inputFiles} = opts;
  9. return this.config.useExternalBookConverter &&
  10. inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf';
  11. }
  12. async run(notUsed, opts) {
  13. if (!this.check(notUsed, opts))
  14. return false;
  15. await this.checkExternalConverterPresent();
  16. const {inputFiles, callback, abort, uploadFileName} = opts;
  17. const inpFile = inputFiles.sourceFile;
  18. const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
  19. const pdfaltoPath = `${this.config.dataDir}/pdfalto/pdfalto`;
  20. if (!await fs.pathExists(pdfaltoPath))
  21. throw new Error('Внешний конвертер pdfalto не найден');
  22. //конвертируем в xml
  23. let perc = 0;
  24. await this.execConverter(pdfaltoPath, [inpFile, outFile], () => {
  25. perc = (perc < 80 ? perc + 10 : 40);
  26. callback(perc);
  27. }, abort);
  28. callback(80);
  29. const data = await fs.readFile(outFile);
  30. callback(90);
  31. //парсим xml
  32. let lines = [];
  33. let images = [];
  34. let loading = [];
  35. let title = '';
  36. let prevTop = 0;
  37. let i = -1;
  38. const loadImage = async(image) => {
  39. const src = path.parse(image.src);
  40. let type = 'unknown';
  41. switch (src.ext) {
  42. case '.jpg': type = 'image/jpeg'; break;
  43. case '.png': type = 'image/png'; break;
  44. }
  45. if (type != 'unknown') {
  46. image.data = (await fs.readFile(image.src)).toString('base64');
  47. image.type = type;
  48. image.name = src.base;
  49. }
  50. }
  51. const putImage = (curTop) => {
  52. if (!isNaN(curTop) && images.length) {
  53. while (images.length && images[0].top < curTop) {
  54. i++;
  55. lines[i] = images[0];
  56. images.shift();
  57. }
  58. }
  59. }
  60. /* let tOpen = (bold ? '<b>' : '');
  61. tOpen += (italic ? '<i>' : '');
  62. let tClose = (italic ? '</i>' : '');
  63. tClose += (bold ? '</b>' : '');
  64. lines[i].text += `${tOpen}${text}${tClose} `;*/
  65. const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  66. if (tag == 'page')
  67. putImage(100000);
  68. if (tag == 'textline') {
  69. const attrs = sax.getAttrsSync(tail);
  70. const line = {
  71. text: '',
  72. top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10),
  73. left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10),
  74. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
  75. height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
  76. };
  77. if (line.width != 0 || line.height != 0) {
  78. if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
  79. putImage(line.top);
  80. i++;
  81. lines[i] = line;
  82. }
  83. prevTop = line.top;
  84. }
  85. }
  86. if (tag == 'string') {
  87. const attrs = sax.getAttrsSync(tail);
  88. if (attrs.content && attrs.content.value) {
  89. lines[i].text += `${attrs.content.value} `;
  90. }
  91. }
  92. if (tag == 'illustration') {
  93. const attrs = sax.getAttrsSync(tail);
  94. if (attrs.type && attrs.type.value == 'image') {
  95. let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
  96. if (src) {
  97. const image = {
  98. isImage: true,
  99. src,
  100. data: '',
  101. type: '',
  102. top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
  103. };
  104. loading.push(loadImage(image));
  105. images.push(image);
  106. images.sort((a, b) => a.top - b.top)
  107. }
  108. }
  109. }
  110. };
  111. /*
  112. const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  113. };
  114. */
  115. let buf = this.decode(data).toString();
  116. sax.parseSync(buf, {
  117. onStartNode
  118. });
  119. putImage(100000);
  120. await Promise.all(loading);
  121. //найдем параграфы и отступы
  122. const indents = [];
  123. for (const line of lines) {
  124. if (line.isImage)
  125. continue;
  126. if (!isNaN(line.left)) {
  127. indents[line.left] = 1;
  128. }
  129. }
  130. let j = 0;
  131. for (let i = 0; i < indents.length; i++) {
  132. if (indents[i]) {
  133. j++;
  134. indents[i] = j;
  135. }
  136. }
  137. indents[0] = 0;
  138. //формируем текст
  139. const limitSize = 2*this.config.maxUploadFileSize;
  140. if (!title && uploadFileName)
  141. title = uploadFileName;
  142. let text = `<title>${title}</title>`;
  143. let concat = '';
  144. let sp = '';
  145. for (const line of lines) {
  146. if (text.length > limitSize) {
  147. throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
  148. }
  149. if (line.isImage) {
  150. text += `<fb2-image type="${line.type}" name="${line.name}">${line.data}</fb2-image>`;
  151. continue;
  152. }
  153. if (concat == '') {
  154. const left = line.left || 0;
  155. sp = ' '.repeat(indents[left]);
  156. }
  157. let t = line.text.trim();
  158. if (t.substr(-1) == '-') {
  159. t = t.substr(0, t.length - 1);
  160. concat += t;
  161. } else {
  162. text += sp + concat + t + "\n";
  163. concat = '';
  164. }
  165. }
  166. if (concat)
  167. text += sp + concat + "\n";
  168. return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
  169. }
  170. }
  171. module.exports = ConvertPdf;