ConvertPdf.js 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. const fs = require('fs-extra');
  2. const path = require('path');
  3. const sax = require('../../sax');
  4. const utils = require('../../utils');
  5. const ConvertHtml = require('./ConvertHtml');
  6. class ConvertPdf extends ConvertHtml {
  7. check(data, opts) {
  8. const {inputFiles} = opts;
  9. return this.config.useExternalBookConverter &&
  10. inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf';
  11. }
  12. async run(notUsed, opts) {
  13. if (!this.check(notUsed, opts))
  14. return false;
  15. await this.checkExternalConverterPresent();
  16. const {inputFiles, callback, abort, uploadFileName} = opts;
  17. const inpFile = inputFiles.sourceFile;
  18. const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
  19. const pdfaltoPath = `${this.config.dataDir}/pdfalto/pdfalto`;
  20. if (!await fs.pathExists(pdfaltoPath))
  21. throw new Error('Внешний конвертер pdfalto не найден');
  22. //конвертируем в xml
  23. let perc = 0;
  24. await this.execConverter(pdfaltoPath, [inpFile, outFile], () => {
  25. perc = (perc < 80 ? perc + 10 : 40);
  26. callback(perc);
  27. }, abort);
  28. callback(80);
  29. const data = await fs.readFile(outFile);
  30. callback(90);
  31. //парсим xml
  32. let lines = [];
  33. let pagelines = [];
  34. let line = {text: ''};
  35. let images = [];
  36. let loading = [];
  37. let title = '';
  38. let prevTop = 0;
  39. let i = -1;
  40. const loadImage = async(image) => {
  41. const src = path.parse(image.src);
  42. let type = 'unknown';
  43. switch (src.ext) {
  44. case '.jpg': type = 'image/jpeg'; break;
  45. case '.png': type = 'image/png'; break;
  46. }
  47. if (type != 'unknown') {
  48. image.data = (await fs.readFile(image.src)).toString('base64');
  49. image.type = type;
  50. image.name = src.base;
  51. }
  52. };
  53. const putImage = (curTop) => {
  54. if (!isNaN(curTop) && images.length) {
  55. while (images.length && images[0].top < curTop) {
  56. i++;
  57. lines[i] = images[0];
  58. images.shift();
  59. }
  60. }
  61. };
  62. const putPageLines = () => {
  63. pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left))
  64. //объединяем в одну строку равные по высоте
  65. const pl = [];
  66. let pt = -100;
  67. let j = -1;
  68. pagelines.forEach(line => {
  69. if (Math.abs(pt - line.top) > 3) {
  70. j++;
  71. pl[j] = line;
  72. } else {
  73. pl[j].text += line.text;
  74. }
  75. pt = line.top;
  76. });
  77. //заполняем lines
  78. pl.forEach(line => {
  79. putImage(line.top);
  80. i++;
  81. lines[i] = line;
  82. });
  83. pagelines = [];
  84. };
  85. const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  86. if (tag == 'page') {
  87. putPageLines();
  88. putImage(100000);
  89. }
  90. if (tag == 'textline') {
  91. const attrs = sax.getAttrsSync(tail);
  92. line = {
  93. text: '',
  94. top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10),
  95. left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10),
  96. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
  97. height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
  98. };
  99. if (line.width != 0 || line.height != 0) {
  100. if (Math.abs(prevTop - line.top) > 3) {
  101. pagelines.push(line);
  102. }
  103. prevTop = line.top;
  104. }
  105. }
  106. if (tag == 'string') {
  107. const attrs = sax.getAttrsSync(tail);
  108. if (attrs.content && attrs.content.value) {
  109. line.text += `${attrs.content.value} `;
  110. }
  111. }
  112. if (tag == 'illustration') {
  113. const attrs = sax.getAttrsSync(tail);
  114. if (attrs.type && attrs.type.value == 'image') {
  115. let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
  116. if (src) {
  117. const image = {
  118. isImage: true,
  119. src,
  120. data: '',
  121. type: '',
  122. top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
  123. };
  124. loading.push(loadImage(image));
  125. images.push(image);
  126. images.sort((a, b) => a.top - b.top)
  127. }
  128. }
  129. }
  130. };
  131. let buf = this.decode(data).toString();
  132. sax.parseSync(buf, {
  133. onStartNode
  134. });
  135. putPageLines();
  136. putImage(100000);
  137. await Promise.all(loading);
  138. //найдем параграфы и отступы
  139. const indents = [];
  140. for (const line of lines) {
  141. if (line.isImage)
  142. continue;
  143. if (!isNaN(line.left)) {
  144. indents[line.left] = 1;
  145. }
  146. }
  147. let j = 0;
  148. for (let i = 0; i < indents.length; i++) {
  149. if (indents[i]) {
  150. j++;
  151. indents[i] = j;
  152. }
  153. }
  154. indents[0] = 0;
  155. //формируем текст
  156. const limitSize = 2*this.config.maxUploadFileSize;
  157. if (!title && uploadFileName)
  158. title = uploadFileName;
  159. let text = `<title>${title}</title>`;
  160. let concat = '';
  161. let sp = '';
  162. for (const line of lines) {
  163. if (text.length > limitSize) {
  164. throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
  165. }
  166. if (line.isImage) {
  167. text += `<fb2-image type="${line.type}" name="${line.name}">${line.data}</fb2-image>`;
  168. continue;
  169. }
  170. if (concat == '') {
  171. const left = line.left || 0;
  172. sp = ' '.repeat(indents[left]);
  173. }
  174. let t = line.text.trim();
  175. if (t.substr(-1) == '-') {
  176. t = t.substr(0, t.length - 1);
  177. concat += t;
  178. } else {
  179. text += sp + concat + t + "\n";
  180. concat = '';
  181. }
  182. }
  183. if (concat)
  184. text += sp + concat + "\n";
  185. return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
  186. }
  187. }
  188. module.exports = ConvertPdf;