ConvertPdf.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. //const _ = require('lodash');
  2. const fs = require('fs-extra');
  3. const path = require('path');
  4. const sax = require('../../sax');
  5. const utils = require('../../utils');
  6. const ConvertHtml = require('./ConvertHtml');
  7. class ConvertPdf extends ConvertHtml {
  8. check(data, opts) {
  9. const {inputFiles} = opts;
  10. return this.config.useExternalBookConverter &&
  11. inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf';
  12. }
  13. async run(notUsed, opts) {
  14. if (!opts.pdfAsText || !this.check(notUsed, opts))
  15. return false;
  16. await this.checkExternalConverterPresent();
  17. const {inputFiles, callback, abort, uploadFileName} = opts;
  18. const inpFile = inputFiles.sourceFile;
  19. const outBasename = `${inputFiles.filesDir}/${utils.randomHexString(10)}`;
  20. const outFile = `${outBasename}.xml`;
  21. const pdftohtmlPath = '/usr/bin/pdftohtml';
  22. if (!await fs.pathExists(pdftohtmlPath))
  23. throw new Error('Внешний конвертер pdftohtml не найден');
  24. //конвертируем в xml
  25. let perc = 0;
  26. await this.execConverter(pdftohtmlPath, ['-nodrm', '-c', '-s', '-xml', inpFile, outFile], () => {
  27. perc = (perc < 80 ? perc + 10 : 40);
  28. callback(perc);
  29. }, abort);
  30. callback(80);
  31. const data = await fs.readFile(outFile);
  32. callback(90);
  33. await utils.sleep(100);
  34. //парсим xml
  35. let lines = [];
  36. let pagelines = [];
  37. let line = {text: ''};
  38. let page = {};
  39. let fonts = {};
  40. let sectionTitleFound = false;
  41. let images = [];
  42. let loading = [];
  43. let inText = false;
  44. let bold = false;
  45. let italic = false;
  46. let i = -1;
  47. const loadImage = async(image) => {
  48. const src = path.parse(image.src);
  49. let type = 'unknown';
  50. switch (src.ext) {
  51. case '.jpg': type = 'image/jpeg'; break;
  52. case '.png': type = 'image/png'; break;
  53. }
  54. if (type != 'unknown') {
  55. image.data = (await fs.readFile(image.src)).toString('base64');
  56. image.type = type;
  57. image.name = src.base;
  58. }
  59. };
  60. const putImage = (curTop) => {
  61. if (!isNaN(curTop) && images.length) {
  62. while (images.length && images[0].top < curTop) {
  63. i++;
  64. lines[i] = images[0];
  65. images.shift();
  66. }
  67. }
  68. };
  69. const isTextBold = (text) => {
  70. const m = text.trim().match(/^<b>(.*)<\/b>$/);
  71. return m && !m[1].match(/<b>|<\/b>|<i>|<\/i>/g);
  72. };
  73. const isTextEmpty = (text) => {
  74. return text.replace(/<b>|<\/b>|<i>|<\/i>/g, '').trim() == '';
  75. };
  76. const putPageLines = () => {
  77. pagelines.sort((a, b) => (Math.abs(a.top - b.top) > 3 ? a.top - b.top : 0)*10000 + (a.left - b.left))
  78. //объединяем в одну строку равные по высоте
  79. const pl = [];
  80. let pt = 0;
  81. let j = -1;
  82. pagelines.forEach(line => {
  83. if (isTextEmpty(line.text))
  84. return;
  85. //проверим, возможно это заголовок
  86. if (line.fontId && line.pageWidth) {
  87. const centerLeft = (line.pageWidth - line.width)/2;
  88. if (isTextBold(line.text) && Math.abs(centerLeft - line.left) < 10) {
  89. if (!sectionTitleFound) {
  90. line.isSectionTitle = true;
  91. sectionTitleFound = true;
  92. } else {
  93. line.isSubtitle = true;
  94. }
  95. }
  96. }
  97. //объединяем
  98. if (pt == 0 || Math.abs(pt - line.top) > 3) {
  99. j++;
  100. pl[j] = line;
  101. } else {
  102. pl[j].text += ` ${line.text}`;
  103. }
  104. pt = line.top;
  105. });
  106. //заполняем lines
  107. const lastIndex = i;
  108. pl.forEach(line => {
  109. putImage(line.top);
  110. //добавим пустую строку, если надо
  111. const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
  112. if (prevLine && !prevLine.isImage) {
  113. const f = (prevLine.fontId ? fonts[prevLine.fontId] : (line.fontId ? fonts[line.fontId] : null));
  114. if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize * 1.8) {
  115. i++;
  116. lines[i] = {text: '<br>'};
  117. }
  118. }
  119. i++;
  120. lines[i] = line;
  121. });
  122. pagelines = [];
  123. putImage(100000);
  124. };
  125. const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  126. if (!cutCounter && inText) {
  127. let tOpen = (bold ? '<b>' : '');
  128. tOpen += (italic ? '<i>' : '');
  129. let tClose = (italic ? '</i>' : '');
  130. tClose += (bold ? '</b>' : '');
  131. line.text += ` ${tOpen}${text}${tClose}`;
  132. }
  133. };
  134. const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  135. if (inText) {
  136. switch (tag) {
  137. case 'i':
  138. italic = true;
  139. break;
  140. case 'b':
  141. bold = true;
  142. break;
  143. }
  144. }
  145. if (tag == 'page') {
  146. const attrs = sax.getAttrsSync(tail);
  147. page = {
  148. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
  149. };
  150. putPageLines();
  151. }
  152. if (tag == 'fontspec') {
  153. const attrs = sax.getAttrsSync(tail);
  154. const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
  155. const fontSize = (attrs.size && attrs.size.value ? attrs.size.value : '');
  156. if (fontId) {
  157. fonts[fontId] = {fontSize};
  158. }
  159. }
  160. if (tag == 'text' && !inText) {
  161. const attrs = sax.getAttrsSync(tail);
  162. line = {
  163. text: '',
  164. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
  165. left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
  166. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
  167. height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
  168. isSectionTitle: false,
  169. isSubtitle: false,
  170. pageWidth: page.width,
  171. fontId: (attrs.font && attrs.font.value ? attrs.font.value : ''),
  172. };
  173. if (line.width != 0 || line.height != 0) {
  174. inText = true;
  175. pagelines.push(line);
  176. }
  177. }
  178. if (tag == 'image') {
  179. const attrs = sax.getAttrsSync(tail);
  180. let src = (attrs.src && attrs.src.value ? attrs.src.value : '');
  181. if (src) {
  182. const image = {
  183. isImage: true,
  184. src,
  185. data: '',
  186. type: '',
  187. top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
  188. left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10) || 0,
  189. width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
  190. height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
  191. };
  192. loading.push(loadImage(image));
  193. images.push(image);
  194. images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
  195. }
  196. }
  197. };
  198. const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  199. if (inText) {
  200. switch (tag) {
  201. case 'i':
  202. italic = false;
  203. break;
  204. case 'b':
  205. bold = false;
  206. break;
  207. }
  208. }
  209. if (tag == 'text')
  210. inText = false;
  211. };
  212. let buf = this.decode(data).toString();
  213. sax.parseSync(buf, {
  214. onStartNode, onEndNode, onTextNode
  215. });
  216. putPageLines();
  217. await Promise.all(loading);
  218. await utils.sleep(100);
  219. //найдем параграфы и отступы
  220. const indents = [];
  221. for (const line of lines) {
  222. if (line.isImage)
  223. continue;
  224. if (!isNaN(line.left)) {
  225. indents[line.left] = 1;
  226. }
  227. }
  228. let j = 0;
  229. for (let i = 0; i < indents.length; i++) {
  230. if (indents[i]) {
  231. j++;
  232. indents[i] = j;
  233. }
  234. }
  235. indents[0] = 0;
  236. //author & title
  237. let {author, title} = await this.getPdfTitleAndAuthor(inpFile);
  238. if (!title && uploadFileName)
  239. title = uploadFileName;
  240. //console.log(JSON.stringify(lines, null, 2));
  241. //формируем текст
  242. const limitSize = 2*this.config.maxUploadFileSize;
  243. let text = '';
  244. if (title)
  245. text += `<fb2-title>${title}</fb2-title>`;
  246. if (author)
  247. text += `<fb2-author>${author}</fb2-author>`;
  248. let concat = '';
  249. let sp = '';
  250. let firstLine = true;
  251. for (const line of lines) {
  252. if (text.length > limitSize) {
  253. throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
  254. }
  255. if (line.isImage) {
  256. text += `<fb2-image type="${line.type}" name="${line.name}">${line.data}</fb2-image>`;
  257. continue;
  258. }
  259. if (line.isSectionTitle) {
  260. if (firstLine)
  261. text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
  262. else
  263. text += `<fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
  264. continue;
  265. }
  266. firstLine = false;
  267. if (line.isSubtitle) {
  268. text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
  269. continue;
  270. }
  271. if (concat == '') {
  272. const left = line.left || 0;
  273. sp = ' '.repeat(indents[left]);
  274. }
  275. let t = line.text.trim();
  276. if (t.substr(-1) == '-') {
  277. t = t.substr(0, t.length - 1);
  278. concat += t;
  279. } else {
  280. text += sp + concat + t + "\n";
  281. concat = '';
  282. }
  283. }
  284. if (concat)
  285. text += sp + concat + "\n";
  286. //console.log(text);
  287. await utils.sleep(100);
  288. return await super.run(Buffer.from(text), {skipHtmlCheck: true, isText: true});
  289. }
  290. async getPdfTitleAndAuthor(pdfFile) {
  291. const result = {author: '', title: ''};
  292. const pdfinfoPath = '/usr/bin/pdfinfo';
  293. if (!await fs.pathExists(pdfinfoPath))
  294. throw new Error('Внешний конвертер pdfinfo не найден');
  295. const execResult = await this.execConverter(pdfinfoPath, [pdfFile]);
  296. const titlePrefix = 'Title:';
  297. const authorPrefix = 'Author:';
  298. const stdout = execResult.stdout.split("\n");
  299. stdout.forEach(line => {
  300. if (line.indexOf(titlePrefix) == 0)
  301. result.title = line.substring(titlePrefix.length).trim();
  302. if (line.indexOf(authorPrefix) == 0)
  303. result.author = line.substring(authorPrefix.length).trim();
  304. });
  305. return result;
  306. }
  307. }
  308. module.exports = ConvertPdf;