|
@@ -1,9 +1,11 @@
|
|
|
|
+//const _ = require('lodash');
|
|
const fs = require('fs-extra');
|
|
const fs = require('fs-extra');
|
|
const path = require('path');
|
|
const path = require('path');
|
|
|
|
|
|
const sax = require('../../sax');
|
|
const sax = require('../../sax');
|
|
const utils = require('../../utils');
|
|
const utils = require('../../utils');
|
|
const ConvertHtml = require('./ConvertHtml');
|
|
const ConvertHtml = require('./ConvertHtml');
|
|
|
|
+const xmlParser = require('../../xmlParser');
|
|
|
|
|
|
class ConvertPdf extends ConvertHtml {
|
|
class ConvertPdf extends ConvertHtml {
|
|
check(data, opts) {
|
|
check(data, opts) {
|
|
@@ -22,11 +24,18 @@ class ConvertPdf extends ConvertHtml {
|
|
const {inputFiles, callback, abort, uploadFileName} = opts;
|
|
const {inputFiles, callback, abort, uploadFileName} = opts;
|
|
|
|
|
|
const inpFile = inputFiles.sourceFile;
|
|
const inpFile = inputFiles.sourceFile;
|
|
- const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
|
|
|
|
|
|
+ const outBasename = `${inputFiles.filesDir}/${utils.randomHexString(10)}`;
|
|
|
|
+ const outFile = `${outBasename}.xml`;
|
|
|
|
+ const metaFile = `${outBasename}_metadata.xml`;
|
|
|
|
+
|
|
|
|
+ const pdfaltoPath = `${this.config.dataDir}/pdfalto/pdfalto`;
|
|
|
|
+
|
|
|
|
+ if (!await fs.pathExists(pdfaltoPath))
|
|
|
|
+ throw new Error('Внешний конвертер pdfalto не найден');
|
|
|
|
|
|
//конвертируем в xml
|
|
//конвертируем в xml
|
|
let perc = 0;
|
|
let perc = 0;
|
|
- await this.execConverter(this.pdfToHtmlPath, ['-nodrm', '-c', '-s', '-xml', inpFile, outFile], () => {
|
|
|
|
|
|
+ await this.execConverter(pdfaltoPath, [inpFile, outFile], () => {
|
|
perc = (perc < 80 ? perc + 10 : 40);
|
|
perc = (perc < 80 ? perc + 10 : 40);
|
|
callback(perc);
|
|
callback(perc);
|
|
}, abort);
|
|
}, abort);
|
|
@@ -35,17 +44,22 @@ class ConvertPdf extends ConvertHtml {
|
|
const data = await fs.readFile(outFile);
|
|
const data = await fs.readFile(outFile);
|
|
callback(90);
|
|
callback(90);
|
|
|
|
|
|
|
|
+ await utils.sleep(100);
|
|
|
|
+
|
|
//парсим xml
|
|
//парсим xml
|
|
let lines = [];
|
|
let lines = [];
|
|
|
|
+ let pagelines = [];
|
|
|
|
+ let line = {text: ''};
|
|
|
|
+ let page = {};
|
|
|
|
+ let fonts = {};
|
|
|
|
+ let sectionTitleFound = false;
|
|
|
|
+
|
|
let images = [];
|
|
let images = [];
|
|
let loading = [];
|
|
let loading = [];
|
|
- let inText = false;
|
|
|
|
- let bold = false;
|
|
|
|
- let italic = false;
|
|
|
|
|
|
+
|
|
let title = '';
|
|
let title = '';
|
|
- let prevTop = 0;
|
|
|
|
|
|
+ let author = '';
|
|
let i = -1;
|
|
let i = -1;
|
|
- let titleCount = 0;
|
|
|
|
|
|
|
|
const loadImage = async(image) => {
|
|
const loadImage = async(image) => {
|
|
const src = path.parse(image.src);
|
|
const src = path.parse(image.src);
|
|
@@ -59,7 +73,7 @@ class ConvertPdf extends ConvertHtml {
|
|
image.type = type;
|
|
image.type = type;
|
|
image.name = src.base;
|
|
image.name = src.base;
|
|
}
|
|
}
|
|
- }
|
|
|
|
|
|
+ };
|
|
|
|
|
|
const putImage = (curTop) => {
|
|
const putImage = (curTop) => {
|
|
if (!isNaN(curTop) && images.length) {
|
|
if (!isNaN(curTop) && images.length) {
|
|
@@ -69,104 +83,180 @@ class ConvertPdf extends ConvertHtml {
|
|
images.shift();
|
|
images.shift();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- }
|
|
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ const putPageLines = () => {
|
|
|
|
+ pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left))
|
|
|
|
+
|
|
|
|
+ //объединяем в одну строку равные по высоте
|
|
|
|
+ const pl = [];
|
|
|
|
+ let pt = 0;
|
|
|
|
+ let j = -1;
|
|
|
|
+ pagelines.forEach(line => {
|
|
|
|
+ //добавим закрывающий тег стиля
|
|
|
|
+ line.text += line.tClose;
|
|
|
|
|
|
- const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
|
|
|
- if (!cutCounter && inText) {
|
|
|
|
- let tOpen = (bold ? '<b>' : '');
|
|
|
|
- tOpen += (italic ? '<i>' : '');
|
|
|
|
- let tClose = (italic ? '</i>' : '');
|
|
|
|
- tClose += (bold ? '</b>' : '');
|
|
|
|
-
|
|
|
|
- lines[i].text += `${tOpen}${text}${tClose} `;
|
|
|
|
- if (titleCount < 2 && text.trim() != '') {
|
|
|
|
- title += text + (titleCount ? '' : ' - ');
|
|
|
|
- titleCount++;
|
|
|
|
|
|
+ //проверим, возможно это заголовок
|
|
|
|
+ if (line.fonts.length == 1 && line.pageWidth) {
|
|
|
|
+ const f = (line.fonts.length ? fonts[line.fonts[0]] : null);
|
|
|
|
+ const centerLeft = (line.pageWidth - line.width)/2;
|
|
|
|
+ if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) {
|
|
|
|
+ if (!sectionTitleFound) {
|
|
|
|
+ line.isSectionTitle = true;
|
|
|
|
+ sectionTitleFound = true;
|
|
|
|
+ } else {
|
|
|
|
+ line.isSubtitle = true;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
}
|
|
}
|
|
- }
|
|
|
|
|
|
+
|
|
|
|
+ //объединяем
|
|
|
|
+ if (pt == 0 || Math.abs(pt - line.top) > 3) {
|
|
|
|
+ j++;
|
|
|
|
+ pl[j] = line;
|
|
|
|
+ } else {
|
|
|
|
+ pl[j].text += ` ${line.text}`;
|
|
|
|
+ }
|
|
|
|
+ pt = line.top;
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ //заполняем lines
|
|
|
|
+ const lastIndex = i;
|
|
|
|
+ pl.forEach(line => {
|
|
|
|
+ putImage(line.top);
|
|
|
|
+
|
|
|
|
+ //добавим пустую строку, если надо
|
|
|
|
+ const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
|
|
|
|
+ if (prevLine && !prevLine.isImage) {
|
|
|
|
+ const f = (prevLine.fonts.length ? fonts[prevLine.fonts[0]] : (line.fonts.length ? fonts[line.fonts[0]] : null));
|
|
|
|
+ if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize*1.8) {
|
|
|
|
+ i++;
|
|
|
|
+ lines[i] = {text: '<br>'};
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ i++;
|
|
|
|
+ lines[i] = line;
|
|
|
|
+ });
|
|
|
|
+ pagelines = [];
|
|
|
|
+ putImage(100000);
|
|
};
|
|
};
|
|
|
|
|
|
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
|
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
|
- if (!cutCounter) {
|
|
|
|
- if (inText) {
|
|
|
|
- switch (tag) {
|
|
|
|
- case 'i':
|
|
|
|
- italic = true;
|
|
|
|
- break;
|
|
|
|
- case 'b':
|
|
|
|
- bold = true;
|
|
|
|
- break;
|
|
|
|
|
|
+ if (tag == 'textstyle') {
|
|
|
|
+ const attrs = sax.getAttrsSync(tail);
|
|
|
|
+ const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
|
|
|
|
+ const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : '');
|
|
|
|
+ const fontSize = (attrs.fontsize && attrs.fontsize.value ? attrs.fontsize.value : '');
|
|
|
|
+
|
|
|
|
+ if (fontId) {
|
|
|
|
+ const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'};
|
|
|
|
+ const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false, fontSize};
|
|
|
|
+
|
|
|
|
+ if (fontStyle) {
|
|
|
|
+ const styles = fontStyle.split(' ');
|
|
|
|
+ styles.forEach(style => {
|
|
|
|
+ const s = styleTags[style];
|
|
|
|
+ if (s) {
|
|
|
|
+ f.tOpen += `<${s}>`;
|
|
|
|
+ f.tClose = `</${s}>${f.tClose}`;
|
|
|
|
+ if (s == 'b')
|
|
|
|
+ f.isBold = true;
|
|
|
|
+ }
|
|
|
|
+ });
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ }
|
|
|
|
|
|
- if (tag == 'text' && !inText) {
|
|
|
|
- let attrs = sax.getAttrsSync(tail);
|
|
|
|
- const line = {
|
|
|
|
- text: '',
|
|
|
|
- top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
|
|
|
|
- left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
|
|
|
|
- width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
|
|
|
|
- height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
|
|
|
|
- };
|
|
|
|
-
|
|
|
|
- if (line.width != 0 || line.height != 0) {
|
|
|
|
- inText = true;
|
|
|
|
- if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
|
|
|
|
- putImage(line.top);
|
|
|
|
- i++;
|
|
|
|
- lines[i] = line;
|
|
|
|
- }
|
|
|
|
- prevTop = line.top;
|
|
|
|
|
|
+ if (tag == 'page') {
|
|
|
|
+ const attrs = sax.getAttrsSync(tail);
|
|
|
|
+ page = {
|
|
|
|
+ width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ putPageLines();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (tag == 'textline') {
|
|
|
|
+ const attrs = sax.getAttrsSync(tail);
|
|
|
|
+ line = {
|
|
|
|
+ text: '',
|
|
|
|
+ top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10),
|
|
|
|
+ left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10),
|
|
|
|
+ width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
|
|
|
|
+ height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
|
|
|
|
+ tOpen: '',
|
|
|
|
+ tClose: '',
|
|
|
|
+ isSectionTitle: false,
|
|
|
|
+ isSubtitle: false,
|
|
|
|
+ pageWidth: page.width,
|
|
|
|
+ fonts: [],
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ if (line.width != 0 || line.height != 0) {
|
|
|
|
+ pagelines.push(line);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (tag == 'string') {
|
|
|
|
+ const attrs = sax.getAttrsSync(tail);
|
|
|
|
+ if (attrs.content && attrs.content.value) {
|
|
|
|
+
|
|
|
|
+ let tOpen = '';
|
|
|
|
+ let tClose = '';
|
|
|
|
+ const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : '');
|
|
|
|
+ if (fontId && fonts[fontId]) {
|
|
|
|
+ tOpen = fonts[fontId].tOpen;
|
|
|
|
+ tClose = fonts[fontId].tClose;
|
|
|
|
+ if (!line.fonts.length || line.fonts[0] != fontId)
|
|
|
|
+ line.fonts.push(fontId);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (line.tOpen != tOpen) {
|
|
|
|
+ line.text += line.tClose + tOpen;
|
|
|
|
+ line.tOpen = tOpen;
|
|
|
|
+ line.tClose = tClose;
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ line.text += `${line.text.length ? ' ' : ''}${attrs.content.value}`;
|
|
}
|
|
}
|
|
|
|
+ }
|
|
|
|
|
|
- if (tag == 'image') {
|
|
|
|
- const attrs = sax.getAttrsSync(tail);
|
|
|
|
- const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
|
|
|
|
|
|
+ if (tag == 'illustration') {
|
|
|
|
+ const attrs = sax.getAttrsSync(tail);
|
|
|
|
+ if (attrs.type && attrs.type.value == 'image') {
|
|
|
|
+ let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
|
|
if (src) {
|
|
if (src) {
|
|
const image = {
|
|
const image = {
|
|
isImage: true,
|
|
isImage: true,
|
|
src,
|
|
src,
|
|
data: '',
|
|
data: '',
|
|
type: '',
|
|
type: '',
|
|
- top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
|
|
|
|
|
|
+ top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
|
|
|
|
+ left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0,
|
|
|
|
+ width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
|
|
|
|
+ height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
|
|
};
|
|
};
|
|
- loading.push(loadImage(image));
|
|
|
|
- images.push(image);
|
|
|
|
- images.sort((a, b) => a.top - b.top)
|
|
|
|
|
|
+ const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height));
|
|
|
|
+ if (!exists.length) {
|
|
|
|
+ loading.push(loadImage(image));
|
|
|
|
+ images.push(image);
|
|
|
|
+ images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
|
|
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-
|
|
|
|
- if (tag == 'page') {
|
|
|
|
- putImage(100000);
|
|
|
|
- }
|
|
|
|
}
|
|
}
|
|
};
|
|
};
|
|
|
|
|
|
- const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
|
|
|
- if (inText) {
|
|
|
|
- switch (tag) {
|
|
|
|
- case 'i':
|
|
|
|
- italic = false;
|
|
|
|
- break;
|
|
|
|
- case 'b':
|
|
|
|
- bold = false;
|
|
|
|
- break;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (tag == 'text')
|
|
|
|
- inText = false;
|
|
|
|
- };
|
|
|
|
-
|
|
|
|
let buf = this.decode(data).toString();
|
|
let buf = this.decode(data).toString();
|
|
sax.parseSync(buf, {
|
|
sax.parseSync(buf, {
|
|
- onStartNode, onEndNode, onTextNode
|
|
|
|
|
|
+ onStartNode
|
|
});
|
|
});
|
|
|
|
|
|
- putImage(100000);
|
|
|
|
|
|
+ putPageLines();
|
|
|
|
|
|
await Promise.all(loading);
|
|
await Promise.all(loading);
|
|
|
|
+ await utils.sleep(100);
|
|
|
|
|
|
//найдем параграфы и отступы
|
|
//найдем параграфы и отступы
|
|
const indents = [];
|
|
const indents = [];
|
|
@@ -187,11 +277,29 @@ class ConvertPdf extends ConvertHtml {
|
|
}
|
|
}
|
|
indents[0] = 0;
|
|
indents[0] = 0;
|
|
|
|
|
|
- //формируем текст
|
|
|
|
- const limitSize = 2*this.config.maxUploadFileSize;
|
|
|
|
|
|
+ //title
|
|
|
|
+ if (fs.pathExists(metaFile)) {
|
|
|
|
+ const metaXmlString = (await fs.readFile(metaFile)).toString();
|
|
|
|
+ let metaXmlParsed = xmlParser.parseXml(metaXmlString);
|
|
|
|
+ metaXmlParsed = xmlParser.simplifyXmlParsed(metaXmlParsed);
|
|
|
|
+ if (metaXmlParsed.metadata) {
|
|
|
|
+ title = (metaXmlParsed.metadata.title ? metaXmlParsed.metadata.title._t : '');
|
|
|
|
+ author = (metaXmlParsed.metadata.author ? metaXmlParsed.metadata.author._t : '');
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
if (!title && uploadFileName)
|
|
if (!title && uploadFileName)
|
|
title = uploadFileName;
|
|
title = uploadFileName;
|
|
- let text = `<title>${title}</title>`;
|
|
|
|
|
|
+
|
|
|
|
+ //console.log(JSON.stringify(lines, null, 2));
|
|
|
|
+ //формируем текст
|
|
|
|
+ const limitSize = 2*this.config.maxUploadFileSize;
|
|
|
|
+ let text = '';
|
|
|
|
+ if (title)
|
|
|
|
+ text += `<fb2-title>${title}</fb2-title>`;
|
|
|
|
+ if (author)
|
|
|
|
+ text += `<fb2-author>${author}</fb2-author>`;
|
|
|
|
+
|
|
let concat = '';
|
|
let concat = '';
|
|
let sp = '';
|
|
let sp = '';
|
|
for (const line of lines) {
|
|
for (const line of lines) {
|
|
@@ -204,6 +312,16 @@ class ConvertPdf extends ConvertHtml {
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ if (line.isSectionTitle) {
|
|
|
|
+ text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (line.isSubtitle) {
|
|
|
|
+ text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
if (concat == '') {
|
|
if (concat == '') {
|
|
const left = line.left || 0;
|
|
const left = line.left || 0;
|
|
sp = ' '.repeat(indents[left]);
|
|
sp = ' '.repeat(indents[left]);
|
|
@@ -221,7 +339,9 @@ class ConvertPdf extends ConvertHtml {
|
|
if (concat)
|
|
if (concat)
|
|
text += sp + concat + "\n";
|
|
text += sp + concat + "\n";
|
|
|
|
|
|
- return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
|
|
|
|
|
|
+ //console.log(text);
|
|
|
|
+ await utils.sleep(100);
|
|
|
|
+ return await super.run(Buffer.from(text), {skipCheck: true, isText: true});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|