|
@@ -5,7 +5,6 @@ const path = require('path');
|
|
const sax = require('../../sax');
|
|
const sax = require('../../sax');
|
|
const utils = require('../../utils');
|
|
const utils = require('../../utils');
|
|
const ConvertHtml = require('./ConvertHtml');
|
|
const ConvertHtml = require('./ConvertHtml');
|
|
-const xmlParser = require('../../xmlParser');
|
|
|
|
|
|
|
|
class ConvertPdf extends ConvertHtml {
|
|
class ConvertPdf extends ConvertHtml {
|
|
check(data, opts) {
|
|
check(data, opts) {
|
|
@@ -26,16 +25,15 @@ class ConvertPdf extends ConvertHtml {
|
|
const inpFile = inputFiles.sourceFile;
|
|
const inpFile = inputFiles.sourceFile;
|
|
const outBasename = `${inputFiles.filesDir}/${utils.randomHexString(10)}`;
|
|
const outBasename = `${inputFiles.filesDir}/${utils.randomHexString(10)}`;
|
|
const outFile = `${outBasename}.xml`;
|
|
const outFile = `${outBasename}.xml`;
|
|
- const metaFile = `${outBasename}_metadata.xml`;
|
|
|
|
|
|
|
|
- const pdfaltoPath = `${this.config.dataDir}/pdfalto/pdfalto`;
|
|
|
|
|
|
+ const pdftohtmlPath = '/usr/bin/pdftohtml';
|
|
|
|
|
|
- if (!await fs.pathExists(pdfaltoPath))
|
|
|
|
- throw new Error('Внешний конвертер pdfalto не найден');
|
|
|
|
|
|
+ if (!await fs.pathExists(pdftohtmlPath))
|
|
|
|
+ throw new Error('Внешний конвертер pdftohtml не найден');
|
|
|
|
|
|
//конвертируем в xml
|
|
//конвертируем в xml
|
|
let perc = 0;
|
|
let perc = 0;
|
|
- await this.execConverter(pdfaltoPath, [inpFile, outFile], () => {
|
|
|
|
|
|
+ await this.execConverter(pdftohtmlPath, ['-nodrm', '-c', '-s', '-xml', inpFile, outFile], () => {
|
|
perc = (perc < 80 ? perc + 10 : 40);
|
|
perc = (perc < 80 ? perc + 10 : 40);
|
|
callback(perc);
|
|
callback(perc);
|
|
}, abort);
|
|
}, abort);
|
|
@@ -57,8 +55,10 @@ class ConvertPdf extends ConvertHtml {
|
|
let images = [];
|
|
let images = [];
|
|
let loading = [];
|
|
let loading = [];
|
|
|
|
|
|
- let title = '';
|
|
|
|
- let author = '';
|
|
|
|
|
|
+ let inText = false;
|
|
|
|
+ let bold = false;
|
|
|
|
+ let italic = false;
|
|
|
|
+
|
|
let i = -1;
|
|
let i = -1;
|
|
|
|
|
|
const loadImage = async(image) => {
|
|
const loadImage = async(image) => {
|
|
@@ -85,22 +85,30 @@ class ConvertPdf extends ConvertHtml {
|
|
}
|
|
}
|
|
};
|
|
};
|
|
|
|
|
|
|
|
+ const isTextBold = (text) => {
|
|
|
|
+ const m = text.trim().match(/^<b>(.*)<\/b>$/);
|
|
|
|
+ return m && !m[1].match(/<b>|<\/b>|<i>|<\/i>/g);
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ const isTextEmpty = (text) => {
|
|
|
|
+ return text.replace(/<b>|<\/b>|<i>|<\/i>/g, '').trim() == '';
|
|
|
|
+ };
|
|
|
|
+
|
|
const putPageLines = () => {
|
|
const putPageLines = () => {
|
|
- pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left))
|
|
|
|
|
|
+ pagelines.sort((a, b) => (Math.abs(a.top - b.top) > 3 ? a.top - b.top : 0)*10000 + (a.left - b.left))
|
|
|
|
|
|
//объединяем в одну строку равные по высоте
|
|
//объединяем в одну строку равные по высоте
|
|
const pl = [];
|
|
const pl = [];
|
|
let pt = 0;
|
|
let pt = 0;
|
|
let j = -1;
|
|
let j = -1;
|
|
pagelines.forEach(line => {
|
|
pagelines.forEach(line => {
|
|
- //добавим закрывающий тег стиля
|
|
|
|
- line.text += line.tClose;
|
|
|
|
|
|
+ if (isTextEmpty(line.text))
|
|
|
|
+ return;
|
|
|
|
|
|
//проверим, возможно это заголовок
|
|
//проверим, возможно это заголовок
|
|
- if (line.fonts.length == 1 && line.pageWidth) {
|
|
|
|
- const f = (line.fonts.length ? fonts[line.fonts[0]] : null);
|
|
|
|
|
|
+ if (line.fontId && line.pageWidth) {
|
|
const centerLeft = (line.pageWidth - line.width)/2;
|
|
const centerLeft = (line.pageWidth - line.width)/2;
|
|
- if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) {
|
|
|
|
|
|
+ if (isTextBold(line.text) && Math.abs(centerLeft - line.left) < 10) {
|
|
if (!sectionTitleFound) {
|
|
if (!sectionTitleFound) {
|
|
line.isSectionTitle = true;
|
|
line.isSectionTitle = true;
|
|
sectionTitleFound = true;
|
|
sectionTitleFound = true;
|
|
@@ -128,8 +136,8 @@ class ConvertPdf extends ConvertHtml {
|
|
//добавим пустую строку, если надо
|
|
//добавим пустую строку, если надо
|
|
const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
|
|
const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
|
|
if (prevLine && !prevLine.isImage) {
|
|
if (prevLine && !prevLine.isImage) {
|
|
- const f = (prevLine.fonts.length ? fonts[prevLine.fonts[0]] : (line.fonts.length ? fonts[line.fonts[0]] : null));
|
|
|
|
- if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize*1.8) {
|
|
|
|
|
|
+ const f = (prevLine.fontId ? fonts[prevLine.fontId] : (line.fontId ? fonts[line.fontId] : null));
|
|
|
|
+ if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize * 1.8) {
|
|
i++;
|
|
i++;
|
|
lines[i] = {text: '<br>'};
|
|
lines[i] = {text: '<br>'};
|
|
}
|
|
}
|
|
@@ -142,29 +150,26 @@ class ConvertPdf extends ConvertHtml {
|
|
putImage(100000);
|
|
putImage(100000);
|
|
};
|
|
};
|
|
|
|
|
|
- const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
|
|
|
- if (tag == 'textstyle') {
|
|
|
|
- const attrs = sax.getAttrsSync(tail);
|
|
|
|
- const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
|
|
|
|
- const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : '');
|
|
|
|
- const fontSize = (attrs.fontsize && attrs.fontsize.value ? attrs.fontsize.value : '');
|
|
|
|
|
|
+ const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
|
|
|
+ if (!cutCounter && inText) {
|
|
|
|
+ let tOpen = (bold ? '<b>' : '');
|
|
|
|
+ tOpen += (italic ? '<i>' : '');
|
|
|
|
+ let tClose = (italic ? '</i>' : '');
|
|
|
|
+ tClose += (bold ? '</b>' : '');
|
|
|
|
|
|
- if (fontId) {
|
|
|
|
- const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'};
|
|
|
|
- const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false, fontSize};
|
|
|
|
-
|
|
|
|
- if (fontStyle) {
|
|
|
|
- const styles = fontStyle.split(' ');
|
|
|
|
- styles.forEach(style => {
|
|
|
|
- const s = styleTags[style];
|
|
|
|
- if (s) {
|
|
|
|
- f.tOpen += `<${s}>`;
|
|
|
|
- f.tClose = `</${s}>${f.tClose}`;
|
|
|
|
- if (s == 'b')
|
|
|
|
- f.isBold = true;
|
|
|
|
- }
|
|
|
|
- });
|
|
|
|
- }
|
|
|
|
|
|
+ line.text += ` ${tOpen}${text}${tClose}`;
|
|
|
|
+ }
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
|
|
|
+ if (inText) {
|
|
|
|
+ switch (tag) {
|
|
|
|
+ case 'i':
|
|
|
|
+ italic = true;
|
|
|
|
+ break;
|
|
|
|
+ case 'b':
|
|
|
|
+ bold = true;
|
|
|
|
+ break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -177,80 +182,78 @@ class ConvertPdf extends ConvertHtml {
|
|
putPageLines();
|
|
putPageLines();
|
|
}
|
|
}
|
|
|
|
|
|
- if (tag == 'textline') {
|
|
|
|
|
|
+ if (tag == 'fontspec') {
|
|
|
|
+ const attrs = sax.getAttrsSync(tail);
|
|
|
|
+ const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
|
|
|
|
+ const fontSize = (attrs.size && attrs.size.value ? attrs.size.value : '');
|
|
|
|
+
|
|
|
|
+ if (fontId) {
|
|
|
|
+ fonts[fontId] = {fontSize};
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (tag == 'text' && !inText) {
|
|
const attrs = sax.getAttrsSync(tail);
|
|
const attrs = sax.getAttrsSync(tail);
|
|
line = {
|
|
line = {
|
|
text: '',
|
|
text: '',
|
|
- top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10),
|
|
|
|
- left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10),
|
|
|
|
|
|
+ top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
|
|
|
|
+ left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
|
|
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
|
|
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
|
|
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
|
|
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
|
|
- tOpen: '',
|
|
|
|
- tClose: '',
|
|
|
|
isSectionTitle: false,
|
|
isSectionTitle: false,
|
|
isSubtitle: false,
|
|
isSubtitle: false,
|
|
pageWidth: page.width,
|
|
pageWidth: page.width,
|
|
- fonts: [],
|
|
|
|
|
|
+ fontId: (attrs.font && attrs.font.value ? attrs.font.value : ''),
|
|
};
|
|
};
|
|
|
|
|
|
if (line.width != 0 || line.height != 0) {
|
|
if (line.width != 0 || line.height != 0) {
|
|
|
|
+ inText = true;
|
|
pagelines.push(line);
|
|
pagelines.push(line);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- if (tag == 'string') {
|
|
|
|
|
|
+ if (tag == 'image') {
|
|
const attrs = sax.getAttrsSync(tail);
|
|
const attrs = sax.getAttrsSync(tail);
|
|
- if (attrs.content && attrs.content.value) {
|
|
|
|
-
|
|
|
|
- let tOpen = '';
|
|
|
|
- let tClose = '';
|
|
|
|
- const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : '');
|
|
|
|
- if (fontId && fonts[fontId]) {
|
|
|
|
- tOpen = fonts[fontId].tOpen;
|
|
|
|
- tClose = fonts[fontId].tClose;
|
|
|
|
- if (!line.fonts.length || line.fonts[0] != fontId)
|
|
|
|
- line.fonts.push(fontId);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (line.tOpen != tOpen) {
|
|
|
|
- line.text += line.tClose + tOpen;
|
|
|
|
- line.tOpen = tOpen;
|
|
|
|
- line.tClose = tClose;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- line.text += `${line.text.length ? ' ' : ''}${attrs.content.value}`;
|
|
|
|
|
|
+ let src = (attrs.src && attrs.src.value ? attrs.src.value : '');
|
|
|
|
+ if (src) {
|
|
|
|
+ const image = {
|
|
|
|
+ isImage: true,
|
|
|
|
+ src,
|
|
|
|
+ data: '',
|
|
|
|
+ type: '',
|
|
|
|
+ top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
|
|
|
|
+ left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10) || 0,
|
|
|
|
+ width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
|
|
|
|
+ height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ loading.push(loadImage(image));
|
|
|
|
+ images.push(image);
|
|
|
|
+ images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ };
|
|
|
|
|
|
- if (tag == 'illustration') {
|
|
|
|
- const attrs = sax.getAttrsSync(tail);
|
|
|
|
- if (attrs.type && attrs.type.value == 'image') {
|
|
|
|
- let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
|
|
|
|
- if (src) {
|
|
|
|
- const image = {
|
|
|
|
- isImage: true,
|
|
|
|
- src,
|
|
|
|
- data: '',
|
|
|
|
- type: '',
|
|
|
|
- top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
|
|
|
|
- left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0,
|
|
|
|
- width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
|
|
|
|
- height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
|
|
|
|
- };
|
|
|
|
- const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height));
|
|
|
|
- if (!exists.length) {
|
|
|
|
- loading.push(loadImage(image));
|
|
|
|
- images.push(image);
|
|
|
|
- images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+ const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
|
|
|
+ if (inText) {
|
|
|
|
+ switch (tag) {
|
|
|
|
+ case 'i':
|
|
|
|
+ italic = false;
|
|
|
|
+ break;
|
|
|
|
+ case 'b':
|
|
|
|
+ bold = false;
|
|
|
|
+ break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ if (tag == 'text')
|
|
|
|
+ inText = false;
|
|
};
|
|
};
|
|
|
|
|
|
let buf = this.decode(data).toString();
|
|
let buf = this.decode(data).toString();
|
|
sax.parseSync(buf, {
|
|
sax.parseSync(buf, {
|
|
- onStartNode
|
|
|
|
|
|
+ onStartNode, onEndNode, onTextNode
|
|
});
|
|
});
|
|
|
|
|
|
putPageLines();
|
|
putPageLines();
|
|
@@ -277,16 +280,8 @@ class ConvertPdf extends ConvertHtml {
|
|
}
|
|
}
|
|
indents[0] = 0;
|
|
indents[0] = 0;
|
|
|
|
|
|
- //title
|
|
|
|
- if (fs.pathExists(metaFile)) {
|
|
|
|
- const metaXmlString = (await fs.readFile(metaFile)).toString();
|
|
|
|
- let metaXmlParsed = xmlParser.parseXml(metaXmlString);
|
|
|
|
- metaXmlParsed = xmlParser.simplifyXmlParsed(metaXmlParsed);
|
|
|
|
- if (metaXmlParsed.metadata) {
|
|
|
|
- title = (metaXmlParsed.metadata.title ? metaXmlParsed.metadata.title._t : '');
|
|
|
|
- author = (metaXmlParsed.metadata.author ? metaXmlParsed.metadata.author._t : '');
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+ //author & title
|
|
|
|
+ let {author, title} = await this.getPdfTitleAndAuthor(inpFile);
|
|
|
|
|
|
if (!title && uploadFileName)
|
|
if (!title && uploadFileName)
|
|
title = uploadFileName;
|
|
title = uploadFileName;
|
|
@@ -302,6 +297,7 @@ class ConvertPdf extends ConvertHtml {
|
|
|
|
|
|
let concat = '';
|
|
let concat = '';
|
|
let sp = '';
|
|
let sp = '';
|
|
|
|
+ let firstLine = true;
|
|
for (const line of lines) {
|
|
for (const line of lines) {
|
|
if (text.length > limitSize) {
|
|
if (text.length > limitSize) {
|
|
throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
|
|
throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
|
|
@@ -313,10 +309,15 @@ class ConvertPdf extends ConvertHtml {
|
|
}
|
|
}
|
|
|
|
|
|
if (line.isSectionTitle) {
|
|
if (line.isSectionTitle) {
|
|
- text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
|
|
|
|
|
|
+ if (firstLine)
|
|
|
|
+ text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
|
|
|
|
+ else
|
|
|
|
+ text += `<fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ firstLine = false;
|
|
|
|
+
|
|
if (line.isSubtitle) {
|
|
if (line.isSubtitle) {
|
|
text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
|
|
text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
|
|
continue;
|
|
continue;
|
|
@@ -343,6 +344,32 @@ class ConvertPdf extends ConvertHtml {
|
|
await utils.sleep(100);
|
|
await utils.sleep(100);
|
|
return await super.run(Buffer.from(text), {skipCheck: true, isText: true});
|
|
return await super.run(Buffer.from(text), {skipCheck: true, isText: true});
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ async getPdfTitleAndAuthor(pdfFile) {
|
|
|
|
+ const result = {author: '', title: ''};
|
|
|
|
+
|
|
|
|
+ const pdfinfoPath = '/usr/bin/pdfinfo';
|
|
|
|
+
|
|
|
|
+ if (!await fs.pathExists(pdfinfoPath))
|
|
|
|
+ throw new Error('Внешний конвертер pdfinfo не найден');
|
|
|
|
+
|
|
|
|
+ const execResult = await this.execConverter(pdfinfoPath, [pdfFile]);
|
|
|
|
+
|
|
|
|
+ const titlePrefix = 'Title:';
|
|
|
|
+ const authorPrefix = 'Author:';
|
|
|
|
+
|
|
|
|
+ const stdout = execResult.stdout.split("\n");
|
|
|
|
+ stdout.forEach(line => {
|
|
|
|
+ if (line.indexOf(titlePrefix) == 0)
|
|
|
|
+ result.title = line.substring(titlePrefix.length).trim();
|
|
|
|
+
|
|
|
|
+ if (line.indexOf(authorPrefix) == 0)
|
|
|
|
+ result.author = line.substring(authorPrefix.length).trim();
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ return result;
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+
|
|
module.exports = ConvertPdf;
|
|
module.exports = ConvertPdf;
|