Browse Source

Merge branch 'release/0.9.11'

Book Pauk 4 years ago
parent
commit
7fa891b4fc

+ 8 - 2
client/components/Reader/TextPage/DrawHelper.js

@@ -77,9 +77,15 @@ export default class DrawHelper {
             let j = 0;
             //формируем строку
             for (const part of line.parts) {
-                let tOpen = (part.style.bold ? '<b>' : '');
+                let tOpen = '';
+                tOpen += (part.style.bold ? '<b>' : '');
                 tOpen += (part.style.italic ? '<i>' : '');
-                let tClose = (part.style.italic ? '</i>' : '');
+                tOpen += (part.style.sup ? '<span style="vertical-align: baseline; position: relative; line-height: 0; top: -0.3em">' : '');
+                tOpen += (part.style.sub ? '<span style="vertical-align: baseline; position: relative; line-height: 0; top: 0.3em">' : '');
+                let tClose = '';
+                tClose += (part.style.sub ? '</span>' : '');
+                tClose += (part.style.sup ? '</span>' : '');
+                tClose += (part.style.italic ? '</i>' : '');
                 tClose += (part.style.bold ? '</b>' : '');
 
                 let text = '';

+ 15 - 3
client/components/Reader/share/BookParser.js

@@ -285,7 +285,7 @@ export default class BookParser {
                     sectionLevel++;
                 }
 
-                if (tag == 'emphasis' || tag == 'strong') {
+                if (tag == 'emphasis' || tag == 'strong' || tag == 'sup' || tag == 'sub') {
                     growParagraph(`<${tag}>`, 0);
                 }
 
@@ -343,7 +343,7 @@ export default class BookParser {
                         sectionLevel--;
                     }
 
-                    if (tag == 'emphasis' || tag == 'strong') {
+                    if (tag == 'emphasis' || tag == 'strong' || tag == 'sup' || tag == 'sub') {
                         growParagraph(`</${tag}>`, 0);
                     }
 
@@ -507,7 +507,7 @@ export default class BookParser {
 
     splitToStyle(s) {
         let result = [];/*array of {
-            style: {bold: Boolean, italic: Boolean, center: Boolean, space: Number},
+            style: {bold: Boolean, italic: Boolean, sup: Boolean, sub: Boolean, center: Boolean, space: Number},
             image: {local: Boolean, inline: Boolean, id: String},
             text: String,
         }*/
@@ -530,6 +530,12 @@ export default class BookParser {
                 case 'emphasis':
                     style.italic = true;
                     break;
+                case 'sup': 
+                    style.sup = true;
+                    break;
+                case 'sub':
+                    style.sub = true;
+                    break;
                 case 'center':
                     style.center = true;
                     break;
@@ -580,6 +586,12 @@ export default class BookParser {
                 case 'emphasis':
                     style.italic = false;
                     break;
+                case 'sup': 
+                    style.sup = false;
+                    break;
+                case 'sub':
+                    style.sub = false;
+                    break;
                 case 'center':
                     style.center = false;
                     break;

+ 2 - 2
client/components/Reader/share/bookManager.js

@@ -169,7 +169,7 @@ class BookManager {
     }
 
     async deflateWithProgress(data, callback) {
-        const chunkSize = 128*1024;
+        const chunkSize = 512*1024;
         const deflator = new utils.pako.Deflate({level: 5});
 
         let chunkTotal = 1 + Math.floor(data.length/chunkSize);
@@ -203,7 +203,7 @@ class BookManager {
     }
 
     async inflateWithProgress(data, callback) {
-        const chunkSize = 64*1024;
+        const chunkSize = 512*1024;
         const inflator = new utils.pako.Inflate({to: 'string'});
 
         let chunkTotal = 1 + Math.floor(data.length/chunkSize);

+ 11 - 0
client/components/Reader/versionHistory.js

@@ -1,4 +1,15 @@
 export const versionHistory = [
+{
+    showUntil: '2020-12-08',
+    header: '0.9.11 (2020-12-09)',
+    content:
+`
+<ul>
+    <li>оптимизации, улучшения работы конвертеров</li>
+</ul>
+`
+},
+
 {
     showUntil: '2020-12-10',
     header: '0.9.10 (2020-12-03)',

+ 13 - 1
docs/omnireader.ru/README.md

@@ -32,11 +32,23 @@ sudo -u www-data mkdir -p /home/liberama/data/calibre
 sudo -u www-data tar xvf calibre-5.5.0-x86_64.txz -C /home/liberama/data/calibre
 ```
 
+### external converter `pdfalto`, github https://github.com/kermitt2/pdfalto
+```
+git clone https://github.com/kermitt2/pdfalto
+cd pdfalto
+git submodule update --init --recursive
+cmake ./
+добавить в начало CMakeLists.txt строчку: set(CMAKE_EXE_LINKER_FLAGS "-no-pie")
+make
+
+sudo -u www-data mkdir -p /home/liberama/data/pdfalto
+sudo -u www-data cp pdfalto /home/liberama/data/pdfalto
+```
+
 ### external converters
 ```
 sudo apt install rar
 sudo apt install libreoffice
-sudo apt install poppler-utils
 sudo apt install djvulibre-bin
 sudo apt install libtiff-tools
 sudo apt install graphicsmagick-imagemagick-compat

+ 1 - 1
package-lock.json

@@ -1,6 +1,6 @@
 {
   "name": "Liberama",
-  "version": "0.9.10",
+  "version": "0.9.11",
   "lockfileVersion": 1,
   "requires": true,
   "dependencies": {

+ 2 - 2
package.json

@@ -1,6 +1,6 @@
 {
   "name": "Liberama",
-  "version": "0.9.10",
+  "version": "0.9.11",
   "author": "Book Pauk <bookpauk@gmail.com>",
   "license": "CC0-1.0",
   "repository": "bookpauk/liberama",
@@ -8,7 +8,7 @@
     "node": ">=10.0.0"
   },
   "scripts": {
-    "dev": "nodemon --inspect --ignore server/public --ignore server/data --exec 'node server'",
+    "dev": "nodemon --inspect --ignore server/public --ignore server/data --ignore client --exec 'node server'",
     "build:client": "webpack --config build/webpack.prod.config.js",
     "build:linux": "npm run build:client && node build/linux && pkg -t latest-linux-x64 -o dist/linux/liberama .",
     "build:win": "npm run build:client && node build/win && pkg -t latest-win-x64 -o dist/win/liberama .",

+ 7 - 57
server/core/Reader/BookConverter/ConvertBase.js

@@ -5,6 +5,7 @@ const he = require('he');
 const LimitedQueue = require('../../LimitedQueue');
 const textUtils = require('./textUtils');
 const utils = require('../../utils');
+const xmlParser = require('../../xmlParser');
 
 const queue = new LimitedQueue(3, 20, 2*60*1000);//2 минуты ожидание подвижек
 
@@ -14,7 +15,6 @@ class ConvertBase {
 
         this.calibrePath = `${config.dataDir}/calibre/ebook-convert`;
         this.sofficePath = '/usr/bin/soffice';
-        this.pdfToHtmlPath = '/usr/bin/pdftohtml';
     }
 
     async run(data, opts) {// eslint-disable-line no-unused-vars
@@ -27,9 +27,6 @@ class ConvertBase {
 
         if (!await fs.pathExists(this.sofficePath))
             throw new Error('Внешний конвертер LibreOffice не найден');
-
-        if (!await fs.pathExists(this.pdfToHtmlPath))
-            throw new Error('Внешний конвертер pdftohtml не найден');
     }
 
     async execConverter(path, args, onData, abort) {
@@ -106,61 +103,14 @@ class ConvertBase {
     }
 
     formatFb2(fb2) {
-        let out = '<?xml version="1.0" encoding="utf-8"?>';
-        out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';
-        out += this.formatFb2Node(fb2);
-        out += '</FictionBook>';
-        return out;
-    }
-
-    formatFb2Node(node, name) {
-        let out = '';
-
-        if (Array.isArray(node)) {
-            for (const n of node) {
-                out += this.formatFb2Node(n);
-            }
-        } else if (typeof node == 'string') {
-            if (name)
-                out += `<${name}>${this.repSpaces(node)}</${name}>`;
-            else
-                out += this.repSpaces(node);
-        } else {
-            if (node._n)
-                name = node._n;
-
-            let attrs = '';
-            if (node._attrs) {
-                for (let attrName in node._attrs) {
-                    attrs += ` ${attrName}="${node._attrs[attrName]}"`;
-                }
+        const out = xmlParser.formatXml({
+            FictionBook: {
+                _attrs: {xmlns: 'http://www.gribuser.ru/xml/fictionbook/2.0', 'xmlns:l': 'http://www.w3.org/1999/xlink'},
+                _a: [fb2],
             }
+        }, 'utf-8', this.repSpaces);
 
-            let tOpen = '';
-            let tBody = '';
-            let tClose = '';
-            if (name)
-                tOpen += `<${name}${attrs}>`;
-            if (node.hasOwnProperty('_t'))
-                tBody += this.repSpaces(node._t);
-
-            for (let nodeName in node) {
-                if (nodeName && nodeName[0] == '_' && nodeName != '_a')
-                    continue;
-
-                const n = node[nodeName];
-                tBody += this.formatFb2Node(n, nodeName);
-            }
-            
-            if (name)
-                tClose += `</${name}>`;
-
-            if (attrs == '' && name == 'p' && tBody.trim() == '')
-                out += '<empty-line/>'
-            else
-                out += `${tOpen}${tBody}${tClose}`;
-        }
-        return out;
+        return out.replace(/<p>\s*?<\/p>/g, '<empty-line/>');
     }
 }
 

+ 27 - 10
server/core/Reader/BookConverter/ConvertDjvu.js

@@ -2,9 +2,9 @@ const fs = require('fs-extra');
 const path = require('path');
 const utils = require('../../utils');
 
-const ConvertHtml = require('./ConvertHtml');
+const ConvertBase = require('./ConvertBase');
 
-class ConvertDjvu extends ConvertHtml {
+class ConvertDjvu extends ConvertBase {
     check(data, opts) {
         const {inputFiles} = opts;
 
@@ -59,9 +59,17 @@ class ConvertDjvu extends ConvertHtml {
         }, abort);
 
         //читаем изображения
+        limitSize = 2*this.config.maxUploadFileSize;
+        let imagesSize = 0;
+
         const loadImage = async(image) => {
             image.data = (await fs.readFile(image.file)).toString('base64');
             image.name = path.basename(image.file);
+
+            imagesSize += image.data.length;
+            if (imagesSize > limitSize) {
+                throw new Error(`Файл для конвертирования слишком большой|FORLOG| imagesSize: ${imagesSize} > ${limitSize}`);
+            }
         }
 
         let files = [];
@@ -82,20 +90,29 @@ class ConvertDjvu extends ConvertHtml {
 
         await Promise.all(loading);
 
-        //формируем текст
-        limitSize = 2*this.config.maxUploadFileSize;
+        //формируем fb2
+        let titleInfo = {};
+        let desc = {_n: 'description', 'title-info': titleInfo};
+        let pars = [];
+        let body = {_n: 'body', section: {_a: [pars]}};
+        let binary = [];
+        let fb2 = [desc, body, binary];
+
         let title = '';
         if (uploadFileName)
             title = uploadFileName;
-        let text = `<title>${title}</title>`;
+
+        titleInfo['book-title'] = title;
+
         for (const image of images) {
-            text += `<fb2-image type="image/jpeg" name="${image.name}">${image.data}</fb2-image>`;
+            const img = {_n: 'binary', _attrs: {id: image.name, 'content-type': 'image/jpeg'}, _t: image.data};
+            binary.push(img);
 
-            if (text.length > limitSize) {
-                throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
-            }
+            pars.push({_n: 'p', _t: ''});
+            pars.push({_n: 'image', _attrs: {'l:href': `#${image.name}`}});
         }
-        return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
+
+        return this.formatFb2(fb2);
     }
 }
 

+ 6 - 5
server/core/Reader/BookConverter/ConvertFb3.js

@@ -2,7 +2,7 @@ const fs = require('fs-extra');
 
 const ConvertHtml = require('./ConvertHtml');
 
-class ConvertDocX extends ConvertHtml {
+class ConvertFb3 extends ConvertHtml {
     async check(data, opts) {
         const {inputFiles} = opts;
         if (this.config.useExternalBookConverter && 
@@ -39,13 +39,14 @@ class ConvertDocX extends ConvertHtml {
         const title = this.getTitle(text)
             .replace(/<\/?p>/g, '')
         ;
-        text = `<title>${title}</title>` + text
+        text = `<fb2-title>${title}</fb2-title>` + text
             .replace(/<title>/g, '<br><b>')
             .replace(/<\/title>/g, '</b><br>')
-            .replace(/<subtitle>/g, '<br><br><subtitle>')
+            .replace(/<subtitle>/g, '<br><br><fb2-subtitle>')
+            .replace(/<\/subtitle>/g, '</fb2-subtitle>')
         ;
-        return await super.run(Buffer.from(text), {skipCheck: true, cutTitle: true});
+        return await super.run(Buffer.from(text), {skipCheck: true});
     }
 }
 
-module.exports = ConvertDocX;
+module.exports = ConvertFb3;

+ 92 - 36
server/core/Reader/BookConverter/ConvertHtml.js

@@ -34,7 +34,6 @@ class ConvertHtml extends ConvertBase {
         } else {
             isText = opts.isText;
         }
-        let {cutTitle} = opts;
 
         let titleInfo = {};
         let desc = {_n: 'description', 'title-info': titleInfo};
@@ -44,12 +43,17 @@ class ConvertHtml extends ConvertBase {
         let fb2 = [desc, body, binary];
 
         let title = '';
+        let author = '';
         let inTitle = false;
+        let inSectionTitle = false;
+        let inAuthor = false;
         let inSubTitle = false;
         let inImage = false;
         let image = {};
         let bold = false;
         let italic = false;
+        let superscript = false;
+        let subscript = false;
         let begining = true;
 
         let spaceCounter = [];
@@ -62,7 +66,7 @@ class ConvertHtml extends ConvertBase {
         };
 
         const growParagraph = (text) => {
-            if (!pars.length)
+            if (!pars.length || pars[pars.length - 1]._n != 'p')
                 newParagraph();
 
             const l = pars.length;
@@ -94,12 +98,16 @@ class ConvertHtml extends ConvertBase {
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
             text = this.escapeEntities(text);
 
-            if (!cutCounter && !(cutTitle && inTitle)) {
+            if (!(cutCounter || inTitle || inSectionTitle || inSubTitle)) {
                 let tOpen = '';
                 tOpen += (inSubTitle ? '<subtitle>' : '');
                 tOpen += (bold ? '<strong>' : '');
                 tOpen += (italic ? '<emphasis>' : '');
+                tOpen += (superscript ? '<sup>' : '');
+                tOpen += (subscript ? '<sub>' : '');
                 let tClose = ''
+                tClose += (subscript ? '</sub>' : '');
+                tClose += (superscript ? '</sup>' : '');
                 tClose +=  (italic ? '</emphasis>' : '');
                 tClose += (bold ? '</strong>' : '');
                 tClose += (inSubTitle ? '</subtitle>' : '');
@@ -110,12 +118,22 @@ class ConvertHtml extends ConvertBase {
             if (inTitle && !title)
                 title = text;
 
+            if (inAuthor && !author)
+                author = text;
+
+            if (inSectionTitle) {
+                pars.unshift({_n: 'title', _t: text});
+            }
+
+            if (inSubTitle) {
+                pars.push({_n: 'subtitle', _t: text});
+            }
+
             if (inImage) {
                 image._t = text;
                 binary.push(image);
 
                 pars.push({_n: 'image', _attrs: {'l:href': '#' + image._attrs.id}, _t: ''});
-                newParagraph();
             }
 
         };
@@ -140,15 +158,27 @@ class ConvertHtml extends ConvertBase {
                         bold = true;
                         break;
                 }
+
+                if (tag == 'sup')
+                    superscript = true;
+        
+                if (tag == 'sub')
+                    subscript = true;
             }
 
-            if (tag == 'title' || tag == 'cut-title') {
+            if (tag == 'title' || tag == 'fb2-title') {
                 inTitle = true;
-                if (tag == 'cut-title')
-                    cutTitle = true;
             }
 
-            if (tag == 'subtitle') {
+            if (tag == 'fb2-author') {
+                inAuthor = true;
+            }
+
+            if (tag == 'fb2-section-title') {
+                inSectionTitle = true;
+            }
+
+            if (tag == 'fb2-subtitle') {
                 inSubTitle = true;
             }
 
@@ -156,7 +186,7 @@ class ConvertHtml extends ConvertBase {
                 inImage = true;
                 const attrs = sax.getAttrsSync(tail);
                 image = {_n: 'binary', _attrs: {id: attrs.name.value, 'content-type': attrs.type.value}, _t: ''};
-            }
+            }            
         };
 
         const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
@@ -179,12 +209,26 @@ class ConvertHtml extends ConvertBase {
                         bold = false;
                         break;
                 }
+
+                if (tag == 'sup')
+                    superscript = false;
+        
+                if (tag == 'sub')
+                    subscript = false;
             }
 
-            if (tag == 'title' || tag == 'cut-title')
+            if (tag == 'title' || tag == 'fb2-title')
                 inTitle = false;
 
-            if (tag == 'subtitle')
+            if (tag == 'fb2-author') {
+                inAuthor = false;
+            }
+
+            if (tag == 'fb2-section-title') {
+                inSectionTitle = false;
+            }
+
+            if (tag == 'fb2-subtitle')
                 inSubTitle = false;
 
             if (tag == 'fb2-image')
@@ -195,10 +239,17 @@ class ConvertHtml extends ConvertBase {
 
         sax.parseSync(buf, {
             onStartNode, onEndNode, onTextNode,
-            innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image'])
+            innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image', 'fb2-title', 'fb2-author'])
         });
 
         titleInfo['book-title'] = title;
+        if (author)
+            titleInfo.author = {'last-name': author};
+
+        body.section._a[0] = pars;
+
+        //console.log(JSON.stringify(fb2, null, 2));
+
         //подозрение на чистый текст, надо разбить на параграфы
         if (isText || (buf.length > 30*1024 && pars.length < buf.length/2000)) {
             let total = 0;
@@ -228,56 +279,49 @@ class ConvertHtml extends ConvertBase {
             if (parIndent > 2) parIndent--;
 
             let newPars = [];
+            let curPar = {};
             const newPar = () => {
-                newPars.push({_n: 'p', _t: ''});
+                curPar = {_n: 'p', _t: ''};
+                newPars.push(curPar);
             };
 
-            const growPar = (text) => {
-                if (!newPars.length)
-                    newPar();
-
-                const l = newPars.length;
-                newPars[l - 1]._t += text;
-            }
-
-            i = 0;
             for (const par of pars) {
                 if (par._n != 'p') {
                     newPars.push(par);
                     continue;
                 }
 
-                if (i > 0)
-                    newPar();
-                i++;
-
-                let j = 0;
+                newPar();
+                
                 const lines = par._t.split('\n');
-                for (let line of lines) {
-                    line = repCrLfTab(line);
+                for (let j = 0; j < lines.length; j++) {
+                    const line = repCrLfTab(lines[j]);
 
                     let l = 0;
                     while (l < line.length && line[l] == ' ') {
                         l++;
                     }
 
-                    if (l >= parIndent || line == '') {
-                        if (j > 0)
-                            newPar();
-                        j++;
+                    if (j > 0 &&
+                        (l >= parIndent ||
+                            (j < lines.length - 1 && line == '')
+                        )
+                    ) {
+                        newPar();
                     }
-                    growPar(line.trim() + ' ');
+
+                    curPar._t += line.trim() + ' ';
                 }
             }
 
             body.section._a[0] = newPars;
-        } else {
-            body.section._a[0] = pars;
         }
 
         //убираем лишнее, делаем валидный fb2, т.к. в рез-те разбиения на параграфы бьются теги
         bold = false;
         italic = false;
+        superscript = false;
+        subscript = false;
         inSubTitle = false;
         pars = body.section._a[0];
         for (let i = 0; i < pars.length; i++) {
@@ -297,7 +341,11 @@ class ConvertHtml extends ConvertBase {
                     tOpen += (inSubTitle ? '<subtitle>' : '');
                     tOpen += (bold ? '<strong>' : '');
                     tOpen += (italic ? '<emphasis>' : '');
+                    tOpen += (superscript ? '<sup>' : '');
+                    tOpen += (subscript ? '<sub>' : '');
                     let tClose = ''
+                    tClose += (subscript ? '</sub>' : '');
+                    tClose += (superscript ? '</sup>' : '');
                     tClose +=  (italic ? '</emphasis>' : '');
                     tClose += (bold ? '</strong>' : '');
                     tClose += (inSubTitle ? '</subtitle>' : '');
@@ -313,6 +361,10 @@ class ConvertHtml extends ConvertBase {
                         bold = true;
                     if (tag == 'emphasis')
                         italic = true;
+                    if (tag == 'sup')
+                        superscript = true;
+                    if (tag == 'sub')
+                        subscript = true;
                     if (tag == 'subtitle')
                         inSubTitle = true;
                 }
@@ -322,6 +374,10 @@ class ConvertHtml extends ConvertBase {
                         bold = false;
                     if (tag == 'emphasis')
                         italic = false;
+                    if (tag == 'sup')
+                        superscript = false;
+                    if (tag == 'sub')
+                        subscript = false;
                     if (tag == 'subtitle')
                         inSubTitle = false;
                 }

+ 201 - 81
server/core/Reader/BookConverter/ConvertPdf.js

@@ -1,9 +1,11 @@
+//const _ = require('lodash');
 const fs = require('fs-extra');
 const path = require('path');
 
 const sax = require('../../sax');
 const utils = require('../../utils');
 const ConvertHtml = require('./ConvertHtml');
+const xmlParser = require('../../xmlParser');
 
 class ConvertPdf extends ConvertHtml {
     check(data, opts) {
@@ -22,11 +24,18 @@ class ConvertPdf extends ConvertHtml {
         const {inputFiles, callback, abort, uploadFileName} = opts;
 
         const inpFile = inputFiles.sourceFile;
-        const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
+        const outBasename = `${inputFiles.filesDir}/${utils.randomHexString(10)}`;
+        const outFile = `${outBasename}.xml`;
+        const metaFile = `${outBasename}_metadata.xml`;
+
+        const pdfaltoPath = `${this.config.dataDir}/pdfalto/pdfalto`;
+
+        if (!await fs.pathExists(pdfaltoPath))
+            throw new Error('Внешний конвертер pdfalto не найден');
 
         //конвертируем в xml
         let perc = 0;
-        await this.execConverter(this.pdfToHtmlPath, ['-nodrm', '-c', '-s', '-xml', inpFile, outFile], () => {
+        await this.execConverter(pdfaltoPath, [inpFile, outFile], () => {
             perc = (perc < 80 ? perc + 10 : 40);
             callback(perc);
         }, abort);
@@ -35,17 +44,22 @@ class ConvertPdf extends ConvertHtml {
         const data = await fs.readFile(outFile);
         callback(90);
 
+        await utils.sleep(100);
+
         //парсим xml
         let lines = [];
+        let pagelines = [];
+        let line = {text: ''};
+        let page = {};
+        let fonts = {};
+        let sectionTitleFound = false;
+
         let images = [];
         let loading = [];
-        let inText = false;
-        let bold = false;
-        let italic = false;
+
         let title = '';
-        let prevTop = 0;
+        let author = '';
         let i = -1;
-        let titleCount = 0;
 
         const loadImage = async(image) => {
             const src = path.parse(image.src);
@@ -59,7 +73,7 @@ class ConvertPdf extends ConvertHtml {
                 image.type = type;
                 image.name = src.base;
             }
-        }
+        };
 
         const putImage = (curTop) => {
             if (!isNaN(curTop) && images.length) {
@@ -69,104 +83,180 @@ class ConvertPdf extends ConvertHtml {
                     images.shift();
                 }
             }
-        }
+        };
+
+        const putPageLines = () => {
+            pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left))
+            
+            //объединяем в одну строку равные по высоте
+            const pl = [];
+            let pt = 0;
+            let j = -1;
+            pagelines.forEach(line => {
+                //добавим закрывающий тег стиля
+                line.text += line.tClose;
 
-        const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
-            if (!cutCounter && inText) {
-                let tOpen = (bold ? '<b>' : '');
-                tOpen += (italic ? '<i>' : '');
-                let tClose = (italic ? '</i>' : '');
-                tClose += (bold ? '</b>' : '');
-
-                lines[i].text += `${tOpen}${text}${tClose} `;
-                if (titleCount < 2 && text.trim() != '') {
-                    title += text + (titleCount ? '' : ' - ');
-                    titleCount++;
+                //проверим, возможно это заголовок
+                if (line.fonts.length == 1 && line.pageWidth) {
+                    const f = (line.fonts.length ? fonts[line.fonts[0]] : null);
+                    const centerLeft = (line.pageWidth - line.width)/2;
+                    if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) {
+                        if (!sectionTitleFound) {
+                            line.isSectionTitle = true;
+                            sectionTitleFound = true;
+                        } else {
+                            line.isSubtitle = true;
+                        }
+                    }
                 }
-            }
+
+                //объединяем
+                if (pt == 0 || Math.abs(pt - line.top) > 3) {
+                    j++;
+                    pl[j] = line;
+                } else {
+                    pl[j].text += ` ${line.text}`;
+                }
+                pt = line.top;
+            });
+
+            //заполняем lines
+            const lastIndex = i;
+            pl.forEach(line => {
+                putImage(line.top);
+
+                //добавим пустую строку, если надо
+                const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
+                if (prevLine && !prevLine.isImage) {
+                    const f = (prevLine.fonts.length ? fonts[prevLine.fonts[0]] : (line.fonts.length ? fonts[line.fonts[0]] : null));
+                    if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize*1.8) {
+                        i++;
+                        lines[i] = {text: '<br>'};
+                    }
+                }
+
+                i++;
+                lines[i] = line;
+            });
+            pagelines = [];
+            putImage(100000);
         };
 
         const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
-            if (!cutCounter) {
-                if (inText) {
-                    switch (tag) {
-                        case 'i':
-                            italic = true;
-                            break;
-                        case 'b':
-                            bold = true;
-                            break;
+            if (tag == 'textstyle') {
+                const attrs = sax.getAttrsSync(tail);
+                const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
+                const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : '');
+                const fontSize = (attrs.fontsize && attrs.fontsize.value ? attrs.fontsize.value : '');
+
+                if (fontId) {
+                    const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'};
+                    const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false, fontSize};
+
+                    if (fontStyle) {
+                        const styles = fontStyle.split(' ');
+                        styles.forEach(style => {
+                            const s = styleTags[style];
+                            if (s) {
+                                f.tOpen += `<${s}>`;
+                                f.tClose = `</${s}>${f.tClose}`;
+                                if (s == 'b')
+                                    f.isBold = true;
+                            }
+                        });
                     }
                 }
+            }
 
-                if (tag == 'text' && !inText) {
-                    let attrs = sax.getAttrsSync(tail);
-                    const line = {
-                        text: '',
-                        top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
-                        left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
-                        width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
-                        height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
-                    };
-
-                    if (line.width != 0 || line.height != 0) {
-                        inText = true;
-                        if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
-                            putImage(line.top);
-                            i++;
-                            lines[i] = line;
-                        }
-                        prevTop = line.top;
+            if (tag == 'page') {
+                const attrs = sax.getAttrsSync(tail);
+                page = {
+                    width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
+                };
+
+                putPageLines();
+            }
+
+            if (tag == 'textline') {
+                const attrs = sax.getAttrsSync(tail);
+                line = {
+                    text: '',
+                    top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10),
+                    left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10),
+                    width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
+                    height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
+                    tOpen: '',
+                    tClose: '',
+                    isSectionTitle: false,
+                    isSubtitle: false,
+                    pageWidth: page.width,
+                    fonts: [],
+                };
+
+                if (line.width != 0 || line.height != 0) {
+                    pagelines.push(line);
+                }
+            }
+
+            if (tag == 'string') {
+                const attrs = sax.getAttrsSync(tail);
+                if (attrs.content && attrs.content.value) {
+
+                    let tOpen = '';
+                    let tClose = '';
+                    const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : '');
+                    if (fontId && fonts[fontId]) {
+                        tOpen = fonts[fontId].tOpen;
+                        tClose = fonts[fontId].tClose;
+                        if (!line.fonts.length || line.fonts[0] != fontId)
+                            line.fonts.push(fontId);
+                    }
+
+                    if (line.tOpen != tOpen) {
+                        line.text += line.tClose + tOpen;
+                        line.tOpen = tOpen;
+                        line.tClose = tClose;
                     }
+
+                    line.text += `${line.text.length ? ' ' : ''}${attrs.content.value}`;
                 }
+            }
 
-                if (tag == 'image') {
-                    const attrs = sax.getAttrsSync(tail);
-                    const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
+            if (tag == 'illustration') {
+                const attrs = sax.getAttrsSync(tail);
+                if (attrs.type && attrs.type.value == 'image') {
+                    let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
                     if (src) {
                         const image = {
                             isImage: true,
                             src,
                             data: '',
                             type: '',
-                            top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
+                            top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
+                            left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0,
+                            width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
+                            height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
                         };
-                        loading.push(loadImage(image));
-                        images.push(image);
-                        images.sort((a, b) => a.top - b.top)
+                        const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height));
+                        if (!exists.length) {
+                            loading.push(loadImage(image));
+                            images.push(image);
+                            images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
+                        }
                     }
                 }
-
-                if (tag == 'page') {
-                    putImage(100000);
-                }
             }
         };
 
-        const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
-            if (inText) {
-                switch (tag) {
-                    case 'i':
-                        italic = false;
-                        break;
-                    case 'b':
-                        bold = false;
-                        break;
-                }
-            }
-
-            if (tag == 'text')
-                inText = false;
-        };
-
         let buf = this.decode(data).toString();
         sax.parseSync(buf, {
-            onStartNode, onEndNode, onTextNode
+            onStartNode
         });
 
-        putImage(100000);
+        putPageLines();
 
         await Promise.all(loading);
+        await utils.sleep(100);
 
         //найдем параграфы и отступы
         const indents = [];
@@ -187,11 +277,29 @@ class ConvertPdf extends ConvertHtml {
         }
         indents[0] = 0;
 
-        //формируем текст
-        const limitSize = 2*this.config.maxUploadFileSize;
+        //title
+        if (fs.pathExists(metaFile)) {
+            const metaXmlString = (await fs.readFile(metaFile)).toString();
+            let metaXmlParsed = xmlParser.parseXml(metaXmlString);
+            metaXmlParsed = xmlParser.simplifyXmlParsed(metaXmlParsed);
+            if (metaXmlParsed.metadata) {
+                title = (metaXmlParsed.metadata.title ? metaXmlParsed.metadata.title._t : '');
+                author = (metaXmlParsed.metadata.author ? metaXmlParsed.metadata.author._t : '');
+            }
+        }
+
         if (!title && uploadFileName)
             title = uploadFileName;
-        let text = `<title>${title}</title>`;
+
+        //console.log(JSON.stringify(lines, null, 2));
+        //формируем текст
+        const limitSize = 2*this.config.maxUploadFileSize;
+        let text = '';
+        if (title)
+            text += `<fb2-title>${title}</fb2-title>`;
+        if (author)
+            text += `<fb2-author>${author}</fb2-author>`;
+
         let concat = '';
         let sp = '';
         for (const line of lines) {
@@ -204,6 +312,16 @@ class ConvertPdf extends ConvertHtml {
                 continue;
             }
 
+            if (line.isSectionTitle) {
+                text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
+                continue;
+            }
+
+            if (line.isSubtitle) {
+                text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
+                continue;
+            }
+
             if (concat == '') {
                 const left = line.left || 0;
                 sp = ' '.repeat(indents[left]);
@@ -221,7 +339,9 @@ class ConvertPdf extends ConvertHtml {
         if (concat)
             text += sp + concat + "\n";
 
-        return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
+        //console.log(text);
+        await utils.sleep(100);
+        return await super.run(Buffer.from(text), {skipCheck: true, isText: true});
     }
 }
 

+ 8 - 8
server/core/Reader/BookConverter/ConvertSites.js

@@ -48,7 +48,7 @@ class ConvertSites extends ConvertHtml {
         if (text === false)
             return false;
 
-        return await super.run(Buffer.from(text), {skipCheck: true, cutTitle: true});
+        return await super.run(Buffer.from(text), {skipCheck: true});
     }
 
     getTitle(text) {
@@ -79,7 +79,7 @@ class ConvertSites extends ConvertHtml {
         let book = this.getTitle(text);
         book = book.replace(' (fb2) | Флибуста', '');
 
-        const title = `<title>${author}${(author ? ' - ' : '')}${book}</title>`;
+        const title = `<fb2-title>${author}${(author ? ' - ' : '')}${book}</fb2-title>`;
 
         let begin = '<h3 class="book">';
         if (text.indexOf(begin) <= 0)
@@ -95,12 +95,12 @@ class ConvertSites extends ConvertHtml {
         return text.substring(l, r)
             .replace(/blockquote class="?book"?/g, 'p')
             .replace(/<br\/?>\s*<\/h3>/g, '</h3>')
-            .replace(/<h3 class="?book"?>/g, '<br><br><subtitle>')
-            .replace(/<h5 class="?book"?>/g, '<br><br><subtitle>')
-            .replace(/<h3>/g, '<br><br><subtitle>')
-            .replace(/<h5>/g, '<br><br><subtitle>')
-            .replace(/<\/h3>/g, '</subtitle><br>')
-            .replace(/<\/h5>/g, '</subtitle><br>')
+            .replace(/<h3 class="?book"?>/g, '<br><br><fb2-subtitle>')
+            .replace(/<h5 class="?book"?>/g, '<br><br><fb2-subtitle>')
+            .replace(/<h3>/g, '<br><br><fb2-subtitle>')
+            .replace(/<h5>/g, '<br><br><fb2-subtitle>')
+            .replace(/<\/h3>/g, '</fb2-subtitle><br>')
+            .replace(/<\/h5>/g, '</fb2-subtitle><br>')
             .replace(/<div class="?stanza"?>/g, '<br>')
             .replace(/<div>/g, '<br>')
             + title;

+ 15 - 8
server/core/sax.js

@@ -6,7 +6,8 @@ function parseSync(xstr, options) {
         onCdata: _onCdata = dummy,
         onComment: _onComment = dummy,
         onProgress: _onProgress = dummy,
-        innerCut = new Set()
+        innerCut = new Set(),
+        lowerCase = true,
     } = options;
 
     let i = 0;
@@ -91,7 +92,8 @@ function parseSync(xstr, options) {
             } else {
                 tag = tagData;
             }
-            tag = tag.toLowerCase();
+            if (lowerCase)
+                tag = tag.toLowerCase();
 
             if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
                 if (!cutCounter)
@@ -146,7 +148,8 @@ async function parse(xstr, options) {
         onCdata: _onCdata = dummy,
         onComment: _onComment = dummy,
         onProgress: _onProgress = dummy,
-        innerCut = new Set()
+        innerCut = new Set(),
+        lowerCase = true,
     } = options;
 
     let i = 0;
@@ -231,7 +234,8 @@ async function parse(xstr, options) {
             } else {
                 tag = tagData;
             }
-            tag = tag.toLowerCase();
+            if (lowerCase)
+                tag = tag.toLowerCase();
 
             if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
                 if (!cutCounter)
@@ -276,7 +280,7 @@ async function parse(xstr, options) {
     await _onProgress(100);
 }
 
-function getAttrsSync(tail) {
+function getAttrsSync(tail, lowerCase = true) {
     let result = {};
     let name = '';    
     let value = '';
@@ -287,13 +291,16 @@ function getAttrsSync(tail) {
     let waitEq = false;
 
     const pushResult = () => {
+        if (lowerCase)
+            name = name.toLowerCase();
         if (name != '') {
+            const fn = name;
             let ns = '';
-            if (name.indexOf(':') >= 0) {
-                [ns, name] = name.split(':');
+            if (fn.indexOf(':') >= 0) {
+                [ns, name] = fn.split(':');
             }
 
-            result[name] = {value, ns};
+            result[name] = {value, ns, fn};
         }
         name = '';
         value = '';

+ 143 - 0
server/core/xmlParser.js

@@ -0,0 +1,143 @@
+const sax = require('./sax');
+
+function formatXml(xmlParsed, encoding = 'utf-8', textFilterFunc) {
+    let out = `<?xml version="1.0" encoding="${encoding}"?>`;
+    out += formatXmlNode(xmlParsed, textFilterFunc);
+    return out;
+}
+
+function formatXmlNode(node, textFilterFunc) {
+    textFilterFunc = (textFilterFunc ? textFilterFunc : text => text);
+
+    const formatNode = (node, name) => {
+        let out = '';
+
+        if (Array.isArray(node)) {
+            for (const n of node) {
+                out += formatNode(n);
+            }
+        } else if (typeof node == 'string') {
+            if (name)
+                out += `<${name}>${textFilterFunc(node)}</${name}>`;
+            else
+                out += textFilterFunc(node);
+        } else {
+            if (node._n)
+                name = node._n;
+
+            let attrs = '';
+            if (node._attrs) {
+                for (let attrName in node._attrs) {
+                    attrs += ` ${attrName}="${node._attrs[attrName]}"`;
+                }
+            }
+
+            let tOpen = '';
+            let tBody = '';
+            let tClose = '';
+            if (name)
+                tOpen += `<${name}${attrs}>`;
+            if (node.hasOwnProperty('_t'))
+                tBody += textFilterFunc(node._t);
+
+            for (let nodeName in node) {
+                if (nodeName && nodeName[0] == '_' && nodeName != '_a')
+                    continue;
+
+                const n = node[nodeName];
+                tBody += formatNode(n, nodeName);
+            }
+            
+            if (name)
+                tClose += `</${name}>`;
+
+            out += `${tOpen}${tBody}${tClose}`;
+        }
+        return out;
+    }
+
+    return formatNode(node);
+}
+
+function parseXml(xmlString, lowerCase = true) {
+    let result = {};
+    let node = result;
+
+    const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+        node._t = text;
+    };
+
+    const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+        if (tag == '?xml')
+            return;
+        
+        const newNode = {_n: tag, _p: node};
+
+        if (tail) {
+            const parsedAttrs = sax.getAttrsSync(tail, lowerCase);
+            const atKeys = Object.keys(parsedAttrs);
+            if (atKeys.length) {
+                const attrs = {};
+                for (let i = 0; i < atKeys.length; i++) {
+                    const attrName = atKeys[i];
+                    attrs[parsedAttrs[attrName].fn] = parsedAttrs[attrName].value;
+                }
+
+                newNode._attrs = attrs;
+            }
+        }
+
+        if (!node._a)
+            node._a = [];
+        node._a.push(newNode);
+        node = newNode;
+    };
+
+    const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+        if (node._p && node._n == tag)
+            node = node._p;
+    };
+
+    sax.parseSync(xmlString, {
+        onStartNode, onEndNode, onTextNode, lowerCase
+    });
+
+    if (result._a)
+        result = result._a[0];
+    
+    return result;
+}
+
+function simplifyXmlParsed(node) {
+    
+    const simplifyNodeArray = (a) => {
+        const result = {};
+
+        for (let i = 0; i < a.length; i++) {
+            const child = a[i];
+            if (child._n && !result[child._n]) {
+                result[child._n] = {};
+                if (child._a) {
+                    result[child._n] = simplifyNodeArray(child._a);
+                }
+                if (child._t) {
+                    result[child._n]._t = child._t;
+                }
+                if (child._attrs) {
+                    result[child._n]._attrs = child._attrs;
+                }
+            }
+        }
+
+        return result;
+    };
+
+    return simplifyNodeArray([node]);
+}
+
+module.exports = {
+    formatXml,
+    formatXmlNode,
+    parseXml,
+    simplifyXmlParsed
+}