Prechádzať zdrojové kódy

Работа над конвертером Pdf

Book Pauk 4 rokov pred
rodič
commit
ef0d6eab89

+ 19 - 3
server/core/Reader/BookConverter/ConvertHtml.js

@@ -45,6 +45,7 @@ class ConvertHtml extends ConvertBase {
         let title = '';
         let author = '';
         let inTitle = false;
+        let inSectionTitle = false;
         let inAuthor = false;
         let inSubTitle = false;
         let inImage = false;
@@ -63,7 +64,7 @@ class ConvertHtml extends ConvertBase {
         };
 
         const growParagraph = (text) => {
-            if (!pars.length)
+            if (!pars.length || pars[pars.length - 1]._n != 'p')
                 newParagraph();
 
             const l = pars.length;
@@ -95,7 +96,7 @@ class ConvertHtml extends ConvertBase {
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
             text = this.escapeEntities(text);
 
-            if (!(cutCounter || inTitle) || inSubTitle) {
+            if (!(cutCounter || inTitle || inSectionTitle || inSubTitle)) {
                 let tOpen = '';
                 tOpen += (inSubTitle ? '<subtitle>' : '');
                 tOpen += (bold ? '<strong>' : '');
@@ -114,12 +115,19 @@ class ConvertHtml extends ConvertBase {
             if (inAuthor && !author)
                 author = text;
 
+            if (inSectionTitle) {
+                pars.unshift({_n: 'title', _t: text});
+            }
+
+            if (inSubTitle) {
+                pars.push({_n: 'subtitle', _t: text});
+            }
+
             if (inImage) {
                 image._t = text;
                 binary.push(image);
 
                 pars.push({_n: 'image', _attrs: {'l:href': '#' + image._attrs.id}, _t: ''});
-                newParagraph();
             }
 
         };
@@ -154,6 +162,10 @@ class ConvertHtml extends ConvertBase {
                 inAuthor = true;
             }
 
+            if (tag == 'fb2-section-title') {
+                inSectionTitle = true;
+            }
+
             if (tag == 'fb2-subtitle') {
                 inSubTitle = true;
             }
@@ -194,6 +206,10 @@ class ConvertHtml extends ConvertBase {
                 inAuthor = false;
             }
 
+            if (tag == 'fb2-section-title') {
+                inSectionTitle = false;
+            }
+
             if (tag == 'fb2-subtitle')
                 inSubTitle = false;
 

+ 46 - 3
server/core/Reader/BookConverter/ConvertPdf.js

@@ -50,7 +50,9 @@ class ConvertPdf extends ConvertHtml {
         let lines = [];
         let pagelines = [];
         let line = {text: ''};
+        let page = {};
         let fonts = {};
+        let sectionTitleFound = false;
 
         let images = [];
         let loading = [];
@@ -95,11 +97,26 @@ class ConvertPdf extends ConvertHtml {
                 //добавим закрывающий тег стиля
                 line.text += line.tClose;
 
+                //проверим, возможно это заголовок
+                if (line.fonts.length == 1 && line.pageWidth) {
+                    const f = fonts[line.fonts[0]];
+                    const centerLeft = (line.pageWidth - line.width)/2;
+                    if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) {
+                        if (!sectionTitleFound) {
+                            line.isSectionTitle = true;
+                            sectionTitleFound = true;
+                        } else {
+                            line.isSubtitle = true;
+                        }
+                    }
+                }
+
+                //объедняем
                 if (Math.abs(pt - line.top) > 3) {
                     j++;
                     pl[j] = line;
                 } else {
-                    pl[j].text += line.text;
+                    pl[j].text += ` ${line.text}`;
                 }
                 pt = line.top;
             });
@@ -111,6 +128,7 @@ class ConvertPdf extends ConvertHtml {
                 lines[i] = line;
             });
             pagelines = [];
+            prevTop = 0;
         };
 
         const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
@@ -122,19 +140,26 @@ class ConvertPdf extends ConvertHtml {
                 if (fontId && fontStyle) {
                     const styles = fontStyle.split(' ');
                     const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'};
-                    const f = fonts[fontId] = {tOpen: '', tClose: ''};
+                    const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false};
 
                     styles.forEach(style => {
                         const s = styleTags[style];
                         if (s) {
                             f.tOpen += `<${s}>`;
                             f.tClose = `</${s}>${f.tClose}`;
+                            if (s == 'b')
+                                f.isBold = true;
                         }
                     });
                 }
             }
 
             if (tag == 'page') {
+                const attrs = sax.getAttrsSync(tail);
+                page = {
+                    width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
+                };
+
                 putPageLines();
                 putImage(100000);
             }
@@ -149,13 +174,17 @@ class ConvertPdf extends ConvertHtml {
                     height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
                     tOpen: '',
                     tClose: '',
+                    isSectionTitle: false,
+                    isSubtitle: false,
+                    pageWidth: page.width,
+                    fonts: [],
                 };
 
                 if (line.width != 0 || line.height != 0) {
                     if (Math.abs(prevTop - line.top) > 3) {
                         putImage(line.top);
-                        pagelines.push(line);
                     }
+                    pagelines.push(line);
                     prevTop = line.top;
                 }
             }
@@ -170,6 +199,8 @@ class ConvertPdf extends ConvertHtml {
                     if (fontId && fonts[fontId]) {
                         tOpen = fonts[fontId].tOpen;
                         tClose = fonts[fontId].tClose;
+                        if (!line.fonts.length || line.fonts[0] != fontId)
+                            line.fonts.push(fontId);
                     }
 
                     if (line.tOpen != tOpen) {
@@ -252,6 +283,7 @@ class ConvertPdf extends ConvertHtml {
         if (!title && uploadFileName)
             title = uploadFileName;
 
+        //console.log(JSON.stringify(lines, null, 2));
         //формируем текст
         const limitSize = 2*this.config.maxUploadFileSize;
         let text = '';
@@ -270,6 +302,16 @@ class ConvertPdf extends ConvertHtml {
                 continue;
             }
 
+            if (line.isSectionTitle) {
+                text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
+                continue;
+            }
+
+            if (line.isSubtitle) {
+                text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
+                continue;
+            }
+
             if (concat == '') {
                 const left = line.left || 0;
                 sp = ' '.repeat(indents[left]);
@@ -287,6 +329,7 @@ class ConvertPdf extends ConvertHtml {
         if (concat)
             text += sp + concat + "\n";
 
+        //console.log(text);
         await utils.sleep(100);
         return await super.run(Buffer.from(text), {skipCheck: true, isText: true});
     }