Forráskód Böngészése

Работа над конвертером pdf

Book Pauk 4 éve
szülő
commit
9906dd43c7
1 módosított fájl, 93 hozzáadás és 80 törlés
  1. 93 80
      server/core/Reader/BookConverter/ConvertPdf.js

+ 93 - 80
server/core/Reader/BookConverter/ConvertPdf.js

@@ -55,6 +55,10 @@ class ConvertPdf extends ConvertHtml {
         let images = [];
         let loading = [];
 
+        let inText = false;
+        let bold = false;
+        let italic = false;
+
         let i = -1;
 
         const loadImage = async(image) => {
@@ -81,22 +85,30 @@ class ConvertPdf extends ConvertHtml {
             }
         };
 
+        const isTextBold = (text) => {
+            const m = text.trim().match(/^<b>(.*)<\/b>$/);
+            return m && !m[1].match(/<b>|<\/b>|<i>|<\/i>/g);
+        };
+
+        const isTextEmpty = (text) => {
+            return text.replace(/<b>|<\/b>|<i>|<\/i>/g, '').trim() == '';
+        };
+
         const putPageLines = () => {
-            pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left))
+            pagelines.sort((a, b) => (Math.abs(a.top - b.top) > 3 ? a.top - b.top : 0)*10000 + (a.left - b.left))
             
             //объединяем в одну строку равные по высоте
             const pl = [];
             let pt = 0;
             let j = -1;
             pagelines.forEach(line => {
-                //добавим закрывающий тег стиля
-                line.text += line.tClose;
+                if (isTextEmpty(line.text))
+                    return;
 
                 //проверим, возможно это заголовок
-                if (line.fonts.length == 1 && line.pageWidth) {
-                    const f = (line.fonts.length ? fonts[line.fonts[0]] : null);
+                if (line.fontId && line.pageWidth) {
                     const centerLeft = (line.pageWidth - line.width)/2;
-                    if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) {
+                    if (isTextBold(line.text) && Math.abs(centerLeft - line.left) < 10) {
                         if (!sectionTitleFound) {
                             line.isSectionTitle = true;
                             sectionTitleFound = true;
@@ -124,8 +136,8 @@ class ConvertPdf extends ConvertHtml {
                 //добавим пустую строку, если надо
                 const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
                 if (prevLine && !prevLine.isImage) {
-                    const f = (prevLine.fonts.length ? fonts[prevLine.fonts[0]] : (line.fonts.length ? fonts[line.fonts[0]] : null));
-                    if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize*1.8) {
+                    const f = (prevLine.fontId ? fonts[prevLine.fontId] : (line.fontId ? fonts[line.fontId] : null));
+                    if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize * 1.8) {
                         i++;
                         lines[i] = {text: '<br>'};
                     }
@@ -138,29 +150,26 @@ class ConvertPdf extends ConvertHtml {
             putImage(100000);
         };
 
-        const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
-            if (tag == 'textstyle') {
-                const attrs = sax.getAttrsSync(tail);
-                const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
-                const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : '');
-                const fontSize = (attrs.fontsize && attrs.fontsize.value ? attrs.fontsize.value : '');
+        const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+            if (!cutCounter && inText) {
+                let tOpen = (bold ? '<b>' : '');
+                tOpen += (italic ? '<i>' : '');
+                let tClose = (italic ? '</i>' : '');
+                tClose += (bold ? '</b>' : '');
 
-                if (fontId) {
-                    const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'};
-                    const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false, fontSize};
-
-                    if (fontStyle) {
-                        const styles = fontStyle.split(' ');
-                        styles.forEach(style => {
-                            const s = styleTags[style];
-                            if (s) {
-                                f.tOpen += `<${s}>`;
-                                f.tClose = `</${s}>${f.tClose}`;
-                                if (s == 'b')
-                                    f.isBold = true;
-                            }
-                        });
-                    }
+                line.text += ` ${tOpen}${text}${tClose}`;
+            }
+        };
+
+        const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+            if (inText) {
+                switch (tag) {
+                    case 'i':
+                        italic = true;
+                        break;
+                    case 'b':
+                        bold = true;
+                        break;
                 }
             }
 
@@ -173,80 +182,78 @@ class ConvertPdf extends ConvertHtml {
                 putPageLines();
             }
 
-            if (tag == 'textline') {
+            if (tag == 'fontspec') {
+                const attrs = sax.getAttrsSync(tail);
+                const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
+                const fontSize = (attrs.size && attrs.size.value ? attrs.size.value : '');
+
+                if (fontId) {
+                    fonts[fontId] = {fontSize};
+
+                }
+            }
+
+            if (tag == 'text' && !inText) {
                 const attrs = sax.getAttrsSync(tail);
                 line = {
                     text: '',
-                    top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10),
-                    left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10),
+                    top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
+                    left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
                     width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
                     height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
-                    tOpen: '',
-                    tClose: '',
                     isSectionTitle: false,
                     isSubtitle: false,
                     pageWidth: page.width,
-                    fonts: [],
+                    fontId: (attrs.font && attrs.font.value ? attrs.font.value : ''),
                 };
 
                 if (line.width != 0 || line.height != 0) {
+                    inText = true;
                     pagelines.push(line);
                 }
             }
 
-            if (tag == 'string') {
+            if (tag == 'image') {
                 const attrs = sax.getAttrsSync(tail);
-                if (attrs.content && attrs.content.value) {
-
-                    let tOpen = '';
-                    let tClose = '';
-                    const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : '');
-                    if (fontId && fonts[fontId]) {
-                        tOpen = fonts[fontId].tOpen;
-                        tClose = fonts[fontId].tClose;
-                        if (!line.fonts.length || line.fonts[0] != fontId)
-                            line.fonts.push(fontId);
-                    }
-
-                    if (line.tOpen != tOpen) {
-                        line.text += line.tClose + tOpen;
-                        line.tOpen = tOpen;
-                        line.tClose = tClose;
-                    }
-
-                    line.text += `${line.text.length ? ' ' : ''}${attrs.content.value}`;
+                let src = (attrs.src && attrs.src.value ? attrs.src.value : '');
+                if (src) {
+                    const image = {
+                        isImage: true,
+                        src,
+                        data: '',
+                        type: '',
+                        top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
+                        left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10) || 0,
+                        width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
+                        height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
+                    };
+
+                    loading.push(loadImage(image));
+                    images.push(image);
+                    images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
                 }
             }
+        };
 
-            if (tag == 'illustration') {
-                const attrs = sax.getAttrsSync(tail);
-                if (attrs.type && attrs.type.value == 'image') {
-                    let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
-                    if (src) {
-                        const image = {
-                            isImage: true,
-                            src,
-                            data: '',
-                            type: '',
-                            top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
-                            left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0,
-                            width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
-                            height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
-                        };
-                        const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height));
-                        if (!exists.length) {
-                            loading.push(loadImage(image));
-                            images.push(image);
-                            images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
-                        }
-                    }
+        const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+            if (inText) {
+                switch (tag) {
+                    case 'i':
+                        italic = false;
+                        break;
+                    case 'b':
+                        bold = false;
+                        break;
                 }
             }
+
+            if (tag == 'text')
+                inText = false;
         };
 
         let buf = this.decode(data).toString();
         sax.parseSync(buf, {
-            onStartNode
+            onStartNode, onEndNode, onTextNode
         });
 
         putPageLines();
@@ -290,6 +297,7 @@ class ConvertPdf extends ConvertHtml {
 
         let concat = '';
         let sp = '';
+        let firstLine = true;
         for (const line of lines) {
             if (text.length > limitSize) {
                 throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
@@ -301,10 +309,15 @@ class ConvertPdf extends ConvertHtml {
             }
 
             if (line.isSectionTitle) {
-                text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
+                if (firstLine)
+                    text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
+                else
+                    text += `<fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
                 continue;
             }
 
+            firstLine = false;
+
             if (line.isSubtitle) {
                 text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
                 continue;