Prechádzať zdrojové kódy

Улучшение парсинга pdf и html

Book Pauk 6 rokov pred
rodič
commit
b3e579d8b7

+ 1 - 1
server/core/BookConverter/ConvertHtml.js

@@ -76,7 +76,7 @@ class ConvertHtml extends ConvertBase {
             }
             }
         };
         };
 
 
-        const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
+        const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
 
 
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
             if (!cutCounter && !(cutTitle && inTitle)) {
             if (!cutCounter && !(cutTitle && inTitle)) {

+ 30 - 1
server/core/BookConverter/ConvertPdf.js

@@ -38,6 +38,8 @@ class ConvertPdf extends ConvertHtml {
         let images = [];
         let images = [];
         let loading = [];
         let loading = [];
         let inText = false;
         let inText = false;
+        let bold = false;
+        let italic = false;
         let title = '';
         let title = '';
         let prevTop = 0;
         let prevTop = 0;
         let i = -1;
         let i = -1;
@@ -68,7 +70,12 @@ class ConvertPdf extends ConvertHtml {
 
 
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
             if (!cutCounter && inText) {
             if (!cutCounter && inText) {
-                lines[i].text += text + ' ';
+                let tOpen = (bold ? '<b>' : '');
+                tOpen += (italic ? '<i>' : '');
+                let tClose = (italic ? '</i>' : '');
+                tClose += (bold ? '</b>' : '');
+
+                lines[i].text += `${tOpen}${text}${tClose} `;
                 if (i < 2)
                 if (i < 2)
                     title += text + ' ';
                     title += text + ' ';
             }
             }
@@ -76,6 +83,17 @@ class ConvertPdf extends ConvertHtml {
 
 
         const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
         const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
             if (!cutCounter) {
             if (!cutCounter) {
+                if (inText) {
+                    switch (tag) {
+                        case 'i':
+                            italic = true;
+                            break;
+                        case 'b':
+                            bold = true;
+                            break;
+                    }
+                }
+
                 if (tag == 'text' && !inText) {
                 if (tag == 'text' && !inText) {
                     let attrs = sax.getAttrsSync(tail);
                     let attrs = sax.getAttrsSync(tail);
                     const line = {
                     const line = {
@@ -121,6 +139,17 @@ class ConvertPdf extends ConvertHtml {
         };
         };
 
 
         const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
         const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+            if (inText) {
+                switch (tag) {
+                    case 'i':
+                        italic = false;
+                        break;
+                    case 'b':
+                        bold = false;
+                        break;
+                }
+            }
+
             if (tag == 'text')
             if (tag == 'text')
                 inText = false;
                 inText = false;
         };
         };