Просмотр исходного кода

Улучшение парсинга Pdf

Book Pauk 6 лет назад
Родитель
Сommit
2f8b68ec62
2 измененных файлов с 30 добавлено и 4 удалено
  1. 28 2
      server/core/BookConverter/ConvertHtml.js
  2. 2 2
      server/core/BookConverter/ConvertPdf.js

+ 28 - 2
server/core/BookConverter/ConvertHtml.js

@@ -34,10 +34,13 @@ class ConvertHtml extends ConvertBase {
         let desc = {_n: 'description', 'title-info': titleInfo};
         let pars = [];
         let body = {_n: 'body', section: {_a: []}};
-        let fb2 = [desc, body];
+        let binary = [];
+        let fb2 = [desc, body, binary];
 
         let title = '';
         let inTitle = false;
+        let inImage = false;
+        let image = {};
 
         let spaceCounter = [];
 
@@ -80,6 +83,15 @@ class ConvertHtml extends ConvertBase {
 
             if (inTitle && !title)
                 title = text;
+
+            if (inImage) {
+                image._t = text;
+                binary.push(image);
+
+                pars.push({_n: 'image', _attrs: {'l:href': '#' + image._attrs.id}, _t: ''});
+                newParagraph();
+            }
+
         };
 
         const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
@@ -90,18 +102,27 @@ class ConvertHtml extends ConvertBase {
 
             if (tag == 'title')
                 inTitle = true;
+
+            if (tag == 'fb2-image') {
+                inImage = true;
+                const attrs = sax.getAttrsSync(tail);
+                image = {_n: 'binary', _attrs: {id: attrs.name.value, 'content-type': attrs.type.value}, _t: ''};
+            }
         };
 
         const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
             if (tag == 'title')
                 inTitle = false;
+
+            if (tag == 'fb2-image')
+                inImage = false;
         };
 
         let buf = this.decode(data).toString();
 
         sax.parseSync(buf, {
             onStartNode, onEndNode, onTextNode,
-            innerCut: new Set(['head', 'script', 'style', 'binary'])
+            innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image'])
         });
 
         titleInfo['book-title'] = title;
@@ -148,6 +169,11 @@ class ConvertHtml extends ConvertBase {
 
             i = 0;
             for (const par of pars) {
+                if (par._n != 'p') {
+                    newPars.push(par);
+                    continue;
+                }
+
                 if (i > 0)
                     newPar();
                 i++;

+ 2 - 2
server/core/BookConverter/ConvertPdf.js

@@ -86,7 +86,7 @@ class ConvertPdf extends ConvertHtml {
                         height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
                     };
 
-                    if (line.width !== '0' || line.height !== '0') {
+                    if (line.width != 0 || line.height != 0) {
                         inText = true;
                         if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
                             putImage(line.top);
@@ -98,7 +98,7 @@ class ConvertPdf extends ConvertHtml {
                 }
 
                 if (tag == 'image') {
-                    let attrs = sax.getAttrsSync(tail);
+                    const attrs = sax.getAttrsSync(tail);
                     const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
                     if (src) {
                         const image = {