Эх сурвалжийг харах

Рефакторинг с улучшениями

Book Pauk 6 жил өмнө
parent
commit
1001051422

+ 65 - 116
server/core/BookConverter/index.js

@@ -3,6 +3,7 @@ const URL = require('url').URL;
 const iconv = require('iconv-lite');
 const chardet = require('chardet');
 const _ = require('lodash');
+const sax = require('./sax');
 
 const FileDetector = require('../FileDetector');
 
@@ -54,60 +55,6 @@ class BookConverter {
         return iconv.decode(data, selected);
     }
 
-    parseHtml(buf, onNode, onText, innerCut) {
-        if (!onNode)
-            onNode = () => {};
-        if (!onText)
-            onText = () => {};
-        if (!innerCut)
-            innerCut = new Set();
-
-        buf = buf.replace(/ /g, ' '); 
-
-        let i = 0;
-        const len = buf.length;
-        let cutCounter = 0;
-        let cutTag = '';
-        while (i < len) {
-            let left = buf.indexOf('<', i);
-            if (left < 0)
-                break;
-            let right = buf.indexOf('>', left + 1);
-            if (right < 0)
-                break;
-
-            let tag = buf.substr(left + 1, right - left - 1).trim().toLowerCase();
-            let tail = '';
-            const firstSpace = tag.indexOf(' ');
-            if (firstSpace >= 0) {
-                tail = tag.substr(firstSpace);
-                tag = tag.substr(0, firstSpace);
-            }
-
-            const text = buf.substr(i, left - i);
-
-            onText(text, cutCounter, cutTag);
-            onNode(tag, tail, cutCounter, cutTag);
-
-            if (innerCut.has(tag) && (!cutCounter || cutTag == tag)) {
-                if (!cutCounter)
-                    cutTag = tag;
-                cutCounter++;
-            }
-
-            if (tag != '' && tag.charAt(0) == '/' && cutTag == tag.substr(1)) {
-                cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
-                if (!cutCounter)
-                    cutTag = '';
-            }
-
-            i = right + 1;
-        }
-
-        if (i < len)
-            onText(buf.substr(i, len - i), cutCounter, cutTag);
-    }
-
     convertHtml(data, isText) {
         let titleInfo = {};
         let desc = {_n: 'description', 'title-info': titleInfo};
@@ -267,75 +214,75 @@ class BookConverter {
 
         newParagraph();
 
-        const onNode = (elemName, tail, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+        const onStartNode = (elemName, tail, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
             if (elemName == '')
                 return;
-            if (elemName[0] == '!') {//comment
-                const text = elemName + tail;
-                if (text == '!----------- собственно произведение ---------------')
-                    inText = true;
-                if (text == '!---------------------------------------------------')
-                    inText = false;
-            } else if (elemName[0] != '/') {//open tag
-                if (!inText) {
-                    path += '/' + elemName;
-                    tag = elemName;
-                } else {
-                    if (!inSubtitle && (elemName == 'p' || elemName == 'dd')) {
-                        newParagraph();
-                    }
+            if (!inText) {
+                path += '/' + elemName;
+                tag = elemName;
+            } else {
+                if (!inSubtitle && (elemName == 'p' || elemName == 'dd')) {
+                    newParagraph();
+                }
 
-                    switch (elemName) {
-                        case 'i':
-                            openTag('emphasis');
-                            break;
-                        case 'b':
-                            openTag('strong');
-                            break;
-                        case 'div':
-                            if (tail.indexOf('align="center"') >= 0) {
-                                openTag('subtitle');
-                                inSubtitle = true;
-                            }
-                            break;
-                    }
+                switch (elemName) {
+                    case 'i':
+                        openTag('emphasis');
+                        break;
+                    case 'b':
+                        openTag('strong');
+                        break;
+                    case 'div':
+                        if (tail.indexOf('align="center"') >= 0) {
+                            openTag('subtitle');
+                            inSubtitle = true;
+                        }
+                        break;
                 }
-            } else if (elemName[0] == '/') {//close tag
-                elemName = elemName.substr(1);
-                if (!inText) {
-                    const oldPath = path;
-                    let t = '';
-                    do  {
-                        let i = path.lastIndexOf('/');
-                        t = path.substr(i + 1);
-                        path = path.substr(0, i);
-                    } while (t != elemName && path);
-
-                    if (t != elemName) {
-                        path = oldPath;
-                    }
+            }
+        };
 
+        const onEndNode = (elemName, tail, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+            if (!inText) {
+                const oldPath = path;
+                let t = '';
+                do  {
                     let i = path.lastIndexOf('/');
-                    tag = path.substr(i + 1);
-                } else {
-                    switch (elemName) {
-                        case 'i':
-                            closeTag('emphasis');
-                            break;
-                        case 'b':
-                            closeTag('strong');
-                            break;
-                        case 'div':
-                            if (inSubtitle)
-                                closeTag('subtitle');
-                            inSubtitle = false;
-                            break;
-                    }
+                    t = path.substr(i + 1);
+                    path = path.substr(0, i);
+                } while (t != elemName && path);
+
+                if (t != elemName) {
+                    path = oldPath;
+                }
+
+                let i = path.lastIndexOf('/');
+                tag = path.substr(i + 1);
+            } else {
+                switch (elemName) {
+                    case 'i':
+                        closeTag('emphasis');
+                        break;
+                    case 'b':
+                        closeTag('strong');
+                        break;
+                    case 'div':
+                        if (inSubtitle)
+                            closeTag('subtitle');
+                        inSubtitle = false;
+                        break;
                 }
             }
         };
 
-        const onText = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+        const onComment = (text, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+            if (text == '--------- Собственно произведение -------------')
+                inText = true;
+            if (text == '-----------------------------------------------')
+                inText = false;
+        };
+
+        const onTextNode = (text, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
             if (text != ' ' && text.trim() == '')
                 text = text.trim();
 
@@ -363,8 +310,10 @@ class BookConverter {
                 growParagraph(text);
         };
 
-        this.parseHtml(this.decode(data).toString(), 
-            onNode, onText, new Set(['head', 'script', 'style']));
+        sax.parse(this.decode(data).toString(), {
+            onStartNode, onEndNode, onTextNode, onComment,
+            innerCut: new Set(['head', 'script', 'style'])
+        });
 
         const title = (titleInfo['book-title'] ? titleInfo['book-title'] : '');
         let author = '';
@@ -395,7 +344,7 @@ class BookConverter {
     formatFb2Node(node, name) {
         let out = '';
 
-        const repl = (text) => text.replace(/[\t\n\r]/g, ' ');
+        const repl = (text) => text.replace(/&nbsp;|[\t\n\r]/g, ' ');
 
         if (Array.isArray(node)) {
             for (const n of node) {

+ 130 - 0
server/core/BookConverter/sax.js

@@ -0,0 +1,130 @@
+function parse(xstr, options) {
+    let {onStartNode, onEndNode, onTextNode, onCdata, onComment, innerCut} = options;
+
+    if (!onStartNode)
+        onStartNode = () => {};
+    if (!onEndNode)
+        onEndNode = () => {};
+    if (!onTextNode)
+        onTextNode = () => {};
+    if (!onCdata)
+        onCdata = () => {};
+    if (!onComment)
+        onComment = () => {};
+
+    if (!innerCut)
+        innerCut = new Set();
+
+    let i = 0;
+    const len = xstr.length;
+    let cutCounter = 0;
+    let cutTag = '';
+    let inCdata;
+    let inComment;
+    while (i < len) {
+        inCdata = false;
+        inComment = false;
+
+        let left = xstr.indexOf('<', i);
+        if (left < 0)
+            break;
+        let leftData = left;
+
+        if (left < len - 2 && xstr[left + 1] == '!') {
+            if (xstr[left + 2] == '-') {
+                const leftComment = xstr.indexOf('<!--', left);
+                if (leftComment == left) {
+                    inComment = true;
+                    leftData = left + 3;
+                }
+            }
+
+            if (!inComment && xstr[left + 2] == '[') {
+                const leftCdata = xstr.indexOf('<![CDATA[', left);
+                if (leftCdata == left) {
+                    inCdata = true;
+                    leftData = left + 8;
+                }
+            }
+        }
+
+        let right = null;
+        let rightData = null;
+        if (inCdata) {
+            rightData = xstr.indexOf(']]>', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData + 2;
+        } else if (inComment) {
+            rightData = xstr.indexOf('-->', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData + 2;
+        } else {
+            rightData = xstr.indexOf('>', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData;
+        }
+
+        let tagData = xstr.substr(leftData + 1, rightData - leftData - 1);
+
+        if (inCdata) {
+            onCdata(tagData, left, cutCounter, cutTag);
+        } else if (inComment) {
+            onComment(tagData, left, cutCounter, cutTag);
+        } else {
+            let tag = '';
+            let tail = '';
+            const firstSpace = tagData.indexOf(' ');
+            if (firstSpace >= 0) {
+                tail = tagData.substr(firstSpace);
+                tag = tagData.substr(0, firstSpace);
+            } else {
+                tag = tagData;
+            }
+            tag = tag.toLowerCase();
+
+            const text = xstr.substr(i, left - i);
+
+            onTextNode(text, left, cutCounter, cutTag);
+
+            let endTag = '';
+            if (tag === '' || tag[0] !== '/') {
+                onStartNode(tag, tail, left, cutCounter, cutTag);
+            } else {
+                endTag = tag.substr(1);
+                onEndNode(endTag, tail, left, cutCounter, cutTag);
+            }
+
+            if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
+                if (!cutCounter)
+                    cutTag = tag;
+                cutCounter++;
+            }
+
+            if (cutTag === endTag) {
+                cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
+                if (!cutCounter)
+                    cutTag = '';
+            }
+        }
+
+        i = right + 1;
+    }
+
+    if (i < len) {
+        if (inCdata) {
+            onCdata(xstr.substr(i, len - i), len - 1, cutCounter, cutTag);
+        } else if (inComment) {
+            onComment(xstr.substr(i, len - i), len - 1, cutCounter, cutTag);
+        } else {
+            onTextNode(xstr.substr(i, len - i), len - 1, cutCounter, cutTag);
+        }
+    }
+}
+
+
+module.exports = {
+    parse
+}