Преглед изворни кода

Улучшение парсинга html

Book Pauk пре 6 година
родитељ
комит
983d9ee1b9

+ 5 - 0
server/core/BookConverter/ConvertBase.js

@@ -1,6 +1,7 @@
 const fs = require('fs-extra');
 const iconv = require('iconv-lite');
 const chardet = require('chardet');
+const he = require('he');
 
 const textUtils = require('./textUtils');
 const utils = require('../utils');
@@ -80,6 +81,10 @@ class ConvertBase {
         return text.replace(/ |[\t\n\r]/g, ' ');
     }
 
+    escapeEntities(text) {
+        return he.escape(text);
+    }
+
     formatFb2(fb2) {
         let out = '<?xml version="1.0" encoding="utf-8"?>';
         out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';

+ 2 - 0
server/core/BookConverter/ConvertHtml.js

@@ -79,6 +79,8 @@ class ConvertHtml extends ConvertBase {
         const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
 
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+            text = this.escapeEntities(text);
+
             if (!cutCounter && !(cutTitle && inTitle)) {
                 let tOpen = (bold ? '<strong>' : '');
                 tOpen += (italic ? '<emphasis>' : '');

+ 2 - 0
server/core/BookConverter/ConvertSamlib.js

@@ -218,6 +218,8 @@ class ConvertSamlib extends ConvertBase {
             if (!text)
                 return;
 
+            text = this.escapeEntities(text);
+
             switch (path) {
                 case '/html/body/center/h2':
                     titleInfo['book-title'] = text;