Преглед на файлове

Улучшение парсинга html

Book Pauk преди 6 години
родител
ревизия
983d9ee1b9
променени са 3 файла, в които са добавени 9 реда и са изтрити 0 реда
  1. 5 0
      server/core/BookConverter/ConvertBase.js
  2. 2 0
      server/core/BookConverter/ConvertHtml.js
  3. 2 0
      server/core/BookConverter/ConvertSamlib.js

+ 5 - 0
server/core/BookConverter/ConvertBase.js

@@ -1,6 +1,7 @@
 const fs = require('fs-extra');
 const fs = require('fs-extra');
 const iconv = require('iconv-lite');
 const iconv = require('iconv-lite');
 const chardet = require('chardet');
 const chardet = require('chardet');
+const he = require('he');
 
 
 const textUtils = require('./textUtils');
 const textUtils = require('./textUtils');
 const utils = require('../utils');
 const utils = require('../utils');
@@ -80,6 +81,10 @@ class ConvertBase {
         return text.replace(/ |[\t\n\r]/g, ' ');
         return text.replace(/ |[\t\n\r]/g, ' ');
     }
     }
 
 
+    escapeEntities(text) {
+        return he.escape(text);
+    }
+
     formatFb2(fb2) {
     formatFb2(fb2) {
         let out = '<?xml version="1.0" encoding="utf-8"?>';
         let out = '<?xml version="1.0" encoding="utf-8"?>';
         out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';
         out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';

+ 2 - 0
server/core/BookConverter/ConvertHtml.js

@@ -79,6 +79,8 @@ class ConvertHtml extends ConvertBase {
         const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
         const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
 
 
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
         const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
+            text = this.escapeEntities(text);
+
             if (!cutCounter && !(cutTitle && inTitle)) {
             if (!cutCounter && !(cutTitle && inTitle)) {
                 let tOpen = (bold ? '<strong>' : '');
                 let tOpen = (bold ? '<strong>' : '');
                 tOpen += (italic ? '<emphasis>' : '');
                 tOpen += (italic ? '<emphasis>' : '');

+ 2 - 0
server/core/BookConverter/ConvertSamlib.js

@@ -218,6 +218,8 @@ class ConvertSamlib extends ConvertBase {
             if (!text)
             if (!text)
                 return;
                 return;
 
 
+            text = this.escapeEntities(text);
+
             switch (path) {
             switch (path) {
                 case '/html/body/center/h2':
                 case '/html/body/center/h2':
                     titleInfo['book-title'] = text;
                     titleInfo['book-title'] = text;