const ConvertBase = require('./ConvertBase');
const sax = require('./sax');
const textUtils = require('./textUtils');
class ConvertHtml extends ConvertBase {
check(data, opts) {
const {dataType} = opts;
if (dataType && (dataType.ext == 'html' || dataType.ext == 'xml'))
return {isText: false};
//может это чистый текст?
if (textUtils.checkIfText(data)) {
return {isText: true};
}
return false;
}
async run(data, opts) {
let isText = false;
if (!opts.skipCheck) {
const checkResult = this.check(data, opts);
if (!checkResult)
return false;
isText = checkResult.isText;
} else {
isText = opts.isText;
}
const {cutTitle} = opts;
let titleInfo = {};
let desc = {_n: 'description', 'title-info': titleInfo};
let pars = [];
let body = {_n: 'body', section: {_a: []}};
let binary = [];
let fb2 = [desc, body, binary];
let title = '';
let inTitle = false;
let inImage = false;
let image = {};
let bold = false;
let italic = false;
let spaceCounter = [];
const repCrLfTab = (text) => text.replace(/[\n\r]/g, '').replace(/\t/g, ' ');
const newParagraph = () => {
pars.push({_n: 'p', _t: ''});
};
const growParagraph = (text) => {
if (!pars.length)
newParagraph();
const l = pars.length;
pars[l - 1]._t += text;
//посчитаем отступы у текста, чтобы выделить потом параграфы
const lines = text.split('\n');
for (let line of lines) {
if (line.trim() == '')
continue;
line = repCrLfTab(line);
let l = 0;
while (l < line.length && line[l] == ' ') {
l++;
}
if (!spaceCounter[l])
spaceCounter[l] = 0;
spaceCounter[l]++;
}
};
const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
text = this.escapeEntities(text);
if (!cutCounter && !(cutTitle && inTitle)) {
let tOpen = (bold ? '' : '');
tOpen += (italic ? '' : '');
let tClose = (italic ? '' : '');
tClose += (bold ? '' : '');
growParagraph(`${tOpen}${text}${tClose}`);
}
if (inTitle && !title)
title = text;
if (inImage) {
image._t = text;
binary.push(image);
pars.push({_n: 'image', _attrs: {'l:href': '#' + image._attrs.id}, _t: ''});
newParagraph();
}
};
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter) {
if (newPara.has(tag))
newParagraph();
switch (tag) {
case 'i':
case 'em':
italic = true;
break;
case 'b':
case 'strong':
case 'h1':
case 'h2':
case 'h3':
bold = true;
break;
}
}
if (tag == 'title')
inTitle = true;
if (tag == 'fb2-image') {
inImage = true;
const attrs = sax.getAttrsSync(tail);
image = {_n: 'binary', _attrs: {id: attrs.name.value, 'content-type': attrs.type.value}, _t: ''};
}
};
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter) {
if (newPara.has('/' + tag))
newParagraph();
switch (tag) {
case 'i':
case 'em':
italic = false;
break;
case 'b':
case 'strong':
case 'h1':
case 'h2':
case 'h3':
bold = false;
break;
}
}
if (tag == 'title')
inTitle = false;
if (tag == 'fb2-image')
inImage = false;
};
let buf = this.decode(data).toString();
sax.parseSync(buf, {
onStartNode, onEndNode, onTextNode,
innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image'])
});
titleInfo['book-title'] = title;
//подозрение на чистый текст, надо разбить на параграфы
if (isText || pars.length < buf.length/2000) {
let total = 0;
let count = 1;
for (let i = 0; i < spaceCounter.length; i++) {
const sc = (spaceCounter[i] ? spaceCounter[i] : 0);
if (sc) count++;
total += sc;
}
let d = 0;
const mid = total/count;
for (let i = 0; i < spaceCounter.length; i++) {
const sc = (spaceCounter[i] ? spaceCounter[i] : 0);
if (sc > mid) d++;
}
let i = 0;
//если разброс не слишком большой, выделяем параграфы
if (d < 10 && spaceCounter.length) {
total /= 20;
i = spaceCounter.length - 1;
while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--;
}
const parIndent = (i > 0 ? i : 0);
let newPars = [];
const newPar = () => {
newPars.push({_n: 'p', _t: ''});
};
const growPar = (text) => {
if (!newPars.length)
newPar();
const l = newPars.length;
newPars[l - 1]._t += text;
}
i = 0;
for (const par of pars) {
if (par._n != 'p') {
newPars.push(par);
continue;
}
if (i > 0)
newPar();
i++;
let j = 0;
const lines = par._t.split('\n');
for (let line of lines) {
line = repCrLfTab(line);
let l = 0;
while (l < line.length && line[l] == ' ') {
l++;
}
if (l >= parIndent) {
if (j > 0)
newPar();
j++;
}
growPar(line.trim() + ' ');
}
}
body.section._a[0] = newPars;
} else {
body.section._a[0] = pars;
}
//убираем лишнее, делаем валидный fb2, т.к. в рез-те разбиения на параграфы бьются теги
bold = false;
italic = false;
pars = body.section._a[0];
for (let i = 0; i < pars.length; i++) {
if (pars[i]._n != 'p')
continue;
pars[i]._t = this.repSpaces(pars[i]._t).trim();
if (pars[i]._t.indexOf('<') >= 0) {
const t = pars[i]._t;
let a = [];
const onTextNode = (text) => {
let tOpen = (bold ? '' : '');
tOpen += (italic ? '' : '');
let tClose = (italic ? '' : '');
tClose += (bold ? '' : '');
a.push(`${tOpen}${text}${tClose}`);
}
const onStartNode = (tag) => {
if (tag == 'strong')
bold = true;
if (tag == 'emphasis')
italic = true;
}
const onEndNode = (tag) => {
if (tag == 'strong')
bold = false;
if (tag == 'emphasis')
italic = false;
}
sax.parseSync(t, { onStartNode, onEndNode, onTextNode });
pars[i]._t = '';
pars[i]._a = a;
}
}
return this.formatFb2(fb2);
}
}
module.exports = ConvertHtml;