index.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. const fs = require('fs-extra');
  2. const URL = require('url').URL;
  3. const iconv = require('iconv-lite');
  4. const chardet = require('chardet');
  5. const _ = require('lodash');
  6. const FileDetector = require('../FileDetector');
  7. class BookConverter {
  8. constructor() {
  9. this.detector = new FileDetector();
  10. }
  11. async convertToFb2(inputFile, outputFile, url, callback) {
  12. const fileType = await this.detector.detectFile(inputFile);
  13. if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) {
  14. const data = await fs.readFile(inputFile);
  15. if (data.toString().indexOf('<FictionBook') >= 0) {
  16. await fs.writeFile(outputFile, data);
  17. return;
  18. }
  19. const parsedUrl = new URL(url);
  20. if (parsedUrl.hostname == 'samlib.ru' ||
  21. parsedUrl.hostname == 'budclub.ru') {
  22. await fs.writeFile(outputFile, await this.convertSamlib(data));
  23. return;
  24. }
  25. //Заглушка
  26. await fs.writeFile(outputFile, await this.convertHtml(data));
  27. callback(100);
  28. } else {
  29. if (fileType)
  30. throw new Error(`unknown file format: ${fileType.mime}`);
  31. else
  32. throw new Error(`unsupported file format: ${url}`);
  33. }
  34. }
  35. decode(data) {
  36. const charsetAll = chardet.detectAll(data.slice(0, 10000));
  37. let selected = 'ISO-8859-1';
  38. for (const charset of charsetAll) {
  39. if (charset.name.indexOf('ISO-8859') < 0) {
  40. selected = charset.name;
  41. break;
  42. }
  43. }
  44. return iconv.decode(data, selected);
  45. }
  46. parseHtml(buf, onNode, onText, innerCut) {
  47. if (!onNode)
  48. onNode = () => {};
  49. if (!onText)
  50. onText = () => {};
  51. if (!innerCut)
  52. innerCut = new Set();
  53. buf = buf.replace(/&nbsp;/g, ' ');
  54. let i = 0;
  55. const len = buf.length;
  56. let cutCounter = 0;
  57. let cutTag = '';
  58. while (i < len) {
  59. let left = buf.indexOf('<', i);
  60. if (left < 0)
  61. break;
  62. let right = buf.indexOf('>', left + 1);
  63. if (right < 0)
  64. break;
  65. let tag = buf.substr(left + 1, right - left - 1).trim().toLowerCase();
  66. let tail = '';
  67. const firstSpace = tag.indexOf(' ');
  68. if (firstSpace >= 0) {
  69. tail = tag.substr(firstSpace);
  70. tag = tag.substr(0, firstSpace);
  71. }
  72. const text = buf.substr(i, left - i);
  73. onText(text, cutCounter, cutTag);
  74. onNode(tag, tail, cutCounter, cutTag);
  75. if (innerCut.has(tag) && (!cutCounter || cutTag == tag)) {
  76. if (!cutCounter)
  77. cutTag = tag;
  78. cutCounter++;
  79. }
  80. if (tag != '' && tag.charAt(0) == '/' && cutTag == tag.substr(1)) {
  81. cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
  82. if (!cutCounter)
  83. cutTag = '';
  84. }
  85. i = right + 1;
  86. }
  87. if (i < len)
  88. onText(buf.substr(i, len - i), cutCounter, cutTag);
  89. }
  90. convertHtml(data, isText) {
  91. let titleInfo = {};
  92. let desc = {_n: 'description', 'title-info': titleInfo};
  93. let pars = [];
  94. let body = {_n: 'body', section: {_a: []}};
  95. let fb2 = [desc, body];
  96. let title = '';
  97. let inTitle = false;
  98. let spaceCounter = [];
  99. const newParagraph = () => {
  100. pars.push({_n: 'p', _t: ''});
  101. };
  102. const growParagraph = (text) => {
  103. const l = pars.length;
  104. if (l) {
  105. if (pars[l - 1]._t == '')
  106. text = text.trimLeft();
  107. pars[l - 1]._t += text;
  108. }
  109. //посчитаем отступы у текста, чтобы выделить потом параграфы
  110. const lines = text.split('\n');
  111. for (const line of lines) {
  112. const sp = line.split(' ');
  113. let l = 0;
  114. while (l < sp.length && sp[l].trim() == '') {
  115. l++;
  116. }
  117. if (!spaceCounter[l])
  118. spaceCounter[l] = 0;
  119. spaceCounter[l]++;
  120. }
  121. };
  122. newParagraph();
  123. const newPara = new Set(['tr', 'br', 'br/', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
  124. const onText = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  125. if (!cutCounter) {
  126. growParagraph(text);
  127. }
  128. if (inTitle && !title)
  129. title = text;
  130. };
  131. const onNode = (tag, tail, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  132. if (!cutCounter) {
  133. if (newPara.has(tag))
  134. newParagraph();
  135. }
  136. if (tag == 'title')
  137. inTitle = true;
  138. else if (tag == '/title')
  139. inTitle = false;
  140. };
  141. let buf = this.decode(data).toString();
  142. this.parseHtml(buf, onNode, onText, new Set(['head', 'script', 'style']));
  143. titleInfo['book-title'] = title;
  144. //подозрение на чистый текст, надо разбить на параграфы
  145. if ((isText || pars.length < buf.length/2000) && spaceCounter.length) {
  146. let total = 0;
  147. for (let i = 0; i < spaceCounter.length; i++) {
  148. total += (spaceCounter[i] ? spaceCounter[i] : 0);
  149. }
  150. total /= 10;
  151. let i = spaceCounter.length - 1;
  152. while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--;
  153. const parIndent = i;
  154. if (parIndent > 0) {//нашли отступ параграфа
  155. let newPars = [];
  156. const newPar = () => {
  157. newPars.push({_n: 'p', _t: ''});
  158. };
  159. const growPar = (text) => {
  160. const l = newPars.length;
  161. if (l) {
  162. newPars[l - 1]._t += text;
  163. }
  164. }
  165. for (const par of pars) {
  166. newPar();
  167. const lines = par._t.split('\n');
  168. for (const line of lines) {
  169. const sp = line.split(' ');
  170. let l = 0;
  171. while (l < sp.length && sp[l].trim() == '') {
  172. l++;
  173. }
  174. if (l >= parIndent)
  175. newPar();
  176. growPar(line.trim() + ' ');
  177. }
  178. }
  179. body.section._a[0] = newPars;
  180. } else {
  181. body.section._a[0] = pars;
  182. }
  183. } else {
  184. body.section._a[0] = pars;
  185. }
  186. return this.formatFb2(fb2);
  187. }
  188. async convertSamlib(data) {
  189. let titleInfo = {};
  190. let desc = {_n: 'description', 'title-info': titleInfo};
  191. let pars = [];
  192. let body = {_n: 'body', section: {_a: [pars]}};
  193. let fb2 = [desc, body];
  194. let inSubtitle = false;
  195. let path = '';
  196. let tag = '';// eslint-disable-line no-unused-vars
  197. let inText = false;
  198. let node = {};
  199. const newParagraph = () => {
  200. node = {_n: 'p', _a: []};
  201. pars.push(node);
  202. };
  203. const openTag = (name) => {
  204. let n = {_n: name, _a: [], _p: node};
  205. node._a.push(n);
  206. node = n;
  207. };
  208. const closeTag = (name) => {
  209. if (node._n == name && node._p) {
  210. node = node._p;
  211. }
  212. };
  213. const growParagraph = (text) => {
  214. if (node._n == 'p' && node._a.length == 0)
  215. text = text.trimLeft();
  216. node._a.push({_t: text});
  217. };
  218. newParagraph();
  219. const onNode = (elemName, tail, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  220. if (elemName == '')
  221. return;
  222. if (elemName[0] == '!') {//comment
  223. const text = elemName + tail;
  224. if (text == '!----------- собственно произведение ---------------')
  225. inText = true;
  226. if (text == '!---------------------------------------------------')
  227. inText = false;
  228. } else if (elemName[0] != '/') {//open tag
  229. if (!inText) {
  230. path += '/' + elemName;
  231. tag = elemName;
  232. } else {
  233. if (!inSubtitle && (elemName == 'p' || elemName == 'dd')) {
  234. newParagraph();
  235. }
  236. switch (elemName) {
  237. case 'i':
  238. openTag('emphasis');
  239. break;
  240. case 'b':
  241. openTag('strong');
  242. break;
  243. case 'div':
  244. if (tail.indexOf('align="center"') >= 0) {
  245. openTag('subtitle');
  246. inSubtitle = true;
  247. }
  248. break;
  249. }
  250. }
  251. } else if (elemName[0] == '/') {//close tag
  252. elemName = elemName.substr(1);
  253. if (!inText) {
  254. const oldPath = path;
  255. let t = '';
  256. do {
  257. let i = path.lastIndexOf('/');
  258. t = path.substr(i + 1);
  259. path = path.substr(0, i);
  260. } while (t != elemName && path);
  261. if (t != elemName) {
  262. path = oldPath;
  263. }
  264. let i = path.lastIndexOf('/');
  265. tag = path.substr(i + 1);
  266. } else {
  267. switch (elemName) {
  268. case 'i':
  269. closeTag('emphasis');
  270. break;
  271. case 'b':
  272. closeTag('strong');
  273. break;
  274. case 'div':
  275. if (inSubtitle)
  276. closeTag('subtitle');
  277. inSubtitle = false;
  278. break;
  279. }
  280. }
  281. }
  282. };
  283. const onText = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  284. if (text != ' ' && text.trim() == '')
  285. text = text.trim();
  286. if (text == '')
  287. return;
  288. switch (path) {
  289. case '/html/body/center/h2':
  290. titleInfo['book-title'] = text;
  291. return;
  292. case '/html/body/div/h3':
  293. if (!titleInfo.author)
  294. titleInfo.author = {};
  295. text = text.replace(':', '').trim().split(' ');
  296. if (text[0])
  297. titleInfo.author['last-name'] = text[0];
  298. if (text[1])
  299. titleInfo.author['first-name'] = text[1];
  300. if (text[2])
  301. titleInfo.author['middle-name'] = text[2];
  302. return;
  303. }
  304. if (inText)
  305. growParagraph(text);
  306. };
  307. this.parseHtml(this.decode(data).toString(),
  308. onNode, onText, new Set(['head', 'script', 'style']));
  309. const title = (titleInfo['book-title'] ? titleInfo['book-title'] : '');
  310. let author = '';
  311. if (titleInfo.author) {
  312. author = _.compact([
  313. (titleInfo.author['last-name'] ? titleInfo.author['last-name'] : ''),
  314. (titleInfo.author['first-name'] ? titleInfo.author['first-name'] : ''),
  315. (titleInfo.author['middle-name'] ? titleInfo.author['middle-name'] : ''),
  316. ]).join(' ');
  317. }
  318. pars.unshift({_n: 'title', _a: [
  319. {_n: 'p', _t: author}, {_n: 'p', _t: ''},
  320. {_n: 'p', _t: title}, {_n: 'p', _t: ''},
  321. ]})
  322. return this.formatFb2(fb2);
  323. }
  324. formatFb2(fb2) {
  325. let out = '<?xml version="1.0" encoding="utf-8"?>';
  326. out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';
  327. out += this.formatFb2Node(fb2);
  328. out += '</FictionBook>';
  329. return out;
  330. }
  331. formatFb2Node(node, name) {
  332. let out = '';
  333. const repl = (text) => text.replace(/[\t\n\r]/g, ' ');
  334. if (Array.isArray(node)) {
  335. for (const n of node) {
  336. out += this.formatFb2Node(n);
  337. }
  338. } else if (typeof node == 'string') {
  339. if (name)
  340. out += `<${name}>${repl(node)}</${name}>`;
  341. else
  342. out += repl(node);
  343. } else {
  344. if (node._n)
  345. name = node._n;
  346. if (name)
  347. out += `<${name}>`;
  348. if (node.hasOwnProperty('_t'))
  349. out += repl(node._t);
  350. for (let nodeName in node) {
  351. if (nodeName && nodeName[0] == '_' && nodeName != '_a')
  352. continue;
  353. const n = node[nodeName];
  354. out += this.formatFb2Node(n, nodeName);
  355. }
  356. if (name)
  357. out += `</${name}>`;
  358. }
  359. return out;
  360. }
  361. }
  362. module.exports = BookConverter;