index.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. const fs = require('fs-extra');
  2. const URL = require('url').URL;
  3. const iconv = require('iconv-lite');
  4. const chardet = require('chardet');
  5. const _ = require('lodash');
  6. const FileDetector = require('../FileDetector');
  7. class BookConverter {
  8. constructor() {
  9. this.detector = new FileDetector();
  10. }
  11. async convertToFb2(inputFile, outputFile, url, callback) {
  12. const fileType = await this.detector.detectFile(inputFile);
  13. if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) {
  14. const data = await fs.readFile(inputFile);
  15. if (data.toString().indexOf('<FictionBook') >= 0) {
  16. await fs.writeFile(outputFile, data);
  17. return;
  18. }
  19. const parsedUrl = new URL(url);
  20. if (parsedUrl.hostname == 'samlib.ru' ||
  21. parsedUrl.hostname == 'budclub.ru') {
  22. await fs.writeFile(outputFile, await this.convertSamlib(data));
  23. return;
  24. }
  25. //Заглушка
  26. await fs.writeFile(outputFile, await this.convertHtml(data));
  27. callback(100);
  28. } else {
  29. if (fileType)
  30. throw new Error(`unknown file format: ${fileType.mime}`);
  31. else
  32. throw new Error(`unsupported file format: ${url}`);
  33. }
  34. }
  35. decode(data) {
  36. const charsetAll = chardet.detectAll(data.slice(0, 10000));
  37. let selected = 'ISO-8859-1';
  38. for (const charset of charsetAll) {
  39. if (charset.name.indexOf('ISO-8859') < 0) {
  40. selected = charset.name;
  41. break;
  42. }
  43. }
  44. return iconv.decode(data, selected);
  45. }
  46. parseHtml(buf, onNode, onText, innerCut) {
  47. if (!onNode)
  48. onNode = () => {};
  49. if (!onText)
  50. onText = () => {};
  51. if (!innerCut)
  52. innerCut = new Set();
  53. buf = buf.replace(/&nbsp;/g, ' ');
  54. let i = 0;
  55. const len = buf.length;
  56. let cutCounter = 0;
  57. let cutTag = '';
  58. while (i < len) {
  59. let left = buf.indexOf('<', i);
  60. if (left < 0)
  61. break;
  62. let right = buf.indexOf('>', left + 1);
  63. if (right < 0)
  64. break;
  65. let tag = buf.substr(left + 1, right - left - 1).trim().toLowerCase();
  66. let tail = '';
  67. const firstSpace = tag.indexOf(' ');
  68. if (firstSpace >= 0) {
  69. tail = tag.substr(firstSpace);
  70. tag = tag.substr(0, firstSpace);
  71. }
  72. const text = buf.substr(i, left - i);
  73. onText(text, cutCounter, cutTag);
  74. onNode(tag, tail, cutCounter, cutTag);
  75. if (innerCut.has(tag) && (!cutCounter || cutTag == tag)) {
  76. if (!cutCounter)
  77. cutTag = tag;
  78. cutCounter++;
  79. }
  80. if (tag != '' && tag.charAt(0) == '/' && cutTag == tag.substr(1)) {
  81. cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
  82. if (!cutCounter)
  83. cutTag = '';
  84. }
  85. i = right + 1;
  86. }
  87. if (i < len)
  88. onText(buf.substr(i, len - i), cutCounter, cutTag);
  89. }
  90. convertHtml(data, isText) {
  91. let titleInfo = {};
  92. let desc = {_n: 'description', 'title-info': titleInfo};
  93. let pars = [];
  94. let body = {_n: 'body', section: {_a: []}};
  95. let fb2 = [desc, body];
  96. let title = '';
  97. let inTitle = false;
  98. let spaceCounter = [];
  99. const newParagraph = () => {
  100. pars.push({_n: 'p', _t: ''});
  101. };
  102. const growParagraph = (text) => {
  103. const l = pars.length;
  104. if (l) {
  105. if (pars[l - 1]._t == '')
  106. text = text.trimLeft();
  107. pars[l - 1]._t += text;
  108. }
  109. //посчитаем отступы у текста, чтобы выделить потом параграфы
  110. const lines = text.split('\n');
  111. for (const line of lines) {
  112. const sp = line.split(' ');
  113. let l = 0;
  114. while (l < sp.length && sp[l].trim() == '') {
  115. l++;
  116. }
  117. if (!spaceCounter[l])
  118. spaceCounter[l] = 0;
  119. spaceCounter[l]++;
  120. }
  121. };
  122. newParagraph();
  123. const newPara = new Set(['tr', 'br', 'br/', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
  124. const onText = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  125. if (!cutCounter) {
  126. growParagraph(text);
  127. }
  128. if (inTitle && !title)
  129. title = text;
  130. };
  131. const onNode = (tag, tail, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  132. if (!cutCounter) {
  133. if (newPara.has(tag))
  134. newParagraph();
  135. }
  136. if (tag == 'title')
  137. inTitle = true;
  138. else if (tag == '/title')
  139. inTitle = false;
  140. };
  141. let buf = this.decode(data).toString();
  142. this.parseHtml(buf, onNode, onText, new Set(['head', 'script', 'style']));
  143. titleInfo['book-title'] = title;
  144. //подозрение на чистый текст, надо разбить на параграфы
  145. if ((isText || pars.length < buf.length/2000) && spaceCounter.length) {
  146. let total = 0;
  147. for (let i = 0; i < spaceCounter.length; i++) {
  148. total += (spaceCounter[i] ? spaceCounter[i] : 0);
  149. }
  150. total /= 10;
  151. let i = spaceCounter.length - 1;
  152. while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--;
  153. const parIndent = i;
  154. if (parIndent > 0) {//нашли отступ параграфа
  155. let newPars = [];
  156. const newPar = () => {
  157. newPars.push({_n: 'p', _t: ''});
  158. };
  159. const growPar = (text) => {
  160. const l = newPars.length;
  161. if (l) {
  162. newPars[l - 1]._t += text;
  163. }
  164. }
  165. for (const par of pars) {
  166. newPar();
  167. const lines = par._t.split('\n');
  168. for (const line of lines) {
  169. const sp = line.split(' ');
  170. let l = 0;
  171. while (l < sp.length && sp[l].trim() == '') {
  172. l++;
  173. }
  174. if (l >= parIndent)
  175. newPar();
  176. growPar(line.trim() + ' ');
  177. }
  178. }
  179. body.section._a[0] = newPars;
  180. } else {
  181. body.section._a[0] = pars;
  182. }
  183. } else {
  184. body.section._a[0] = pars;
  185. }
  186. //убрать лишнее
  187. for (let p of body.section._a[0]) {
  188. p._t = p._t.replace(/[\t\n\r]/g, ' ');
  189. }
  190. return this.formatFb2(fb2);
  191. }
  192. async convertSamlib(data) {
  193. let titleInfo = {};
  194. let desc = {_n: 'description', 'title-info': titleInfo};
  195. let pars = [];
  196. let body = {_n: 'body', section: {_a: [pars]}};
  197. let fb2 = [desc, body];
  198. let path = '';
  199. let tag = '';// eslint-disable-line no-unused-vars
  200. let inText = false;
  201. let center = false;
  202. //let italic = false;
  203. //let bold = false;
  204. let node = {};
  205. const newParagraph = () => {
  206. node = {_n: 'p', _t: ''};
  207. pars.push(node);
  208. };
  209. const newSubTitle = () => {
  210. node = {_n: 'subtitle', _t: ''};
  211. pars.push(node);
  212. };
  213. const newItalic = () => {
  214. let n = {_n: 'emphasis', _t: ''};
  215. if (!node._a)
  216. node._a = [];
  217. node._a.push(n);
  218. node = n;
  219. };
  220. const newBold = () => {
  221. let n = {_n: 'strong', _t: ''};
  222. if (!node._a)
  223. node._a = [];
  224. node._a.push(n);
  225. node = n;
  226. };
  227. const growParagraph = (text) => {
  228. if (node._t == '')
  229. text = text.trimLeft();
  230. node._t += text;
  231. };
  232. newParagraph();
  233. const onNode = (elemName, tail, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  234. if (elemName == '')
  235. return;
  236. if (elemName[0] == '!') {//comment
  237. const text = elemName + tail;
  238. if (text == '!----------- собственно произведение ---------------')
  239. inText = true;
  240. if (text == '!---------------------------------------------------')
  241. inText = false;
  242. } else if (elemName[0] != '/') {//open tag
  243. if (!inText) {
  244. path += '/' + elemName;
  245. tag = elemName;
  246. } else {
  247. if (!center && (elemName == 'p' || elemName == 'dd')) {
  248. newParagraph();
  249. }
  250. switch (elemName) {
  251. case 'i':
  252. newItalic();
  253. //italic = true;
  254. break;
  255. case 'b':
  256. newBold();
  257. //bold = true;
  258. break;
  259. case 'div':
  260. if (tail == 'center') {
  261. newSubTitle();
  262. center = true;
  263. }
  264. break;
  265. }
  266. }
  267. } else if (elemName[0] == '/') {//close tag
  268. elemName = elemName.substr(1);
  269. if (!inText) {
  270. const oldPath = path;
  271. let t = '';
  272. do {
  273. let i = path.lastIndexOf('/');
  274. t = path.substr(i + 1);
  275. path = path.substr(0, i);
  276. } while (t != elemName && path);
  277. if (t != elemName) {
  278. path = oldPath;
  279. }
  280. let i = path.lastIndexOf('/');
  281. tag = path.substr(i + 1);
  282. } else {
  283. switch (elemName) {
  284. case 'i':
  285. //italic = false;
  286. break;
  287. case 'b':
  288. //bold = false;
  289. break;
  290. case 'div':
  291. center = false;
  292. break;
  293. }
  294. }
  295. }
  296. };
  297. const onText = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  298. if (text != ' ' && text.trim() == '')
  299. text = text.trim();
  300. if (text == '')
  301. return;
  302. switch (path) {
  303. case '/html/body/center/h2':
  304. titleInfo['book-title'] = text;
  305. return;
  306. case '/html/body/div/h3':
  307. if (!titleInfo.author)
  308. titleInfo.author = {};
  309. text = text.replace(':', '').trim().split(' ');
  310. if (text[0])
  311. titleInfo.author['last-name'] = text[0];
  312. if (text[1])
  313. titleInfo.author['first-name'] = text[1];
  314. if (text[2])
  315. titleInfo.author['middle-name'] = text[2];
  316. return;
  317. }
  318. if (inText)
  319. growParagraph(text);
  320. };
  321. this.parseHtml(this.decode(data).toString(),
  322. onNode, onText, new Set(['head', 'script', 'style']));
  323. const title = (titleInfo['book-title'] ? titleInfo['book-title'] : '');
  324. let author = '';
  325. if (titleInfo.author) {
  326. author = _.compact([
  327. (titleInfo.author['last-name'] ? titleInfo.author['last-name'] : ''),
  328. (titleInfo.author['first-name'] ? titleInfo.author['first-name'] : ''),
  329. (titleInfo.author['middle-name'] ? titleInfo.author['middle-name'] : ''),
  330. ]).join(' ');
  331. }
  332. pars.unshift({_n: 'title', _a: [
  333. {_n: 'p', _t: author}, {_n: 'p', _t: ''},
  334. {_n: 'p', _t: title}, {_n: 'p', _t: ''},
  335. ]})
  336. return this.formatFb2(fb2);
  337. }
  338. formatFb2(fb2) {
  339. let out = '<?xml version="1.0" encoding="utf-8"?>';
  340. out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';
  341. out += this.formatFb2Node(fb2);
  342. out += '</FictionBook>';
  343. return out;
  344. }
  345. formatFb2Node(node, name) {
  346. let out = '';
  347. if (Array.isArray(node)) {
  348. for (const n of node) {
  349. out += this.formatFb2Node(n);
  350. }
  351. } else if (typeof node == 'string') {
  352. out += `<${name}>${node}</${name}>`;
  353. } else {
  354. if (node._n)
  355. name = node._n;
  356. if (!name)
  357. throw new Error(`malformed fb2 object`);
  358. out += `<${name}>`;
  359. if (node.hasOwnProperty('_t'))
  360. out += node._t;
  361. for (let nodeName in node) {
  362. if (nodeName == '_n' || nodeName == '_t')
  363. continue;
  364. const n = node[nodeName];
  365. out += this.formatFb2Node(n, nodeName);
  366. }
  367. out += `</${name}>`;
  368. }
  369. return out;
  370. }
  371. }
  372. module.exports = BookConverter;