ConvertSamlib.js 9.2 KB


  1. const _ = require('lodash');
  2. const URL = require('url').URL;
  3. const sax = require('../../sax');
  4. const ConvertBase = require('./ConvertBase');
  5. class ConvertSamlib extends ConvertBase {
  6. check(data, opts) {
  7. const {url, dataType} = opts;
  8. const parsedUrl = new URL(url);
  9. if (dataType && dataType.ext == 'html' &&
  10. (parsedUrl.hostname == 'samlib.ru' ||
  11. parsedUrl.hostname == 'budclub.ru' ||
  12. parsedUrl.hostname == 'zhurnal.lib.ru')) {
  13. return {hostname: parsedUrl.hostname};
  14. }
  15. return false;
  16. }
  17. async run(data, opts) {
  18. if (!opts.enableSitesFilter)
  19. return false;
  20. const checkResult = this.check(data, opts);
  21. if (!checkResult)
  22. return false;
  23. const {hostname} = checkResult;
  24. let titleInfo = {};
  25. let desc = {_n: 'description', 'title-info': titleInfo};
  26. let pars = [];
  27. let body = {_n: 'body', section: {_a: pars}};
  28. let fb2 = [desc, body];
  29. let inSubtitle = false;
  30. let inJustify = true;
  31. let inImage = false;
  32. let isFirstPara = false;
  33. let path = '';
  34. let tag = '';// eslint-disable-line no-unused-vars
  35. let inText = false;
  36. let textFound = false;
  37. let node = {_a: pars};
  38. let inPara = false;
  39. let italic = false;
  40. let bold = false;
  41. const openTag = (name, attrs) => {
  42. if (name == 'p')
  43. inPara = true;
  44. let n = {_n: name, _attrs: attrs, _a: [], _p: node};
  45. node._a.push(n);
  46. node = n;
  47. };
  48. const closeTag = (name) => {
  49. if (name == 'p')
  50. inPara = false;
  51. if (node._p) {
  52. const exact = (node._n == name);
  53. node = node._p;
  54. if (!exact)
  55. closeTag(name);
  56. }
  57. };
  58. const growParagraph = (text) => {
  59. if (!node._p) {
  60. if (text.trim() != '')
  61. openTag('p');
  62. else
  63. return;
  64. }
  65. if (node._n == 'p' && node._a.length == 0)
  66. text = text.trimLeft();
  67. node._a.push({_t: text});
  68. };
  69. const onStartNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  70. if (elemName == '')
  71. return;
  72. if (!inText) {
  73. path += '/' + elemName;
  74. tag = elemName;
  75. } else {
  76. switch (elemName) {
  77. case 'li':
  78. case 'p':
  79. case 'dd':
  80. case 'br':
  81. if (!(inSubtitle && isFirstPara)) {
  82. if (inPara)
  83. closeTag('p');
  84. openTag('p');
  85. }
  86. isFirstPara = false;
  87. break;
  88. case 'h1':
  89. case 'h2':
  90. case 'h3':
  91. if (inPara)
  92. closeTag('p');
  93. openTag('p');
  94. bold = true;
  95. break;
  96. case 'i':
  97. case 'em':
  98. italic = true;
  99. break;
  100. case 'b':
  101. case 'strong':
  102. bold = true;
  103. break;
  104. case 'div':
  105. if (inPara)
  106. closeTag('p');
  107. if (tail.indexOf('align="center"') >= 0) {
  108. openTag('subtitle');
  109. inSubtitle = true;
  110. isFirstPara = true;
  111. }
  112. if (tail.indexOf('align="justify"') >= 0) {
  113. openTag('p');
  114. inJustify = true;
  115. }
  116. break;
  117. case 'img': {
  118. if (inPara)
  119. closeTag('p');
  120. const attrs = sax.getAttrsSync(tail);
  121. if (attrs.src && attrs.src.value) {
  122. let href = attrs.src.value;
  123. if (href[0] == '/')
  124. href = `http://${hostname}${href}`;
  125. openTag('image', {'l:href': href});
  126. inImage = true;
  127. }
  128. break;
  129. }
  130. }
  131. }
  132. };
  133. const onEndNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  134. if (!inText) {
  135. const oldPath = path;
  136. let t = '';
  137. do {
  138. let i = path.lastIndexOf('/');
  139. t = path.substr(i + 1);
  140. path = path.substr(0, i);
  141. } while (t != elemName && path);
  142. if (t != elemName) {
  143. path = oldPath;
  144. }
  145. let i = path.lastIndexOf('/');
  146. tag = path.substr(i + 1);
  147. } else {
  148. switch (elemName) {
  149. case 'li':
  150. case 'p':
  151. case 'dd':
  152. closeTag('p');
  153. break;
  154. case 'h1':
  155. case 'h2':
  156. case 'h3':
  157. closeTag('p');
  158. bold = false;
  159. break;
  160. case 'i':
  161. case 'em':
  162. italic = false;
  163. break;
  164. case 'b':
  165. case 'strong':
  166. bold = false;
  167. break;
  168. case 'div':
  169. if (inSubtitle) {
  170. closeTag('subtitle');
  171. inSubtitle = false;
  172. isFirstPara = false;
  173. }
  174. if (inJustify) {
  175. closeTag('p');
  176. inJustify = false;
  177. }
  178. break;
  179. case 'img':
  180. if (inImage)
  181. closeTag('image');
  182. inImage = false;
  183. break;
  184. }
  185. }
  186. };
  187. const onComment = (text) => {// eslint-disable-line no-unused-vars
  188. if (text == '--------- Собственно произведение -------------') {
  189. inText = true;
  190. textFound = true;
  191. }
  192. if (text == '-----------------------------------------------')
  193. inText = false;
  194. };
  195. const onTextNode = (text) => {// eslint-disable-line no-unused-vars
  196. if (text && text.trim() == '')
  197. text = (text.indexOf(' ') >= 0 ? ' ' : '');
  198. if (!text)
  199. return;
  200. text = this.escapeEntities(text);
  201. switch (path) {
  202. case '/html/body/center/h2':
  203. titleInfo['book-title'] = text;
  204. return;
  205. case '/html/body/div/h3':
  206. if (!titleInfo.author)
  207. titleInfo.author = {};
  208. text = text.replace(':', '').trim().split(' ');
  209. if (text[0])
  210. titleInfo.author['last-name'] = text[0];
  211. if (text[1])
  212. titleInfo.author['first-name'] = text[1];
  213. if (text[2])
  214. titleInfo.author['middle-name'] = text[2];
  215. return;
  216. }
  217. let tOpen = (bold ? '<strong>' : '');
  218. tOpen += (italic ? '<emphasis>' : '');
  219. let tClose = (italic ? '</emphasis>' : '');
  220. tClose += (bold ? '</strong>' : '');
  221. if (inText)
  222. growParagraph(`${tOpen}${text}${tClose}`);
  223. };
  224. sax.parseSync(this.decode(data).toString().replace(/&nbsp;/g, ' '), {
  225. onStartNode, onEndNode, onTextNode, onComment,
  226. innerCut: new Set(['head', 'script', 'style'])
  227. });
  228. //текст не найден на странице, обработать корректно не получилось
  229. if (!textFound)
  230. return false;
  231. const title = (titleInfo['book-title'] ? titleInfo['book-title'] : '');
  232. let author = '';
  233. if (titleInfo.author) {
  234. author = _.compact([
  235. (titleInfo.author['last-name'] ? titleInfo.author['last-name'] : ''),
  236. (titleInfo.author['first-name'] ? titleInfo.author['first-name'] : ''),
  237. (titleInfo.author['middle-name'] ? titleInfo.author['middle-name'] : ''),
  238. ]).join(' ');
  239. }
  240. pars.unshift({_n: 'title', _a: [
  241. {_n: 'p', _t: author}, {_n: 'p', _t: ''},
  242. {_n: 'p', _t: title}, {_n: 'p', _t: ''},
  243. ]})
  244. return this.formatFb2(fb2);
  245. }
  246. }
  247. module.exports = ConvertSamlib;