ConvertSamlib.js 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. const _ = require('lodash');
  2. const URL = require('url').URL;
  3. const sax = require('./sax');
  4. const ConvertBase = require('./ConvertBase');
  5. class ConvertSamlib extends ConvertBase {
  6. check(data, opts) {
  7. const {url, dataType} = opts;
  8. const parsedUrl = new URL(url);
  9. if (dataType && dataType.ext == 'html' &&
  10. (parsedUrl.hostname == 'samlib.ru' ||
  11. parsedUrl.hostname == 'budclub.ru' ||
  12. parsedUrl.hostname == 'zhurnal.lib.ru')) {
  13. return {hostname: parsedUrl.hostname};
  14. }
  15. return false;
  16. }
  17. async run(data, opts) {
  18. const checkResult = this.check(data, opts);
  19. if (!checkResult)
  20. return false;
  21. const {hostname} = checkResult;
  22. let titleInfo = {};
  23. let desc = {_n: 'description', 'title-info': titleInfo};
  24. let pars = [];
  25. let body = {_n: 'body', section: {_a: pars}};
  26. let fb2 = [desc, body];
  27. let inSubtitle = false;
  28. let inJustify = true;
  29. let inImage = false;
  30. let isFirstPara = false;
  31. let path = '';
  32. let tag = '';// eslint-disable-line no-unused-vars
  33. let inText = false;
  34. let textFound = false;
  35. let node = {_a: pars};
  36. let inPara = false;
  37. let italic = false;
  38. let bold = false;
  39. const openTag = (name, attrs) => {
  40. if (name == 'p')
  41. inPara = true;
  42. let n = {_n: name, _attrs: attrs, _a: [], _p: node};
  43. node._a.push(n);
  44. node = n;
  45. };
  46. const closeTag = (name) => {
  47. if (name == 'p')
  48. inPara = false;
  49. if (node._p) {
  50. const exact = (node._n == name);
  51. node = node._p;
  52. if (!exact)
  53. closeTag(name);
  54. }
  55. };
  56. const growParagraph = (text) => {
  57. if (!node._p) {
  58. if (text.trim() != '')
  59. openTag('p');
  60. else
  61. return;
  62. }
  63. if (node._n == 'p' && node._a.length == 0)
  64. text = text.trimLeft();
  65. node._a.push({_t: text});
  66. };
  67. const onStartNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  68. if (elemName == '')
  69. return;
  70. if (!inText) {
  71. path += '/' + elemName;
  72. tag = elemName;
  73. } else {
  74. switch (elemName) {
  75. case 'li':
  76. case 'p':
  77. case 'dd':
  78. case 'br':
  79. if (!(inSubtitle && isFirstPara)) {
  80. if (inPara)
  81. closeTag('p');
  82. openTag('p');
  83. }
  84. isFirstPara = false;
  85. break;
  86. case 'h1':
  87. case 'h2':
  88. case 'h3':
  89. if (inPara)
  90. closeTag('p');
  91. openTag('p');
  92. bold = true;
  93. break;
  94. case 'i':
  95. case 'em':
  96. italic = true;
  97. break;
  98. case 'b':
  99. case 'strong':
  100. bold = true;
  101. break;
  102. case 'div':
  103. if (inPara)
  104. closeTag('p');
  105. if (tail.indexOf('align="center"') >= 0) {
  106. openTag('subtitle');
  107. inSubtitle = true;
  108. isFirstPara = true;
  109. }
  110. if (tail.indexOf('align="justify"') >= 0) {
  111. openTag('p');
  112. inJustify = true;
  113. }
  114. break;
  115. case 'img': {
  116. if (inPara)
  117. closeTag('p');
  118. const attrs = sax.getAttrsSync(tail);
  119. if (attrs.src && attrs.src.value) {
  120. let href = attrs.src.value;
  121. if (href[0] == '/')
  122. href = `http://${hostname}${href}`;
  123. openTag('image', {'l:href': href});
  124. inImage = true;
  125. }
  126. break;
  127. }
  128. }
  129. }
  130. };
  131. const onEndNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  132. if (!inText) {
  133. const oldPath = path;
  134. let t = '';
  135. do {
  136. let i = path.lastIndexOf('/');
  137. t = path.substr(i + 1);
  138. path = path.substr(0, i);
  139. } while (t != elemName && path);
  140. if (t != elemName) {
  141. path = oldPath;
  142. }
  143. let i = path.lastIndexOf('/');
  144. tag = path.substr(i + 1);
  145. } else {
  146. switch (elemName) {
  147. case 'li':
  148. case 'p':
  149. case 'dd':
  150. closeTag('p');
  151. break;
  152. case 'h1':
  153. case 'h2':
  154. case 'h3':
  155. closeTag('p');
  156. bold = false;
  157. break;
  158. case 'i':
  159. case 'em':
  160. italic = false;
  161. break;
  162. case 'b':
  163. case 'strong':
  164. bold = false;
  165. break;
  166. case 'div':
  167. if (inSubtitle) {
  168. closeTag('subtitle');
  169. inSubtitle = false;
  170. isFirstPara = false;
  171. }
  172. if (inJustify) {
  173. closeTag('p');
  174. inJustify = false;
  175. }
  176. break;
  177. case 'img':
  178. if (inImage)
  179. closeTag('image');
  180. inImage = false;
  181. break;
  182. }
  183. }
  184. };
  185. const onComment = (text) => {// eslint-disable-line no-unused-vars
  186. if (text == '--------- Собственно произведение -------------') {
  187. inText = true;
  188. textFound = true;
  189. }
  190. if (text == '-----------------------------------------------')
  191. inText = false;
  192. };
  193. const onTextNode = (text) => {// eslint-disable-line no-unused-vars
  194. if (text && text.trim() == '')
  195. text = (text.indexOf(' ') >= 0 ? ' ' : '');
  196. if (!text)
  197. return;
  198. text = this.escapeEntities(text);
  199. switch (path) {
  200. case '/html/body/center/h2':
  201. titleInfo['book-title'] = text;
  202. return;
  203. case '/html/body/div/h3':
  204. if (!titleInfo.author)
  205. titleInfo.author = {};
  206. text = text.replace(':', '').trim().split(' ');
  207. if (text[0])
  208. titleInfo.author['last-name'] = text[0];
  209. if (text[1])
  210. titleInfo.author['first-name'] = text[1];
  211. if (text[2])
  212. titleInfo.author['middle-name'] = text[2];
  213. return;
  214. }
  215. let tOpen = (bold ? '<strong>' : '');
  216. tOpen += (italic ? '<emphasis>' : '');
  217. let tClose = (italic ? '</emphasis>' : '');
  218. tClose += (bold ? '</strong>' : '');
  219. if (inText)
  220. growParagraph(`${tOpen}${text}${tClose}`);
  221. };
  222. sax.parseSync(this.decode(data).toString().replace(/&nbsp;/g, ' '), {
  223. onStartNode, onEndNode, onTextNode, onComment,
  224. innerCut: new Set(['head', 'script', 'style'])
  225. });
  226. //текст не найден на странице, обработать корректно не получилось
  227. if (!textFound)
  228. return false;
  229. const title = (titleInfo['book-title'] ? titleInfo['book-title'] : '');
  230. let author = '';
  231. if (titleInfo.author) {
  232. author = _.compact([
  233. (titleInfo.author['last-name'] ? titleInfo.author['last-name'] : ''),
  234. (titleInfo.author['first-name'] ? titleInfo.author['first-name'] : ''),
  235. (titleInfo.author['middle-name'] ? titleInfo.author['middle-name'] : ''),
  236. ]).join(' ');
  237. }
  238. pars.unshift({_n: 'title', _a: [
  239. {_n: 'p', _t: author}, {_n: 'p', _t: ''},
  240. {_n: 'p', _t: title}, {_n: 'p', _t: ''},
  241. ]})
  242. return this.formatFb2(fb2);
  243. }
  244. }
  245. module.exports = ConvertSamlib;