ConvertSamlib.js 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. const _ = require('lodash');
  2. const URL = require('url').URL;
  3. const sax = require('./sax');
  4. const ConvertBase = require('./ConvertBase');
  5. class ConvertSamlib extends ConvertBase {
  6. check(data, opts) {
  7. const {url} = opts;
  8. const parsedUrl = new URL(url);
  9. if (parsedUrl.hostname == 'samlib.ru' ||
  10. parsedUrl.hostname == 'budclub.ru' ||
  11. parsedUrl.hostname == 'zhurnal.lib.ru') {
  12. return {hostname: parsedUrl.hostname};
  13. }
  14. return false;
  15. }
  16. run(data, opts) {
  17. const checkResult = this.check(data, opts);
  18. if (!checkResult)
  19. return false;
  20. const {hostname} = checkResult;
  21. let titleInfo = {};
  22. let desc = {_n: 'description', 'title-info': titleInfo};
  23. let pars = [];
  24. let body = {_n: 'body', section: {_a: pars}};
  25. let fb2 = [desc, body];
  26. let inSubtitle = false;
  27. let inJustify = true;
  28. let inImage = false;
  29. let isFirstPara = false;
  30. let path = '';
  31. let tag = '';// eslint-disable-line no-unused-vars
  32. let inText = false;
  33. let textFound = false;
  34. let node = {_a: pars};
  35. let inPara = false;
  36. let italic = false;
  37. let bold = false;
  38. const openTag = (name, attrs) => {
  39. if (name == 'p')
  40. inPara = true;
  41. let n = {_n: name, _attrs: attrs, _a: [], _p: node};
  42. node._a.push(n);
  43. node = n;
  44. };
  45. const closeTag = (name) => {
  46. if (name == 'p')
  47. inPara = false;
  48. if (node._p) {
  49. const exact = (node._n == name);
  50. node = node._p;
  51. if (!exact)
  52. closeTag(name);
  53. }
  54. };
  55. const growParagraph = (text) => {
  56. if (!node._p) {
  57. if (text.trim() != '')
  58. openTag('p');
  59. else
  60. return;
  61. }
  62. if (node._n == 'p' && node._a.length == 0)
  63. text = text.trimLeft();
  64. node._a.push({_t: text});
  65. };
  66. const onStartNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  67. if (elemName == '')
  68. return;
  69. if (!inText) {
  70. path += '/' + elemName;
  71. tag = elemName;
  72. } else {
  73. switch (elemName) {
  74. case 'li':
  75. case 'p':
  76. case 'dd':
  77. case 'br':
  78. if (!(inSubtitle && isFirstPara)) {
  79. if (inPara)
  80. closeTag('p');
  81. openTag('p');
  82. }
  83. isFirstPara = false;
  84. break;
  85. case 'h1':
  86. case 'h2':
  87. case 'h3':
  88. if (inPara)
  89. closeTag('p');
  90. openTag('p');
  91. bold = true;
  92. break;
  93. case 'i':
  94. case 'em':
  95. italic = true;
  96. break;
  97. case 'b':
  98. case 'strong':
  99. bold = true;
  100. break;
  101. case 'div':
  102. if (inPara)
  103. closeTag('p');
  104. if (tail.indexOf('align="center"') >= 0) {
  105. openTag('subtitle');
  106. inSubtitle = true;
  107. isFirstPara = true;
  108. }
  109. if (tail.indexOf('align="justify"') >= 0) {
  110. openTag('p');
  111. inJustify = true;
  112. }
  113. break;
  114. case 'img': {
  115. if (inPara)
  116. closeTag('p');
  117. const attrs = sax.getAttrsSync(tail);
  118. if (attrs.src && attrs.src.value) {
  119. let href = attrs.src.value;
  120. if (href[0] == '/')
  121. href = `http://${hostname}${href}`;
  122. openTag('image', {href});
  123. inImage = true;
  124. }
  125. break;
  126. }
  127. }
  128. }
  129. };
  130. const onEndNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
  131. if (!inText) {
  132. const oldPath = path;
  133. let t = '';
  134. do {
  135. let i = path.lastIndexOf('/');
  136. t = path.substr(i + 1);
  137. path = path.substr(0, i);
  138. } while (t != elemName && path);
  139. if (t != elemName) {
  140. path = oldPath;
  141. }
  142. let i = path.lastIndexOf('/');
  143. tag = path.substr(i + 1);
  144. } else {
  145. switch (elemName) {
  146. case 'li':
  147. case 'p':
  148. case 'dd':
  149. closeTag('p');
  150. break;
  151. case 'h1':
  152. case 'h2':
  153. case 'h3':
  154. closeTag('p');
  155. bold = false;
  156. break;
  157. case 'i':
  158. case 'em':
  159. italic = false;
  160. break;
  161. case 'b':
  162. case 'strong':
  163. bold = false;
  164. break;
  165. case 'div':
  166. if (inSubtitle) {
  167. closeTag('subtitle');
  168. inSubtitle = false;
  169. isFirstPara = false;
  170. }
  171. if (inJustify) {
  172. closeTag('p');
  173. inJustify = false;
  174. }
  175. break;
  176. case 'img':
  177. if (inImage)
  178. closeTag('image');
  179. inImage = false;
  180. break;
  181. }
  182. }
  183. };
  184. const onComment = (text) => {// eslint-disable-line no-unused-vars
  185. if (text == '--------- Собственно произведение -------------') {
  186. inText = true;
  187. textFound = true;
  188. }
  189. if (text == '-----------------------------------------------')
  190. inText = false;
  191. };
  192. const onTextNode = (text) => {// eslint-disable-line no-unused-vars
  193. if (text && text.trim() == '')
  194. text = (text.indexOf(' ') >= 0 ? ' ' : '');
  195. if (!text)
  196. return;
  197. switch (path) {
  198. case '/html/body/center/h2':
  199. titleInfo['book-title'] = text;
  200. return;
  201. case '/html/body/div/h3':
  202. if (!titleInfo.author)
  203. titleInfo.author = {};
  204. text = text.replace(':', '').trim().split(' ');
  205. if (text[0])
  206. titleInfo.author['last-name'] = text[0];
  207. if (text[1])
  208. titleInfo.author['first-name'] = text[1];
  209. if (text[2])
  210. titleInfo.author['middle-name'] = text[2];
  211. return;
  212. }
  213. let tOpen = (bold ? '<strong>' : '');
  214. tOpen += (italic ? '<emphasis>' : '');
  215. let tClose = (italic ? '</emphasis>' : '');
  216. tClose += (bold ? '</strong>' : '');
  217. if (inText)
  218. growParagraph(`${tOpen}${text}${tClose}`);
  219. };
  220. sax.parseSync(this.decode(data).toString().replace(/&nbsp;/g, ' '), {
  221. onStartNode, onEndNode, onTextNode, onComment,
  222. innerCut: new Set(['head', 'script', 'style'])
  223. });
  224. //текст не найден на странице, обработать корректно не получилось
  225. if (!textFound)
  226. return false;
  227. const title = (titleInfo['book-title'] ? titleInfo['book-title'] : '');
  228. let author = '';
  229. if (titleInfo.author) {
  230. author = _.compact([
  231. (titleInfo.author['last-name'] ? titleInfo.author['last-name'] : ''),
  232. (titleInfo.author['first-name'] ? titleInfo.author['first-name'] : ''),
  233. (titleInfo.author['middle-name'] ? titleInfo.author['middle-name'] : ''),
  234. ]).join(' ');
  235. }
  236. pars.unshift({_n: 'title', _a: [
  237. {_n: 'p', _t: author}, {_n: 'p', _t: ''},
  238. {_n: 'p', _t: title}, {_n: 'p', _t: ''},
  239. ]})
  240. return this.formatFb2(fb2);
  241. }
  242. }
  243. module.exports = ConvertSamlib;