DbCreator.js 21 KB


  1. const fs = require('fs-extra');
  2. const InpxParser = require('./InpxParser');
  3. const InpxHashCreator = require('./InpxHashCreator');
  4. const utils = require('./utils');
  5. const emptyFieldValue = '?';
  6. class DbCreator {
  7. constructor(config) {
  8. this.config = config;
  9. }
  10. async loadInpxFilter() {
  11. const inpxFilterFile = this.config.inpxFilterFile;
  12. if (await fs.pathExists(inpxFilterFile)) {
  13. let filter = await fs.readFile(inpxFilterFile, 'utf8');
  14. filter = JSON.parse(filter);
  15. if (filter.includeAuthors) {
  16. filter.includeAuthors = filter.includeAuthors.map(a => a.toLowerCase());
  17. filter.includeSet = new Set(filter.includeAuthors);
  18. }
  19. if (filter.excludeAuthors) {
  20. filter.excludeAuthors = filter.excludeAuthors.map(a => a.toLowerCase());
  21. filter.excludeSet = new Set(filter.excludeAuthors);
  22. }
  23. return filter;
  24. } else {
  25. return false;
  26. }
  27. }
  28. //процедура формировани БД несколько усложнена, в целях экономии памяти
  29. async run(db, callback) {
  30. const config = this.config;
  31. callback({jobStepCount: 5});
  32. callback({job: 'load inpx', jobMessage: 'Загрузка INPX', jobStep: 1, progress: 0});
  33. //временная таблица
  34. await db.create({
  35. table: 'book',
  36. cacheSize: (config.lowMemoryMode ? 5 : 500),
  37. });
  38. //поисковые таблицы, позже сохраним в БД
  39. let authorMap = new Map();//авторы
  40. let authorArr = [];
  41. let seriesMap = new Map();//серии
  42. let seriesArr = [];
  43. let titleMap = new Map();//названия
  44. let titleArr = [];
  45. let genreMap = new Map();//жанры
  46. let genreArr = [];
  47. let langMap = new Map();//языки
  48. let langArr = [];
  49. let delMap = new Map();//удаленные
  50. let delArr = [];
  51. let dateMap = new Map();//дата поступления
  52. let dateArr = [];
  53. let librateMap = new Map();//оценка
  54. let librateArr = [];
  55. let extMap = new Map();//тип файла
  56. let extArr = [];
  57. let uidSet = new Set();//уникальные идентификаторы
  58. //stats
  59. let authorCount = 0;
  60. let bookCount = 0;
  61. let noAuthorBookCount = 0;
  62. let bookDelCount = 0;
  63. //stuff
  64. let recsLoaded = 0;
  65. callback({recsLoaded});
  66. let chunkNum = 0;
  67. //фильтр
  68. const inpxFilter = await this.loadInpxFilter();
  69. let filter = () => true;
  70. if (inpxFilter) {
  71. let recFilter = () => true;
  72. if (inpxFilter.filter) {
  73. if (config.allowUnsafeFilter)
  74. recFilter = new Function(`'use strict'; return ${inpxFilter.filter}`)();
  75. else
  76. throw new Error(`Unsafe property 'filter' detected in ${this.config.inpxFilterFile}. Please specify '--unsafe-filter' param if you know what you're doing.`);
  77. }
  78. filter = (rec) => {
  79. let author = rec.author;
  80. if (!author)
  81. author = emptyFieldValue;
  82. author = author.toLowerCase();
  83. let excluded = false;
  84. if (inpxFilter.excludeSet) {
  85. const authors = author.split(',');
  86. for (const a of authors) {
  87. if (inpxFilter.excludeSet.has(a)) {
  88. excluded = true;
  89. break;
  90. }
  91. }
  92. }
  93. return recFilter(rec)
  94. && (!inpxFilter.includeSet || inpxFilter.includeSet.has(author))
  95. && !excluded
  96. ;
  97. };
  98. }
  99. //вспомогательные функции
  100. const splitAuthor = (author) => {
  101. if (!author)
  102. author = emptyFieldValue;
  103. const result = author.split(',');
  104. if (result.length > 1)
  105. result.push(author);
  106. return result;
  107. }
  108. let totalFiles = 0;
  109. const readFileCallback = async(readState) => {
  110. callback(readState);
  111. if (readState.totalFiles)
  112. totalFiles = readState.totalFiles;
  113. if (totalFiles)
  114. callback({progress: (readState.current || 0)/totalFiles});
  115. };
  116. const parseField = (fieldValue, fieldMap, fieldArr, bookId, rec, fillBookIds = true) => {
  117. let value = fieldValue;
  118. if (typeof(fieldValue) == 'string') {
  119. if (!fieldValue)
  120. fieldValue = emptyFieldValue;
  121. value = fieldValue.toLowerCase();
  122. }
  123. let fieldRec;
  124. if (fieldMap.has(value)) {
  125. const fieldId = fieldMap.get(value);
  126. fieldRec = fieldArr[fieldId];
  127. } else {
  128. fieldRec = {id: fieldArr.length, value, bookIds: new Set()};
  129. if (rec !== undefined) {
  130. fieldRec.name = fieldValue;
  131. fieldRec.bookCount = 0;
  132. fieldRec.bookDelCount = 0;
  133. }
  134. fieldArr.push(fieldRec);
  135. fieldMap.set(value, fieldRec.id);
  136. }
  137. if (fieldValue !== emptyFieldValue || fillBookIds)
  138. fieldRec.bookIds.add(bookId);
  139. if (rec !== undefined) {
  140. if (!rec.del)
  141. fieldRec.bookCount++;
  142. else
  143. fieldRec.bookDelCount++;
  144. }
  145. };
  146. const parseBookRec = (rec) => {
  147. //авторы
  148. const author = splitAuthor(rec.author);
  149. for (let i = 0; i < author.length; i++) {
  150. const a = author[i];
  151. //статистика
  152. if (!authorMap.has(a.toLowerCase()) && (author.length == 1 || i < author.length - 1)) //без соавторов
  153. authorCount++;
  154. parseField(a, authorMap, authorArr, rec.id, rec);
  155. }
  156. //серии
  157. parseField(rec.series, seriesMap, seriesArr, rec.id, rec, false);
  158. //названия
  159. parseField(rec.title, titleMap, titleArr, rec.id, rec);
  160. //жанры
  161. let genre = rec.genre || emptyFieldValue;
  162. genre = rec.genre.split(',');
  163. for (let g of genre) {
  164. parseField(g, genreMap, genreArr, rec.id);
  165. }
  166. //языки
  167. parseField(rec.lang, langMap, langArr, rec.id);
  168. //удаленные
  169. parseField(rec.del, delMap, delArr, rec.id);
  170. //дата поступления
  171. parseField(rec.date, dateMap, dateArr, rec.id);
  172. //оценка
  173. parseField(rec.librate, librateMap, librateArr, rec.id);
  174. //тип файла
  175. parseField(rec.ext, extMap, extArr, rec.id);
  176. };
  177. //основная процедура парсинга
  178. let id = 0;
  179. const parsedCallback = async(chunk) => {
  180. let filtered = false;
  181. for (const rec of chunk) {
  182. //сначала фильтр
  183. if (!filter(rec) || uidSet.has(rec._uid)) {
  184. rec.id = 0;
  185. filtered = true;
  186. continue;
  187. }
  188. rec.id = ++id;
  189. uidSet.add(rec._uid);
  190. if (!rec.del) {
  191. bookCount++;
  192. if (!rec.author)
  193. noAuthorBookCount++;
  194. } else {
  195. bookDelCount++;
  196. }
  197. parseBookRec(rec);
  198. }
  199. let saveChunk = [];
  200. if (filtered) {
  201. saveChunk = chunk.filter(r => r.id);
  202. } else {
  203. saveChunk = chunk;
  204. }
  205. await db.insert({table: 'book', rows: saveChunk});
  206. recsLoaded += chunk.length;
  207. callback({recsLoaded});
  208. if (chunkNum++ % 10 == 0 && config.lowMemoryMode)
  209. utils.freeMemory();
  210. };
  211. //парсинг
  212. const parser = new InpxParser();
  213. await parser.parse(config.inpxFile, readFileCallback, parsedCallback);
  214. //чистка памяти, ибо жрет как не в себя
  215. authorMap = null;
  216. seriesMap = null;
  217. titleMap = null;
  218. genreMap = null;
  219. langMap = null;
  220. delMap = null;
  221. dateMap = null;
  222. librateMap = null;
  223. extMap = null;
  224. uidSet = null;
  225. await db.close({table: 'book'});
  226. await db.freeMemory();
  227. utils.freeMemory();
  228. //отсортируем таблицы выдадим им правильные id
  229. //порядок id соответствует ASC-сортировке по value
  230. callback({job: 'sort', jobMessage: 'Сортировка', jobStep: 2, progress: 0});
  231. await utils.sleep(100);
  232. //сортировка авторов
  233. authorArr.sort((a, b) => a.value.localeCompare(b.value));
  234. callback({progress: 0.2});
  235. await utils.sleep(100);
  236. id = 0;
  237. for (const authorRec of authorArr) {
  238. authorRec.id = ++id;
  239. }
  240. callback({progress: 0.3});
  241. await utils.sleep(100);
  242. //сортировка серий
  243. seriesArr.sort((a, b) => a.value.localeCompare(b.value));
  244. callback({progress: 0.5});
  245. await utils.sleep(100);
  246. id = 0;
  247. for (const seriesRec of seriesArr) {
  248. seriesRec.id = ++id;
  249. }
  250. callback({progress: 0.6});
  251. await utils.sleep(100);
  252. //сортировка названий
  253. titleArr.sort((a, b) => a.value.localeCompare(b.value));
  254. callback({progress: 0.8});
  255. await utils.sleep(100);
  256. id = 0;
  257. for (const titleRec of titleArr) {
  258. titleRec.id = ++id;
  259. }
  260. //stats
  261. const stats = {
  262. filesCount: 0,//вычислим позднее
  263. filesCountAll: 0,//вычислим позднее
  264. filesDelCount: 0,//вычислим позднее
  265. recsLoaded,
  266. authorCount,
  267. authorCountAll: authorArr.length,
  268. bookCount,
  269. bookCountAll: bookCount + bookDelCount,
  270. bookDelCount,
  271. noAuthorBookCount,
  272. titleCount: titleArr.length,
  273. seriesCount: seriesArr.length,
  274. genreCount: genreArr.length,
  275. langCount: langArr.length,
  276. };
  277. //console.log(stats);
  278. //сохраним поисковые таблицы
  279. const chunkSize = 10000;
  280. const saveTable = async(table, arr, nullArr, indexType = 'string', delEmpty = false) => {
  281. if (indexType == 'string')
  282. arr.sort((a, b) => a.value.localeCompare(b.value));
  283. else
  284. arr.sort((a, b) => a.value - b.value);
  285. await db.create({
  286. table,
  287. index: {field: 'value', unique: true, type: indexType, depth: 1000000},
  288. });
  289. //вставка в БД по кусочкам, экономим память
  290. for (let i = 0; i < arr.length; i += chunkSize) {
  291. const chunk = arr.slice(i, i + chunkSize);
  292. for (const rec of chunk)
  293. rec.bookIds = Array.from(rec.bookIds);
  294. await db.insert({table, rows: chunk});
  295. if (i % 5 == 0) {
  296. await db.freeMemory();
  297. await utils.sleep(10);
  298. }
  299. callback({progress: i/arr.length});
  300. }
  301. if (delEmpty) {
  302. const delResult = await db.delete({table, where: `@@indexLR('value', '?', '?')`});
  303. const statField = `${table}Count`;
  304. if (stats[statField])
  305. stats[statField] -= delResult.deleted;
  306. }
  307. nullArr();
  308. await db.close({table});
  309. utils.freeMemory();
  310. await db.freeMemory();
  311. };
  312. //author
  313. callback({job: 'author save', jobMessage: 'Сохранение индекса авторов', jobStep: 3, progress: 0});
  314. await saveTable('author', authorArr, () => {authorArr = null});
  315. //series
  316. callback({job: 'series save', jobMessage: 'Сохранение индекса серий', jobStep: 4, progress: 0});
  317. await saveTable('series', seriesArr, () => {seriesArr = null}, 'string', true);
  318. //title
  319. callback({job: 'title save', jobMessage: 'Сохранение индекса названий', jobStep: 5, progress: 0});
  320. await saveTable('title', titleArr, () => {titleArr = null});
  321. //genre
  322. callback({job: 'genre save', jobMessage: 'Сохранение индекса жанров', jobStep: 6, progress: 0});
  323. await saveTable('genre', genreArr, () => {genreArr = null});
  324. callback({job: 'others save', jobMessage: 'Сохранение остальных индексов', jobStep: 7, progress: 0});
  325. //lang
  326. await saveTable('lang', langArr, () => {langArr = null});
  327. //del
  328. await saveTable('del', delArr, () => {delArr = null}, 'number');
  329. //date
  330. await saveTable('date', dateArr, () => {dateArr = null});
  331. //librate
  332. await saveTable('librate', librateArr, () => {librateArr = null}, 'number');
  333. //ext
  334. await saveTable('ext', extArr, () => {extArr = null});
  335. //кэш-таблицы запросов
  336. await db.create({table: 'query_cache'});
  337. await db.create({table: 'query_time'});
  338. //кэш-таблица имен файлов и их хешей
  339. await db.create({table: 'file_hash'});
  340. //-- завершающие шаги --------------------------------
  341. await db.open({
  342. table: 'book',
  343. cacheSize: (config.lowMemoryMode ? 5 : 500),
  344. });
  345. callback({job: 'optimization', jobMessage: 'Оптимизация', jobStep: 8, progress: 0});
  346. await this.optimizeTable('author', db, (p) => {
  347. if (p.progress)
  348. p.progress = 0.3*p.progress;
  349. callback(p);
  350. });
  351. await this.optimizeTable('series', db, (p) => {
  352. if (p.progress)
  353. p.progress = 0.3 + 0.2*p.progress;
  354. callback(p);
  355. });
  356. await this.optimizeTable('title', db, (p) => {
  357. if (p.progress)
  358. p.progress = 0.5 + 0.5*p.progress;
  359. callback(p);
  360. });
  361. callback({job: 'stats count', jobMessage: 'Подсчет статистики', jobStep: 9, progress: 0});
  362. await this.countStats(db, callback, stats);
  363. //чистка памяти, ибо жрет как не в себя
  364. await db.close({table: 'book'});
  365. await db.freeMemory();
  366. utils.freeMemory();
  367. //config сохраняем в самом конце, нет конфига - с базой что-то не так
  368. const inpxHashCreator = new InpxHashCreator(config);
  369. await db.create({
  370. table: 'config'
  371. });
  372. const inpxInfo = parser.info;
  373. if (inpxFilter && inpxFilter.info) {
  374. if (inpxFilter.info.collection)
  375. inpxInfo.collection = inpxFilter.info.collection;
  376. if (inpxFilter.info.version)
  377. inpxInfo.version = inpxFilter.info.version;
  378. }
  379. await db.insert({table: 'config', rows: [
  380. {id: 'inpxInfo', value: inpxInfo},
  381. {id: 'stats', value: stats},
  382. {id: 'inpxHash', value: await inpxHashCreator.getHash()},
  383. ]});
  384. callback({job: 'done', jobMessage: ''});
  385. }
  386. async optimizeTable(from, db, callback) {
  387. const config = this.config;
  388. const to = `${from}_book`;
  389. await db.open({table: from});
  390. await db.create({table: to});
  391. let bookId2RecId = new Map();
  392. const saveChunk = async(chunk) => {
  393. const ids = [];
  394. for (const rec of chunk) {
  395. for (const id of rec.bookIds) {
  396. let b2r = bookId2RecId.get(id);
  397. if (!b2r) {
  398. b2r = [];
  399. bookId2RecId.set(id, b2r);
  400. }
  401. b2r.push(rec.id);
  402. ids.push(id);
  403. }
  404. }
  405. if (config.fullOptimization) {
  406. ids.sort((a, b) => a - b);// обязательно, иначе будет тормозить - особенности JembaDb
  407. const rows = await db.select({table: 'book', where: `@@id(${db.esc(ids)})`});
  408. const bookArr = new Map();
  409. for (const row of rows)
  410. bookArr.set(row.id, row);
  411. for (const rec of chunk) {
  412. rec.books = [];
  413. for (const id of rec.bookIds) {
  414. const book = bookArr.get(id);
  415. if (book) {//на всякий случай
  416. rec.books.push(book);
  417. }
  418. }
  419. delete rec.name;
  420. delete rec.value;
  421. delete rec.bookIds;
  422. }
  423. await db.insert({
  424. table: to,
  425. rows: chunk,
  426. });
  427. }
  428. };
  429. const rows = await db.select({table: from, count: true});
  430. const fromLength = rows[0].count;
  431. let processed = 0;
  432. while (1) {// eslint-disable-line
  433. const chunk = await db.select({
  434. table: from,
  435. where: `
  436. let iter = @getItem('optimize');
  437. if (!iter) {
  438. iter = @all();
  439. @setItem('optimize', iter);
  440. }
  441. const ids = new Set();
  442. let bookIdsLen = 0;
  443. let id = iter.next();
  444. while (!id.done) {
  445. ids.add(id.value);
  446. const row = @row(id.value);
  447. bookIdsLen += row.bookIds.length;
  448. if (bookIdsLen >= 50000)
  449. break;
  450. id = iter.next();
  451. }
  452. return ids;
  453. `
  454. });
  455. if (chunk.length) {
  456. await saveChunk(chunk);
  457. processed += chunk.length;
  458. callback({progress: 0.9*processed/fromLength});
  459. } else
  460. break;
  461. if (this.config.lowMemoryMode) {
  462. await utils.sleep(10);
  463. utils.freeMemory();
  464. await db.freeMemory();
  465. }
  466. }
  467. await db.close({table: to});
  468. await db.close({table: from});
  469. const idMap = {arr: [], map: []};
  470. for (const [id, value] of bookId2RecId) {
  471. if (value.length > 1) {
  472. idMap.map.push([id, value]);
  473. idMap.arr[id] = 0;
  474. } else {
  475. idMap.arr[id] = value[0];
  476. }
  477. }
  478. callback({progress: 1});
  479. await fs.writeFile(`${this.config.dataDir}/db/${from}_id.map`, JSON.stringify(idMap));
  480. bookId2RecId = null;
  481. utils.freeMemory();
  482. }
  483. async countStats(db, callback, stats) {
  484. //статистика по количеству файлов
  485. //эмуляция прогресса
  486. let countDone = false;
  487. (async() => {
  488. let i = 0;
  489. while (!countDone) {
  490. callback({progress: i/100});
  491. i = (i < 100 ? i + 5 : 100);
  492. await utils.sleep(1000);
  493. }
  494. })();
  495. //подчсет
  496. const countRes = await db.select({table: 'book', rawResult: true, where: `
  497. const files = new Set();
  498. const filesDel = new Set();
  499. for (const id of @all()) {
  500. const r = @row(id);
  501. const file = ${"`${r.folder}/${r.file}.${r.ext}`"};
  502. if (!r.del) {
  503. files.add(file);
  504. } else {
  505. filesDel.add(file);
  506. }
  507. }
  508. for (const file of filesDel)
  509. if (files.has(file))
  510. filesDel.delete(file);
  511. return {filesCount: files.size, filesDelCount: filesDel.size};
  512. `});
  513. if (countRes.length) {
  514. const res = countRes[0].rawResult;
  515. stats.filesCount = res.filesCount;
  516. stats.filesCountAll = res.filesCount + res.filesDelCount;
  517. stats.filesDelCount = res.filesDelCount;
  518. }
  519. //заодно добавим нужный индекс
  520. await db.create({
  521. in: 'book',
  522. hash: {field: '_uid', type: 'string', depth: 100, unique: true},
  523. });
  524. countDone = true;
  525. }
  526. }
  527. module.exports = DbCreator;