Pārlūkot izejas kodu

Работа над XmlParser

Book Pauk 2 gadi atpakaļ
vecāks
revīzija
a40d9e25b0

+ 67 - 0
server/core/xml/Fb2Parser.js

@@ -0,0 +1,67 @@
+const fs = require('fs-extra');
+const iconv = require('iconv-lite');
+const textUtils = require('./textUtils');
+
+const xmlParser = require('./xmlParser');
+const utils = require('../utils');
+
+class Fb2Parser {
+    checkEncoding(data) {
+        //Корректируем кодировку UTF-16
+        let encoding = textUtils.getEncoding(data);
+        if (encoding.indexOf('UTF-16') == 0) {
+            data = Buffer.from(iconv.decode(data, encoding));
+            encoding = 'utf-8';
+        }
+
+        //Корректируем пробелы, всякие файлы попадаются :(
+        if (data[0] == 32) {
+            data = Buffer.from(data.toString().trim());
+        }
+
+        //Окончательно корректируем кодировку
+        let result = data;
+
+        let left = data.indexOf('<?xml version="1.0"');
+        if (left < 0) {
+            left = data.indexOf('<?xml version=\'1.0\'');
+        }
+
+        if (left >= 0) {
+            const right = data.indexOf('?>', left);
+            if (right >= 0) {
+                const head = data.slice(left, right + 2).toString();
+                const m = head.match(/encoding=['"](.*?)['"]/);
+                if (m) {
+                    let enc = m[1].toLowerCase();
+                    if (enc != 'utf-8') {
+                        //enc может не соответсвовать реальной кодировке файла, поэтому:
+                        if (encoding.indexOf('ISO-8859') >= 0) {
+                            encoding = enc;
+                        }
+
+                        result = iconv.decode(data, encoding);
+                        result = Buffer.from(result.toString().replace(m[0], `encoding="utf-8"`));
+                    }
+                }
+            }
+        }
+
+        return result;
+    }
+
+    async getDescAndCover(bookFile) {
+        let data = await fs.readFile(bookFile);
+        data = await utils.gunzipBuffer(data);
+        //data = this.checkEncoding(data);
+
+        const result = xmlParser.parseXml(data.toString(), true, (route) => {
+            console.log(route);
+            return true;
+        });
+
+        return xmlParser.simplifyXmlParsed(result);
+    }
+}
+
+module.exports = Fb2Parser;

+ 342 - 0
server/core/xml/XmlParser.js

@@ -0,0 +1,342 @@
+//node types
+const NODE = 1;
+const TEXT = 2;
+const CDATA = 3;
+const COMMENT = 4;
+
+const name2type = {
+    'NODE': NODE,
+    'TEXT': TEXT,
+    'CDATA': CDATA,
+    'COMMENT': COMMENT,
+};
+
+const type2name = {
+    [NODE]: 'NODE',
+    [TEXT]: 'TEXT',
+    [CDATA]: 'CDATA',
+    [COMMENT]: 'COMMENT',
+};
+
+class NodeBase {
+    makeSelectorObj(selectorString) {
+        const result = {all: false, before: false, type: 0, name: ''};
+
+        if (selectorString === '') {
+            result.before = true;
+        } else if (selectorString === '*') {
+            result.all = true;        
+        } else if (selectorString[0] === '*') {
+            const typeName = selectorString.substring(1);
+            result.type = name2type[typeName];
+            if (!result.type)
+                throw new Error(`Unknown selector type: ${typeName}`);
+        } else {
+            result.name = selectorString;
+        }
+
+        return result;
+    }
+
+    checkNode(rawNode, selectorObj) {
+        return selectorObj.all || selectorObj.before
+            || (selectorObj.type && rawNode[0] === selectorObj.type)
+            || (rawNode[0] === NODE && rawNode[1] === selectorObj.name);
+    }
+
+    findNodeIndex(nodes, selectorObj) {
+        for (let i = 0; i < nodes.length; i++)
+            if (this.checkNode(nodes[i], selectorObj))
+                return i;
+    }
+
+    rawAdd(nodes, rawNode, selectorObj) {
+        if (selectorObj.all) {
+            nodes.push(rawNode);
+        } else if (selectorObj.before) {
+            nodes.unshift(rawNode);
+        } else {
+            const index = this.findNodeIndex(nodes, selectorObj);
+            if (index >= 0)
+                nodes.splice(index, 0, rawNode);
+            else 
+                nodes.push(rawNode);
+        }
+    }
+
+    rawRemove(nodes, selectorObj) {
+        if (selectorObj.before)
+            return;
+
+        for (let i = nodes.length - 1; i >= 0; i--) {
+            if (this.checkNode(nodes[i], selectorObj))
+                nodes.splice(i, 1);
+        }
+    }
+}
+
+class NodeObject extends NodeBase {
+    constructor(rawNode) {
+        super();
+
+        if (rawNode)
+            this.raw = rawNode;
+        else
+            this.raw = [];
+    }
+
+    get type() {
+        return this.raw[0] || null;
+    }
+
+    get name() {
+        if (this.type === NODE)
+            return this.raw[1] || null;
+
+        return null;
+    }
+
+    set name(value) {
+        if (this.type === NODE)
+            this.raw[1] = value;
+    }
+
+    get attrs() {
+        if (this.type === NODE && Array.isArray(this.raw[2]))
+            return new Map(this.raw[2]);
+
+        return null;
+    }
+
+    set attrs(value) {
+        if (this.type === NODE)
+            if (value && value.size)
+                this.raw[2] = Array.from(value);
+            else
+                this.raw[2] = null;
+    }
+
+    get value() {
+        switch (this.type) {
+            case NODE:
+                return this.raw[3] || null;
+            case TEXT:
+            case CDATA:
+            case COMMENT:
+                return this.raw[1] || null;
+        }
+
+        return null;
+    }
+
+    add(node, after = '*') {
+        if (this.type !== NODE)
+            return;
+
+        const selectorObj = this.makeSelectorObj(after);
+
+        if (!Array.isArray(this.raw[3]))
+            this.raw[3] = [];
+        this.rawAdd(this.raw[3], node.raw, selectorObj);
+    }
+
+    remove(selector = '') {
+        if (this.type !== NODE || !this.raw[3])
+            return;
+
+        const selectorObj = this.makeSelectorObj(selector);
+
+        this.rawRemove(this.raw[3], selectorObj);
+        if (!this.raw[3].length)
+            this.raw[3] = null;
+    }
+
+    each(callback) {
+        if (this.type !== NODE || !this.raw[3])
+            return;
+
+        for (const n of this.raw[3]) {
+            callback(new NodeObject(n));
+        }
+    }
+}
+
+class XmlParser extends NodeBase {
+    constructor(rawNodes = []) {
+        super();
+
+        this.NODE = NODE;
+        this.TEXT = TEXT;
+        this.CDATA = CDATA;
+        this.COMMENT = COMMENT;
+
+        this.rawNodes = rawNodes;
+    }
+
+    get count() {
+        return this.rawNodes.length;
+    }
+
+    toObject(node) {
+        return new NodeObject(node);
+    }
+
+    newParser(nodes) {
+        return new XmlParser(nodes);
+    }
+
+    checkType(type) {
+        if (!type2name[type])
+            throw new Error(`Invalid type: ${type}`);
+    }
+
+    createTypedNode(type, nameOrValue, attrs = null, value = null) {
+        this.checkType(type);
+        switch (type) {
+            case NODE:
+                if (!nameOrValue || typeof(nameOrValue) !== 'string')
+                    throw new Error('Node name must be non-empty string');
+                return new NodeObject([type, nameOrValue, attrs, value]);
+            case TEXT:
+            case CDATA:
+            case COMMENT:
+                if (typeof(nameOrValue) !== 'string')
+                    throw new Error('Node value must be of type string');
+                return new NodeObject([type, nameOrValue]);
+        }
+    }
+
+    createNode(name, attrs = null, value = null) {
+        return this.createTypedNode(NODE, name, attrs, value);
+    }
+
+    createText(value = null) {
+        return this.createTypedNode(TEXT, value);
+    }
+
+    createCdata(value = null) {
+        return this.createTypedNode(CDATA, value);
+    }
+
+    createComment(value = null) {
+        return this.createTypedNode(COMMENT, value);
+    }
+
+    add(node, after = '*') {
+        const selectorObj = this.makeSelectorObj(after);
+
+        for (const n of this.rawNodes) {
+            if (n && n[0] === NODE) {
+                if (!Array.isArray(n[3]))
+                    n[3] = [];
+                this.rawAdd(n[3], node.raw, selectorObj);
+            }
+        }
+    }
+
+    addRoot(node, after = '*') {
+        const selectorObj = this.makeSelectorObj(after);
+
+        this.rawAdd(this.rawNodes, node.raw, selectorObj);
+    }
+
+    remove(selector = '') {
+        const selectorObj = this.makeSelectorObj(selector);
+
+        for (const n of this.rawNodes) {
+            if (n && n[0] === NODE && Array.isArray(n[3])) {
+                this.rawRemove(n[3], selectorObj);
+                if (!n[3].length)
+                    n[3] = null;
+            }
+        }
+    }
+
+    removeRoot(selector = '') {
+        const selectorObj = this.makeSelectorObj(selector);
+
+        this.rawRemove(this.rawNodes, selectorObj);
+    }
+
+    each(callback) {
+        for (const n of this.rawNodes) {
+            callback(new NodeObject(n));
+        }
+    }
+
+    rawSelect(nodes, selectorObj, callback) {
+        for (const n of nodes)
+            if (this.checkNode(n, selectorObj))
+                callback(n);
+    }
+
+    select(selector = '', self = false) {
+        let newRawNodes = [];
+
+        if (selector.indexOf('/') >= 0) {
+            const selectors = selector.split('/');
+            let res = this;
+            for (const sel of selectors) {
+                res = res.select(sel, self);
+                self = false;
+            }
+
+            newRawNodes = res.rawNodes;
+        } else {
+            const selectorObj = this.makeSelectorObj(selector);
+
+            if (self) {
+                this.rawSelect(this.rawNodes, selectorObj, (node) => {
+                    newRawNodes.push(node);
+                })
+            } else {
+                for (const n of this.rawNodes) {
+                    if (n && n[0] === NODE && Array.isArray(n[3])) {
+                        this.rawSelect(n[3], selectorObj, (node) => {
+                            newRawNodes.push(node);
+                        })
+                    }
+                }
+            }
+        }
+
+        return new XmlParser(newRawNodes);
+    }
+
+    s(selector, self) {
+        return this.select(selector, self);
+    }
+
+    selectFirst(selector, self) {
+        const result = this.select(selector, self);
+        const node = (result.count ? result.rawNodes[0] : null);
+        return this.toObject(node);
+    }
+
+    sf(selector, self) {
+        return this.selectFirst(selector, self);
+    }
+
+    toJson(format = false) {
+        if (format)
+            return JSON.stringify(this.rawNodes, null, 2);
+        else
+            return JSON.stringify(this.rawNodes);
+    }
+
+    fromJson(jsonString) {
+        const parsed = JSON.parse(jsonString);
+        if (!Array.isArray(parsed))
+            throw new Error('JSON parse error: root element must be array');
+
+        this.rawNodes = parsed;
+    }
+
+    toString() {
+    }
+
+    fromSrtring() {
+    }
+}
+
+module.exports = XmlParser;

+ 366 - 0
server/core/xml/sax.js

@@ -0,0 +1,366 @@
+function parseSync(xstr, options) {
+    const dummy = () => {};
+    let {onStartNode: _onStartNode = dummy,
+        onEndNode: _onEndNode = dummy,
+        onTextNode: _onTextNode = dummy,
+        onCdata: _onCdata = dummy,
+        onComment: _onComment = dummy,
+        onProgress: _onProgress = dummy,
+        innerCut = new Set(),
+        lowerCase = true,
+    } = options;
+
+    let i = 0;
+    const len = xstr.length;
+    const progStep = len/20;
+    let nextProg = 0;
+
+    let cutCounter = 0;
+    let cutTag = '';
+    let inCdata;
+    let inComment;
+    let leftData = 0;
+    while (i < len) {
+        inCdata = false;
+        inComment = false;
+        let singleTag = false;
+
+        let left = xstr.indexOf('<', i);
+        if (left < 0)
+            break;
+        leftData = left;
+
+        if (left < len - 2 && xstr[left + 1] == '!') {
+            if (xstr[left + 2] == '-') {
+                const leftComment = xstr.indexOf('<!--', left);
+                if (leftComment == left) {
+                    inComment = true;
+                    leftData = left + 3;
+                }
+            }
+
+            if (!inComment && xstr[left + 2] == '[') {
+                const leftCdata = xstr.indexOf('<![CDATA[', left);
+                if (leftCdata == left) {
+                    inCdata = true;
+                    leftData = left + 8;
+                }
+            }
+        }
+
+        if (left != i) {
+            const text = xstr.substr(i, left - i);
+            _onTextNode(text, cutCounter, cutTag);
+        }
+
+        let right = null;
+        let rightData = null;
+        if (inCdata) {
+            rightData = xstr.indexOf(']]>', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData + 2;
+        } else if (inComment) {
+            rightData = xstr.indexOf('-->', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData + 2;
+        } else {
+            rightData = xstr.indexOf('>', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData;
+            if (xstr[right - 1] === '/') {
+                singleTag = true;
+                rightData--;
+            }
+        }
+
+        let tagData = xstr.substr(leftData + 1, rightData - leftData - 1);
+
+        if (inCdata) {
+            _onCdata(tagData, cutCounter, cutTag);
+        } else if (inComment) {
+            _onComment(tagData, cutCounter, cutTag);
+        } else {
+            let tag = '';
+            let tail = '';
+            const firstSpace = tagData.indexOf(' ');
+            if (firstSpace >= 0) {
+                tail = tagData.substr(firstSpace);
+                tag = tagData.substr(0, firstSpace);
+            } else {
+                tag = tagData;
+            }
+            if (lowerCase)
+                tag = tag.toLowerCase();
+
+            if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
+                if (!cutCounter)
+                    cutTag = tag;
+                cutCounter++;
+            }
+
+            let endTag = (singleTag ? tag : '');
+            if (tag === '' || tag[0] !== '/') {
+                _onStartNode(tag, tail, singleTag, cutCounter, cutTag);
+            } else {
+                endTag = tag.substr(1);
+            }
+
+            if (endTag)
+                _onEndNode(endTag, tail, singleTag, cutCounter, cutTag);
+
+            if (cutTag === endTag) {
+                cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
+                if (!cutCounter)
+                    cutTag = '';
+            }
+        }
+
+        if (right >= nextProg) {
+            _onProgress(Math.round(right/(len + 1)*100));
+            nextProg += progStep;
+        }
+        i = right + 1;
+    }
+
+    if (i < len) {
+        if (inCdata) {
+            _onCdata(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
+        } else if (inComment) {
+            _onComment(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
+        } else {
+            _onTextNode(xstr.substr(i, len - i), cutCounter, cutTag);
+        }
+    }
+
+    _onProgress(100);
+}
+
+//асинхронная копия parseSync
+//делается заменой "_on" => "await _on" после while
+async function parse(xstr, options) {
+    const dummy = () => {};
+    let {onStartNode: _onStartNode = dummy,
+        onEndNode: _onEndNode = dummy,
+        onTextNode: _onTextNode = dummy,
+        onCdata: _onCdata = dummy,
+        onComment: _onComment = dummy,
+        onProgress: _onProgress = dummy,
+        innerCut = new Set(),
+        lowerCase = true,
+    } = options;
+
+    let i = 0;
+    const len = xstr.length;
+    const progStep = len/20;
+    let nextProg = 0;
+
+    let cutCounter = 0;
+    let cutTag = '';
+    let inCdata;
+    let inComment;
+    let leftData = 0;
+    while (i < len) {
+        inCdata = false;
+        inComment = false;
+        let singleTag = false;
+
+        let left = xstr.indexOf('<', i);
+        if (left < 0)
+            break;
+        leftData = left;
+
+        if (left < len - 2 && xstr[left + 1] == '!') {
+            if (xstr[left + 2] == '-') {
+                const leftComment = xstr.indexOf('<!--', left);
+                if (leftComment == left) {
+                    inComment = true;
+                    leftData = left + 3;
+                }
+            }
+
+            if (!inComment && xstr[left + 2] == '[') {
+                const leftCdata = xstr.indexOf('<![CDATA[', left);
+                if (leftCdata == left) {
+                    inCdata = true;
+                    leftData = left + 8;
+                }
+            }
+        }
+
+        if (left != i) {
+            const text = xstr.substr(i, left - i);
+            await _onTextNode(text, cutCounter, cutTag);
+        }
+
+        let right = null;
+        let rightData = null;
+        if (inCdata) {
+            rightData = xstr.indexOf(']]>', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData + 2;
+        } else if (inComment) {
+            rightData = xstr.indexOf('-->', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData + 2;
+        } else {
+            rightData = xstr.indexOf('>', leftData + 1);
+            if (rightData < 0)
+                break;
+            right = rightData;
+            if (xstr[right - 1] === '/') {
+                singleTag = true;
+                rightData--;
+            }
+        }
+
+        let tagData = xstr.substr(leftData + 1, rightData - leftData - 1);
+
+        if (inCdata) {
+            await _onCdata(tagData, cutCounter, cutTag);
+        } else if (inComment) {
+            await _onComment(tagData, cutCounter, cutTag);
+        } else {
+            let tag = '';
+            let tail = '';
+            const firstSpace = tagData.indexOf(' ');
+            if (firstSpace >= 0) {
+                tail = tagData.substr(firstSpace);
+                tag = tagData.substr(0, firstSpace);
+            } else {
+                tag = tagData;
+            }
+            if (lowerCase)
+                tag = tag.toLowerCase();
+
+            if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
+                if (!cutCounter)
+                    cutTag = tag;
+                cutCounter++;
+            }
+
+            let endTag = (singleTag ? tag : '');
+            if (tag === '' || tag[0] !== '/') {
+                await _onStartNode(tag, tail, singleTag, cutCounter, cutTag);
+            } else {
+                endTag = tag.substr(1);
+            }
+
+            if (endTag)
+                await _onEndNode(endTag, tail, singleTag, cutCounter, cutTag);
+
+            if (cutTag === endTag) {
+                cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
+                if (!cutCounter)
+                    cutTag = '';
+            }
+        }
+
+        if (right >= nextProg) {
+            await _onProgress(Math.round(right/(len + 1)*100));
+            nextProg += progStep;
+        }
+        i = right + 1;
+    }
+
+    if (i < len) {
+        if (inCdata) {
+            await _onCdata(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
+        } else if (inComment) {
+            await _onComment(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
+        } else {
+            await _onTextNode(xstr.substr(i, len - i), cutCounter, cutTag);
+        }
+    }
+
+    await _onProgress(100);
+}
+
+function getAttrsSync(tail, lowerCase = true) {
+    let result = {};
+    let name = '';    
+    let value = '';
+    let vOpen = '';
+    let inName = false;
+    let inValue = false;
+    let waitValue = false;
+    let waitEq = false;
+
+    const pushResult = () => {
+        if (lowerCase)
+            name = name.toLowerCase();
+        if (name != '') {
+            const fn = name;
+            let ns = '';
+            if (fn.indexOf(':') >= 0) {
+                [ns, name] = fn.split(':');
+            }
+
+            result[name] = {value, ns, fn};
+        }
+        name = '';
+        value = '';
+        vOpen = '';
+        inName = false;
+        inValue = false;
+        waitValue = false;
+        waitEq = false;
+    };
+
+    tail = tail.replace(/[\t\n\r]/g, ' ');
+    for (let i = 0; i < tail.length; i++) {
+        const c = tail.charAt(i);
+        if (c == ' ') {
+            if (inValue) {
+                if (vOpen == '"')
+                    value += c;
+                else
+                    pushResult();
+            } else if (inName) {
+                waitEq = true;
+                inName = false;
+            }
+        } else if (!inValue && c == '=') {
+            waitEq = false;
+            waitValue = true;
+            inName = false;
+        } else if (c == '"') {
+            if (inValue) {
+                pushResult();
+            } else if (waitValue) {
+                inValue = true;
+                vOpen = '"';
+            }
+        } else if (inValue) {
+            value += c;
+        } else if (inName) {
+            name += c;
+        } else if (waitEq) {
+            pushResult();
+            inName = true;
+            name = c;
+        } else if (waitValue) {
+            waitValue = false;
+            inValue = true;
+            vOpen = ' ';
+            value = c;
+        } else {
+            inName = true;
+            name = c;
+        }
+    }
+    if (name != '')
+        pushResult();
+
+    return result;
+}
+
+module.exports = {
+    parseSync,
+    getAttrsSync,
+    parse
+}

+ 130 - 0
server/core/xml/textUtils.js

@@ -0,0 +1,130 @@
+const chardet = require('chardet');
+
+function getEncoding(buf) {
+    let selected = getEncodingLite(buf);
+
+    if (selected == 'ISO-8859-5' && buf.length > 10) {
+        const charsetAll = chardet.analyse(buf.slice(0, 20000));
+        for (const charset of charsetAll) {
+            if (charset.name.indexOf('ISO-8859') < 0) {
+                selected = charset.name;
+                break;
+            }
+        }
+    }
+
+    return selected;
+}
+
+
+function getEncodingLite(buf, returnAll) {
+    const lowerCase = 3;
+    const upperCase = 1;
+
+    const codePage = {
+        'k': 'koi8-r',
+        'w': 'Windows-1251',
+        'd': 'cp866',
+        'i': 'ISO-8859-5',
+        'm': 'maccyrillic',
+        'u': 'utf-8',
+    };
+
+    let charsets = {
+        'k': 0,
+        'w': 0,
+        'd': 0,
+        'i': 0,
+        'm': 0,
+        'u': 0,
+    };
+
+    const len = buf.length;
+    const blockSize = (len > 5*3000 ? 3000 : len);
+    let counter = 0;
+    let i = 0;
+    let totalChecked = 0;
+    while (i < len) {
+        const char = buf[i];
+        const nextChar = (i < len - 1 ? buf[i + 1] : 0);
+        totalChecked++;
+        i++;
+        //non-russian characters
+        if (char < 128 || char > 256)
+            continue;
+        //UTF-8
+        if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190)
+            charsets['u'] += lowerCase;
+        else {
+            //CP866
+            if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
+            if ((char > 127 && char < 160)) charsets['d'] += upperCase;
+
+            //KOI8-R
+            if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
+            if ((char > 222 && char < 256)) charsets['k'] += upperCase;
+
+            //WIN-1251
+            if (char > 223 && char < 256) charsets['w'] += lowerCase;
+            if (char > 191 && char < 224) charsets['w'] += upperCase;
+
+            //MAC
+            if (char > 221 && char < 255) charsets['m'] += lowerCase;
+            if (char > 127 && char < 160) charsets['m'] += upperCase;
+
+            //ISO-8859-5
+            if (char > 207 && char < 240) charsets['i'] += lowerCase;
+            if (char > 175 && char < 208) charsets['i'] += upperCase;
+        }
+
+        counter++;
+
+        if (counter > blockSize) {
+            counter = 0;
+            i += Math.round(len/2 - 2*blockSize);
+        }
+    }
+
+    let sorted = Object.keys(charsets).map(function(key) {
+        return { codePage: codePage[key], c: charsets[key], totalChecked };
+    });
+
+    sorted.sort((a, b) => b.c - a.c);
+
+    if (returnAll)
+        return sorted;
+    else if (sorted[0].c > 0 && sorted[0].c > sorted[0].totalChecked/2)
+        return sorted[0].codePage;
+    else
+        return 'ISO-8859-5';
+}
+
+function checkIfText(buf) {
+    const enc = getEncodingLite(buf, true);
+    if (enc[0].c > enc[0].totalChecked*0.9)
+        return true;
+
+    let spaceCount = 0;
+    let crCount = 0;
+    let lfCount = 0;
+    for (let i = 0; i < buf.length; i++) {
+        if (buf[i] == 32)
+            spaceCount++;
+        if (buf[i] == 13)
+            crCount++;
+        if (buf[i] == 10)
+            lfCount++;
+    }
+
+    const spaceFreq = spaceCount/(buf.length + 1);
+    const crFreq = crCount/(buf.length + 1);
+    const lfFreq = lfCount/(buf.length + 1);
+
+    return (buf.length < 1000 || spaceFreq > 0.1 || crFreq > 0.03 || lfFreq > 0.03);
+}
+
+module.exports = {
+    getEncoding,
+    getEncodingLite,
+    checkIfText,
+}