Selaa lähdekoodia

Add HTMLParser

Chris Watson 5 vuotta sitten
vanhempi
commit
06500063d5

+ 3 - 3
gramjs/Utils.js

@@ -1,7 +1,7 @@
 const path = require('path')
 const mime = require('mime-types')
 const struct = require('python-struct')
-const { MarkdownParser, HTMLParser } = require('./extensions')
+const { markdown, html } = require('./extensions')
 const { types } = require('./tl')
 
 const USERNAME_RE = new RegExp('@|(?:https?:\\/\\/)?(?:www\\.)?' +
@@ -892,10 +892,10 @@ function sanitizeParseMode(mode) {
         switch (mode.toLowerCase()) {
         case 'md':
         case 'markdown':
-            return MarkdownParser
+            return markdown
         case 'htm':
         case 'html':
-            return HTMLParser
+            return html
         default:
             throw new Error(`Unknown parse mode ${mode}`)
         }

+ 269 - 0
gramjs/extensions/HTML.js

@@ -0,0 +1,269 @@
+/* eslint-disable no-extend-native */
+/* eslint-disable no-case-declarations, no-fallthrough */
+const Scanner = require('./Scanner')
+const {
+    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
+    MessageEntityPre, MessageEntityEmail, MessageEntityTextUrl,
+    MessageEntityUnderline, MessageEntityStrike, MessageEntityBlockquote,
+} = require('../tl/types')
+
+class HTMLParser extends Scanner {
+    constructor(str) {
+        super(str)
+        this.text = ''
+        this.entities = []
+        this._buildingEntities = {}
+        this._openTags = []
+        this._openTagsMeta = []
+    }
+
+    parse() {
+        while (!this.eof()) {
+            switch (this.peek(1)) {
+            case '<':
+                this.consume(1)
+                if (this.peek(1) === '/') {
+                    // Closing tag
+                    this.consume(1)
+                    const tag = this.scanUntil('>').trim()
+
+                    // Consume the closing bracket
+                    this.consume(1)
+
+                    this.handleEndTag(tag)
+                } else {
+                    // Opening tag
+                    let tag = this.scanUntil('>').trim()
+                    let attrs
+
+                    // Consume the closing bracket
+                    this.consume(1);
+
+                    [tag, ...attrs] = tag.split(/\s+/)
+                    attrs = attrs
+                        // Split on `=`
+                        .map((a) => a.split('='))
+                        // Take non key/value items and make them `true`
+                        .map((a) => a.length === 1 ? a.concat([true]) : a)
+                        // Remove quotes if they exist
+                        .map((a) => {
+                            const txt = a[0].replace(/^('|")|('|")$/, '')
+                            return [txt, a[1]]
+                        })
+
+                    this.handleStartTag(tag, attrs)
+                }
+                break
+            default:
+                if (this.eof()) break
+                this.handleData(this.chr)
+                this.pos += 1
+            }
+        }
+
+        return [this.text, this.entities]
+    }
+
+    static unparse(text, entities, _offset = 0, _length = null) {
+        if (!_length) {
+            _length = text.length
+        }
+
+        const html = []
+        let lastOffset = 0
+
+        for (const [i, entity] of entities.entries()) {
+            if (entity.offset > _offset + _length) {
+                break
+            }
+
+            const relativeOffset = entity.offset - _offset
+            if (relativeOffset > lastOffset) {
+                html.push(text.substring(lastOffset, relativeOffset))
+            } else if (relativeOffset < lastOffset) {
+                continue
+            }
+
+            let skipEntity = false
+            let length = entity.length
+
+            while ((relativeOffset < _length) &&
+                   ('\ud800' <= text.substring(relativeOffset, length)) &&
+                   (text.substring(relativeOffset, length) <= '\udfff')) {
+                length += 1
+            }
+
+            const entityText = this.unparse(
+                text.substring(relativeOffset, relativeOffset + length),
+                entities.slice(i + 1, entities.length),
+                entity.offset,
+                length,
+            )
+
+            const entityType = entity.constructor.name
+
+            switch (entityType) {
+            case 'MessageEntityBold':
+                html.push(`<strong>${entityText}</strong>`)
+                break
+            case 'MessageEntityItalic':
+                html.push(`<em>${entityText}</em>`)
+                break
+            case 'MessageEntityCode':
+                html.push(`<code>${entityText}</code>`)
+                break
+            case 'MessageEntityUnderline':
+                html.push(`<u>${entityText}</u>`)
+                break
+            case 'MessageEntityStrike':
+                html.push(`<del>${entityText}</del>`)
+                break
+            case 'MessageEntityBlockquote':
+                html.push(`<blockquote>${entityText}</blockquote>`)
+                break
+            case 'MessageEntityPre':
+                if (entity.language) {
+                    html.push(`<pre>
+                      <code class="language-${entity.language}">
+                        ${entityText}
+                      </code>
+                    </pre>`)
+                } else {
+                    html.push(`<pre>${entityText}</pre>`)
+                }
+                break
+            case 'MessageEntityEmail':
+                html.push(`<a href="mailto:${entityText}">${entityText}</a>`)
+                break
+            case 'MessageEntityUrl':
+                html.push(`<a href="${entityText}">${entityText}</a>`)
+                break
+            case 'MessageEntityTextUrl':
+                html.push(`<a href="${entity.url}">${entityText}</a>`)
+                break
+            case 'MessageEntityMentionName':
+                html.push(`<a href="tg://user?id=${entity.userId}">${entityText}</a>`)
+                break
+            default:
+                skipEntity = true
+            }
+
+            lastOffset = relativeOffset + (skipEntity ? 0 : length)
+        }
+
+        while ((lastOffset < _length) &&
+               ('\ud800' <= text.substring(lastOffset)) &&
+               (text.substring(lastOffset) <= '\udfff')) {
+            lastOffset += 1
+        }
+
+        html.push(text.substring(lastOffset, text.length))
+        return html.join('')
+    }
+
+    handleStartTag(tag, attrs = {}) {
+        this._openTags.unshift(tag)
+        this._openTagsMeta.unshift(null)
+
+        let EntityType
+        const args = {}
+
+        switch (tag) {
+        case 'b':
+        case 'strong':
+            EntityType = MessageEntityBold
+            break
+        case 'i':
+        case 'em':
+            EntityType = MessageEntityItalic
+            break
+        case 'u':
+            EntityType = MessageEntityUnderline
+            break
+        case 's':
+        case 'del':
+            EntityType = MessageEntityStrike
+            break
+        case 'blockquote':
+            EntityType = MessageEntityBlockquote
+            break
+        case 'code':
+            // If we're in the middle of a <pre> tag, this <code> tag is
+            // probably intended for syntax highlighting.
+            //
+            // Syntax highlighting is set with
+            //     <code class='language-...'>codeblock</code>
+            // inside <pre> tags
+            const pre = this._buildingEntities['pre']
+            const language = attrs['class'] ? attrs['class'].match(/language-(\S+)/)[1] : null
+            if (pre && language) {
+                pre.language = language
+            } else {
+                EntityType = MessageEntityCode
+            }
+            break
+        case 'pre':
+            EntityType = MessageEntityPre
+            args['language'] = ''
+            break
+        case 'a':
+            let url = attrs['href']
+            if (!url) return
+
+            if (url.indexOf('mailto:') === 0) {
+                EntityType = MessageEntityEmail
+            } else {
+                EntityType = MessageEntityTextUrl
+                args['url'] = url
+                url = null
+            }
+
+            this._openTagsMeta.shift()
+            this._openTagsMeta.unshift(url)
+            break
+        default:
+            // Do nothing
+        }
+
+        if (EntityType && !(tag in this._buildingEntities)) {
+            this._buildingEntities[tag] = new EntityType({
+                offset: this.text.length,
+                // The length will be determined when closing the tag.
+                length: 0,
+                ...args,
+            })
+        }
+    }
+
+    handleData(text) {
+        for (const [, entity] of Object.entries(this._buildingEntities)) {
+            entity.length += text.length
+        }
+
+        this.text += text
+    }
+
+    handleEndTag(tag) {
+        this._openTags.shift()
+        this._openTagsMeta.shift()
+
+        const entity = this._buildingEntities[tag]
+        if (entity) {
+            delete this._buildingEntities[tag]
+            this.entities.push(entity)
+        }
+    }
+}
+
+const parse = (str) => {
+    const parser = new HTMLParser(str)
+    return parser.parse()
+}
+
+const unparse = HTMLParser.unparse
+
+module.exports = {
+    HTMLParser,
+    parse,
+    unparse,
+}

+ 23 - 12
gramjs/extensions/Markdown.js

@@ -19,17 +19,17 @@ const DELIMITERS = {
 class MarkdownParser extends Scanner {
     constructor(str) {
         super(str)
-        this.stripped = ''
+        this.text = ''
         this.entities = []
     }
 
-    get strippedPos() {
-        return this.stripped.length - 1
+    get textPos() {
+        return this.text.length - 1
     }
 
     parse() {
         // Do a little reset
-        this.stripped = ''
+        this.text = ''
         this.entities = []
 
         while (!this.eof()) {
@@ -55,12 +55,12 @@ class MarkdownParser extends Scanner {
             case '[':
                 if (this.parseURL()) break
             default:
-                this.stripped += this.chr
+                this.text += this.chr
                 this.pos += 1
             }
         }
 
-        return [this.stripped, this.entities]
+        return [this.text, this.entities]
     }
 
     static unparse(text, entities) {
@@ -107,8 +107,8 @@ class MarkdownParser extends Scanner {
 
     parseEntity(EntityType, delimiter) {
         // The offset for this entity should be the end of the
-        // stripped string
-        const offset = this.strippedPos
+        // text string
+        const offset = this.textPos
 
         // Consume the delimiter
         this.consume(delimiter.length)
@@ -121,8 +121,8 @@ class MarkdownParser extends Scanner {
             // Consume the delimiter again
             this.consume(delimiter.length)
 
-            // Add the entire content to the stripped content
-            this.stripped += content
+            // Add the entire content to the text
+            this.text += content
 
             // Create and return a new Entity
             const entity = new EntityType({
@@ -141,7 +141,7 @@ class MarkdownParser extends Scanner {
         const [full, txt, url] = match
         const len = full.length
 
-        this.stripped += txt
+        this.text += txt
 
         const entity = new MessageEntityTextUrl({
             offset: this.pos,
@@ -156,4 +156,15 @@ class MarkdownParser extends Scanner {
     }
 }
 
-module.exports = MarkdownParser
+const parse = (str) => {
+    const parser = new MarkdownParser(str)
+    return parser.parse()
+}
+
+const unparse = MarkdownParser.unparse
+
+module.exports = {
+    MarkdownParser,
+    parse,
+    unparse,
+}

+ 2 - 2
gramjs/extensions/Scanner.js

@@ -41,11 +41,11 @@ class Scanner {
     }
 
     bof() {
-        return this.pos === 0
+        return this.pos <= 0
     }
 
     eof() {
-        return this.pos === this.str.length
+        return this.pos >= this.str.length
     }
 }
 

+ 10 - 4
gramjs/extensions/index.js

@@ -6,8 +6,8 @@ const MessagePacker = require('./MessagePacker')
 const AsyncQueue = require('./AsyncQueue')
 const PromisedNetSocket = require('./PromisedNetSockets')
 const Scanner = require('./Scanner')
-const MarkdownParser = require('./Markdown')
-const HTMLParser = null
+const markdown = require('./Markdown')
+const html = require('./HTML')
 
 module.exports = {
     BinaryWriter,
@@ -18,6 +18,12 @@ module.exports = {
     PromisedWebSockets,
     PromisedNetSocket,
     Scanner,
-    MarkdownParser,
-    HTMLParser,
+    markdown: {
+        parse: markdown.parse,
+        unparse: markdown.unparse,
+    },
+    html: {
+        parse: html.parse,
+        unparse: html.unparse,
+    }
 }