|
@@ -0,0 +1,563 @@
|
|
|
+/**
|
|
|
+ * HTML2Markdown - An HTML to Markdown converter.
|
|
|
+ *
|
|
|
+ * This implementation uses HTML DOM parsing for conversion. Parsing code was
|
|
|
+ * abstracted out in a parsing function which should be easy to remove in favor
|
|
|
+ * of other parsing libraries.
|
|
|
+ *
|
|
|
+ * Converted MarkDown was tested with ShowDown library for HTML rendering. And
|
|
|
+ * it tries to create MarkDown that does not confuse ShowDown when certain
|
|
|
+ * combination of HTML tags come together.
|
|
|
+ *
|
|
|
+ * @author Himanshu Gilani
|
|
|
+ * @author Kates Gasis (original author)
|
|
|
+ *
|
|
|
+ */
|
|
|
+
|
|
|
+/**
|
|
|
+ * HTML2Markdown
|
|
|
+ * @param html - html string to convert
|
|
|
+ * @return converted markdown text
|
|
|
+ */
|
|
|
+function HTML2Markdown(html, opts) {
|
|
|
+ var logging = false;
|
|
|
+ var nodeList = [];
|
|
|
+ var listTagStack = [];
|
|
|
+ var linkAttrStack = [];
|
|
|
+ var blockquoteStack = [];
|
|
|
+ var preStack = [];
|
|
|
+
|
|
|
+ var links = [];
|
|
|
+
|
|
|
+ opts = opts || {};
|
|
|
+ var inlineStyle = opts['inlineStyle'] || false;
|
|
|
+
|
|
|
+ var markdownTags = {
|
|
|
+ "hr": "- - -\n\n",
|
|
|
+ "br": " \n",
|
|
|
+ "title": "# ",
|
|
|
+ "h1": "# ",
|
|
|
+ "h2": "## ",
|
|
|
+ "h3": "### ",
|
|
|
+ "h4": "#### ",
|
|
|
+ "h5": "##### ",
|
|
|
+ "h6": "###### ",
|
|
|
+ "b": "**",
|
|
|
+ "strong": "**",
|
|
|
+ "i": "_",
|
|
|
+ "em": "_",
|
|
|
+ "dfn": "_",
|
|
|
+ "var": "_",
|
|
|
+ "cite": "_",
|
|
|
+ "span": " ",
|
|
|
+ "ul": "* ",
|
|
|
+ "ol": "1. ",
|
|
|
+ "dl": "- ",
|
|
|
+ "blockquote": "> "
|
|
|
+ };
|
|
|
+
|
|
|
+ function getListMarkdownTag() {
|
|
|
+ var listItem = "";
|
|
|
+ if(listTagStack) {
|
|
|
+ for ( var i = 0; i < listTagStack.length - 1; i++) {
|
|
|
+ listItem += " ";
|
|
|
+ }
|
|
|
+ }
|
|
|
+ listItem += peek(listTagStack);
|
|
|
+ return listItem;
|
|
|
+ }
|
|
|
+
|
|
|
+ function convertAttrs(attrs) {
|
|
|
+ var attributes = {};
|
|
|
+ for(var k in attrs) {
|
|
|
+ var attr = attrs[k];
|
|
|
+ attributes[attr.name] = attr;
|
|
|
+ }
|
|
|
+ return attributes;
|
|
|
+ }
|
|
|
+
|
|
|
+ function peek(list) {
|
|
|
+ if(list && list.length > 0) {
|
|
|
+ return list.slice(-1)[0];
|
|
|
+ }
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+
|
|
|
+ function peekTillNotEmpty(list) {
|
|
|
+ if(!list) {
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+
|
|
|
+ for(var i = list.length - 1; i>=0; i-- ){
|
|
|
+ if(list[i] != "") {
|
|
|
+ return list[i];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+
|
|
|
+ function removeIfEmptyTag(start) {
|
|
|
+ var cleaned = false;
|
|
|
+ if(start == peekTillNotEmpty(nodeList)) {
|
|
|
+ while(peek(nodeList) != start) {
|
|
|
+ nodeList.pop();
|
|
|
+ }
|
|
|
+ nodeList.pop();
|
|
|
+ cleaned = true;
|
|
|
+ }
|
|
|
+ return cleaned;
|
|
|
+ }
|
|
|
+
|
|
|
+ function sliceText(start) {
|
|
|
+ var text = [];
|
|
|
+ while(nodeList.length > 0 && peek(nodeList) != start) {
|
|
|
+ var t = nodeList.pop();
|
|
|
+ text.unshift(t);
|
|
|
+ }
|
|
|
+ return text.join("");
|
|
|
+ }
|
|
|
+
|
|
|
+ function block(isEndBlock) {
|
|
|
+ var lastItem = nodeList.pop();
|
|
|
+ if (!lastItem) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ if(!isEndBlock) {
|
|
|
+ var block;
|
|
|
+ if(/\s*\n\n\s*$/.test(lastItem)) {
|
|
|
+ lastItem = lastItem.replace(/\s*\n\n\s*$/, "\n\n");
|
|
|
+ block = "";
|
|
|
+ } else if(/\s*\n\s*$/.test(lastItem)) {
|
|
|
+ lastItem = lastItem.replace(/\s*\n\s*$/, "\n");
|
|
|
+ block = "\n";
|
|
|
+ } else if(/\s+$/.test(lastItem)) {
|
|
|
+ block = "\n\n";
|
|
|
+ } else {
|
|
|
+ block = "\n\n";
|
|
|
+ }
|
|
|
+
|
|
|
+ nodeList.push(lastItem);
|
|
|
+ nodeList.push(block);
|
|
|
+ } else {
|
|
|
+ nodeList.push(lastItem);
|
|
|
+ if(!lastItem.endsWith("\n")) {
|
|
|
+ nodeList.push("\n\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ function listBlock() {
|
|
|
+ if(nodeList.length > 0) {
|
|
|
+ var li = peek(nodeList);
|
|
|
+
|
|
|
+ if(!li.endsWith("\n")) {
|
|
|
+ nodeList.push("\n");
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ nodeList.push("\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ try {
|
|
|
+ var dom;
|
|
|
+ if(html) {
|
|
|
+ var e = document.createElement('div');
|
|
|
+ e.innerHTML = html;
|
|
|
+ dom = e;
|
|
|
+ } else {
|
|
|
+ dom = window.document.body;
|
|
|
+ }
|
|
|
+
|
|
|
+ HTMLParser(dom,{
|
|
|
+ start: function(tag, attrs, unary) {
|
|
|
+ tag = tag.toLowerCase();
|
|
|
+ if(logging) {
|
|
|
+ console.log("start: "+ tag);
|
|
|
+ }
|
|
|
+
|
|
|
+ if(unary && (tag != "br" && tag != "hr" && tag != "img")) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ switch (tag) {
|
|
|
+ case "br":
|
|
|
+ nodeList.push(markdownTags[tag]);
|
|
|
+ break;
|
|
|
+ case "hr":
|
|
|
+ block();
|
|
|
+ nodeList.push(markdownTags[tag]);
|
|
|
+ break;
|
|
|
+ case "title":
|
|
|
+ case "h1":
|
|
|
+ case "h2":
|
|
|
+ case "h3":
|
|
|
+ case "h4":
|
|
|
+ case "h5":
|
|
|
+ case "h6":
|
|
|
+ block();
|
|
|
+ nodeList.push(markdownTags[tag]);
|
|
|
+ break;
|
|
|
+ case "b":
|
|
|
+ case "strong":
|
|
|
+ case "i":
|
|
|
+ case "em":
|
|
|
+ case "dfn":
|
|
|
+ case "var":
|
|
|
+ case "cite":
|
|
|
+ nodeList.push(markdownTags[tag]);
|
|
|
+ break;
|
|
|
+ case "span":
|
|
|
+ if(! /\s+$/.test(peek(nodeList))) {
|
|
|
+ nodeList.push(markdownTags[tag]);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case "p":
|
|
|
+ case "div":
|
|
|
+ case "td":
|
|
|
+ block();
|
|
|
+ break;
|
|
|
+ case "ul":
|
|
|
+ case "ol":
|
|
|
+ case "dl":
|
|
|
+ listTagStack.push(markdownTags[tag]);
|
|
|
+ // lists are block elements
|
|
|
+ if(listTagStack.length > 1) {
|
|
|
+ listBlock();
|
|
|
+ } else {
|
|
|
+ block();
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case "li":
|
|
|
+ case "dt":
|
|
|
+ var li = getListMarkdownTag();
|
|
|
+ nodeList.push(li);
|
|
|
+ break;
|
|
|
+ case "a":
|
|
|
+ var attribs = convertAttrs(attrs);
|
|
|
+ linkAttrStack.push(attribs);
|
|
|
+ nodeList.push("[");
|
|
|
+ break;
|
|
|
+ case "img":
|
|
|
+ var attribs = convertAttrs(attrs);
|
|
|
+ var alt, title, url;
|
|
|
+
|
|
|
+ attribs["src"] ? url = getNormalizedUrl(attribs["src"].value) : url = "";
|
|
|
+ if(!url) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ attribs['alt'] ? alt = attribs['alt'].value.trim() : alt = "";
|
|
|
+ attribs['title'] ? title = attribs['title'].value.trim() : title = "";
|
|
|
+
|
|
|
+ // if parent of image tag is nested in anchor tag use inline style
|
|
|
+ if(!inlineStyle && !peekTillNotEmpty(nodeList).startsWith("[")) {
|
|
|
+ var l = links.indexOf(url);
|
|
|
+ if(l == -1) {
|
|
|
+ links.push(url);
|
|
|
+ l=links.length-1;
|
|
|
+ }
|
|
|
+
|
|
|
+ block();
|
|
|
+ nodeList.push("![");
|
|
|
+ if(alt!= "") {
|
|
|
+ nodeList.push(alt);
|
|
|
+ } else if (title != null) {
|
|
|
+ nodeList.push(title);
|
|
|
+ }
|
|
|
+
|
|
|
+ nodeList.push("][" + l + "]");
|
|
|
+ block();
|
|
|
+ } else {
|
|
|
+ //if image is not a link image then treat images as block elements
|
|
|
+ if(!peekTillNotEmpty(nodeList).startsWith("[")) {
|
|
|
+ block();
|
|
|
+ }
|
|
|
+
|
|
|
+ nodeList.push(" + ")");
|
|
|
+
|
|
|
+ if(!peekTillNotEmpty(nodeList).startsWith("[")) {
|
|
|
+ block(true);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case "blockquote":
|
|
|
+ block();
|
|
|
+ blockquoteStack.push(markdownTags[tag]);
|
|
|
+ nodeList.push(blockquoteStack.join(""));
|
|
|
+ break;
|
|
|
+ case "pre":
|
|
|
+ case "code":
|
|
|
+ block();
|
|
|
+ preStack.push(true);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ },
|
|
|
+ chars: function(text) {
|
|
|
+ if(preStack.length > 0) {
|
|
|
+ text = " " + text.replace(/\n/g,"\n ");
|
|
|
+ } else if(text.trim() != "") {
|
|
|
+ text = text.replace(/\s+/g, " ");
|
|
|
+
|
|
|
+ var prevText = peekTillNotEmpty(nodeList);
|
|
|
+ if(/\s+$/.test(prevText)) {
|
|
|
+ text = text.replace(/^\s+/g, "");
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ nodeList.push("");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ if(logging) {
|
|
|
+ console.log("text: "+ text);
|
|
|
+ }
|
|
|
+
|
|
|
+ nodeList.push(text);
|
|
|
+ },
|
|
|
+ end: function(tag) {
|
|
|
+ tag = tag.toLowerCase();
|
|
|
+ if(logging) {
|
|
|
+ console.log("end: "+ tag);
|
|
|
+ }
|
|
|
+ switch (tag) {
|
|
|
+ case "title":
|
|
|
+ case "h1":
|
|
|
+ case "h2":
|
|
|
+ case "h3":
|
|
|
+ case "h4":
|
|
|
+ case "h5":
|
|
|
+ case "h6":
|
|
|
+ if(!removeIfEmptyTag(markdownTags[tag])) {
|
|
|
+ block(true);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case "p":
|
|
|
+ case "div":
|
|
|
+ case "td":
|
|
|
+ while(nodeList.length > 0 && peek(nodeList).trim() == "") {
|
|
|
+ nodeList.pop();
|
|
|
+ }
|
|
|
+ block(true);
|
|
|
+ break;
|
|
|
+ case "b":
|
|
|
+ case "strong":
|
|
|
+ case "i":
|
|
|
+ case "em":
|
|
|
+ case "dfn":
|
|
|
+ case "var":
|
|
|
+ case "cite":
|
|
|
+ if(!removeIfEmptyTag(markdownTags[tag])) {
|
|
|
+ nodeList.push(sliceText(markdownTags[tag]).trim());
|
|
|
+ nodeList.push(markdownTags[tag]);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case "a":
|
|
|
+ var text = sliceText("[");
|
|
|
+ text = text.replace(/\s+/g, " ");
|
|
|
+ text = text.trim();
|
|
|
+
|
|
|
+ if(text == "") {
|
|
|
+ nodeList.pop();
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ var attrs = linkAttrStack.pop();
|
|
|
+ var url;
|
|
|
+ attrs["href"] && attrs["href"].value != "" ? url = getNormalizedUrl(attrs["href"].value) : url = "";
|
|
|
+
|
|
|
+ if(url == "") {
|
|
|
+ nodeList.pop();
|
|
|
+ nodeList.push(text);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ nodeList.push(text);
|
|
|
+
|
|
|
+ if(!inlineStyle && !peek(nodeList).startsWith("!")){
|
|
|
+ var l = links.indexOf(url);
|
|
|
+ if(l == -1) {
|
|
|
+ links.push(url);
|
|
|
+ l=links.length-1;
|
|
|
+ }
|
|
|
+ nodeList.push("][" + l + "]");
|
|
|
+ } else {
|
|
|
+ if(peek(nodeList).startsWith("!")){
|
|
|
+ var text = nodeList.pop();
|
|
|
+ text = nodeList.pop() + text;
|
|
|
+ block();
|
|
|
+ nodeList.push(text);
|
|
|
+ }
|
|
|
+
|
|
|
+ var title = attrs["title"];
|
|
|
+ nodeList.push("](" + url + (title ? " \"" + title.value.trim().replace(/\s+/g, " ") + "\"" : "") + ")");
|
|
|
+
|
|
|
+ if(peek(nodeList).startsWith("!")){
|
|
|
+ block(true);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case "ul":
|
|
|
+ case "ol":
|
|
|
+ case "dl":
|
|
|
+ listBlock();
|
|
|
+ listTagStack.pop();
|
|
|
+ break;
|
|
|
+ case "li":
|
|
|
+ case "dt":
|
|
|
+ var li = getListMarkdownTag();
|
|
|
+ if(!removeIfEmptyTag(li)) {
|
|
|
+ var text = sliceText(li).trim();
|
|
|
+
|
|
|
+ if(text.startsWith("[![")) {
|
|
|
+ nodeList.pop();
|
|
|
+ block();
|
|
|
+ nodeList.push(text);
|
|
|
+ block(true);
|
|
|
+ } else {
|
|
|
+ nodeList.push(text);
|
|
|
+ listBlock();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case "blockquote":
|
|
|
+ blockquoteStack.pop();
|
|
|
+ break;
|
|
|
+ case "pre":
|
|
|
+ case "code":
|
|
|
+ block(true);
|
|
|
+ preStack.pop();
|
|
|
+ break;
|
|
|
+ case "span":
|
|
|
+ if(peek(nodeList).trim() == "") {
|
|
|
+ nodeList.pop();
|
|
|
+ if(peek(nodeList) == " ") {
|
|
|
+ nodeList.pop();
|
|
|
+ } else {
|
|
|
+ nodeList.push(markdownTags[tag]);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ var text = nodeList.pop();
|
|
|
+ nodeList.push(text.trim());
|
|
|
+ nodeList.push(markdownTags[tag]);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case "br":
|
|
|
+ case "hr":
|
|
|
+ case "img":
|
|
|
+ case "table":
|
|
|
+ case "tr":
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }, {"nodesToIgnore": ["script", "noscript", "object", "iframe", "frame", "head", "style", "label"]});
|
|
|
+
|
|
|
+ if(!inlineStyle) {
|
|
|
+ for ( var i = 0; i < links.length; i++) {
|
|
|
+ if(i == 0) {
|
|
|
+ var lastItem = nodeList.pop();
|
|
|
+ nodeList.push(lastItem.replace(/\s+$/g, ""));
|
|
|
+ nodeList.push("\n\n[" + i + "]: " + links[i]);
|
|
|
+ } else {
|
|
|
+ nodeList.push("\n[" + i + "]: " + links[i]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch(e) {
|
|
|
+ console.log(e.stack);
|
|
|
+ console.trace();
|
|
|
+ }
|
|
|
+
|
|
|
+ return nodeList.join("");
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+function getNormalizedUrl(s) {
|
|
|
+ var urlBase = location.href;
|
|
|
+ var urlDir = urlBase.replace(/\/[^\/]*$/, '/');
|
|
|
+ var urlPage = urlBase.replace(/#[^\/#]*$/, '');
|
|
|
+
|
|
|
+ var url;
|
|
|
+ if(/^[a-zA-Z]([a-zA-Z0-9 -.])*:/.test(s)) {
|
|
|
+ // already absolute url
|
|
|
+ url = s;
|
|
|
+ } else if(/^\x2f/.test(s)) {// %2f --> /
|
|
|
+ // url is relative to site
|
|
|
+ location.protocol != "" ? url = location.protocol + "//" : url ="";
|
|
|
+ url+= location.hostname;
|
|
|
+ if(location.port != "80") {
|
|
|
+ url+=":"+location.port;
|
|
|
+ }
|
|
|
+ url += s;
|
|
|
+ } else if(/^#/.test(s)) {
|
|
|
+ // url is relative to page
|
|
|
+ url = urlPage + s;
|
|
|
+ } else {
|
|
|
+ url = urlDir + s;
|
|
|
+ }
|
|
|
+ return encodeURI(url);
|
|
|
+}
|
|
|
+
|
|
|
+if (typeof exports != "undefined") {
|
|
|
+ exports.HTML2Markdown = HTML2Markdown;
|
|
|
+}
|
|
|
+
|
|
|
+if (typeof exports != "undefined") {
|
|
|
+ exports.HTML2MarkDown = HTML2MarkDown;
|
|
|
+}
|
|
|
+
|
|
|
+/* add the useful functions to String object*/
|
|
|
+if (typeof String.prototype.trim != 'function') {
|
|
|
+ String.prototype.trim = function() {
|
|
|
+ return replace(/^\s+|\s+$/g,"");
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+if (typeof String.prototype.isNotEmpty != 'function') {
|
|
|
+ String.prototype.isNotEmpty = function() {
|
|
|
+ if (/\S/.test(this)) {
|
|
|
+ return true;
|
|
|
+ } else {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+if (typeof String.prototype.replaceAll != 'function') {
|
|
|
+ String.prototype.replaceAll = function(stringToFind,stringToReplace){
|
|
|
+ var temp = this;
|
|
|
+ var index = temp.indexOf(stringToFind);
|
|
|
+ while(index != -1){
|
|
|
+ temp = temp.replace(stringToFind,stringToReplace);
|
|
|
+ index = temp.indexOf(stringToFind);
|
|
|
+ }
|
|
|
+ return temp;
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+if (typeof String.prototype.startsWith != 'function') {
|
|
|
+ String.prototype.startsWith = function(str) {
|
|
|
+ return this.indexOf(str) == 0;
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+if (typeof String.prototype.endsWith != 'function') {
|
|
|
+ String.prototype.endsWith = function(suffix) {
|
|
|
+ return this.match(suffix+"$") == suffix;
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+if (typeof Array.prototype.indexOf != 'function') {
|
|
|
+ Array.prototype.indexOf = function(obj, fromIndex) {
|
|
|
+ if (fromIndex == null) {
|
|
|
+ fromIndex = 0;
|
|
|
+ } else if (fromIndex < 0) {
|
|
|
+ fromIndex = Math.max(0, this.length + fromIndex);
|
|
|
+ }
|
|
|
+ for ( var i = fromIndex, j = this.length; i < j; i++) {
|
|
|
+ if (this[i] === obj)
|
|
|
+ return i;
|
|
|
+ }
|
|
|
+ return -1;
|
|
|
+ };
|
|
|
+}
|