234 lines
5.5 KiB
JavaScript
234 lines
5.5 KiB
JavaScript
/*
|
|
* to-markdown - an HTML to Markdown converter
|
|
*
|
|
* Copyright 2011+, Dom Christie
|
|
* Licenced under the MIT licence
|
|
*
|
|
*/
|
|
|
|
'use strict'
|
|
|
|
var toMarkdown
|
|
var converters
|
|
var mdConverters = require('./lib/md-converters')
|
|
var gfmConverters = require('./lib/gfm-converters')
|
|
var HtmlParser = require('./lib/html-parser')
|
|
var collapse = require('collapse-whitespace')
|
|
|
|
/*
|
|
* Utilities
|
|
*/
|
|
|
|
var blocks = ['address', 'article', 'aside', 'audio', 'blockquote', 'body',
|
|
'canvas', 'center', 'dd', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption',
|
|
'figure', 'footer', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
'header', 'hgroup', 'hr', 'html', 'isindex', 'li', 'main', 'menu', 'nav',
|
|
'noframes', 'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table',
|
|
'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'ul'
|
|
]
|
|
|
|
function isBlock (node) {
|
|
return blocks.indexOf(node.nodeName.toLowerCase()) !== -1
|
|
}
|
|
|
|
var voids = [
|
|
'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
|
|
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
]
|
|
|
|
function isVoid (node) {
|
|
return voids.indexOf(node.nodeName.toLowerCase()) !== -1
|
|
}
|
|
|
|
function htmlToDom (string) {
|
|
var tree = new HtmlParser().parseFromString(string, 'text/html')
|
|
collapse(tree.documentElement, isBlock)
|
|
return tree
|
|
}
|
|
|
|
/*
|
|
* Flattens DOM tree into single array
|
|
*/
|
|
|
|
function bfsOrder (node) {
|
|
var inqueue = [node]
|
|
var outqueue = []
|
|
var elem
|
|
var children
|
|
var i
|
|
|
|
while (inqueue.length > 0) {
|
|
elem = inqueue.shift()
|
|
outqueue.push(elem)
|
|
children = elem.childNodes
|
|
for (i = 0; i < children.length; i++) {
|
|
if (children[i].nodeType === 1) inqueue.push(children[i])
|
|
}
|
|
}
|
|
outqueue.shift()
|
|
return outqueue
|
|
}
|
|
|
|
/*
|
|
* Contructs a Markdown string of replacement text for a given node
|
|
*/
|
|
|
|
function getContent (node) {
|
|
var text = ''
|
|
for (var i = 0; i < node.childNodes.length; i++) {
|
|
if (node.childNodes[i].nodeType === 1) {
|
|
text += node.childNodes[i]._replacement
|
|
} else if (node.childNodes[i].nodeType === 3) {
|
|
text += node.childNodes[i].data
|
|
} else continue
|
|
}
|
|
return text
|
|
}
|
|
|
|
/*
|
|
* Returns the HTML string of an element with its contents converted
|
|
*/
|
|
|
|
function outer (node, content) {
|
|
return node.cloneNode(false).outerHTML.replace('><', '>' + content + '<')
|
|
}
|
|
|
|
function canConvert (node, filter) {
|
|
if (typeof filter === 'string') {
|
|
return filter === node.nodeName.toLowerCase()
|
|
}
|
|
if (Array.isArray(filter)) {
|
|
return filter.indexOf(node.nodeName.toLowerCase()) !== -1
|
|
} else if (typeof filter === 'function') {
|
|
return filter.call(toMarkdown, node)
|
|
} else {
|
|
throw new TypeError('`filter` needs to be a string, array, or function')
|
|
}
|
|
}
|
|
|
|
function isFlankedByWhitespace (side, node) {
|
|
var sibling
|
|
var regExp
|
|
var isFlanked
|
|
|
|
if (side === 'left') {
|
|
sibling = node.previousSibling
|
|
regExp = / $/
|
|
} else {
|
|
sibling = node.nextSibling
|
|
regExp = /^ /
|
|
}
|
|
|
|
if (sibling) {
|
|
if (sibling.nodeType === 3) {
|
|
isFlanked = regExp.test(sibling.nodeValue)
|
|
} else if (sibling.nodeType === 1 && !isBlock(sibling)) {
|
|
isFlanked = regExp.test(sibling.textContent)
|
|
}
|
|
}
|
|
return isFlanked
|
|
}
|
|
|
|
function flankingWhitespace (node, content) {
|
|
var leading = ''
|
|
var trailing = ''
|
|
|
|
if (!isBlock(node)) {
|
|
var hasLeading = /^[ \r\n\t]/.test(content)
|
|
var hasTrailing = /[ \r\n\t]$/.test(content)
|
|
|
|
if (hasLeading && !isFlankedByWhitespace('left', node)) {
|
|
leading = ' '
|
|
}
|
|
if (hasTrailing && !isFlankedByWhitespace('right', node)) {
|
|
trailing = ' '
|
|
}
|
|
}
|
|
|
|
return { leading: leading, trailing: trailing }
|
|
}
|
|
|
|
/*
|
|
* Finds a Markdown converter, gets the replacement, and sets it on
|
|
* `_replacement`
|
|
*/
|
|
|
|
function process (node) {
|
|
var replacement
|
|
var content = getContent(node)
|
|
|
|
// Remove blank nodes
|
|
if (!isVoid(node) && !/A|TH|TD/.test(node.nodeName) && /^\s*$/i.test(content)) {
|
|
node._replacement = ''
|
|
return
|
|
}
|
|
|
|
for (var i = 0; i < converters.length; i++) {
|
|
var converter = converters[i]
|
|
|
|
if (canConvert(node, converter.filter)) {
|
|
if (typeof converter.replacement !== 'function') {
|
|
throw new TypeError(
|
|
'`replacement` needs to be a function that returns a string'
|
|
)
|
|
}
|
|
|
|
var whitespace = flankingWhitespace(node, content)
|
|
|
|
if (whitespace.leading || whitespace.trailing) {
|
|
content = content.trim()
|
|
}
|
|
replacement = whitespace.leading +
|
|
converter.replacement.call(toMarkdown, content, node) +
|
|
whitespace.trailing
|
|
break
|
|
}
|
|
}
|
|
|
|
node._replacement = replacement
|
|
}
|
|
|
|
toMarkdown = function (input, options) {
|
|
options = options || {}
|
|
|
|
if (typeof input !== 'string') {
|
|
throw new TypeError(input + ' is not a string')
|
|
}
|
|
|
|
if (input === '') {
|
|
return ''
|
|
}
|
|
|
|
// Escape potential ol triggers
|
|
input = input.replace(/(\d+)\. /g, '$1\\. ')
|
|
|
|
var clone = htmlToDom(input).body
|
|
var nodes = bfsOrder(clone)
|
|
var output
|
|
|
|
converters = mdConverters.slice(0)
|
|
if (options.gfm) {
|
|
converters = gfmConverters.concat(converters)
|
|
}
|
|
|
|
if (options.converters) {
|
|
converters = options.converters.concat(converters)
|
|
}
|
|
|
|
// Process through nodes in reverse (so deepest child elements are first).
|
|
for (var i = nodes.length - 1; i >= 0; i--) {
|
|
process(nodes[i])
|
|
}
|
|
output = getContent(clone)
|
|
|
|
return output.replace(/^[\t\r\n]+|[\t\r\n\s]+$/g, '')
|
|
.replace(/\n\s+\n/g, '\n\n')
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
}
|
|
|
|
toMarkdown.isBlock = isBlock
|
|
toMarkdown.isVoid = isVoid
|
|
toMarkdown.outer = outer
|
|
|
|
module.exports = toMarkdown
|