medium-to-markdown/node_modules/to-markdown/index.js

234 lines
5.5 KiB
JavaScript

/*
* to-markdown - an HTML to Markdown converter
*
* Copyright 2011+, Dom Christie
* Licenced under the MIT licence
*
*/
'use strict'
var toMarkdown
var converters
var mdConverters = require('./lib/md-converters')
var gfmConverters = require('./lib/gfm-converters')
var HtmlParser = require('./lib/html-parser')
var collapse = require('collapse-whitespace')
/*
* Utilities
*/
var blocks = ['address', 'article', 'aside', 'audio', 'blockquote', 'body',
'canvas', 'center', 'dd', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption',
'figure', 'footer', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hgroup', 'hr', 'html', 'isindex', 'li', 'main', 'menu', 'nav',
'noframes', 'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table',
'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'ul'
]
function isBlock (node) {
return blocks.indexOf(node.nodeName.toLowerCase()) !== -1
}
var voids = [
'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
]
function isVoid (node) {
return voids.indexOf(node.nodeName.toLowerCase()) !== -1
}
function htmlToDom (string) {
var tree = new HtmlParser().parseFromString(string, 'text/html')
collapse(tree.documentElement, isBlock)
return tree
}
/*
* Flattens DOM tree into single array
*/
function bfsOrder (node) {
var inqueue = [node]
var outqueue = []
var elem
var children
var i
while (inqueue.length > 0) {
elem = inqueue.shift()
outqueue.push(elem)
children = elem.childNodes
for (i = 0; i < children.length; i++) {
if (children[i].nodeType === 1) inqueue.push(children[i])
}
}
outqueue.shift()
return outqueue
}
/*
* Contructs a Markdown string of replacement text for a given node
*/
function getContent (node) {
var text = ''
for (var i = 0; i < node.childNodes.length; i++) {
if (node.childNodes[i].nodeType === 1) {
text += node.childNodes[i]._replacement
} else if (node.childNodes[i].nodeType === 3) {
text += node.childNodes[i].data
} else continue
}
return text
}
/*
* Returns the HTML string of an element with its contents converted
*/
function outer (node, content) {
return node.cloneNode(false).outerHTML.replace('><', '>' + content + '<')
}
function canConvert (node, filter) {
if (typeof filter === 'string') {
return filter === node.nodeName.toLowerCase()
}
if (Array.isArray(filter)) {
return filter.indexOf(node.nodeName.toLowerCase()) !== -1
} else if (typeof filter === 'function') {
return filter.call(toMarkdown, node)
} else {
throw new TypeError('`filter` needs to be a string, array, or function')
}
}
function isFlankedByWhitespace (side, node) {
var sibling
var regExp
var isFlanked
if (side === 'left') {
sibling = node.previousSibling
regExp = / $/
} else {
sibling = node.nextSibling
regExp = /^ /
}
if (sibling) {
if (sibling.nodeType === 3) {
isFlanked = regExp.test(sibling.nodeValue)
} else if (sibling.nodeType === 1 && !isBlock(sibling)) {
isFlanked = regExp.test(sibling.textContent)
}
}
return isFlanked
}
function flankingWhitespace (node, content) {
var leading = ''
var trailing = ''
if (!isBlock(node)) {
var hasLeading = /^[ \r\n\t]/.test(content)
var hasTrailing = /[ \r\n\t]$/.test(content)
if (hasLeading && !isFlankedByWhitespace('left', node)) {
leading = ' '
}
if (hasTrailing && !isFlankedByWhitespace('right', node)) {
trailing = ' '
}
}
return { leading: leading, trailing: trailing }
}
/*
* Finds a Markdown converter, gets the replacement, and sets it on
* `_replacement`
*/
function process (node) {
var replacement
var content = getContent(node)
// Remove blank nodes
if (!isVoid(node) && !/A|TH|TD/.test(node.nodeName) && /^\s*$/i.test(content)) {
node._replacement = ''
return
}
for (var i = 0; i < converters.length; i++) {
var converter = converters[i]
if (canConvert(node, converter.filter)) {
if (typeof converter.replacement !== 'function') {
throw new TypeError(
'`replacement` needs to be a function that returns a string'
)
}
var whitespace = flankingWhitespace(node, content)
if (whitespace.leading || whitespace.trailing) {
content = content.trim()
}
replacement = whitespace.leading +
converter.replacement.call(toMarkdown, content, node) +
whitespace.trailing
break
}
}
node._replacement = replacement
}
toMarkdown = function (input, options) {
options = options || {}
if (typeof input !== 'string') {
throw new TypeError(input + ' is not a string')
}
if (input === '') {
return ''
}
// Escape potential ol triggers
input = input.replace(/(\d+)\. /g, '$1\\. ')
var clone = htmlToDom(input).body
var nodes = bfsOrder(clone)
var output
converters = mdConverters.slice(0)
if (options.gfm) {
converters = gfmConverters.concat(converters)
}
if (options.converters) {
converters = options.converters.concat(converters)
}
// Process through nodes in reverse (so deepest child elements are first).
for (var i = nodes.length - 1; i >= 0; i--) {
process(nodes[i])
}
output = getContent(clone)
return output.replace(/^[\t\r\n]+|[\t\r\n\s]+$/g, '')
.replace(/\n\s+\n/g, '\n\n')
.replace(/\n{3,}/g, '\n\n')
}
toMarkdown.isBlock = isBlock
toMarkdown.isVoid = isVoid
toMarkdown.outer = outer
module.exports = toMarkdown