94 lines
2.6 KiB
JavaScript
94 lines
2.6 KiB
JavaScript
'use strict';
|
|
|
|
const request = require('request');
|
|
const cheerio = require('cheerio');
|
|
const TurndownService = require('turndown')
|
|
const gfm = require('turndown-plugin-gfm').gfm
|
|
|
|
const converters = require('./mdConverters');
|
|
|
|
|
|
const turndownService = new TurndownService()
|
|
turndownService.use(gfm)
|
|
|
|
converters.forEach((converter)=>{
|
|
turndownService.addRule(converter.filter, converter)
|
|
})
|
|
|
|
// following block adapted from https://github.com/domchristie/turndown/blob/61c2748c99fc53699896c1449f953ea492311c5b/src/commonmark-rules.js#L131
|
|
turndownService.addRule('mediumInlineLink', {
|
|
filter: function (node, options) {
|
|
return (
|
|
options.linkStyle === 'inlined' &&
|
|
node.nodeName === 'A' &&
|
|
node.getAttribute('href')
|
|
)
|
|
},
|
|
|
|
replacement: function (content, node) {
|
|
var href = node.getAttribute('href')
|
|
|
|
// following code added in to handle medium relative urls
|
|
// otherwise the link to article "foo" in the new website would go to
|
|
// https://newwebsite.com/@username/foo-a16a6fcf49c7 which doesn't exist
|
|
if(href.startsWith('/')){
|
|
href = "https://medium.com" + href
|
|
}
|
|
|
|
var title = node.title ? ' "' + node.title + '"' : ''
|
|
return '[' + content + '](' + href + title + ')'
|
|
}
|
|
})
|
|
|
|
// Medium has these weird hidden images that are in the html and get rendered
|
|
// by turndown. We filter these out.
|
|
turndownService.addRule('noHiddenImages', {
|
|
filter: function (node, options) {
|
|
return (
|
|
node.nodeName === 'IMG' &&
|
|
node.getAttribute('src') &&
|
|
node.getAttribute('src').endsWith('?q=20')
|
|
)
|
|
},
|
|
|
|
replacement: function () {
|
|
return ''
|
|
}
|
|
})
|
|
|
|
turndownService.addRule('code blocks', {
|
|
filter: 'pre',
|
|
replacement: function (content, node) {
|
|
return "```\n" + content + "\n```"
|
|
}
|
|
})
|
|
|
|
// todo: filter out profile header
|
|
// (right below title, the div with author profile pic and name and time to read article)
|
|
// unfortunately Medium uses randomly generated CSS properties which makes it hard to
|
|
// identify the header and strip it out. For example, I could strip the div with
|
|
// the class "eq" but the next time medium updated their CSS the div would have
|
|
// a different class name and the filter wouldn't work anymore
|
|
|
|
function convertFromUrl(url) {
|
|
return new Promise(function(resolve, reject) {
|
|
request({
|
|
uri: url,
|
|
method: 'GET'
|
|
}, function (err, httpResponse, body) {
|
|
|
|
if (err)
|
|
return reject(err);
|
|
|
|
let $ = cheerio.load(body);
|
|
let html = $('article').html() || '';
|
|
let markdown = turndownService.turndown(html);
|
|
|
|
resolve(markdown);
|
|
|
|
});
|
|
});
|
|
}
|
|
|
|
module.exports = convertFromUrl;
|