diff options
author | liam4 <towerofnix@gmail.com> | 2017-05-30 00:53:03 +0000 |
---|---|---|
committer | liam4 <towerofnix@gmail.com> | 2017-05-30 00:53:03 +0000 |
commit | c96672fc6d7ad37c15a43c542c9e6d692ddfa3c6 (patch) | |
tree | 1e7e3aa97d082f2e7ad4bdb0d2aa9c6fc90bc194 /crawl-links.js | |
parent | 37caa6c403d50a22b5bab9f48da8d7fdb526ec3f (diff) |
Crawl links
Diffstat (limited to 'crawl-links.js')
-rw-r--r-- | crawl-links.js | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/crawl-links.js b/crawl-links.js new file mode 100644 index 0000000..8602ce1 --- /dev/null +++ b/crawl-links.js @@ -0,0 +1,33 @@ +'use strict' + +const fetch = require('node-fetch') +const $ = require('cheerio') +const url = require('url') + +const DEFAULT_EXTENSIONS = [ + 'mp3', 'wav' +] + +function getHTMLLinks(text) { + // Never parse HTML with a regex! + + return $(text).find('a').get().map(a => { + const $a = $(a) + return [$a.text(), $a.attr('href')] + }) +} + +module.exports.getHTMLLinks = getHTMLLinks + +if (require.main === module) { + const urlString = process.argv[2] + const exts = process.argv.length > 3 ? process.argv.slice(3) : DEFAULT_EXTENSIONS + + fetch(urlString) + .then(res => res.text()) + .then(text => getHTMLLinks(text)) + .then(links => links.filter(l => exts.some(e => l[1].endsWith('.' + e)))) + .then(links => links.map(l => [l[0], url.resolve(urlString, l[1])])) + .then(links => console.log(JSON.stringify(links, null, 2))) + .catch(err => console.error(err)) +} |