From c96672fc6d7ad37c15a43c542c9e6d692ddfa3c6 Mon Sep 17 00:00:00 2001 From: liam4 Date: Tue, 30 May 2017 00:53:03 +0000 Subject: Crawl links --- crawl-links.js | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 crawl-links.js (limited to 'crawl-links.js') diff --git a/crawl-links.js b/crawl-links.js new file mode 100644 index 0000000..8602ce1 --- /dev/null +++ b/crawl-links.js @@ -0,0 +1,33 @@ +'use strict' + +const fetch = require('node-fetch') +const $ = require('cheerio') +const url = require('url') + +const DEFAULT_EXTENSIONS = [ + 'mp3', 'wav' +] + +function getHTMLLinks(text) { + // Never parse HTML with a regex! + + return $(text).find('a').get().map(a => { + const $a = $(a) + return [$a.text(), $a.attr('href')] + }) +} + +module.exports.getHTMLLinks = getHTMLLinks + +if (require.main === module) { + const urlString = process.argv[2] + const exts = process.argv.length > 3 ? process.argv.slice(3) : DEFAULT_EXTENSIONS + + fetch(urlString) + .then(res => res.text()) + .then(text => getHTMLLinks(text)) + .then(links => links.filter(l => exts.some(e => l[1].endsWith('.' + e)))) + .then(links => links.map(l => [l[0], url.resolve(urlString, l[1])])) + .then(links => console.log(JSON.stringify(links, null, 2))) + .catch(err => console.error(err)) +} -- cgit 1.3.0-6-gf8a5