« get me outta code hell

crawl-links.js - http-music - Command-line music player + utils (not a server!)
about summary refs log tree commit diff
path: root/crawl-links.js
blob: 8602ce1ac90fb18148b976d084d0538ee1ec3c52 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
'use strict'

const fetch = require('node-fetch')
const $ = require('cheerio')
const url = require('url')

const DEFAULT_EXTENSIONS = [
	'mp3', 'wav'
]

function getHTMLLinks(text) {
	// Never parse HTML with a regex!

	return $(text).find('a').get().map(a => {
		const $a = $(a)
		return [$a.text(), $a.attr('href')]
	})
}

module.exports.getHTMLLinks = getHTMLLinks

if (require.main === module) {
	const urlString = process.argv[2]
	const exts = process.argv.length > 3 ? process.argv.slice(3) : DEFAULT_EXTENSIONS

	fetch(urlString)
		.then(res => res.text())
		.then(text => getHTMLLinks(text))
		.then(links => links.filter(l => exts.some(e => l[1].endsWith('.' + e))))
		.then(links => links.map(l => [l[0], url.resolve(urlString, l[1])]))
		.then(links => console.log(JSON.stringify(links, null, 2)))
		.catch(err => console.error(err))
}