From 69cf7222241ee9ed1f2aae3fba48061f05dbd56f Mon Sep 17 00:00:00 2001 From: liam4 Date: Wed, 31 May 2017 19:59:16 -0300 Subject: Generally improve how scripts and running works --- src/crawl-recursive.js | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100755 src/crawl-recursive.js (limited to 'src/crawl-recursive.js') diff --git a/src/crawl-recursive.js b/src/crawl-recursive.js new file mode 100755 index 0000000..2656279 --- /dev/null +++ b/src/crawl-recursive.js @@ -0,0 +1,91 @@ +#!/usr/bin/env node + +'use strict' + +const MAX_DOWNLOAD_ATTEMPTS = 5 + +const fetch = require('node-fetch') +const $ = require('cheerio') + +function crawl(absURL, attempts = 0) { + // Recursively crawls a given URL, following every link to a deeper path and + // recording all links in a tree (in the same format playlists use). Makes + // multiple attempts to download failed paths. + + return fetch(absURL) + .then( + res => res.text().then(text => { + const links = getHTMLLinks(text) + const verbose = process.argv.includes('--verbose') + + return Promise.all(links.map(link => { + const [ title, href ] = link + + if (href.endsWith('/')) { + // It's a directory! + + if (verbose) console.log("[Dir] " + absURL + href) + return crawl(absURL + href) + .then(res => [title, res]) + } else { + // It's a file! + + if (verbose) console.log("[File] " + absURL + href) + return Promise.resolve([title, absURL + href]) + } + })) + }), + + err => { + console.warn("Failed to download: " + absURL) + + if (attempts < MAX_DOWNLOAD_ATTEMPTS) { + console.warn( + "Trying again. Attempt " + (attempts + 1) + + "/" + MAX_DOWNLOAD_ATTEMPTS + "..." + ) + + return crawl(absURL, attempts + 1) + } else { + console.error( + "We've hit the download attempt limit (" + + MAX_DOWNLOAD_ATTEMPTS + "). Giving up on this path." + ) + + throw 'FAILED_DOWNLOAD' + } + } + ) + .catch(error => { + if (error === 'FAILED_DOWNLOAD') { + // Debug logging for this is already handled above. + return [] + } else { + throw error + } + }) +} + +function getHTMLLinks(text) { + // Never parse HTML with a regex! + + return $(text).find('a').get().map(a => { + const $a = $(a) + return [$a.text(), $a.attr('href')] + }) +} + +if (process.argv.length === 2) { + console.log("Usage: http-music-crawl-recursive http://.../example/path/") + console.log("..or, npm run crawl-recursive -- http://...") +} else { + let url = process.argv[2] + + if (!(url.endsWith('/'))) { + url = url + '/' + } + + crawl(url) + .then(res => console.log(JSON.stringify(res, null, 2))) + .catch(err => console.error(err)) +} -- cgit 1.3.0-6-gf8a5