diff options
author | liam4 <towerofnix@gmail.com> | 2017-05-31 19:59:16 -0300 |
---|---|---|
committer | liam4 <towerofnix@gmail.com> | 2017-05-31 19:59:16 -0300 |
commit | 69cf7222241ee9ed1f2aae3fba48061f05dbd56f (patch) | |
tree | bf0a9cdb52a0d2a2a6087e7e07808c412acbc4f8 /src | |
parent | 50a9c1a0a3feca4412f1c4041f041e7faf45088f (diff) |
Generally improve how scripts and running works
Diffstat (limited to 'src')
-rwxr-xr-x | src/crawl-recursive.js | 91 | ||||
-rwxr-xr-x[-rw-r--r--] | src/play.js | 2 |
2 files changed, 93 insertions, 0 deletions
diff --git a/src/crawl-recursive.js b/src/crawl-recursive.js new file mode 100755 index 0000000..2656279 --- /dev/null +++ b/src/crawl-recursive.js @@ -0,0 +1,91 @@ +#!/usr/bin/env node + +'use strict' + +const MAX_DOWNLOAD_ATTEMPTS = 5 + +const fetch = require('node-fetch') +const $ = require('cheerio') + +function crawl(absURL, attempts = 0) { + // Recursively crawls a given URL, following every link to a deeper path and + // recording all links in a tree (in the same format playlists use). Makes + // multiple attempts to download failed paths. + + return fetch(absURL) + .then( + res => res.text().then(text => { + const links = getHTMLLinks(text) + const verbose = process.argv.includes('--verbose') + + return Promise.all(links.map(link => { + const [ title, href ] = link + + if (href.endsWith('/')) { + // It's a directory! + + if (verbose) console.log("[Dir] " + absURL + href) + return crawl(absURL + href) + .then(res => [title, res]) + } else { + // It's a file! + + if (verbose) console.log("[File] " + absURL + href) + return Promise.resolve([title, absURL + href]) + } + })) + }), + + err => { + console.warn("Failed to download: " + absURL) + + if (attempts < MAX_DOWNLOAD_ATTEMPTS) { + console.warn( + "Trying again. Attempt " + (attempts + 1) + + "/" + MAX_DOWNLOAD_ATTEMPTS + "..." + ) + + return crawl(absURL, attempts + 1) + } else { + console.error( + "We've hit the download attempt limit (" + + MAX_DOWNLOAD_ATTEMPTS + "). Giving up on this path." + ) + + throw 'FAILED_DOWNLOAD' + } + } + ) + .catch(error => { + if (error === 'FAILED_DOWNLOAD') { + // Debug logging for this is already handled above. + return [] + } else { + throw error + } + }) +} + +function getHTMLLinks(text) { + // Never parse HTML with a regex! + + return $(text).find('a').get().map(a => { + const $a = $(a) + return [$a.text(), $a.attr('href')] + }) +} + +if (process.argv.length === 2) { + console.log("Usage: http-music-crawl-recursive http://.../example/path/") + console.log("..or, npm run crawl-recursive -- http://...") +} else { + let url = process.argv[2] + + if (!(url.endsWith('/'))) { + url = url + '/' + } + + crawl(url) + .then(res => console.log(JSON.stringify(res, null, 2))) + .catch(err => console.error(err)) +} diff --git a/src/play.js b/src/play.js index 5e3e04b..2e47fba 100644..100755 --- a/src/play.js +++ b/src/play.js @@ -1,3 +1,5 @@ +#!/usr/bin/env node + 'use strict' const fs = require('fs') |