From 9fcf30b57a7a8f0fc86f1187d2cad72e8eaaa37e Mon Sep 17 00:00:00 2001 From: liam4 Date: Sun, 4 Jun 2017 10:26:54 -0300 Subject: Local downloader --- package.json | 6 ++-- src/crawl-http.js | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/crawl-local.js | 41 +++++++++++++++++++++++ src/crawl-recursive.js | 91 -------------------------------------------------- todo.txt | 1 + 5 files changed, 137 insertions(+), 93 deletions(-) create mode 100755 src/crawl-http.js create mode 100644 src/crawl-local.js delete mode 100755 src/crawl-recursive.js diff --git a/package.json b/package.json index 86856d9..e0f2cae 100644 --- a/package.json +++ b/package.json @@ -4,11 +4,13 @@ "main": "src/play.js", "scripts": { "play": "node src/play.js", - "crawl-recursive": "node src/crawl-recursive" + "crawl-http": "node src/crawl-http", + "crawl-local": "node src/crawl-local" }, "bin": { "http-music": "./src/play.js", - "http-music-crawl-recursive": "./src/crawl-recursive.js" + "http-music-crawl-http": "./src/crawl-http.js", + "http-music-crawl-local": "./src/crawl-local.js" }, "dependencies": { "cheerio": "^1.0.0-rc.1", diff --git a/src/crawl-http.js b/src/crawl-http.js new file mode 100755 index 0000000..189ba28 --- /dev/null +++ b/src/crawl-http.js @@ -0,0 +1,91 @@ +#!/usr/bin/env node + +'use strict' + +const MAX_DOWNLOAD_ATTEMPTS = 5 + +const fetch = require('node-fetch') +const $ = require('cheerio') + +function crawl(absURL, attempts = 0) { + // Recursively crawls a given URL, following every link to a deeper path and + // recording all links in a tree (in the same format playlists use). Makes + // multiple attempts to download failed paths. + + return fetch(absURL) + .then( + res => res.text().then(text => { + const links = getHTMLLinks(text) + const verbose = process.argv.includes('--verbose') + + return Promise.all(links.map(link => { + const [ title, href ] = link + + if (href.endsWith('/')) { + // It's a directory! + + if (verbose) console.log("[Dir] " + absURL + href) + return crawl(absURL + href) + .then(res => [title, res]) + } else { + // It's a file! + + if (verbose) console.log("[File] " + absURL + href) + return Promise.resolve([title, absURL + href]) + } + })) + }), + + err => { + console.warn("Failed to download: " + absURL) + + if (attempts < MAX_DOWNLOAD_ATTEMPTS) { + console.warn( + "Trying again. Attempt " + (attempts + 1) + + "/" + MAX_DOWNLOAD_ATTEMPTS + "..." + ) + + return crawl(absURL, attempts + 1) + } else { + console.error( + "We've hit the download attempt limit (" + + MAX_DOWNLOAD_ATTEMPTS + "). Giving up on this path." + ) + + throw 'FAILED_DOWNLOAD' + } + } + ) + .catch(error => { + if (error === 'FAILED_DOWNLOAD') { + // Debug logging for this is already handled above. + return [] + } else { + throw error + } + }) +} + +function getHTMLLinks(text) { + // Never parse HTML with a regex! + + return $(text).find('a').get().map(a => { + const $a = $(a) + return [$a.text(), $a.attr('href')] + }) +} + +if (process.argv.length === 2) { + console.log("Usage: http-music-crawl-http http://.../example/path/") + console.log("..or, npm run crawl-http -- http://.../example/path/") +} else { + let url = process.argv[2] + + if (!(url.endsWith('/'))) { + url = url + '/' + } + + crawl(url) + .then(res => console.log(JSON.stringify(res, null, 2))) + .catch(err => console.error(err)) +} diff --git a/src/crawl-local.js b/src/crawl-local.js new file mode 100644 index 0000000..d9a9a70 --- /dev/null +++ b/src/crawl-local.js @@ -0,0 +1,41 @@ +#!/usr/bin/env node + +'use strict' + +const fs = require('fs') +const path = require('path') + +const { promisify } = require('util') +const readDir = promisify(fs.readdir) +const stat = promisify(fs.stat) + +function crawl(dirPath) { + return readDir(dirPath).then( + res => Promise.all(res.map(item => { + const itemPath = path.join(dirPath, item) + + return stat(itemPath).then(stats => { + if (stats.isDirectory()) { + return crawl(itemPath).then(contents => { + const group = [item, contents] + return group + }) + } else if (stats.isFile()) { + const track = [item, itemPath] + return track + } + }) + }) + )) +} + +if (process.argv.length === 2) { + console.log("Usage: http-music-crawl-local /example/path..") + console.log("..or, npm run crawl-local /example/path") +} else { + const path = process.argv[2] + + crawl(path) + .then(res => console.log(JSON.stringify(res, null, 2))) + .catch(err => console.error(err)) +} diff --git a/src/crawl-recursive.js b/src/crawl-recursive.js deleted file mode 100755 index 2656279..0000000 --- a/src/crawl-recursive.js +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env node - -'use strict' - -const MAX_DOWNLOAD_ATTEMPTS = 5 - -const fetch = require('node-fetch') -const $ = require('cheerio') - -function crawl(absURL, attempts = 0) { - // Recursively crawls a given URL, following every link to a deeper path and - // recording all links in a tree (in the same format playlists use). Makes - // multiple attempts to download failed paths. - - return fetch(absURL) - .then( - res => res.text().then(text => { - const links = getHTMLLinks(text) - const verbose = process.argv.includes('--verbose') - - return Promise.all(links.map(link => { - const [ title, href ] = link - - if (href.endsWith('/')) { - // It's a directory! - - if (verbose) console.log("[Dir] " + absURL + href) - return crawl(absURL + href) - .then(res => [title, res]) - } else { - // It's a file! - - if (verbose) console.log("[File] " + absURL + href) - return Promise.resolve([title, absURL + href]) - } - })) - }), - - err => { - console.warn("Failed to download: " + absURL) - - if (attempts < MAX_DOWNLOAD_ATTEMPTS) { - console.warn( - "Trying again. Attempt " + (attempts + 1) + - "/" + MAX_DOWNLOAD_ATTEMPTS + "..." - ) - - return crawl(absURL, attempts + 1) - } else { - console.error( - "We've hit the download attempt limit (" + - MAX_DOWNLOAD_ATTEMPTS + "). Giving up on this path." - ) - - throw 'FAILED_DOWNLOAD' - } - } - ) - .catch(error => { - if (error === 'FAILED_DOWNLOAD') { - // Debug logging for this is already handled above. - return [] - } else { - throw error - } - }) -} - -function getHTMLLinks(text) { - // Never parse HTML with a regex! - - return $(text).find('a').get().map(a => { - const $a = $(a) - return [$a.text(), $a.attr('href')] - }) -} - -if (process.argv.length === 2) { - console.log("Usage: http-music-crawl-recursive http://.../example/path/") - console.log("..or, npm run crawl-recursive -- http://...") -} else { - let url = process.argv[2] - - if (!(url.endsWith('/'))) { - url = url + '/' - } - - crawl(url) - .then(res => console.log(JSON.stringify(res, null, 2))) - .catch(err => console.error(err)) -} diff --git a/todo.txt b/todo.txt index 53a5991..3197f92 100644 --- a/todo.txt +++ b/todo.txt @@ -76,6 +76,7 @@ TODO: Use NOT the internet as its source, so that it's a bit more general (Done!) TODO: Recursive local file playlist crawler. + (Done!) TODO: *Requiring* a literal `playlist.json` file doesn't seem quite right, especially since there's the `--open` option. -- cgit 1.3.0-6-gf8a5