From 9ab94d9a4ba2896a58db7a2f965808b3bb6ab262 Mon Sep 17 00:00:00 2001 From: liam4 Date: Tue, 30 May 2017 20:09:03 +0000 Subject: Better recursive crawler --- README.md | 5 ++-- crawl-itunes.js | 85 ------------------------------------------------------ crawl-recursive.js | 74 +++++++++++++++++++++++++++++++++++++++++++++++ play.js | 2 ++ 4 files changed, 78 insertions(+), 88 deletions(-) delete mode 100644 crawl-itunes.js create mode 100644 crawl-recursive.js diff --git a/README.md b/README.md index db3c69a..7e7478b 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,12 @@ It's also decently powerful. ```bash # On the server; that is, the device that holds the media: $ cd my_music_folder -$ python3 -m http.server 1233 +$ python3 -m http.server # On the client; that is, the device with http-music: $ cd http-music $ yarn # to install Node.js dependencies; you'll also need `avconv` and `play` (sox). -$ node crawl-itunes.js > playlist.json # Bad script name, right? -# I think you might need to configure crawl-itunes.js to get the right IP and port.. +$ node crawl-recursive.js > playlist.json $ node play.js # Go! ``` diff --git a/crawl-itunes.js b/crawl-itunes.js deleted file mode 100644 index 3c0f3f7..0000000 --- a/crawl-itunes.js +++ /dev/null @@ -1,85 +0,0 @@ -const fetch = require('node-fetch') - -const MAX_DOWNLOAD_ATTEMPTS = 5 - -function parseDirectoryListing(text) { - // Matches all links in a directory listing. - // Returns an array where each item is in the format [href, label]. - - if (!(text.includes('Directory listing for'))) { - throw 'NOT_DIRECTORY_LISTING' - } - - const regex = /([^>]*)<\/a>/g - - let matches, output = [] - while (matches = regex.exec(text)) { - output.push([matches[1], matches[2]]) - } - return output -} - -function crawl(absURL, attempts = 0) { - return fetch(absURL) - .then(res => res.text().then(text => playlistifyParse(text, absURL)), err => { - console.error('Failed to download: ' + absURL) - - if (attempts < MAX_DOWNLOAD_ATTEMPTS) { - console.error( - 'Trying again. Attempt ' + (attempts + 1) + - '/' + MAX_DOWNLOAD_ATTEMPTS + '...' - ) - return crawl(absURL, attempts + 1) - } else { - console.error( - 'We\'ve hit the download attempt limit (' + - MAX_DOWNLOAD_ATTEMPTS + '). Giving up on ' + - 'this path.' - ) - throw 'FAILED_DOWNLOAD' - } - }) - .catch(error => { - if (error === 'FAILED_DOWNLOAD') { - // Debug logging for this is already handled above. - return [] - } else { - throw error - } - }) -} - -function playlistifyParse(text, absURL) { - const links = parseDirectoryListing(text) - return Promise.all(links.map(link => { - const [ href, title ] = link - - const verbose = process.argv.includes('--verbose') - - if (href.endsWith('/')) { - // It's a directory! - - if (verbose) console.log('[Dir] ' + absURL + href) - return crawl(absURL + href) - .then(res => [title, res]) - .catch(error => { - if (error === 'NOT_DIRECTORY_LISTING') { - console.error('Not a directory listing: ' + absURL) - return [] - } else { - throw error - } - }) - } else { - // It's a file! - - if (verbose) console.log('[File] ' + absURL + href) - return Promise.resolve([title, absURL + href]) - } - })).catch(error => { - }) -} - -crawl('http://192.168.2.19:1233/') - .then(res => console.log(JSON.stringify(res, null, 2))) - .catch(err => console.error(err)) diff --git a/crawl-recursive.js b/crawl-recursive.js new file mode 100644 index 0000000..2aa4041 --- /dev/null +++ b/crawl-recursive.js @@ -0,0 +1,74 @@ +'use strict' + +const MAX_DOWNLOAD_ATTEMPTS = 5 + +const fetch = require('node-fetch') +const { getHTMLLinks } = require('./crawl-links') + +function crawl(absURL, attempts = 0) { + return fetch(absURL) + .then(res => res.text().then(text => playlistifyParse(text, absURL)), err => { + console.error('Failed to download: ' + absURL) + + if (attempts < MAX_DOWNLOAD_ATTEMPTS) { + console.error( + 'Trying again. Attempt ' + (attempts + 1) + + '/' + MAX_DOWNLOAD_ATTEMPTS + '...' + ) + return crawl(absURL, attempts + 1) + } else { + console.error( + 'We\'ve hit the download attempt limit (' + + MAX_DOWNLOAD_ATTEMPTS + '). Giving up on ' + + 'this path.' + ) + throw 'FAILED_DOWNLOAD' + } + }) + .catch(error => { + if (error === 'FAILED_DOWNLOAD') { + // Debug logging for this is already handled above. + return [] + } else { + throw error + } + }) +} + +function playlistifyParse(text, absURL) { + const links = getHTMLLinks(text) + const verbose = process.argv.includes('--verbose') + + return Promise.all(links.map(link => { + const [ title, href ] = link + + if (href.endsWith('/')) { + // It's a directory! + + if (verbose) console.log('[Dir] ' + absURL + href) + return crawl(absURL + href) + .then(res => [title, res]) + } else { + // It's a file! + + if (verbose) console.log('[File] ' + absURL + href) + return Promise.resolve([title, absURL + href]) + } + })) +} + +if (process.argv.length === 2) { + console.log('Usage: crawl-recursive http://example.com/example/path') +} else { + console.log('Crawling URL: ' + process.argv[2]) + + let url = process.argv[2] + + if (!(url.endsWith('/'))) { + url = url + '/' + } + + crawl(url) + .then(res => console.log(JSON.stringify(res, null, 2))) + .catch(err => console.error(err)) +} diff --git a/play.js b/play.js index f7604f8..8032378 100644 --- a/play.js +++ b/play.js @@ -40,6 +40,7 @@ // itely true; 'Saucey Sounds'[0] === 'S', and 'Unofficial'[0] // === 'U', which are the two "files" it crashes on while playing // -g 'Jake Chudnow'.) +// (Done?) // // TODO: A way to exclude a specific group path. // (Done!) @@ -58,6 +59,7 @@ // friendly (i.e. don't require editing the script itself), and // make it use the getHTMLLinks function defined in the new // crawl-links.js script. +// (Done!) // // TODO: Play-in-order track picker. // (Done!) -- cgit 1.3.0-6-gf8a5