diff options
author | liam4 <towerofnix@gmail.com> | 2017-05-30 20:09:03 +0000 |
---|---|---|
committer | liam4 <towerofnix@gmail.com> | 2017-05-30 20:09:03 +0000 |
commit | 9ab94d9a4ba2896a58db7a2f965808b3bb6ab262 (patch) | |
tree | 73a4602cd3322458d495ed4898ae5fd4082cf414 | |
parent | e8fa654bcb8a1b191e1c1ce5853c9f0cb529c11d (diff) |
Better recursive crawler
-rw-r--r-- | README.md | 5 | ||||
-rw-r--r-- | crawl-recursive.js (renamed from crawl-itunes.js) | 57 | ||||
-rw-r--r-- | play.js | 2 |
3 files changed, 27 insertions, 37 deletions
diff --git a/README.md b/README.md index db3c69a..7e7478b 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,12 @@ It's also decently powerful. ```bash # On the server; that is, the device that holds the media: $ cd my_music_folder -$ python3 -m http.server 1233 +$ python3 -m http.server <some_port> # On the client; that is, the device with http-music: $ cd http-music $ yarn # to install Node.js dependencies; you'll also need `avconv` and `play` (sox). -$ node crawl-itunes.js > playlist.json # Bad script name, right? -# I think you might need to configure crawl-itunes.js to get the right IP and port.. +$ node crawl-recursive.js <server_ip> > playlist.json $ node play.js # Go! ``` diff --git a/crawl-itunes.js b/crawl-recursive.js index 3c0f3f7..2aa4041 100644 --- a/crawl-itunes.js +++ b/crawl-recursive.js @@ -1,23 +1,9 @@ -const fetch = require('node-fetch') +'use strict' const MAX_DOWNLOAD_ATTEMPTS = 5 -function parseDirectoryListing(text) { - // Matches all links in a directory listing. - // Returns an array where each item is in the format [href, label]. - - if (!(text.includes('Directory listing for'))) { - throw 'NOT_DIRECTORY_LISTING' - } - - const regex = /<a href="([^"]*)">([^>]*)<\/a>/g - - let matches, output = [] - while (matches = regex.exec(text)) { - output.push([matches[1], matches[2]]) - } - return output -} +const fetch = require('node-fetch') +const { getHTMLLinks } = require('./crawl-links') function crawl(absURL, attempts = 0) { return fetch(absURL) @@ -50,11 +36,11 @@ function crawl(absURL, attempts = 0) { } function playlistifyParse(text, absURL) { - const links = parseDirectoryListing(text) - return Promise.all(links.map(link => { - const [ href, title ] = link + const links = getHTMLLinks(text) + const verbose = process.argv.includes('--verbose') - const verbose = process.argv.includes('--verbose') + return Promise.all(links.map(link => { + const [ title, href ] = link if (href.endsWith('/')) { // It's a directory! @@ -62,24 +48,27 @@ function playlistifyParse(text, absURL) { if (verbose) console.log('[Dir] ' + absURL + href) return crawl(absURL + href) .then(res => [title, res]) - .catch(error => { - if (error === 'NOT_DIRECTORY_LISTING') { - console.error('Not a directory listing: ' + absURL) - return [] - } else { - throw error - } - }) } else { // It's a file! if (verbose) console.log('[File] ' + absURL + href) return Promise.resolve([title, absURL + href]) } - })).catch(error => { - }) + })) } -crawl('http://192.168.2.19:1233/') - .then(res => console.log(JSON.stringify(res, null, 2))) - .catch(err => console.error(err)) +if (process.argv.length === 2) { + console.log('Usage: crawl-recursive http://example.com/example/path') +} else { + console.log('Crawling URL: ' + process.argv[2]) + + let url = process.argv[2] + + if (!(url.endsWith('/'))) { + url = url + '/' + } + + crawl(url) + .then(res => console.log(JSON.stringify(res, null, 2))) + .catch(err => console.error(err)) +} diff --git a/play.js b/play.js index f7604f8..8032378 100644 --- a/play.js +++ b/play.js @@ -40,6 +40,7 @@ // itely true; 'Saucey Sounds'[0] === 'S', and 'Unofficial'[0] // === 'U', which are the two "files" it crashes on while playing // -g 'Jake Chudnow'.) +// (Done?) // // TODO: A way to exclude a specific group path. // (Done!) @@ -58,6 +59,7 @@ // friendly (i.e. don't require editing the script itself), and // make it use the getHTMLLinks function defined in the new // crawl-links.js script. +// (Done!) // // TODO: Play-in-order track picker. // (Done!) |