Better recursive crawler

author: liam4 <towerofnix@gmail.com> 2017-05-30 20:09:03 +0000
committer: liam4 <towerofnix@gmail.com> 2017-05-30 20:09:03 +0000
commit: 9ab94d9a4ba2896a58db7a2f965808b3bb6ab262 (patch)
tree: 73a4602cd3322458d495ed4898ae5fd4082cf414
parent: e8fa654bcb8a1b191e1c1ce5853c9f0cb529c11d (diff)
3 files changed, 27 insertions, 37 deletions
diff --git a/README.md b/README.md
index db3c69a..7e7478b 100644
--- a/README.md
+++ b/README.md
@@ -8,13 +8,12 @@ It's also decently powerful.
 ```bash
 # On the server; that is, the device that holds the media:
 $ cd my_music_folder
-$ python3 -m http.server 1233
+$ python3 -m http.server <some_port>
 
 # On the client; that is, the device with http-music:
 $ cd http-music
 $ yarn  # to install Node.js dependencies; you'll also need `avconv` and `play` (sox).
-$ node crawl-itunes.js > playlist.json  # Bad script name, right?
-# I think you might need to configure crawl-itunes.js to get the right IP and port..
+$ node crawl-recursive.js <server_ip> > playlist.json
 $ node play.js  # Go!
 ```
 
diff --git a/crawl-itunes.js b/crawl-recursive.js
index 3c0f3f7..2aa4041 100644
--- a/crawl-itunes.js
+++ b/crawl-recursive.js
@@ -1,23 +1,9 @@
-const fetch = require('node-fetch')
+'use strict'
 
 const MAX_DOWNLOAD_ATTEMPTS = 5
 
-function parseDirectoryListing(text) {
-	// Matches all links in a directory listing.
-	// Returns an array where each item is in the format [href, label].
-
-	if (!(text.includes('Directory listing for'))) {
-		throw 'NOT_DIRECTORY_LISTING'
-	}
-
-	const regex = /<a href="([^"]*)">([^>]*)<\/a>/g
-
-	let matches, output = []
-	while (matches = regex.exec(text)) {
-		output.push([matches[1], matches[2]])
-	}
-	return output
-}
+const fetch = require('node-fetch')
+const { getHTMLLinks } = require('./crawl-links')
 
 function crawl(absURL, attempts = 0) {
 	return fetch(absURL)
@@ -50,11 +36,11 @@ function crawl(absURL, attempts = 0) {
 }
 
 function playlistifyParse(text, absURL) {
-	const links = parseDirectoryListing(text)
-	return Promise.all(links.map(link => {
-		const [ href, title ] = link
+	const links = getHTMLLinks(text)
+	const verbose = process.argv.includes('--verbose')
 
-		const verbose = process.argv.includes('--verbose')
+	return Promise.all(links.map(link => {
+		const [ title, href ] = link
 
 		if (href.endsWith('/')) {
 			// It's a directory!
@@ -62,24 +48,27 @@ function playlistifyParse(text, absURL) {
 			if (verbose) console.log('[Dir] ' + absURL + href)
 			return crawl(absURL + href)
 				.then(res => [title, res])
-				.catch(error => {
-					if (error === 'NOT_DIRECTORY_LISTING') {
-						console.error('Not a directory listing: ' + absURL)
-						return []
-					} else {
-						throw error
-					}
-				})
 		} else {
 			// It's a file!
 
 			if (verbose) console.log('[File] ' + absURL + href)
 			return Promise.resolve([title, absURL + href])
 		}
-	})).catch(error => {
-	})
+	}))
 }
 
-crawl('http://192.168.2.19:1233/')
-	.then(res => console.log(JSON.stringify(res, null, 2)))
-	.catch(err => console.error(err))
+if (process.argv.length === 2) {
+	console.log('Usage: crawl-recursive http://example.com/example/path')
+} else {
+	console.log('Crawling URL: ' + process.argv[2])
+
+	let url = process.argv[2]
+
+	if (!(url.endsWith('/'))) {
+		url = url + '/'
+	}
+
+	crawl(url)
+		.then(res => console.log(JSON.stringify(res, null, 2)))
+		.catch(err => console.error(err))
+}
diff --git a/play.js b/play.js
index f7604f8..8032378 100644
--- a/play.js
+++ b/play.js
@@ -40,6 +40,7 @@
 //       itely true; 'Saucey Sounds'[0] === 'S', and 'Unofficial'[0]
 //       === 'U', which are the two "files" it crashes on while playing
 //       -g 'Jake Chudnow'.)
+//       (Done?)
 //
 // TODO: A way to exclude a specific group path.
 //       (Done!)
@@ -58,6 +59,7 @@
 //       friendly (i.e. don't require editing the script itself), and
 //       make it use the getHTMLLinks function defined in the new
 //       crawl-links.js script.
+//       (Done!)
 //
 // TODO: Play-in-order track picker.
 //       (Done!)
author	liam4 <towerofnix@gmail.com>	2017-05-30 20:09:03 +0000
committer	liam4 <towerofnix@gmail.com>	2017-05-30 20:09:03 +0000
commit	9ab94d9a4ba2896a58db7a2f965808b3bb6ab262 (patch)
tree	73a4602cd3322458d495ed4898ae5fd4082cf414
parent	e8fa654bcb8a1b191e1c1ce5853c9f0cb529c11d (diff)