Make max crawl-http download attempts settable by user

author: Liam <towerofnix@gmail.com> 2017-06-16 21:25:59 -0300
committer: Liam <towerofnix@gmail.com> 2017-06-16 21:25:59 -0300
commit: 8a95e5f61792748ab9bebe5fc7e551c66a178b2d (patch)
tree: 4a9345caf2ea7b6402a3c0848a1bb379915f672b
parent: 5f22ac6b2cc2a3946cbe754b06610e82ce004dff (diff)
3 files changed, 112 insertions, 16 deletions
diff --git a/man/http-music-crawl-http.1 b/man/http-music-crawl-http.1
new file mode 100644
index 0000000..949d682
--- /dev/null
+++ b/man/http-music-crawl-http.1
@@ -0,0 +1,78 @@
+.TH http-music-crawl-http 1
+
+
+.SH NAME
+http-music-crawl-http - create a playlist file using an HTTP-based directory listing
+
+
+.SH SYNOPSIS
+.B http-music-crawl-http
+\fIdownloadURL\fR
+[opts...]
+
+
+.SH DESCRIPTION
+\fBhttp-music-crawl-http\fR is a (convolutedly-named) command line utility used to generate playlist files for \fBhttp-music\fR by crawling the directory listing found at a given URL.
+http-music uses playlist files as its source of music; without a playlist file, the program simply does not know what to play!
+
+.PP
+The resulting playlist file is structured as a tree that represents the path that the crawler follows.
+For instance, if the links of a directory listing give the following tree:
+
+.PP
+.nf
+.RS
+http://example.com/
+  Rainbows/
+    Sunlight.mp3
+    Rainbows.mp3
+    Pineapples.mp3
+  Cool Author 72/
+    Good Album/
+      Hello world!.mp3
+  Bad News/
+    Bad News - Single/
+      Bad News.mp3
+    Irony/
+      Rahhhh!!.mp3
+.RE
+.fi
+
+.PP
+\[char46]\[char46]then the following playlist file is generated:
+
+.PP
+.nf
+.RS
+[
+  ['Rainbows', [
+    ['Sunlight', 'http://example.com/Rainbows/Sunlight.mp3'],
+    ['Rainbows', 'http://example.com/Rainbows/Rainbows.mp3'],
+    ['Pineapples', 'http://example.com/Rainbows/Pineapples.mp3']
+  ]],
+  ['Cool Author 72', [
+    ['Good Album', [
+      ['Hello World!', 'http://example.com/Cool%20Author%2072/Good%20Album/Hello%20World.mp3'],
+    ]]
+  ]],
+  ['Bad News', [
+    ['Bad News - Single', [
+      ['Bad News', 'http://example.com/Bad%20News/Bad%20News%20-%20Single/Bad%20News.mp3']
+    ]],
+    ['Irony', [
+      ['Rahhhh!!', 'http://example.com/Bad%20News/Irony/Rahhhh!!.mp3']
+    ]]
+  ]]
+]
+.RE
+.fi
+
+.PP
+As you can see, the resulting playlist file follows the same structure as the directory listing.
+
+
+.SH OPTIONS
+.TP
+.BR -m ", " --max-download-attempts
+Sets the maximum number of times any single directory will be attempted to be downloaded, when the HTTP download request fails.
+Defaults to 5.
diff --git a/src/crawl-http.js b/src/crawl-http.js
index fa078e3..05685e4 100755
--- a/src/crawl-http.js
+++ b/src/crawl-http.js
@@ -2,14 +2,13 @@
 
 'use strict'
 
-const MAX_DOWNLOAD_ATTEMPTS = 5
-
 const fetch = require('node-fetch')
 const $ = require('cheerio')
 const url = require('url')
 const path = require('path')
+const processArgv = require('./process-argv')
 
-function crawl(absURL, attempts = 0) {
+function crawl(absURL, maxAttempts = 5, attempts = 0) {
   // Recursively crawls a given URL, following every link to a deeper path and
   // recording all links in a tree (in the same format playlists use). Makes
   // multiple attempts to download failed paths.
@@ -28,7 +27,7 @@ function crawl(absURL, attempts = 0) {
             // It's a directory!
 
             if (verbose) console.log("[Dir] " + linkURL)
-            return crawl(linkURL)
+            return crawl(linkURL, maxAttempts)
               .then(res => [title, res])
           } else {
             // It's a file!
@@ -42,17 +41,16 @@ function crawl(absURL, attempts = 0) {
       err => {
         console.warn("Failed to download: " + absURL)
 
-        if (attempts < MAX_DOWNLOAD_ATTEMPTS) {
+        if (attempts < maxAttempts) {
           console.warn(
-            "Trying again. Attempt " + (attempts + 1) +
-            "/" + MAX_DOWNLOAD_ATTEMPTS + "..."
+            `Trying again. Attempt ${attempts + 1}/${maxAttempts}...`
           )
 
-          return crawl(absURL, attempts + 1)
+          return crawl(absURL, maxAttempts, attempts + 1)
         } else {
           console.error(
-            "We've hit the download attempt limit (" +
-            MAX_DOWNLOAD_ATTEMPTS + "). Giving up on this path."
+            "We've hit the download attempt limit (" + maxAttempts + "). " +
+            "Giving up on this path."
           )
 
           throw 'FAILED_DOWNLOAD'
@@ -78,13 +76,32 @@ function getHTMLLinks(text) {
   })
 }
 
-if (process.argv.length === 2) {
-  console.log("Usage: http-music-crawl-http http://.../example/path/")
-  console.log("..or, npm run crawl-http -- http://.../example/path/")
-} else {
+async function main() {
   let url = process.argv[2]
 
-  crawl(url)
-    .then(res => console.log(JSON.stringify(res, null, 2)))
+  let maxDownloadAttempts = 5
+
+  await processArgv(process.argv.slice(3), {
+    '-max-download-attempts': function(util) {
+      // --max-download-attempts <max>  (alias: -m)
+      // Sets the maximum number of times to attempt downloading the index for
+      // any one directory. Defaults to 5.
+
+      maxDownloadAttempts = util.nextArg()
+      console.log(maxDownloadAttempts)
+    },
+
+    'm': util => util.alias('-max-download-attempts')
+  })
+
+  const downloadedPlaylist = await crawl(url, maxDownloadAttempts)
+
+  return JSON.stringify(res, null, 2)
+}
+
+if (process.argv.length === 2) {
+  console.log("Usage: http-music-crawl-http http://.../example/path/ [opts]")
+} else {
+  main()
     .catch(err => console.error(err))
 }
diff --git a/todo.txt b/todo.txt
index 7620dba..1ad2215 100644
--- a/todo.txt
+++ b/todo.txt
@@ -139,3 +139,4 @@ TODO: Figure out why written track files (when using HTTP downloader) are
 
 TODO: Make max download attempts variable by the user (without requiring
       source editing, obviously).
+      (Done!)
author	Liam <towerofnix@gmail.com>	2017-06-16 21:25:59 -0300
committer	Liam <towerofnix@gmail.com>	2017-06-16 21:25:59 -0300
commit	8a95e5f61792748ab9bebe5fc7e551c66a178b2d (patch)
tree	4a9345caf2ea7b6402a3c0848a1bb379915f672b
parent	5f22ac6b2cc2a3946cbe754b06610e82ce004dff (diff)