From 6f640a0b8e8e5b26a266f4680a626a629d3c7944 Mon Sep 17 00:00:00 2001 From: Florrie Date: Fri, 5 Jan 2018 23:20:23 -0400 Subject: Support mpga in crawl-http --- src/crawl-http.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/crawl-http.js') diff --git a/src/crawl-http.js b/src/crawl-http.js index d3e1533..9c7608e 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -21,7 +21,7 @@ function crawl(absURL, opts = {}, internals = {}) { keepSeparateHosts = false, keepAnyFileType = false, - fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov'], + fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga'], filterRegex = null } = opts -- cgit 1.3.0-6-gf8a5 From 64bcc2930392d70437dc5bc8b2f078840d8998a9 Mon Sep 17 00:00:00 2001 From: Florrie Date: Sat, 27 Jan 2018 00:23:21 -0400 Subject: Various improvements to crawl-http Names are now trimmed. You shouldn't see " Vim!" anymore - just "Vim!". .MOD files are considered to be music. The crawler will try to avoid going out of whatever directory was passed to it. --- src/crawl-http.js | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'src/crawl-http.js') diff --git a/src/crawl-http.js b/src/crawl-http.js index 9c7608e..ae38ca4 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -19,9 +19,10 @@ function crawl(absURL, opts = {}, internals = {}) { maxAttempts = 5, keepSeparateHosts = false, + stayInSameDirectory = true, keepAnyFileType = false, - fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga'], + fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'], filterRegex = null } = opts @@ -56,7 +57,9 @@ function crawl(absURL, opts = {}, internals = {}) { name = name.slice(0, -1) } - const urlObj = new url.URL(href, absURL) + name = name.trim() + + const urlObj = new url.URL(href, absURL + '/') const linkURL = url.format(urlObj) if (internals.allURLs.includes(linkURL)) { @@ -79,6 +82,14 @@ function crawl(absURL, opts = {}, internals = {}) { return false } + if (stayInSameDirectory) { + const relative = path.relative(absURLObj.pathname, urlObj.pathname) + if (relative.startsWith('..') || path.isAbsolute(relative)) { + verboseLog("[Ignored] Outside of parent directory: " + linkURL) + return false + } + } + if (href.endsWith('/')) { // It's a directory! -- cgit 1.3.0-6-gf8a5 From 60d4ac4b28eee349070ad0930330654e2d67e27d Mon Sep 17 00:00:00 2001 From: Florrie Date: Sat, 27 Jan 2018 00:39:22 -0400 Subject: Make crawl-http go through one directory at a time Hopefully this makes the tool, like, less of an unintentional denial-of-service. --- src/crawl-http.js | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'src/crawl-http.js') diff --git a/src/crawl-http.js b/src/crawl-http.js index ae38ca4..b40ed02 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -44,10 +44,12 @@ function crawl(absURL, opts = {}, internals = {}) { return fetch(absURL) .then( - res => res.text().then(text => { + res => res.text().then(async text => { const links = getHTMLLinks(text) - return Promise.all(links.map(link => { + const items = [] + + for (const link of links) { let [ name, href ] = link // If the name (that's the content inside of ..) ends with a @@ -64,29 +66,26 @@ function crawl(absURL, opts = {}, internals = {}) { if (internals.allURLs.includes(linkURL)) { verboseLog("[Ignored] Already done this URL: " + linkURL) - - return false + continue } internals.allURLs.push(linkURL) if (filterRegex && !(filterRegex.test(linkURL))) { verboseLog("[Ignored] Failed regex: " + linkURL) - - return false + continue } if (!keepSeparateHosts && urlObj.host !== absURLObj.host) { verboseLog("[Ignored] Inconsistent host: " + linkURL) - - return false + continue } if (stayInSameDirectory) { const relative = path.relative(absURLObj.pathname, urlObj.pathname) if (relative.startsWith('..') || path.isAbsolute(relative)) { verboseLog("[Ignored] Outside of parent directory: " + linkURL) - return false + continue } } @@ -95,8 +94,10 @@ function crawl(absURL, opts = {}, internals = {}) { verboseLog("[Dir] " + linkURL) - return crawl(linkURL, opts, Object.assign({}, internals)) - .then(({ items }) => ({name, items})) + items.push(await ( + crawl(linkURL, opts, Object.assign({}, internals)) + .then(({ items }) => ({name, items})) + )) } else { // It's a file! @@ -107,14 +108,15 @@ function crawl(absURL, opts = {}, internals = {}) { !(extensions.includes(path.extname(href))) ) { verboseLog("[Ignored] Bad extension: " + linkURL) - - return false + continue } verboseLog("[File] " + linkURL) - return Promise.resolve({name, downloaderArg: linkURL}) + items.push({name, downloaderArg: linkURL}) } - }).filter(Boolean)).then(items => ({items})) + } + + return {items} }), err => { -- cgit 1.3.0-6-gf8a5 From fe65f1777f130ec9d61c5ce06532a551b5dcc899 Mon Sep 17 00:00:00 2001 From: Florrie Date: Mon, 12 Feb 2018 19:28:49 -0400 Subject: Make it reasonable to have crawl-http save to a file while verbosely logging --- src/crawl-http.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'src/crawl-http.js') diff --git a/src/crawl-http.js b/src/crawl-http.js index b40ed02..5a4932d 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -36,7 +36,7 @@ function crawl(absURL, opts = {}, internals = {}) { const verboseLog = text => { if (verbose) { - console.log(text) + console.error(text) } } @@ -203,7 +203,10 @@ async function main(args, shouldReturn = false) { // such. Defaults to false. verbose = true - console.log('Outputting verbosely.') + console.error( + 'Outputting verbosely. (Log output goes to STDERR - ' + + 'you can still pipe to a file to save your playlist.)' + ) }, 'v': util => util.alias('-verbose'), -- cgit 1.3.0-6-gf8a5