From 64bcc2930392d70437dc5bc8b2f078840d8998a9 Mon Sep 17 00:00:00 2001 From: Florrie Date: Sat, 27 Jan 2018 00:23:21 -0400 Subject: Various improvements to crawl-http Names are now trimmed. You shouldn't see " Vim!" anymore - just "Vim!". .MOD files are considered to be music. The crawler will try to avoid going out of whatever directory was passed to it. --- src/crawl-http.js | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/crawl-http.js b/src/crawl-http.js index 9c7608e..ae38ca4 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -19,9 +19,10 @@ function crawl(absURL, opts = {}, internals = {}) { maxAttempts = 5, keepSeparateHosts = false, + stayInSameDirectory = true, keepAnyFileType = false, - fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga'], + fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'], filterRegex = null } = opts @@ -56,7 +57,9 @@ function crawl(absURL, opts = {}, internals = {}) { name = name.slice(0, -1) } - const urlObj = new url.URL(href, absURL) + name = name.trim() + + const urlObj = new url.URL(href, absURL + '/') const linkURL = url.format(urlObj) if (internals.allURLs.includes(linkURL)) { @@ -79,6 +82,14 @@ function crawl(absURL, opts = {}, internals = {}) { return false } + if (stayInSameDirectory) { + const relative = path.relative(absURLObj.pathname, urlObj.pathname) + if (relative.startsWith('..') || path.isAbsolute(relative)) { + verboseLog("[Ignored] Outside of parent directory: " + linkURL) + return false + } + } + if (href.endsWith('/')) { // It's a directory! -- cgit 1.3.0-6-gf8a5