diff options
author | Florrie <towerofnix@gmail.com> | 2018-01-27 00:23:21 -0400 |
---|---|---|
committer | Florrie <towerofnix@gmail.com> | 2018-01-27 00:35:06 -0400 |
commit | 64bcc2930392d70437dc5bc8b2f078840d8998a9 (patch) | |
tree | 7e5250e6859a112c85a264c313a9e57818db7c21 | |
parent | a72500509a5a334bd8f0f7d490a4833c03201966 (diff) |
Various improvements to crawl-http
Names are now trimmed. You shouldn't see " Vim!" anymore - just "Vim!". .MOD files are considered to be music. The crawler will try to avoid going out of whatever directory was passed to it.
-rwxr-xr-x | src/crawl-http.js | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/src/crawl-http.js b/src/crawl-http.js index 9c7608e..ae38ca4 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -19,9 +19,10 @@ function crawl(absURL, opts = {}, internals = {}) { maxAttempts = 5, keepSeparateHosts = false, + stayInSameDirectory = true, keepAnyFileType = false, - fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga'], + fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'], filterRegex = null } = opts @@ -56,7 +57,9 @@ function crawl(absURL, opts = {}, internals = {}) { name = name.slice(0, -1) } - const urlObj = new url.URL(href, absURL) + name = name.trim() + + const urlObj = new url.URL(href, absURL + '/') const linkURL = url.format(urlObj) if (internals.allURLs.includes(linkURL)) { @@ -79,6 +82,14 @@ function crawl(absURL, opts = {}, internals = {}) { return false } + if (stayInSameDirectory) { + const relative = path.relative(absURLObj.pathname, urlObj.pathname) + if (relative.startsWith('..') || path.isAbsolute(relative)) { + verboseLog("[Ignored] Outside of parent directory: " + linkURL) + return false + } + } + if (href.endsWith('/')) { // It's a directory! |