diff options
author | Florrie <towerofnix@gmail.com> | 2018-06-05 21:14:10 -0300 |
---|---|---|
committer | Florrie <towerofnix@gmail.com> | 2018-06-05 21:14:10 -0300 |
commit | 3c65d4ca6246c5723d9fc6a73371150d06c0b6c2 (patch) | |
tree | a60395e67eab694c1e5ac54f5ddcd1adc3ec54b9 | |
parent | a0757178bdfacb2f5fe8d6c3a02a09b5cc26bb28 (diff) |
Dumb http crawler changes
-rw-r--r-- | crawlers.js | 22 |
1 files changed, 18 insertions, 4 deletions
diff --git a/crawlers.js b/crawlers.js index 0bf5c4e..8ba70f3 100644 --- a/crawlers.js +++ b/crawlers.js @@ -27,12 +27,13 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { maxAttempts = 5, - keepSeparateHosts = false, + allowedExternalHostRegex = null, stayInSameDirectory = true, keepAnyFileType = false, fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'], + forceGroupRegex = null, filterRegex = null } = opts @@ -55,12 +56,17 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { .then( res => res.text().then(async text => { const links = getHTMLLinks(text) + console.log(links) const items = [] for (const link of links) { let [ name, href ] = link + if (!href) { + continue + } + // If the name (that's the content inside of <a>..</a>) ends with a // slash, that's probably just an artifact of a directory lister; // not actually part of the intended content. So we remove it! @@ -85,12 +91,20 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { continue } - if (!keepSeparateHosts && urlObj.host !== absURLObj.host) { + if (urlObj.host !== absURLObj.host && !( + allowedExternalHostRegex && new RegExp(allowedExternalHostRegex) + .test(urlObj.host))) { verboseLog("[Ignored] Inconsistent host: " + linkURL) continue } - if (stayInSameDirectory) { + if (stayInSameDirectory) sameDir: { + // Don't bother with staying in the same directory if it's on a + // different host. + if (urlObj.host !== absURLObj.host) { + break sameDir + } + const relative = path.relative(absURLObj.pathname, urlObj.pathname) if (relative.startsWith('..') || path.isAbsolute(relative)) { verboseLog("[Ignored] Outside of parent directory: " + linkURL) @@ -98,7 +112,7 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { } } - if (href.endsWith('/')) { + if (href.endsWith('/') || (forceGroupRegex && new RegExp(forceGroupRegex).test(href))) { // It's a directory! verboseLog("[Dir] " + linkURL) |