From 3c65d4ca6246c5723d9fc6a73371150d06c0b6c2 Mon Sep 17 00:00:00 2001 From: Florrie Date: Tue, 5 Jun 2018 21:14:10 -0300 Subject: Dumb http crawler changes --- crawlers.js | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'crawlers.js') diff --git a/crawlers.js b/crawlers.js index 0bf5c4e..8ba70f3 100644 --- a/crawlers.js +++ b/crawlers.js @@ -27,12 +27,13 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { maxAttempts = 5, - keepSeparateHosts = false, + allowedExternalHostRegex = null, stayInSameDirectory = true, keepAnyFileType = false, fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'], + forceGroupRegex = null, filterRegex = null } = opts @@ -55,12 +56,17 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { .then( res => res.text().then(async text => { const links = getHTMLLinks(text) + console.log(links) const items = [] for (const link of links) { let [ name, href ] = link + if (!href) { + continue + } + // If the name (that's the content inside of ..) ends with a // slash, that's probably just an artifact of a directory lister; // not actually part of the intended content. So we remove it! @@ -85,12 +91,20 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { continue } - if (!keepSeparateHosts && urlObj.host !== absURLObj.host) { + if (urlObj.host !== absURLObj.host && !( + allowedExternalHostRegex && new RegExp(allowedExternalHostRegex) + .test(urlObj.host))) { verboseLog("[Ignored] Inconsistent host: " + linkURL) continue } - if (stayInSameDirectory) { + if (stayInSameDirectory) sameDir: { + // Don't bother with staying in the same directory if it's on a + // different host. + if (urlObj.host !== absURLObj.host) { + break sameDir + } + const relative = path.relative(absURLObj.pathname, urlObj.pathname) if (relative.startsWith('..') || path.isAbsolute(relative)) { verboseLog("[Ignored] Outside of parent directory: " + linkURL) @@ -98,7 +112,7 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { } } - if (href.endsWith('/')) { + if (href.endsWith('/') || (forceGroupRegex && new RegExp(forceGroupRegex).test(href))) { // It's a directory! verboseLog("[Dir] " + linkURL) -- cgit 1.3.0-6-gf8a5