From 72eff553df4fd4496172cf8d26b4585ac9f34b49 Mon Sep 17 00:00:00 2001 From: Florrie Date: Sat, 22 Dec 2018 13:45:33 -0400 Subject: HTTP crawler stuff Basically, support for cors-anywhere. --- crawlers.js | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/crawlers.js b/crawlers.js index 4c96c85..3a1436d 100644 --- a/crawlers.js +++ b/crawlers.js @@ -75,7 +75,15 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { name = name.trim() - const urlObj = new URL(href, absURL + '/') + let base + if (path.extname(absURL)) { + base = path.dirname(absURL) + '/' + console.log('extname:', path.extname(absURL), 'so base:', base) + } else { + base = absURL + } + + const urlObj = new URL(href, base) const linkURL = urlObj.toString() if (internals.allURLs.includes(linkURL)) { @@ -104,9 +112,9 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { break sameDir } - const relative = path.relative(absURLObj.pathname, urlObj.pathname) + const relative = path.relative((new URL(base)).pathname, urlObj.pathname) if (relative.startsWith('..') || path.isAbsolute(relative)) { - verboseLog("[Ignored] Outside of parent directory: " + linkURL) + verboseLog("[Ignored] Outside of parent directory: " + linkURL + "\n-- relative: " + relative + "\n-- to base: " + base) continue } } -- cgit 1.3.0-6-gf8a5