From b0f256ea0352fd86f40c6e4bd18bee47c36c320b Mon Sep 17 00:00:00 2001 From: Florrie Date: Sat, 22 Dec 2018 13:45:33 -0400 Subject: HTTP crawler stuff Basically, support for cors-anywhere. (Cherry-picked from web-mtui branch) --- crawlers.js | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/crawlers.js b/crawlers.js index feeedf2..c12948f 100644 --- a/crawlers.js +++ b/crawlers.js @@ -86,7 +86,15 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { name = name.trim() - const urlObj = new url.URL(href, absURL + '/') + let base + if (path.extname(absURL)) { + base = path.dirname(absURL) + '/' + console.log('extname:', path.extname(absURL), 'so base:', base) + } else { + base = absURL + } + + const urlObj = new url.URL(href, base) const linkURL = url.format(urlObj) if (internals.allURLs.includes(linkURL)) { @@ -115,9 +123,9 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { break sameDir } - const relative = path.relative(absURLObj.pathname, urlObj.pathname) + const relative = path.relative((new url.URL(base)).pathname, urlObj.pathname) if (relative.startsWith('..') || path.isAbsolute(relative)) { - verboseLog("[Ignored] Outside of parent directory: " + linkURL) + verboseLog("[Ignored] Outside of parent directory: " + linkURL + "\n-- relative: " + relative + "\n-- to base: " + base) continue } } -- cgit 1.3.0-6-gf8a5