From 60d4ac4b28eee349070ad0930330654e2d67e27d Mon Sep 17 00:00:00 2001 From: Florrie Date: Sat, 27 Jan 2018 00:39:22 -0400 Subject: Make crawl-http go through one directory at a time Hopefully this makes the tool, like, less of an unintentional denial-of-service. --- src/crawl-http.js | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/crawl-http.js b/src/crawl-http.js index ae38ca4..b40ed02 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -44,10 +44,12 @@ function crawl(absURL, opts = {}, internals = {}) { return fetch(absURL) .then( - res => res.text().then(text => { + res => res.text().then(async text => { const links = getHTMLLinks(text) - return Promise.all(links.map(link => { + const items = [] + + for (const link of links) { let [ name, href ] = link // If the name (that's the content inside of ..) ends with a @@ -64,29 +66,26 @@ function crawl(absURL, opts = {}, internals = {}) { if (internals.allURLs.includes(linkURL)) { verboseLog("[Ignored] Already done this URL: " + linkURL) - - return false + continue } internals.allURLs.push(linkURL) if (filterRegex && !(filterRegex.test(linkURL))) { verboseLog("[Ignored] Failed regex: " + linkURL) - - return false + continue } if (!keepSeparateHosts && urlObj.host !== absURLObj.host) { verboseLog("[Ignored] Inconsistent host: " + linkURL) - - return false + continue } if (stayInSameDirectory) { const relative = path.relative(absURLObj.pathname, urlObj.pathname) if (relative.startsWith('..') || path.isAbsolute(relative)) { verboseLog("[Ignored] Outside of parent directory: " + linkURL) - return false + continue } } @@ -95,8 +94,10 @@ function crawl(absURL, opts = {}, internals = {}) { verboseLog("[Dir] " + linkURL) - return crawl(linkURL, opts, Object.assign({}, internals)) - .then(({ items }) => ({name, items})) + items.push(await ( + crawl(linkURL, opts, Object.assign({}, internals)) + .then(({ items }) => ({name, items})) + )) } else { // It's a file! @@ -107,14 +108,15 @@ function crawl(absURL, opts = {}, internals = {}) { !(extensions.includes(path.extname(href))) ) { verboseLog("[Ignored] Bad extension: " + linkURL) - - return false + continue } verboseLog("[File] " + linkURL) - return Promise.resolve({name, downloaderArg: linkURL}) + items.push({name, downloaderArg: linkURL}) } - }).filter(Boolean)).then(items => ({items})) + } + + return {items} }), err => { -- cgit 1.3.0-6-gf8a5