diff options
author | Florrie <towerofnix@gmail.com> | 2018-01-27 00:39:22 -0400 |
---|---|---|
committer | Florrie <towerofnix@gmail.com> | 2018-01-27 00:39:24 -0400 |
commit | 60d4ac4b28eee349070ad0930330654e2d67e27d (patch) | |
tree | 3f52a7447fa43fe3f809c79f798c4edd8d6dc288 | |
parent | 64bcc2930392d70437dc5bc8b2f078840d8998a9 (diff) |
Make crawl-http go through one directory at a time
Hopefully this makes the tool, like, less of an unintentional denial-of-service.
-rwxr-xr-x | src/crawl-http.js | 32 |
1 files changed, 17 insertions, 15 deletions
diff --git a/src/crawl-http.js b/src/crawl-http.js index ae38ca4..b40ed02 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -44,10 +44,12 @@ function crawl(absURL, opts = {}, internals = {}) { return fetch(absURL) .then( - res => res.text().then(text => { + res => res.text().then(async text => { const links = getHTMLLinks(text) - return Promise.all(links.map(link => { + const items = [] + + for (const link of links) { let [ name, href ] = link // If the name (that's the content inside of <a>..</a>) ends with a @@ -64,29 +66,26 @@ function crawl(absURL, opts = {}, internals = {}) { if (internals.allURLs.includes(linkURL)) { verboseLog("[Ignored] Already done this URL: " + linkURL) - - return false + continue } internals.allURLs.push(linkURL) if (filterRegex && !(filterRegex.test(linkURL))) { verboseLog("[Ignored] Failed regex: " + linkURL) - - return false + continue } if (!keepSeparateHosts && urlObj.host !== absURLObj.host) { verboseLog("[Ignored] Inconsistent host: " + linkURL) - - return false + continue } if (stayInSameDirectory) { const relative = path.relative(absURLObj.pathname, urlObj.pathname) if (relative.startsWith('..') || path.isAbsolute(relative)) { verboseLog("[Ignored] Outside of parent directory: " + linkURL) - return false + continue } } @@ -95,8 +94,10 @@ function crawl(absURL, opts = {}, internals = {}) { verboseLog("[Dir] " + linkURL) - return crawl(linkURL, opts, Object.assign({}, internals)) - .then(({ items }) => ({name, items})) + items.push(await ( + crawl(linkURL, opts, Object.assign({}, internals)) + .then(({ items }) => ({name, items})) + )) } else { // It's a file! @@ -107,14 +108,15 @@ function crawl(absURL, opts = {}, internals = {}) { !(extensions.includes(path.extname(href))) ) { verboseLog("[Ignored] Bad extension: " + linkURL) - - return false + continue } verboseLog("[File] " + linkURL) - return Promise.resolve({name, downloaderArg: linkURL}) + items.push({name, downloaderArg: linkURL}) } - }).filter(Boolean)).then(items => ({items})) + } + + return {items} }), err => { |