diff options
author | liam4 <towerofnix@gmail.com> | 2017-06-04 17:08:19 -0300 |
---|---|---|
committer | liam4 <towerofnix@gmail.com> | 2017-06-04 17:08:25 -0300 |
commit | fbdbed0c46bfc947e66f1896799bfdc614b46524 (patch) | |
tree | b370cd399ab9c7976c79ba6bb49521f1fdd92f03 | |
parent | 816f7b2dbe43f6f50bfd0b75450a2dafa714c981 (diff) |
Make http-crawl work better
Before this URL wouldn't work: http://billwurtz.com/exerpt.html
-rwxr-xr-x | src/crawl-http.js | 15 |
1 files changed, 7 insertions, 8 deletions
diff --git a/src/crawl-http.js b/src/crawl-http.js index 189ba28..fa078e3 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -6,6 +6,8 @@ const MAX_DOWNLOAD_ATTEMPTS = 5 const fetch = require('node-fetch') const $ = require('cheerio') +const url = require('url') +const path = require('path') function crawl(absURL, attempts = 0) { // Recursively crawls a given URL, following every link to a deeper path and @@ -20,18 +22,19 @@ function crawl(absURL, attempts = 0) { return Promise.all(links.map(link => { const [ title, href ] = link + const linkURL = url.format(new url.URL(href, absURL)) if (href.endsWith('/')) { // It's a directory! - if (verbose) console.log("[Dir] " + absURL + href) - return crawl(absURL + href) + if (verbose) console.log("[Dir] " + linkURL) + return crawl(linkURL) .then(res => [title, res]) } else { // It's a file! - if (verbose) console.log("[File] " + absURL + href) - return Promise.resolve([title, absURL + href]) + if (verbose) console.log("[File] " + linkURL) + return Promise.resolve([title, linkURL]) } })) }), @@ -81,10 +84,6 @@ if (process.argv.length === 2) { } else { let url = process.argv[2] - if (!(url.endsWith('/'))) { - url = url + '/' - } - crawl(url) .then(res => console.log(JSON.stringify(res, null, 2))) .catch(err => console.error(err)) |