From fbdbed0c46bfc947e66f1896799bfdc614b46524 Mon Sep 17 00:00:00 2001 From: liam4 Date: Sun, 4 Jun 2017 17:08:19 -0300 Subject: Make http-crawl work better Before this URL wouldn't work: http://billwurtz.com/exerpt.html --- src/crawl-http.js | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/crawl-http.js b/src/crawl-http.js index 189ba28..fa078e3 100755 --- a/src/crawl-http.js +++ b/src/crawl-http.js @@ -6,6 +6,8 @@ const MAX_DOWNLOAD_ATTEMPTS = 5 const fetch = require('node-fetch') const $ = require('cheerio') +const url = require('url') +const path = require('path') function crawl(absURL, attempts = 0) { // Recursively crawls a given URL, following every link to a deeper path and @@ -20,18 +22,19 @@ function crawl(absURL, attempts = 0) { return Promise.all(links.map(link => { const [ title, href ] = link + const linkURL = url.format(new url.URL(href, absURL)) if (href.endsWith('/')) { // It's a directory! - if (verbose) console.log("[Dir] " + absURL + href) - return crawl(absURL + href) + if (verbose) console.log("[Dir] " + linkURL) + return crawl(linkURL) .then(res => [title, res]) } else { // It's a file! - if (verbose) console.log("[File] " + absURL + href) - return Promise.resolve([title, absURL + href]) + if (verbose) console.log("[File] " + linkURL) + return Promise.resolve([title, linkURL]) } })) }), @@ -81,10 +84,6 @@ if (process.argv.length === 2) { } else { let url = process.argv[2] - if (!(url.endsWith('/'))) { - url = url + '/' - } - crawl(url) .then(res => console.log(JSON.stringify(res, null, 2))) .catch(err => console.error(err)) -- cgit 1.3.0-6-gf8a5