From fbdbed0c46bfc947e66f1896799bfdc614b46524 Mon Sep 17 00:00:00 2001
From: liam4 <towerofnix@gmail.com>
Date: Sun, 4 Jun 2017 17:08:19 -0300
Subject: Make http-crawl work better

Before this URL wouldn't work: http://billwurtz.com/exerpt.html
---
 src/crawl-http.js | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/crawl-http.js b/src/crawl-http.js
index 189ba28..fa078e3 100755
--- a/src/crawl-http.js
+++ b/src/crawl-http.js
@@ -6,6 +6,8 @@ const MAX_DOWNLOAD_ATTEMPTS = 5
 
 const fetch = require('node-fetch')
 const $ = require('cheerio')
+const url = require('url')
+const path = require('path')
 
 function crawl(absURL, attempts = 0) {
   // Recursively crawls a given URL, following every link to a deeper path and
@@ -20,18 +22,19 @@ function crawl(absURL, attempts = 0) {
 
         return Promise.all(links.map(link => {
           const [ title, href ] = link
+          const linkURL = url.format(new url.URL(href, absURL))
 
           if (href.endsWith('/')) {
             // It's a directory!
 
-            if (verbose) console.log("[Dir] " + absURL + href)
-            return crawl(absURL + href)
+            if (verbose) console.log("[Dir] " + linkURL)
+            return crawl(linkURL)
               .then(res => [title, res])
           } else {
             // It's a file!
 
-            if (verbose) console.log("[File] " + absURL + href)
-            return Promise.resolve([title, absURL + href])
+            if (verbose) console.log("[File] " + linkURL)
+            return Promise.resolve([title, linkURL])
           }
         }))
       }),
@@ -81,10 +84,6 @@ if (process.argv.length === 2) {
 } else {
   let url = process.argv[2]
 
-  if (!(url.endsWith('/'))) {
-    url = url + '/'
-  }
-
   crawl(url)
     .then(res => console.log(JSON.stringify(res, null, 2)))
     .catch(err => console.error(err))
-- 
cgit 1.3.0-6-gf8a5