« get me outta code hell

Make http-crawl work better - http-music - Command-line music player + utils (not a server!)
about summary refs log tree commit diff
diff options
context:
space:
mode:
authorliam4 <towerofnix@gmail.com>2017-06-04 17:08:19 -0300
committerliam4 <towerofnix@gmail.com>2017-06-04 17:08:25 -0300
commitfbdbed0c46bfc947e66f1896799bfdc614b46524 (patch)
treeb370cd399ab9c7976c79ba6bb49521f1fdd92f03
parent816f7b2dbe43f6f50bfd0b75450a2dafa714c981 (diff)
Make http-crawl work better
Before this URL wouldn't work: http://billwurtz.com/exerpt.html
-rwxr-xr-xsrc/crawl-http.js15
1 files changed, 7 insertions, 8 deletions
diff --git a/src/crawl-http.js b/src/crawl-http.js
index 189ba28..fa078e3 100755
--- a/src/crawl-http.js
+++ b/src/crawl-http.js
@@ -6,6 +6,8 @@ const MAX_DOWNLOAD_ATTEMPTS = 5
 
 const fetch = require('node-fetch')
 const $ = require('cheerio')
+const url = require('url')
+const path = require('path')
 
 function crawl(absURL, attempts = 0) {
   // Recursively crawls a given URL, following every link to a deeper path and
@@ -20,18 +22,19 @@ function crawl(absURL, attempts = 0) {
 
         return Promise.all(links.map(link => {
           const [ title, href ] = link
+          const linkURL = url.format(new url.URL(href, absURL))
 
           if (href.endsWith('/')) {
             // It's a directory!
 
-            if (verbose) console.log("[Dir] " + absURL + href)
-            return crawl(absURL + href)
+            if (verbose) console.log("[Dir] " + linkURL)
+            return crawl(linkURL)
               .then(res => [title, res])
           } else {
             // It's a file!
 
-            if (verbose) console.log("[File] " + absURL + href)
-            return Promise.resolve([title, absURL + href])
+            if (verbose) console.log("[File] " + linkURL)
+            return Promise.resolve([title, linkURL])
           }
         }))
       }),
@@ -81,10 +84,6 @@ if (process.argv.length === 2) {
 } else {
   let url = process.argv[2]
 
-  if (!(url.endsWith('/'))) {
-    url = url + '/'
-  }
-
   crawl(url)
     .then(res => console.log(JSON.stringify(res, null, 2)))
     .catch(err => console.error(err))