Progress

author: Liam <towerofnix@gmail.com> 2017-05-31 18:58:08 -0300
committer: Liam <towerofnix@gmail.com> 2017-05-31 18:58:08 -0300
commit: 26663377fd7ea15a6c3d23a399d1266c8639d42e (patch)
tree: fa0532caaf5672501bb5797499a515da72a09038 /crawl-recursive.js
parent: 42ec01bb91c517067a9eba901272c1248ed52261 (diff)
1 files changed, 66 insertions, 52 deletions
diff --git a/crawl-recursive.js b/crawl-recursive.js
index 8d33ded..d3b0127 100644
--- a/crawl-recursive.js
+++ b/crawl-recursive.js
@@ -3,70 +3,84 @@
 const MAX_DOWNLOAD_ATTEMPTS = 5
 
 const fetch = require('node-fetch')
-const { getHTMLLinks } = require('./crawl-links')
 
 function crawl(absURL, attempts = 0) {
-	return fetch(absURL)
-		.then(res => res.text().then(text => playlistifyParse(text, absURL)), err => {
-			console.error('Failed to download: ' + absURL)
+  // Recursively crawls a given URL, following every link to a deeper path and
+  // recording all links in a tree (in the same format playlists use). Makes
+  // multiple attempts to download failed paths.
 
-			if (attempts < MAX_DOWNLOAD_ATTEMPTS) {
-				console.error(
-					'Trying again. Attempt ' + (attempts + 1) +
-					'/' + MAX_DOWNLOAD_ATTEMPTS + '...'
-				)
-				return crawl(absURL, attempts + 1)
-			} else {
-				console.error(
-					'We\'ve hit the download attempt limit (' +
-					MAX_DOWNLOAD_ATTEMPTS + '). Giving up on ' +
-					'this path.'
-				)
-				throw 'FAILED_DOWNLOAD'
-			}
-		})
-		.catch(error => {
-			if (error === 'FAILED_DOWNLOAD') {
-				// Debug logging for this is already handled above.
-				return []
-			} else {
-				throw error
-			}
-		})
-}
+  return fetch(absURL)
+    .then(
+      res => res.text().then(text => {
+        const links = getHTMLLinks(text)
+        const verbose = process.argv.includes('--verbose')
+
+        return Promise.all(links.map(link => {
+          const [ title, href ] = link
+
+          if (href.endsWith('/')) {
+            // It's a directory!
 
-function playlistifyParse(text, absURL) {
-	const links = getHTMLLinks(text)
-	const verbose = process.argv.includes('--verbose')
+            if (verbose) console.log('[Dir] ' + absURL + href)
+            return crawl(absURL + href)
+              .then(res => [title, res])
+          } else {
+            // It's a file!
 
-	return Promise.all(links.map(link => {
-		const [ title, href ] = link
+            if (verbose) console.log('[File] ' + absURL + href)
+            return Promise.resolve([title, absURL + href])
+          }
+        }))
+      }),
 
-		if (href.endsWith('/')) {
-			// It's a directory!
+      err => {
+        console.error('Failed to download: ' + absURL)
+
+        if (attempts < MAX_DOWNLOAD_ATTEMPTS) {
+          console.error(
+            'Trying again. Attempt ' + (attempts + 1) +
+            '/' + MAX_DOWNLOAD_ATTEMPTS + '...'
+          )
+          return crawl(absURL, attempts + 1)
+        } else {
+          console.error(
+            'We\'ve hit the download attempt limit (' +
+            MAX_DOWNLOAD_ATTEMPTS + '). Giving up on ' +
+            'this path.'
+          )
+          throw 'FAILED_DOWNLOAD'
+        }
+      }
+    )
+    .catch(error => {
+      if (error === 'FAILED_DOWNLOAD') {
+        // Debug logging for this is already handled above.
+        return []
+      } else {
+        throw error
+      }
+    })
+}
 
-			if (verbose) console.log('[Dir] ' + absURL + href)
-			return crawl(absURL + href)
-				.then(res => [title, res])
-		} else {
-			// It's a file!
+function getHTMLLinks(text) {
+  // Never parse HTML with a regex!
 
-			if (verbose) console.log('[File] ' + absURL + href)
-			return Promise.resolve([title, absURL + href])
-		}
-	}))
+  return $(text).find('a').get().map(a => {
+    const $a = $(a)
+    return [$a.text(), $a.attr('href')]
+  })
 }
 
 if (process.argv.length === 2) {
-	console.log('Usage: crawl-recursive http://example.com/example/path')
+  console.log('Usage: crawl-recursive http://example.com/example/path')
 } else {
-	let url = process.argv[2]
+  let url = process.argv[2]
 
-	if (!(url.endsWith('/'))) {
-		url = url + '/'
-	}
+  if (!(url.endsWith('/'))) {
+    url = url + '/'
+  }
 
-	crawl(url)
-		.then(res => console.log(JSON.stringify(res, null, 2)))
-		.catch(err => console.error(err))
+  crawl(url)
+    .then(res => console.log(JSON.stringify(res, null, 2)))
+    .catch(err => console.error(err))
 }
author	Liam <towerofnix@gmail.com>	2017-05-31 18:58:08 -0300
committer	Liam <towerofnix@gmail.com>	2017-05-31 18:58:08 -0300
commit	26663377fd7ea15a6c3d23a399d1266c8639d42e (patch)
tree	fa0532caaf5672501bb5797499a515da72a09038 /crawl-recursive.js
parent	42ec01bb91c517067a9eba901272c1248ed52261 (diff)