« get me outta code hell

Dumb http crawler changes - mtui - Music Text User Interface - user-friendly command line music player
about summary refs log tree commit diff
path: root/crawlers.js
diff options
context:
space:
mode:
authorFlorrie <towerofnix@gmail.com>2018-06-05 21:14:10 -0300
committerFlorrie <towerofnix@gmail.com>2018-06-05 21:14:10 -0300
commit3c65d4ca6246c5723d9fc6a73371150d06c0b6c2 (patch)
treea60395e67eab694c1e5ac54f5ddcd1adc3ec54b9 /crawlers.js
parenta0757178bdfacb2f5fe8d6c3a02a09b5cc26bb28 (diff)
Dumb http crawler changes
Diffstat (limited to 'crawlers.js')
-rw-r--r--crawlers.js22
1 files changed, 18 insertions, 4 deletions
diff --git a/crawlers.js b/crawlers.js
index 0bf5c4e..8ba70f3 100644
--- a/crawlers.js
+++ b/crawlers.js
@@ -27,12 +27,13 @@ function crawlHTTP(absURL, opts = {}, internals = {}) {
 
     maxAttempts = 5,
 
-    keepSeparateHosts = false,
+    allowedExternalHostRegex = null,
     stayInSameDirectory = true,
 
     keepAnyFileType = false,
     fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'],
 
+    forceGroupRegex = null,
     filterRegex = null
   } = opts
 
@@ -55,12 +56,17 @@ function crawlHTTP(absURL, opts = {}, internals = {}) {
     .then(
       res => res.text().then(async text => {
         const links = getHTMLLinks(text)
+        console.log(links)
 
         const items = []
 
         for (const link of links) {
           let [ name, href ] = link
 
+          if (!href) {
+            continue
+          }
+
           // If the name (that's the content inside of <a>..</a>) ends with a
           // slash, that's probably just an artifact of a directory lister;
           // not actually part of the intended content. So we remove it!
@@ -85,12 +91,20 @@ function crawlHTTP(absURL, opts = {}, internals = {}) {
             continue
           }
 
-          if (!keepSeparateHosts && urlObj.host !== absURLObj.host) {
+          if (urlObj.host !== absURLObj.host && !(
+            allowedExternalHostRegex && new RegExp(allowedExternalHostRegex)
+              .test(urlObj.host))) {
             verboseLog("[Ignored] Inconsistent host: " + linkURL)
             continue
           }
 
-          if (stayInSameDirectory) {
+          if (stayInSameDirectory) sameDir: {
+            // Don't bother with staying in the same directory if it's on a
+            // different host.
+            if (urlObj.host !== absURLObj.host) {
+              break sameDir
+            }
+
             const relative = path.relative(absURLObj.pathname, urlObj.pathname)
             if (relative.startsWith('..') || path.isAbsolute(relative)) {
               verboseLog("[Ignored] Outside of parent directory: " + linkURL)
@@ -98,7 +112,7 @@ function crawlHTTP(absURL, opts = {}, internals = {}) {
             }
           }
 
-          if (href.endsWith('/')) {
+          if (href.endsWith('/') || (forceGroupRegex && new RegExp(forceGroupRegex).test(href))) {
             // It's a directory!
 
             verboseLog("[Dir] " + linkURL)