« get me outta code hell

mtui - Music Text User Interface - user-friendly command line music player
about summary refs log tree commit diff
path: root/crawlers.js
diff options
context:
space:
mode:
Diffstat (limited to 'crawlers.js')
-rw-r--r--crawlers.js199
1 files changed, 15 insertions, 184 deletions
diff --git a/crawlers.js b/crawlers.js
index c12948f..fc1cccf 100644
--- a/crawlers.js
+++ b/crawlers.js
@@ -1,15 +1,5 @@
-const fs = require('fs')
 const path = require('path')
-const naturalSort = require('node-natural-sort')
-const expandHomeDir = require('expand-home-dir')
-const fetch = require('node-fetch')
-const url = require('url')
 const { downloadPlaylistFromOptionValue, promisifyProcess } = require('./general-util')
-const { spawn } = require('child_process')
-
-const { promisify } = require('util')
-const readDir = promisify(fs.readdir)
-const stat = promisify(fs.stat)
 
 // Each value is a function with these additional properties:
 // * crawlerName: The name of the crawler, such as "crawl-http". Used by
@@ -26,7 +16,6 @@ function sortIgnoreCase(sortFunction) {
   }
 }
 
-/* TODO: Removed cheerio, so crawl-http no longer works.
 function crawlHTTP(absURL, opts = {}, internals = {}) {
   // Recursively crawls a given URL, following every link to a deeper path and
   // recording all links in a tree (in the same format playlists use). Makes
@@ -60,13 +49,12 @@ function crawlHTTP(absURL, opts = {}, internals = {}) {
     }
   }
 
-  const absURLObj = new url.URL(absURL)
+  const absURLObj = new URL(absURL)
 
   return fetch(absURL)
     .then(
       res => res.text().then(async text => {
         const links = getHTMLLinks(text)
-        console.log(links)
 
         const items = []
 
@@ -87,15 +75,14 @@ function crawlHTTP(absURL, opts = {}, internals = {}) {
           name = name.trim()
 
           let base
-          if (path.extname(absURL)) {
-            base = path.dirname(absURL) + '/'
-            console.log('extname:', path.extname(absURL), 'so base:', base)
+          if (path.extname(absURLObj.pathname)) {
+            base = absURLObj.origin + path.dirname(absURLObj.pathname) + '/'
           } else {
             base = absURL
           }
 
-          const urlObj = new url.URL(href, base)
-          const linkURL = url.format(urlObj)
+          const urlObj = new URL(href, base)
+          const linkURL = urlObj.toString()
 
           if (internals.allURLs.includes(linkURL)) {
             verboseLog("[Ignored] Already done this URL: " + linkURL)
@@ -123,7 +110,7 @@ function crawlHTTP(absURL, opts = {}, internals = {}) {
               break sameDir
             }
 
-            const relative = path.relative((new url.URL(base)).pathname, urlObj.pathname)
+            const relative = path.relative((new URL(base)).pathname, urlObj.pathname)
             if (relative.startsWith('..') || path.isAbsolute(relative)) {
               verboseLog("[Ignored] Outside of parent directory: " + linkURL + "\n-- relative: " + relative + "\n-- to base: " + base)
               continue
@@ -191,6 +178,15 @@ function crawlHTTP(absURL, opts = {}, internals = {}) {
     })
 }
 
+function getHTMLLinks(text) {
+  // Never parse HTML with a regex!
+  const doc = (new DOMParser()).parseFromString(text, 'text/html')
+
+  return Array.from(doc.getElementsByTagName('a')).map(el => {
+    return [el.innerText, el.getAttribute('href')]
+  })
+}
+
 crawlHTTP.crawlerName = 'crawl-http'
 
 crawlHTTP.isAppropriateForArg = function(arg) {
@@ -213,176 +209,11 @@ crawlHTTP.isAppropriateForArg = function(arg) {
     return false
   }
 
-  // Just to avoid conflict with crawl-youtube, assume crawl-http is not used
-  // for URLs on YouTube:
-  if (crawlYouTube.isAppropriateForArg(arg)) {
-    return false
-  }
-
   return true
 }
 
 allCrawlers.crawlHTTP = crawlHTTP
 
-function getHTMLLinks(text) {
-  // Never parse HTML with a regex!
-  // const $ = cheerio.load(text)
-
-  return $('a').get().map(el => {
-    const $el = $(el)
-    return [$el.text(), $el.attr('href')]
-  })
-}
-*/
-
-function crawlLocal(dirPath, extensions = [
-  'ogg', 'oga',
-  'wav', 'mp3', 'mp4', 'm4a', 'aac',
-  'mod'
-], isTop = true) {
-  // If the passed path is a file:// URL, try to decode it:
-  try {
-    const url = new URL(dirPath)
-    if (url.protocol === 'file:') {
-      dirPath = decodeURIComponent(url.pathname)
-    }
-  } catch (error) {
-    // If it's not a URL, it's (assumedly) an ordinary path ("/path/to/the directory").
-    // In this case we'll expand any ~ in the path (e.g. ~/Music -> /home/.../Music).
-    dirPath = expandHomeDir(dirPath)
-  }
-
-  return readDir(dirPath).then(items => {
-    items.sort(sortIgnoreCase(naturalSort()))
-
-    return Promise.all(items.map(item => {
-      const itemPath = path.join(dirPath, item)
-
-      return stat(itemPath).then(stats => {
-        if (stats.isDirectory()) {
-          return crawlLocal(itemPath, extensions, false)
-            .then(group => Object.assign({name: item}, group))
-        } else if (stats.isFile()) {
-          // Extname returns a string starting with a dot; we don't want the
-          // dot, so we slice it off of the front.
-          const ext = path.extname(item).slice(1)
-
-          if (extensions.includes(ext)) {
-            // The name of the track doesn't include the file extension; a user
-            // probably wouldn't add the file extensions to a hand-written
-            // playlist, or want them in an auto-generated one.
-            const basename = path.basename(item, path.extname(item))
-
-            const track = {name: basename, downloaderArg: itemPath}
-            return track
-          } else {
-            return null
-          }
-        }
-      }, statErr => null)
-    }))
-  }, err => {
-    if (err.code === 'ENOENT') {
-      if (isTop) {
-        throw 'That directory path does not exist!'
-      } else {
-        return []
-      }
-    } else if (err.code === 'EACCES') {
-      if (isTop) {
-        throw 'You do not have permission to open that directory.'
-      } else {
-        return []
-      }
-    } else {
-      throw err
-    }
-  }).then(items => items.filter(Boolean))
-    .then(filteredItems => ({items: filteredItems}))
-}
-
-crawlLocal.crawlerName = 'crawl-local'
-
-crawlLocal.isAppropriateForArg = function(arg) {
-  // When the passed argument is a valid URL, it is only used for file://
-  // URLs:
-  try {
-    const url = new URL(arg)
-    if (url.protocol !== 'file:') {
-      return false
-    }
-  } catch (error) {}
-
-  // If the passed argument ends with .json, it is probably not a directory.
-  if (path.extname(arg) === '.json') {
-    return false
-  }
-
-  return true
-}
-
-allCrawlers.crawlLocal = crawlLocal
-
-async function crawlYouTube(url) {
-  const ytdl = spawn('youtube-dl', [
-    '-j', // Output as JSON
-    '--flat-playlist',
-    url
-  ])
-
-  const items = []
-
-  ytdl.stdout.on('data', data => {
-    const lines = data.toString().trim().split('\n')
-
-    items.push(...lines.map(JSON.parse))
-  })
-
-  // Pass false so it doesn't show logging.
-  try {
-    await promisifyProcess(ytdl, false)
-  } catch (error) {
-    // Yeow.
-    throw 'Youtube-dl failed.'
-  }
-
-  return {
-    name: 'A YouTube playlist',
-    items: items.map(item => {
-      return {
-        name: item.title,
-        downloaderArg: 'https://youtube.com/watch?v=' + item.id
-      }
-    })
-  }
-}
-
-crawlYouTube.crawlerName = 'crawl-youtube'
-
-crawlYouTube.isAppropriateForArg = function(arg) {
-  // It is definitely not used for arguments that are not URLs:
-  let url
-  try {
-    url = new URL(arg)
-  } catch (error) {
-    return false
-  }
-
-  // It is only used for URLs on the YouTube domain:
-  if (!(url.hostname === 'youtube.com' || url.hostname === 'www.youtube.com')) {
-    return false
-  }
-
-  // It is only used for playlist pages:
-  if (url.pathname !== '/playlist') {
-    return false
-  }
-
-  return true
-}
-
-allCrawlers.crawlYouTube = crawlYouTube
-
 async function openFile(input) {
   return JSON.parse(await downloadPlaylistFromOptionValue(input))
 }