diff options
Diffstat (limited to 'crawlers.js')
-rw-r--r-- | crawlers.js | 199 |
1 files changed, 15 insertions, 184 deletions
diff --git a/crawlers.js b/crawlers.js index c12948f..fc1cccf 100644 --- a/crawlers.js +++ b/crawlers.js @@ -1,15 +1,5 @@ -const fs = require('fs') const path = require('path') -const naturalSort = require('node-natural-sort') -const expandHomeDir = require('expand-home-dir') -const fetch = require('node-fetch') -const url = require('url') const { downloadPlaylistFromOptionValue, promisifyProcess } = require('./general-util') -const { spawn } = require('child_process') - -const { promisify } = require('util') -const readDir = promisify(fs.readdir) -const stat = promisify(fs.stat) // Each value is a function with these additional properties: // * crawlerName: The name of the crawler, such as "crawl-http". Used by @@ -26,7 +16,6 @@ function sortIgnoreCase(sortFunction) { } } -/* TODO: Removed cheerio, so crawl-http no longer works. function crawlHTTP(absURL, opts = {}, internals = {}) { // Recursively crawls a given URL, following every link to a deeper path and // recording all links in a tree (in the same format playlists use). Makes @@ -60,13 +49,12 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { } } - const absURLObj = new url.URL(absURL) + const absURLObj = new URL(absURL) return fetch(absURL) .then( res => res.text().then(async text => { const links = getHTMLLinks(text) - console.log(links) const items = [] @@ -87,15 +75,14 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { name = name.trim() let base - if (path.extname(absURL)) { - base = path.dirname(absURL) + '/' - console.log('extname:', path.extname(absURL), 'so base:', base) + if (path.extname(absURLObj.pathname)) { + base = absURLObj.origin + path.dirname(absURLObj.pathname) + '/' } else { base = absURL } - const urlObj = new url.URL(href, base) - const linkURL = url.format(urlObj) + const urlObj = new URL(href, base) + const linkURL = urlObj.toString() if (internals.allURLs.includes(linkURL)) { verboseLog("[Ignored] Already done this URL: " + linkURL) @@ -123,7 +110,7 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { break sameDir } - const relative = path.relative((new url.URL(base)).pathname, urlObj.pathname) + const relative = path.relative((new URL(base)).pathname, urlObj.pathname) if (relative.startsWith('..') || path.isAbsolute(relative)) { verboseLog("[Ignored] Outside of parent directory: " + linkURL + "\n-- relative: " + relative + "\n-- to base: " + base) continue @@ -191,6 +178,15 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { }) } +function getHTMLLinks(text) { + // Never parse HTML with a regex! + const doc = (new DOMParser()).parseFromString(text, 'text/html') + + return Array.from(doc.getElementsByTagName('a')).map(el => { + return [el.innerText, el.getAttribute('href')] + }) +} + crawlHTTP.crawlerName = 'crawl-http' crawlHTTP.isAppropriateForArg = function(arg) { @@ -213,176 +209,11 @@ crawlHTTP.isAppropriateForArg = function(arg) { return false } - // Just to avoid conflict with crawl-youtube, assume crawl-http is not used - // for URLs on YouTube: - if (crawlYouTube.isAppropriateForArg(arg)) { - return false - } - return true } allCrawlers.crawlHTTP = crawlHTTP -function getHTMLLinks(text) { - // Never parse HTML with a regex! - // const $ = cheerio.load(text) - - return $('a').get().map(el => { - const $el = $(el) - return [$el.text(), $el.attr('href')] - }) -} -*/ - -function crawlLocal(dirPath, extensions = [ - 'ogg', 'oga', - 'wav', 'mp3', 'mp4', 'm4a', 'aac', - 'mod' -], isTop = true) { - // If the passed path is a file:// URL, try to decode it: - try { - const url = new URL(dirPath) - if (url.protocol === 'file:') { - dirPath = decodeURIComponent(url.pathname) - } - } catch (error) { - // If it's not a URL, it's (assumedly) an ordinary path ("/path/to/the directory"). - // In this case we'll expand any ~ in the path (e.g. ~/Music -> /home/.../Music). - dirPath = expandHomeDir(dirPath) - } - - return readDir(dirPath).then(items => { - items.sort(sortIgnoreCase(naturalSort())) - - return Promise.all(items.map(item => { - const itemPath = path.join(dirPath, item) - - return stat(itemPath).then(stats => { - if (stats.isDirectory()) { - return crawlLocal(itemPath, extensions, false) - .then(group => Object.assign({name: item}, group)) - } else if (stats.isFile()) { - // Extname returns a string starting with a dot; we don't want the - // dot, so we slice it off of the front. - const ext = path.extname(item).slice(1) - - if (extensions.includes(ext)) { - // The name of the track doesn't include the file extension; a user - // probably wouldn't add the file extensions to a hand-written - // playlist, or want them in an auto-generated one. - const basename = path.basename(item, path.extname(item)) - - const track = {name: basename, downloaderArg: itemPath} - return track - } else { - return null - } - } - }, statErr => null) - })) - }, err => { - if (err.code === 'ENOENT') { - if (isTop) { - throw 'That directory path does not exist!' - } else { - return [] - } - } else if (err.code === 'EACCES') { - if (isTop) { - throw 'You do not have permission to open that directory.' - } else { - return [] - } - } else { - throw err - } - }).then(items => items.filter(Boolean)) - .then(filteredItems => ({items: filteredItems})) -} - -crawlLocal.crawlerName = 'crawl-local' - -crawlLocal.isAppropriateForArg = function(arg) { - // When the passed argument is a valid URL, it is only used for file:// - // URLs: - try { - const url = new URL(arg) - if (url.protocol !== 'file:') { - return false - } - } catch (error) {} - - // If the passed argument ends with .json, it is probably not a directory. - if (path.extname(arg) === '.json') { - return false - } - - return true -} - -allCrawlers.crawlLocal = crawlLocal - -async function crawlYouTube(url) { - const ytdl = spawn('youtube-dl', [ - '-j', // Output as JSON - '--flat-playlist', - url - ]) - - const items = [] - - ytdl.stdout.on('data', data => { - const lines = data.toString().trim().split('\n') - - items.push(...lines.map(JSON.parse)) - }) - - // Pass false so it doesn't show logging. - try { - await promisifyProcess(ytdl, false) - } catch (error) { - // Yeow. - throw 'Youtube-dl failed.' - } - - return { - name: 'A YouTube playlist', - items: items.map(item => { - return { - name: item.title, - downloaderArg: 'https://youtube.com/watch?v=' + item.id - } - }) - } -} - -crawlYouTube.crawlerName = 'crawl-youtube' - -crawlYouTube.isAppropriateForArg = function(arg) { - // It is definitely not used for arguments that are not URLs: - let url - try { - url = new URL(arg) - } catch (error) { - return false - } - - // It is only used for URLs on the YouTube domain: - if (!(url.hostname === 'youtube.com' || url.hostname === 'www.youtube.com')) { - return false - } - - // It is only used for playlist pages: - if (url.pathname !== '/playlist') { - return false - } - - return true -} - -allCrawlers.crawlYouTube = crawlYouTube - async function openFile(input) { return JSON.parse(await downloadPlaylistFromOptionValue(input)) } |