From 992911a021e2f2cccfbc63e554a3f34bd997cd91 Mon Sep 17 00:00:00 2001 From: Florrie Date: Tue, 26 Jun 2018 12:01:43 -0300 Subject: Pass YouTube playlist URLs directly to mtui Or any other crawler argument, and it'll (try to) guess which crawler you want to use automatically. Handy! --- crawlers.js | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 109 insertions(+), 17 deletions(-) (limited to 'crawlers.js') diff --git a/crawlers.js b/crawlers.js index 82ef78e..caf3c0e 100644 --- a/crawlers.js +++ b/crawlers.js @@ -11,6 +11,15 @@ const { promisify } = require('util') const readDir = promisify(fs.readdir) const stat = promisify(fs.stat) +// Each value is a function with these additional properties: +// * crawlerName: The name of the crawler, such as "crawl-http". Used by +// getCrawlerByName. +// * isAppropriateForArg: A function returning whether an argument is valid for +// the crawler. For example, crawlHTTP.isAppropriateForArg returns whether or +// not the passed argument is a valid URL of the HTTP/HTTPS protocol. Used by +// getAllCrawlersForArg. +const allCrawlers = {} + function sortIgnoreCase(sortFunction) { return function(a, b) { return sortFunction(a.toLowerCase(), b.toLowerCase()) @@ -173,6 +182,39 @@ function crawlHTTP(absURL, opts = {}, internals = {}) { }) } +crawlHTTP.crawlerName = 'crawl-http' + +crawlHTTP.isAppropriateForArg = function(arg) { + // It is only used for HTTP(S) servers: + if (!(arg.startsWith('http://') || arg.startsWith('https://'))) { + return false + } + + // It will definitely only work for valid URLs: + let url + try { + url = new URL(arg) + } catch (error) { + return false + } + + // If the URL ends with a .json, it is probably meant to be used for a direct + // playlist download, not to be crawled. + if (path.extname(url.pathname) === '.json') { + return false + } + + // Just to avoid conflict with crawl-youtube, assume crawl-http is not used + // for URLs on YouTube: + if (crawlYouTube.isAppropriateForArg(arg)) { + return false + } + + return true +} + +allCrawlers.crawlHTTP = crawlHTTP + function getHTMLLinks(text) { // Never parse HTML with a regex! const $ = cheerio.load(text) @@ -183,8 +225,6 @@ function getHTMLLinks(text) { }) } - - function crawlLocal(dirPath, extensions = [ 'ogg', 'oga', 'wav', 'mp3', 'mp4', 'm4a', 'aac', @@ -238,6 +278,28 @@ function crawlLocal(dirPath, extensions = [ .then(filteredItems => ({items: filteredItems})) } +crawlLocal.crawlerName = 'crawl-local' + +crawlLocal.isAppropriateForArg = function(arg) { + // When the passed argument is a valid URL, it is only used for file:// + // URLs: + try { + const url = new URL(arg) + if (url.protocol !== 'file:') { + return false + } + } catch (error) {} + + // If the passed argument ends with .json, it is probably not a directory. + if (path.extname(arg) === '.json') { + return false + } + + return true +} + +allCrawlers.crawlLocal = crawlLocal + async function crawlYouTube(url) { const ytdl = spawn('youtube-dl', [ '-j', // Output as JSON @@ -266,23 +328,53 @@ async function crawlYouTube(url) { } } +crawlYouTube.crawlerName = 'crawl-youtube' + +crawlYouTube.isAppropriateForArg = function(arg) { + // It is definitely not used for arguments that are not URLs: + let url + try { + url = new URL(arg) + } catch (error) { + return false + } + + // It is only used for URLs on the YouTube domain: + if (!(url.hostname === 'youtube.com' || url.hostname === 'www.youtube.com')) { + return false + } + + // It is only used for playlist pages: + if (url.pathname !== '/playlist') { + return false + } + + return true +} + +allCrawlers.crawlYouTube = crawlYouTube + async function openFile(input) { return JSON.parse(await downloadPlaylistFromOptionValue(input)) } -module.exports = { - crawlHTTP, - crawlLocal, - crawlYouTube, - openFile, - - getCrawlerByName: function(name) { - switch (name) { - case 'crawl-http': return crawlHTTP - case 'crawl-local': return crawlLocal - case 'crawl-youtube': return crawlYouTube - case 'open-file': return openFile - default: return null - } - } +openFile.crawlerName = 'open-file' + +openFile.isAppropriateForArg = function(arg) { + // It is only valid for arguments that end with .json: + return path.extname(arg) === '.json' +} + +allCrawlers.openFile = openFile + +// Actual module.exports stuff: + +Object.assign(module.exports, allCrawlers) + +module.exports.getCrawlerByName = function(name) { + return Object.values(allCrawlers).find(fn => fn.crawlerName === name) +} + +module.exports.getAllCrawlersForArg = function(arg) { + return Object.values(allCrawlers).filter(fn => fn.isAppropriateForArg(arg)) } -- cgit 1.3.0-6-gf8a5