From 6055638558a345904b41467839191a7143862d25 Mon Sep 17 00:00:00 2001 From: Florrie Date: Mon, 4 Jun 2018 21:27:18 -0300 Subject: Smart playlists Basically directly pulled from http-music. Want to make a nice UI for this eventually ("opening playlist..." popup dialog), but not for now. --- crawlers.js | 259 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 crawlers.js (limited to 'crawlers.js') diff --git a/crawlers.js b/crawlers.js new file mode 100644 index 0000000..0bf5c4e --- /dev/null +++ b/crawlers.js @@ -0,0 +1,259 @@ +const fs = require('fs') +const path = require('path') +const naturalSort = require('node-natural-sort') +const fetch = require('node-fetch') +const cheerio = require('cheerio') +const url = require('url') +const { downloadPlaylistFromOptionValue, promisifyProcess } = require('./general-util') +const { spawn } = require('child_process') + +const { promisify } = require('util') +const readDir = promisify(fs.readdir) +const stat = promisify(fs.stat) + +function sortIgnoreCase(sortFunction) { + return function(a, b) { + return sortFunction(a.toLowerCase(), b.toLowerCase()) + } +} + +function crawlHTTP(absURL, opts = {}, internals = {}) { + // Recursively crawls a given URL, following every link to a deeper path and + // recording all links in a tree (in the same format playlists use). Makes + // multiple attempts to download failed paths. + + const { + verbose = false, + + maxAttempts = 5, + + keepSeparateHosts = false, + stayInSameDirectory = true, + + keepAnyFileType = false, + fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'], + + filterRegex = null + } = opts + + if (!internals.attempts) internals.attempts = 0 + + // TODO: Should absURL initially be added into this array? I'd like to + // re-program this entire crawl function to make more sense - "internal" + // dictionaries aren't quite easy to reason about! + if (!internals.allURLs) internals.allURLs = [] + + const verboseLog = text => { + if (verbose) { + console.error(text) + } + } + + const absURLObj = new url.URL(absURL) + + return fetch(absURL) + .then( + res => res.text().then(async text => { + const links = getHTMLLinks(text) + + const items = [] + + for (const link of links) { + let [ name, href ] = link + + // If the name (that's the content inside of ..) ends with a + // slash, that's probably just an artifact of a directory lister; + // not actually part of the intended content. So we remove it! + if (name.endsWith('/')) { + name = name.slice(0, -1) + } + + name = name.trim() + + const urlObj = new url.URL(href, absURL + '/') + const linkURL = url.format(urlObj) + + if (internals.allURLs.includes(linkURL)) { + verboseLog("[Ignored] Already done this URL: " + linkURL) + continue + } + + internals.allURLs.push(linkURL) + + if (filterRegex && !(filterRegex.test(linkURL))) { + verboseLog("[Ignored] Failed regex: " + linkURL) + continue + } + + if (!keepSeparateHosts && urlObj.host !== absURLObj.host) { + verboseLog("[Ignored] Inconsistent host: " + linkURL) + continue + } + + if (stayInSameDirectory) { + const relative = path.relative(absURLObj.pathname, urlObj.pathname) + if (relative.startsWith('..') || path.isAbsolute(relative)) { + verboseLog("[Ignored] Outside of parent directory: " + linkURL) + continue + } + } + + if (href.endsWith('/')) { + // It's a directory! + + verboseLog("[Dir] " + linkURL) + + items.push(await ( + crawlHTTP(linkURL, opts, Object.assign({}, internals)) + .then(({ items }) => ({name, items})) + )) + } else { + // It's a file! + + const extensions = fileTypes.map(t => '.' + t) + + if ( + !keepAnyFileType && + !(extensions.includes(path.extname(href))) + ) { + verboseLog("[Ignored] Bad extension: " + linkURL) + continue + } + + verboseLog("[File] " + linkURL) + items.push({name, downloaderArg: linkURL}) + } + } + + return {items} + }), + + err => { + console.warn("Failed to download: " + absURL) + + if (internals.attempts < maxAttempts) { + console.warn( + `Trying again. Attempt ${internals.attempts + 1}/${maxAttempts}...` + ) + + return crawlHTTP(absURL, opts, Object.assign({}, internals, { + attempts: internals.attempts + 1 + })) + } else { + console.error( + "We've hit the download attempt limit (" + maxAttempts + "). " + + "Giving up on this path." + ) + + throw 'FAILED_DOWNLOAD' + } + } + ) + .catch(error => { + if (error === 'FAILED_DOWNLOAD') { + // Debug logging for this is already handled above. + return [] + } else { + throw error + } + }) +} + +function getHTMLLinks(text) { + // Never parse HTML with a regex! + const $ = cheerio.load(text) + + return $('a').get().map(el => { + const $el = $(el) + return [$el.text(), $el.attr('href')] + }) +} + + + +function crawlLocal(dirPath, extensions = [ + 'ogg', 'oga', + 'wav', 'mp3', 'mp4', 'm4a', 'aac', + 'mod' +]) { + return readDir(dirPath).then(items => { + items.sort(sortIgnoreCase(naturalSort())) + + return Promise.all(items.map(item => { + const itemPath = path.join(dirPath, item) + + return stat(itemPath).then(stats => { + if (stats.isDirectory()) { + return crawlLocal(itemPath, extensions) + .then(group => Object.assign({name: item}, group)) + } else if (stats.isFile()) { + // Extname returns a string starting with a dot; we don't want the + // dot, so we slice it off of the front. + const ext = path.extname(item).slice(1) + + if (extensions.includes(ext)) { + // The name of the track doesn't include the file extension; a user + // probably wouldn't add the file extensions to a hand-written + // playlist, or want them in an auto-generated one. + const basename = path.basename(item, path.extname(item)) + + const track = {name: basename, downloaderArg: itemPath} + return track + } else { + return null + } + } + }) + })) + }).then(items => items.filter(Boolean)) + .then(filteredItems => ({items: filteredItems})) +} + +async function crawlYouTube(url) { + const ytdl = spawn('youtube-dl', [ + '-j', // Output as JSON + '--flat-playlist', + url + ]) + + const items = [] + + ytdl.stdout.on('data', data => { + const lines = data.toString().trim().split('\n') + + items.push(...lines.map(JSON.parse)) + }) + + // Pass false so it doesn't show logging. + await promisifyProcess(ytdl, false) + + return { + items: items.map(item => { + return { + name: item.title, + downloaderArg: 'https://youtube.com/watch?v=' + item.id + } + }) + } +} + +async function openFile(input) { + return JSON.parse(await downloadPlaylistFromOptionValue(input)) +} + +module.exports = { + crawlHTTP, + crawlLocal, + crawlYouTube, + openFile, + + getCrawlerByName: function(name) { + switch (name) { + case 'crawl-http': return crawlHTTP + case 'crawl-local': return crawlLocal + case 'crawl-youtube': return crawlYouTube + case 'open-file': return openFile + default: return null + } + } +} -- cgit 1.3.0-6-gf8a5