From c96672fc6d7ad37c15a43c542c9e6d692ddfa3c6 Mon Sep 17 00:00:00 2001 From: liam4 Date: Tue, 30 May 2017 00:53:03 +0000 Subject: Crawl links --- crawl-links.js | 33 ++++++++++++++ package.json | 1 + play.js | 10 +++++ yarn.lock | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 181 insertions(+) create mode 100644 crawl-links.js diff --git a/crawl-links.js b/crawl-links.js new file mode 100644 index 0000000..8602ce1 --- /dev/null +++ b/crawl-links.js @@ -0,0 +1,33 @@ +'use strict' + +const fetch = require('node-fetch') +const $ = require('cheerio') +const url = require('url') + +const DEFAULT_EXTENSIONS = [ + 'mp3', 'wav' +] + +function getHTMLLinks(text) { + // Never parse HTML with a regex! + + return $(text).find('a').get().map(a => { + const $a = $(a) + return [$a.text(), $a.attr('href')] + }) +} + +module.exports.getHTMLLinks = getHTMLLinks + +if (require.main === module) { + const urlString = process.argv[2] + const exts = process.argv.length > 3 ? process.argv.slice(3) : DEFAULT_EXTENSIONS + + fetch(urlString) + .then(res => res.text()) + .then(text => getHTMLLinks(text)) + .then(links => links.filter(l => exts.some(e => l[1].endsWith('.' + e)))) + .then(links => links.map(l => [l[0], url.resolve(urlString, l[1])])) + .then(links => console.log(JSON.stringify(links, null, 2))) + .catch(err => console.error(err)) +} diff --git a/package.json b/package.json index c1e8450..0d2a017 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,6 @@ { "dependencies": { + "cheerio": "^1.0.0-rc.1", "fs-promise": "^2.0.3", "node-fetch": "^1.7.0", "sanitize-filename": "^1.6.1" diff --git a/play.js b/play.js index b5a85e6..6013821 100644 --- a/play.js +++ b/play.js @@ -47,6 +47,16 @@ // (Done!) // // TODO: Option to include a specific path from the source playlist. +// (Done!) +// +// TODO: Make a playlist generator that parses http://billwurtz.com +// instrumentals.html. +// (Done!) +// +// TODO: Make crawl-itunes.js a bit more general, more command-line +// friendly (i.e. don't require editing the script itself), and +// make it use the getHTMLLinks function defined in the new +// crawl-links.js script. 'use strict' diff --git a/yarn.lock b/yarn.lock index db1fddd..d610ff6 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2,16 +2,88 @@ # yarn lockfile v1 +"@types/node@^6.0.46": + version "6.0.73" + resolved "https://registry.yarnpkg.com/@types/node/-/node-6.0.73.tgz#85dc4bb6f125377c75ddd2519a1eeb63f0a4ed70" + any-promise@^1.0.0, any-promise@^1.3.0: version "1.3.0" resolved "https://registry.yarnpkg.com/any-promise/-/any-promise-1.3.0.tgz#abc6afeedcea52e809cdc0376aed3ce39635d17f" +boolbase@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e" + +buffer-shims@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/buffer-shims/-/buffer-shims-1.0.0.tgz#9978ce317388c649ad8793028c3477ef044a8b51" + +cheerio@^1.0.0-rc.1: + version "1.0.0-rc.1" + resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.1.tgz#2af37339eab713ef6b72cde98cefa672b87641fe" + dependencies: + css-select "~1.2.0" + dom-serializer "~0.1.0" + entities "~1.1.1" + htmlparser2 "^3.9.1" + lodash "^4.15.0" + parse5 "^3.0.1" + +core-util-is@~1.0.0: + version "1.0.2" + resolved "https://registry.yarnpkg.com/core-util-is/-/core-util-is-1.0.2.tgz#b5fd54220aa2bc5ab57aab7140c940754503c1a7" + +css-select@~1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/css-select/-/css-select-1.2.0.tgz#2b3a110539c5355f1cd8d314623e870b121ec858" + dependencies: + boolbase "~1.0.0" + css-what "2.1" + domutils "1.5.1" + nth-check "~1.0.1" + +css-what@2.1: + version "2.1.0" + resolved "https://registry.yarnpkg.com/css-what/-/css-what-2.1.0.tgz#9467d032c38cfaefb9f2d79501253062f87fa1bd" + +dom-serializer@0, dom-serializer@~0.1.0: + version "0.1.0" + resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.1.0.tgz#073c697546ce0780ce23be4a28e293e40bc30c82" + dependencies: + domelementtype "~1.1.1" + entities "~1.1.1" + +domelementtype@1, domelementtype@^1.3.0: + version "1.3.0" + resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.0.tgz#b17aed82e8ab59e52dd9c19b1756e0fc187204c2" + +domelementtype@~1.1.1: + version "1.1.3" + resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.1.3.tgz#bd28773e2642881aec51544924299c5cd822185b" + +domhandler@^2.3.0: + version "2.4.1" + resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-2.4.1.tgz#892e47000a99be55bbf3774ffea0561d8879c259" + dependencies: + domelementtype "1" + +domutils@1.5.1, domutils@^1.5.1: + version "1.5.1" + resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.5.1.tgz#dcd8488a26f563d61079e48c9f7b7e32373682cf" + dependencies: + dom-serializer "0" + domelementtype "1" + encoding@^0.1.11: version "0.1.12" resolved "https://registry.yarnpkg.com/encoding/-/encoding-0.1.12.tgz#538b66f3ee62cd1ab51ec323829d1f9480c74beb" dependencies: iconv-lite "~0.4.13" +entities@^1.1.1, entities@~1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/entities/-/entities-1.1.1.tgz#6e5c2d0a5621b5dadaecef80b90edfb5cd7772f0" + fs-extra@^2.0.0: version "2.1.2" resolved "https://registry.yarnpkg.com/fs-extra/-/fs-extra-2.1.2.tgz#046c70163cef9aad46b0e4a7fa467fb22d71de35" @@ -32,20 +104,43 @@ graceful-fs@^4.1.2, graceful-fs@^4.1.6: version "4.1.11" resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.1.11.tgz#0e8bdfe4d1ddb8854d64e04ea7c00e2a026e5658" +htmlparser2@^3.9.1: + version "3.9.2" + resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-3.9.2.tgz#1bdf87acca0f3f9e53fa4fcceb0f4b4cbb00b338" + dependencies: + domelementtype "^1.3.0" + domhandler "^2.3.0" + domutils "^1.5.1" + entities "^1.1.1" + inherits "^2.0.1" + readable-stream "^2.0.2" + iconv-lite@~0.4.13: version "0.4.17" resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.4.17.tgz#4fdaa3b38acbc2c031b045d0edcdfe1ecab18c8d" +inherits@^2.0.1, inherits@~2.0.1: + version "2.0.3" + resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.3.tgz#633c2c83e3da42a502f52466022480f4208261de" + is-stream@^1.0.1: version "1.1.0" resolved "https://registry.yarnpkg.com/is-stream/-/is-stream-1.1.0.tgz#12d4a3dd4e68e0b79ceb8dbc84173ae80d91ca44" +isarray@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" + jsonfile@^2.1.0: version "2.4.0" resolved "https://registry.yarnpkg.com/jsonfile/-/jsonfile-2.4.0.tgz#3736a2b428b87bbda0cc83b53fa3d633a35c2ae8" optionalDependencies: graceful-fs "^4.1.6" +lodash@^4.15.0: + version "4.17.4" + resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.4.tgz#78203a4d1c328ae1d86dca6460e369b57f4055ae" + mz@^2.6.0: version "2.6.0" resolved "https://registry.yarnpkg.com/mz/-/mz-2.6.0.tgz#c8b8521d958df0a4f2768025db69c719ee4ef1ce" @@ -61,16 +156,54 @@ node-fetch@^1.7.0: encoding "^0.1.11" is-stream "^1.0.1" +nth-check@~1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-1.0.1.tgz#9929acdf628fc2c41098deab82ac580cf149aae4" + dependencies: + boolbase "~1.0.0" + object-assign@^4.0.1: version "4.1.1" resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863" +parse5@^3.0.1: + version "3.0.2" + resolved "https://registry.yarnpkg.com/parse5/-/parse5-3.0.2.tgz#05eff57f0ef4577fb144a79f8b9a967a6cc44510" + dependencies: + "@types/node" "^6.0.46" + +process-nextick-args@~1.0.6: + version "1.0.7" + resolved "https://registry.yarnpkg.com/process-nextick-args/-/process-nextick-args-1.0.7.tgz#150e20b756590ad3f91093f25a4f2ad8bff30ba3" + +readable-stream@^2.0.2: + version "2.2.9" + resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.2.9.tgz#cf78ec6f4a6d1eb43d26488cac97f042e74b7fc8" + dependencies: + buffer-shims "~1.0.0" + core-util-is "~1.0.0" + inherits "~2.0.1" + isarray "~1.0.0" + process-nextick-args "~1.0.6" + string_decoder "~1.0.0" + util-deprecate "~1.0.1" + +safe-buffer@^5.0.1: + version "5.0.1" + resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.0.1.tgz#d263ca54696cd8a306b5ca6551e92de57918fbe7" + sanitize-filename@^1.6.1: version "1.6.1" resolved "https://registry.yarnpkg.com/sanitize-filename/-/sanitize-filename-1.6.1.tgz#612da1c96473fa02dccda92dcd5b4ab164a6772a" dependencies: truncate-utf8-bytes "^1.0.0" +string_decoder@~1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.0.1.tgz#62e200f039955a6810d8df0a33ffc0f013662d98" + dependencies: + safe-buffer "^5.0.1" + thenify-all@^1.0.0, thenify-all@^1.6.0: version "1.6.0" resolved "https://registry.yarnpkg.com/thenify-all/-/thenify-all-1.6.0.tgz#1a1918d402d8fc3f98fbf234db0bcc8cc10e9726" @@ -92,3 +225,7 @@ truncate-utf8-bytes@^1.0.0: utf8-byte-length@^1.0.1: version "1.0.4" resolved "https://registry.yarnpkg.com/utf8-byte-length/-/utf8-byte-length-1.0.4.tgz#f45f150c4c66eee968186505ab93fcbb8ad6bf61" + +util-deprecate@~1.0.1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" -- cgit 1.3.0-6-gf8a5