blob: fa078e3c5671f84a1933bca1d92c9f8d80ddf122 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
#!/usr/bin/env node
'use strict'
const MAX_DOWNLOAD_ATTEMPTS = 5
const fetch = require('node-fetch')
const $ = require('cheerio')
const url = require('url')
const path = require('path')
function crawl(absURL, attempts = 0) {
// Recursively crawls a given URL, following every link to a deeper path and
// recording all links in a tree (in the same format playlists use). Makes
// multiple attempts to download failed paths.
return fetch(absURL)
.then(
res => res.text().then(text => {
const links = getHTMLLinks(text)
const verbose = process.argv.includes('--verbose')
return Promise.all(links.map(link => {
const [ title, href ] = link
const linkURL = url.format(new url.URL(href, absURL))
if (href.endsWith('/')) {
// It's a directory!
if (verbose) console.log("[Dir] " + linkURL)
return crawl(linkURL)
.then(res => [title, res])
} else {
// It's a file!
if (verbose) console.log("[File] " + linkURL)
return Promise.resolve([title, linkURL])
}
}))
}),
err => {
console.warn("Failed to download: " + absURL)
if (attempts < MAX_DOWNLOAD_ATTEMPTS) {
console.warn(
"Trying again. Attempt " + (attempts + 1) +
"/" + MAX_DOWNLOAD_ATTEMPTS + "..."
)
return crawl(absURL, attempts + 1)
} else {
console.error(
"We've hit the download attempt limit (" +
MAX_DOWNLOAD_ATTEMPTS + "). Giving up on this path."
)
throw 'FAILED_DOWNLOAD'
}
}
)
.catch(error => {
if (error === 'FAILED_DOWNLOAD') {
// Debug logging for this is already handled above.
return []
} else {
throw error
}
})
}
function getHTMLLinks(text) {
// Never parse HTML with a regex!
return $(text).find('a').get().map(a => {
const $a = $(a)
return [$a.text(), $a.attr('href')]
})
}
if (process.argv.length === 2) {
console.log("Usage: http-music-crawl-http http://.../example/path/")
console.log("..or, npm run crawl-http -- http://.../example/path/")
} else {
let url = process.argv[2]
crawl(url)
.then(res => console.log(JSON.stringify(res, null, 2)))
.catch(err => console.error(err))
}
|