diff options
author | Liam <towerofnix@gmail.com> | 2017-05-31 18:58:08 -0300 |
---|---|---|
committer | Liam <towerofnix@gmail.com> | 2017-05-31 18:58:08 -0300 |
commit | 26663377fd7ea15a6c3d23a399d1266c8639d42e (patch) | |
tree | fa0532caaf5672501bb5797499a515da72a09038 /crawl-recursive.js | |
parent | 42ec01bb91c517067a9eba901272c1248ed52261 (diff) |
Progress
Diffstat (limited to 'crawl-recursive.js')
-rw-r--r-- | crawl-recursive.js | 118 |
1 files changed, 66 insertions, 52 deletions
diff --git a/crawl-recursive.js b/crawl-recursive.js index 8d33ded..d3b0127 100644 --- a/crawl-recursive.js +++ b/crawl-recursive.js @@ -3,70 +3,84 @@ const MAX_DOWNLOAD_ATTEMPTS = 5 const fetch = require('node-fetch') -const { getHTMLLinks } = require('./crawl-links') function crawl(absURL, attempts = 0) { - return fetch(absURL) - .then(res => res.text().then(text => playlistifyParse(text, absURL)), err => { - console.error('Failed to download: ' + absURL) + // Recursively crawls a given URL, following every link to a deeper path and + // recording all links in a tree (in the same format playlists use). Makes + // multiple attempts to download failed paths. - if (attempts < MAX_DOWNLOAD_ATTEMPTS) { - console.error( - 'Trying again. Attempt ' + (attempts + 1) + - '/' + MAX_DOWNLOAD_ATTEMPTS + '...' - ) - return crawl(absURL, attempts + 1) - } else { - console.error( - 'We\'ve hit the download attempt limit (' + - MAX_DOWNLOAD_ATTEMPTS + '). Giving up on ' + - 'this path.' - ) - throw 'FAILED_DOWNLOAD' - } - }) - .catch(error => { - if (error === 'FAILED_DOWNLOAD') { - // Debug logging for this is already handled above. - return [] - } else { - throw error - } - }) -} + return fetch(absURL) + .then( + res => res.text().then(text => { + const links = getHTMLLinks(text) + const verbose = process.argv.includes('--verbose') + + return Promise.all(links.map(link => { + const [ title, href ] = link + + if (href.endsWith('/')) { + // It's a directory! -function playlistifyParse(text, absURL) { - const links = getHTMLLinks(text) - const verbose = process.argv.includes('--verbose') + if (verbose) console.log('[Dir] ' + absURL + href) + return crawl(absURL + href) + .then(res => [title, res]) + } else { + // It's a file! - return Promise.all(links.map(link => { - const [ title, href ] = link + if (verbose) console.log('[File] ' + absURL + href) + return Promise.resolve([title, absURL + href]) + } + })) + }), - if (href.endsWith('/')) { - // It's a directory! + err => { + console.error('Failed to download: ' + absURL) + + if (attempts < MAX_DOWNLOAD_ATTEMPTS) { + console.error( + 'Trying again. Attempt ' + (attempts + 1) + + '/' + MAX_DOWNLOAD_ATTEMPTS + '...' + ) + return crawl(absURL, attempts + 1) + } else { + console.error( + 'We\'ve hit the download attempt limit (' + + MAX_DOWNLOAD_ATTEMPTS + '). Giving up on ' + + 'this path.' + ) + throw 'FAILED_DOWNLOAD' + } + } + ) + .catch(error => { + if (error === 'FAILED_DOWNLOAD') { + // Debug logging for this is already handled above. + return [] + } else { + throw error + } + }) +} - if (verbose) console.log('[Dir] ' + absURL + href) - return crawl(absURL + href) - .then(res => [title, res]) - } else { - // It's a file! +function getHTMLLinks(text) { + // Never parse HTML with a regex! - if (verbose) console.log('[File] ' + absURL + href) - return Promise.resolve([title, absURL + href]) - } - })) + return $(text).find('a').get().map(a => { + const $a = $(a) + return [$a.text(), $a.attr('href')] + }) } if (process.argv.length === 2) { - console.log('Usage: crawl-recursive http://example.com/example/path') + console.log('Usage: crawl-recursive http://example.com/example/path') } else { - let url = process.argv[2] + let url = process.argv[2] - if (!(url.endsWith('/'))) { - url = url + '/' - } + if (!(url.endsWith('/'))) { + url = url + '/' + } - crawl(url) - .then(res => console.log(JSON.stringify(res, null, 2))) - .catch(err => console.error(err)) + crawl(url) + .then(res => console.log(JSON.stringify(res, null, 2))) + .catch(err => console.error(err)) } |