Skip to content

Commit

Permalink
chore: add script to ensure corresponding pages/redirects exist (nuxt…
Browse files Browse the repository at this point in the history
  • Loading branch information
danielroe authored Jul 21, 2021
1 parent a2d01cf commit 609eb7d
Show file tree
Hide file tree
Showing 3 changed files with 549 additions and 14 deletions.
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
"devDependencies": {
"@nuxt/typescript-build": "^2.1.0",
"@snackbar/core": "^1.7.0",
"@types/crawler": "^1.2.2",
"crawler": "^1.3.0",
"docus": "0.8.15",
"ohmyfetch": "^0.2.0",
"ufo": "^0.7.7",
"vue-plausible": "^1.2.1"
}
}
111 changes: 111 additions & 0 deletions scripts/crawl.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env node -r jiti/register

import Crawler from 'crawler'
import consola from 'consola'
import { withoutTrailingSlash } from 'ufo'
import chalk from 'chalk'

const logger = consola.withTag('crawler')

const excludedExtensions = ['svg', 'png', 'jpg', 'sketch', 'ico', 'gif']
const urlsToOmit = ['http://localhost:3000']

const sourceSite = 'https://nuxtjs.org'
const baseURL = withoutTrailingSlash(process.env.BASE_URL || 'https://preview.nuxtjs.org')

const startingURL = sourceSite + '/'

// GLOBALS
const urls = new Set([startingURL])
const erroredUrls = []

let crawler: Crawler

const credentials = Buffer.from('nuxt:nuxt').toString('base64')

const verifier = new Crawler({
maxConnections: 100,
headers: {
Authorization: `Basic ${credentials}`
},
callback (error, res, done) {
const { $ } = res
const { uri } = res.options
// @ts-ignore
const { statusCode } = res.request.response

if (error || ![200, 301, 302].includes(statusCode)) {
const message = chalk.red(`${chalk.bold('✗')} ${uri} (${statusCode})`)
logger.log(message)
erroredUrls.push(uri)
return done()
}

if (!$) {
logger.error('Could not parse', uri)
return done()
}

logger.success(chalk.green(uri))
logger.debug(uri, `[${verifier.queueSize} / ${urls.size}]`)
done()
},
})

function queue (path: string, referrer?: string) {
if (urlsToOmit.some(url => path.startsWith(url))) return

const { pathname, origin } = new URL(path, referrer)

// Don't crawl the same page more than once
const url = `${origin}${pathname}`
if (!url || urls.has(url) || !crawler) return

// Don't try to visit linked assets (e.g. SVGs)
const extension = url.split('.').pop()
if (excludedExtensions.includes(extension)) return

// Don't crawl external URLs
if (origin !== sourceSite) return

urls.add(url)

crawler.queue(url)
verifier.queue(`${baseURL}${pathname}`)
}

crawler = new Crawler({
maxConnections: 100,
callback (error, res, done) {
const { $ } = res
const { uri } = res.options
// @ts-ignore
const { statusCode } = res.request.response

if (error || ![200, 301, 302].includes(statusCode) || !$) {
return done()
}

$(`a:not([href*=mailto])`).each((_, el) => 'attribs' in el && queue(el.attribs.href, uri))

if (crawler.queueSize === 1) {
logger.log('')
logger.info(`Checked \`${urls.size}\` pages.`)
// Tasks to run at the end.
if (erroredUrls.length) {
const message = `Could not find ${chalk.bold(erroredUrls.length)} equivalent URLs on ${chalk.bold(baseURL)}.`
const error = new Error(`\n\n${message}\n`)
error.message = message
error.stack = ''
throw error
}
}
done()
},
})

logger.log('')
logger.info(`Checking \`${sourceSite}\`.`)
logger.info(`Ignoring file extensions: \`${excludedExtensions.join(', ')}.\`\n`)

crawler.queue(startingURL)
Loading

0 comments on commit 609eb7d

Please sign in to comment.