-
Notifications
You must be signed in to change notification settings - Fork 247
Improve link checker #14871
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
asafashirov
wants to merge
1
commit into
master
Choose a base branch
from
clean-link-checker-improvements
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Improve link checker #14871
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,11 +1,10 @@ | ||||||||||||
const { HtmlUrlChecker } = require("broken-link-checker"); | ||||||||||||
const { WebClient, LogLevel } = require('@slack/web-api'); | ||||||||||||
const httpServer = require("http-server"); | ||||||||||||
const Sitemapper = require("sitemapper"); | ||||||||||||
const sitemap = new Sitemapper(); | ||||||||||||
const path = require("path"); | ||||||||||||
const fs = require("fs"); | ||||||||||||
|
||||||||||||
const axios = require("axios"); | ||||||||||||
|
||||||||||||
// Additional routes to check that are not included in the sitemap. | ||||||||||||
const additionalRoutes = [ | ||||||||||||
|
@@ -15,18 +14,25 @@ const additionalRoutes = [ | |||||||||||
"https://www.pulumi.com/registry/sitemap.xml", | ||||||||||||
] | ||||||||||||
|
||||||||||||
|
||||||||||||
/** | ||||||||||||
* This script uses the programmatic API of https://github.com/stevenvachon/broken-link-checker | ||||||||||||
to check the links (including images, iframes, and client-side redirects) for either an individual page | ||||||||||||
or for a whole site. Usage: | ||||||||||||
|
||||||||||||
# Log successes as well as failures. | ||||||||||||
$ DEBUG=1 node scripts/check-links.js "https://www.pulumi.com" | ||||||||||||
* to check the links (including images, iframes, and client-side redirects) for either an individual page | ||||||||||||
* or for a whole site. Usage: | ||||||||||||
* | ||||||||||||
* # Standard check (default: no section filter) | ||||||||||||
* $ node scripts/link-checker/check-links.js "https://www.pulumi.com" 2 | ||||||||||||
* | ||||||||||||
* # Check specific section (e.g., only check /docs/ URLs) | ||||||||||||
* $ node scripts/link-checker/check-links.js "https://www.pulumi.com" 2 "/docs/" | ||||||||||||
* | ||||||||||||
* # Log successes as well as failures | ||||||||||||
* $ DEBUG=1 node scripts/link-checker/check-links.js "https://www.pulumi.com" 2 | ||||||||||||
*/ | ||||||||||||
|
||||||||||||
let [ baseURL, maxRetries ] = process.argv.slice(2); | ||||||||||||
let [ baseURL, maxRetries, sectionFilter ] = process.argv.slice(2); | ||||||||||||
let retryCount = 0; | ||||||||||||
let totalCheckedLinks = 0; | ||||||||||||
let startTime = Date.now(); | ||||||||||||
|
||||||||||||
if (!baseURL) { | ||||||||||||
throw new Error("A baseURL (e.g., 'https://pulumi.com') is required."); | ||||||||||||
|
@@ -61,14 +67,30 @@ checkLinks(); | |||||||||||
|
||||||||||||
// Runs the checker. | ||||||||||||
async function checkLinks() { | ||||||||||||
const checker = getChecker([]); | ||||||||||||
|
||||||||||||
// Load all URLs. | ||||||||||||
const urls = await getURLsToCheck(baseURL); | ||||||||||||
const brokenLinks = []; | ||||||||||||
const checker = getChecker(brokenLinks); | ||||||||||||
|
||||||||||||
// Start the checker. | ||||||||||||
checker.enqueue(baseURL); | ||||||||||||
urls.forEach(url => checker.enqueue(url)); | ||||||||||||
console.log("=== Link Checker Started ==="); | ||||||||||||
console.log(`Base URL: ${baseURL}`); | ||||||||||||
if (sectionFilter) { | ||||||||||||
console.log(`Section filter: ${sectionFilter}`); | ||||||||||||
} | ||||||||||||
console.log(`Max retries: ${maxRetries}`); | ||||||||||||
|
||||||||||||
try { | ||||||||||||
// Get all URLs from the main sitemap AND section sitemaps | ||||||||||||
console.log("Fetching URLs from sitemaps..."); | ||||||||||||
const urls = await getAllUrlsToCheck(baseURL); | ||||||||||||
|
||||||||||||
console.log(`Found ${urls.length} URLs to check`); | ||||||||||||
|
||||||||||||
// Start the checker with the base URL and all URLs | ||||||||||||
checker.enqueue(baseURL); | ||||||||||||
urls.forEach(url => checker.enqueue(url)); | ||||||||||||
} catch (error) { | ||||||||||||
console.error(`Error fetching URLs: ${error.message}`); | ||||||||||||
process.exit(1); | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
// Returns an instance of either HtmlUrlChecker. | ||||||||||||
|
@@ -101,29 +123,38 @@ function getDefaultHandlers(brokenLinks) { | |||||||||||
return { | ||||||||||||
link: (result) => { | ||||||||||||
try { | ||||||||||||
totalCheckedLinks++; | ||||||||||||
|
||||||||||||
// Show progress periodically | ||||||||||||
if (totalCheckedLinks % 500 === 0) { | ||||||||||||
const elapsedMinutes = ((Date.now() - startTime) / 1000 / 60).toFixed(1); | ||||||||||||
console.log(`Progress: Checked ${totalCheckedLinks} links in ${elapsedMinutes} minutes, found ${brokenLinks.length} broken links`); | ||||||||||||
} | ||||||||||||
|
||||||||||||
onLink(result, brokenLinks); | ||||||||||||
} | ||||||||||||
catch (error) { | ||||||||||||
fail(error); | ||||||||||||
console.error(`Error in link handler: ${error.message}`); | ||||||||||||
} | ||||||||||||
}, | ||||||||||||
error: (error) => { | ||||||||||||
fail(error); | ||||||||||||
console.error(`Checker error: ${error.message}`); | ||||||||||||
}, | ||||||||||||
page: (error, pageURL) => { | ||||||||||||
try { | ||||||||||||
onPage(error, pageURL, brokenLinks); | ||||||||||||
} | ||||||||||||
catch(error) { | ||||||||||||
fail(error); | ||||||||||||
console.error(`Error in page handler: ${error.message}`); | ||||||||||||
} | ||||||||||||
}, | ||||||||||||
end: async () => { | ||||||||||||
try { | ||||||||||||
await onComplete(brokenLinks); | ||||||||||||
} | ||||||||||||
catch (error) { | ||||||||||||
fail(error); | ||||||||||||
console.error(`Error in end handler: ${error.message}`); | ||||||||||||
process.exit(1); | ||||||||||||
} | ||||||||||||
}, | ||||||||||||
}; | ||||||||||||
|
@@ -143,9 +174,8 @@ function onLink(result, brokenLinks) { | |||||||||||
logLink(source, destination, reason); | ||||||||||||
|
||||||||||||
} else if (process.env.DEBUG) { | ||||||||||||
|
||||||||||||
// Log successes when DEBUG is truthy. | ||||||||||||
logLink(source, destination, result.http.response.statusCode); | ||||||||||||
logLink(source, destination, result.http?.response?.statusCode || "SUCCESS"); | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
|
@@ -163,26 +193,63 @@ function onPage(error, pageURL, brokenLinks) { | |||||||||||
// Handles the BLC 'complete' event, which is raised at the end of a run. | ||||||||||||
async function onComplete(brokenLinks) { | ||||||||||||
const filtered = excludeAcceptable(brokenLinks); | ||||||||||||
const elapsedTime = ((Date.now() - startTime) / 1000).toFixed(1); | ||||||||||||
|
||||||||||||
if (filtered.length > 0) { | ||||||||||||
console.log("=== Link Check Completed ==="); | ||||||||||||
console.log(`Total time: ${elapsedTime} seconds`); | ||||||||||||
console.log(`Total links checked: ${totalCheckedLinks}`); | ||||||||||||
console.log(`Total broken links found: ${filtered.length}`); | ||||||||||||
|
||||||||||||
if (filtered.length > 0) { | ||||||||||||
// If we failed and a retry count was provided, retry. Note that retry count !== | ||||||||||||
// run count, so a retry count of 1 means run once, then retry once, which means a | ||||||||||||
// total run count of two. | ||||||||||||
if (maxRetries > 0 && retryCount < maxRetries) { | ||||||||||||
retryCount += 1; | ||||||||||||
console.log(`Retrying (${retryCount} of ${maxRetries})...`); | ||||||||||||
|
||||||||||||
// Reset counters | ||||||||||||
totalCheckedLinks = 0; | ||||||||||||
startTime = Date.now(); | ||||||||||||
|
||||||||||||
checkLinks(); | ||||||||||||
return; | ||||||||||||
} | ||||||||||||
|
||||||||||||
// Group broken links by reason | ||||||||||||
const groupedByReason = {}; | ||||||||||||
filtered.forEach(link => { | ||||||||||||
if (!groupedByReason[link.reason]) { | ||||||||||||
groupedByReason[link.reason] = []; | ||||||||||||
} | ||||||||||||
groupedByReason[link.reason].push(link); | ||||||||||||
}); | ||||||||||||
|
||||||||||||
// Display summary by reason | ||||||||||||
console.log(`Broken links by reason:`); | ||||||||||||
Object.keys(groupedByReason).forEach(reason => { | ||||||||||||
console.log(`${reason}: ${groupedByReason[reason].length} links`); | ||||||||||||
}); | ||||||||||||
|
||||||||||||
// List all broken links | ||||||||||||
console.log("\nList of all broken links:"); | ||||||||||||
filtered.forEach(link => { | ||||||||||||
console.log(`${link.source} -> ${link.destination} (${link.reason})`); | ||||||||||||
}); | ||||||||||||
|
||||||||||||
// Format for Slack | ||||||||||||
const list = filtered | ||||||||||||
.map(link => `:link: <${link.source}|${new URL(link.source).pathname}> → ${link.destination} (${link.reason})`) | ||||||||||||
.join("\n"); | ||||||||||||
|
||||||||||||
// Post the results to Slack. | ||||||||||||
console.warn("Posting to slack: " + list); | ||||||||||||
await postToSlack("docs-ops", list); | ||||||||||||
console.log(`Posting ${filtered.length} broken links to Slack...`); | ||||||||||||
await postToSlack("docs-ops", `Found ${filtered.length} broken links:\n${list}`); | ||||||||||||
|
||||||||||||
// Exit with error code | ||||||||||||
process.exit(1); | ||||||||||||
} else { | ||||||||||||
console.log(`All links are valid!`); | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
|
@@ -339,14 +406,18 @@ async function postToSlack(channel, text) { | |||||||||||
return; | ||||||||||||
} | ||||||||||||
|
||||||||||||
const client = new WebClient(token, { logLevel: LogLevel.ERROR }); | ||||||||||||
return await client.chat.postMessage({ | ||||||||||||
text, | ||||||||||||
channel: `#${channel}`, | ||||||||||||
as_user: true, | ||||||||||||
mrkdwn: true, | ||||||||||||
unfurl_links: false, | ||||||||||||
}); | ||||||||||||
try { | ||||||||||||
const client = new WebClient(token, { logLevel: LogLevel.ERROR }); | ||||||||||||
return await client.chat.postMessage({ | ||||||||||||
text, | ||||||||||||
channel: `#${channel}`, | ||||||||||||
as_user: true, | ||||||||||||
mrkdwn: true, | ||||||||||||
unfurl_links: false, | ||||||||||||
}); | ||||||||||||
} catch (error) { | ||||||||||||
console.error(`Error posting to Slack: ${error.message}`); | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
// Adds a broken link to the running list. | ||||||||||||
|
@@ -360,46 +431,104 @@ function addLink(source, destination, reason, links) { | |||||||||||
|
||||||||||||
// Logs a link result to the console. | ||||||||||||
function logLink(source, destination, reason) { | ||||||||||||
console.log(source); | ||||||||||||
console.log(` -> ${destination}`); | ||||||||||||
console.log(` -> ${reason}`); | ||||||||||||
console.log(); | ||||||||||||
if (reason && (reason.toString().startsWith('4') || reason.toString().startsWith('5') || | ||||||||||||
typeof reason === 'string' && !reason.match(/^2\d\d$/))) { | ||||||||||||
console.log(`BROKEN: ${source} -> ${destination} (${reason})`); | ||||||||||||
} else if (process.env.DEBUG) { | ||||||||||||
console.log(`OK: ${source} -> ${destination} (${reason})`); | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
// Logs and exits immediately. | ||||||||||||
function fail(error) { | ||||||||||||
console.error(error.message); | ||||||||||||
process.exit(1); | ||||||||||||
// Get all URLs to check from multiple sitemaps | ||||||||||||
async function getAllUrlsToCheck(base) { | ||||||||||||
try { | ||||||||||||
// Set of URLs to check to avoid duplicates | ||||||||||||
const allUrls = new Set(); | ||||||||||||
|
||||||||||||
// Add the known section sitemaps for Pulumi docs site | ||||||||||||
const sitemaps = [ | ||||||||||||
// Main sitemap | ||||||||||||
`${base}/sitemap.xml`, | ||||||||||||
|
||||||||||||
// Section sitemaps (based on examining the repo) | ||||||||||||
`${base}/static/sitemaps/sitemap-blog.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-docs.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-tutorials.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-templates.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-registry.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-case-studies.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-product.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-compliance.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-what-is.xml`, | ||||||||||||
`${base}/static/sitemaps/sitemap-other.xml`, | ||||||||||||
]; | ||||||||||||
|
||||||||||||
// Try each sitemap | ||||||||||||
for (const sitemapUrl of sitemaps) { | ||||||||||||
try { | ||||||||||||
console.log(`Processing sitemap: ${sitemapUrl}`); | ||||||||||||
const urls = await processSitemap(sitemapUrl, base); | ||||||||||||
|
||||||||||||
// Add to our set | ||||||||||||
urls.forEach(url => allUrls.add(url)); | ||||||||||||
|
||||||||||||
console.log(`Found ${urls.length} URLs in ${sitemapUrl}`); | ||||||||||||
} catch (error) { | ||||||||||||
console.log(`Could not process ${sitemapUrl}: ${error.message}`); | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
// Convert to array | ||||||||||||
let urls = [...allUrls]; | ||||||||||||
|
||||||||||||
// Apply section filter if provided | ||||||||||||
if (sectionFilter) { | ||||||||||||
urls = urls.filter(url => url.includes(sectionFilter)); | ||||||||||||
console.log(`Applied section filter "${sectionFilter}": ${urls.length} URLs remaining`); | ||||||||||||
} | ||||||||||||
|
||||||||||||
// Add the additional routes | ||||||||||||
urls = urls.concat(additionalRoutes); | ||||||||||||
|
||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nitpick] The list of URLs is returned without sorting, which may lead to non-deterministic order. Consider sorting the URLs (e.g., alphabetically) before returning to maintain consistency.
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||||||||
return urls; | ||||||||||||
} catch (error) { | ||||||||||||
console.error(`Error processing sitemaps: ${error.message}`); | ||||||||||||
throw error; | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
// Start by fetching the sitemap from `baseURL`. | ||||||||||||
async function getURLsToCheck(base) { | ||||||||||||
return await sitemap | ||||||||||||
.fetch(`${base}/sitemap.xml`) | ||||||||||||
.then(map => { | ||||||||||||
const urls = map.sites | ||||||||||||
|
||||||||||||
// Exclude resource docs, SDK docs, and CLI download pages. | ||||||||||||
.filter(page => !page.match(/\/registry\/packages\/.+\/api-docs\//)) | ||||||||||||
.filter(page => !page.match(/\/docs\/reference\/pkg\/nodejs|python\//)) | ||||||||||||
.filter(page => !page.match(/\/docs\/install\/versions\//)) | ||||||||||||
|
||||||||||||
// Always check using the supplied baseURL. | ||||||||||||
.map(url => { | ||||||||||||
const newURL = new URL(url); | ||||||||||||
const baseURLObj = new URL(base); | ||||||||||||
newURL.hostname = baseURLObj.hostname; | ||||||||||||
newURL.protocol = baseURLObj.protocol; | ||||||||||||
return newURL.toString(); | ||||||||||||
}) | ||||||||||||
|
||||||||||||
// Tack on any additional pages we'd like to check. | ||||||||||||
.concat(additionalRoutes) | ||||||||||||
|
||||||||||||
// Sort everything alphabetically. | ||||||||||||
.sort(); | ||||||||||||
|
||||||||||||
// Return the list of URLs to be crawled. | ||||||||||||
return urls; | ||||||||||||
}); | ||||||||||||
} | ||||||||||||
// Process a single sitemap | ||||||||||||
async function processSitemap(sitemapUrl, base) { | ||||||||||||
try { | ||||||||||||
const result = await sitemap.fetch(sitemapUrl); | ||||||||||||
|
||||||||||||
if (!result || !result.sites || !Array.isArray(result.sites)) { | ||||||||||||
return []; | ||||||||||||
} | ||||||||||||
|
||||||||||||
const urls = result.sites; | ||||||||||||
|
||||||||||||
// Exclude resource docs, SDK docs, and CLI download pages. | ||||||||||||
const filtered = urls | ||||||||||||
.filter(page => !page.match(/\/registry\/packages\/.+\/api-docs\//)) | ||||||||||||
.filter(page => !page.match(/\/docs\/reference\/pkg\/nodejs|python\//)) | ||||||||||||
.filter(page => !page.match(/\/docs\/install\/versions\//)); | ||||||||||||
|
||||||||||||
// Always check using the supplied baseURL. | ||||||||||||
return filtered.map(url => { | ||||||||||||
try { | ||||||||||||
const newURL = new URL(url); | ||||||||||||
const baseURLObj = new URL(base); | ||||||||||||
newURL.hostname = baseURLObj.hostname; | ||||||||||||
newURL.protocol = baseURLObj.protocol; | ||||||||||||
return newURL.toString(); | ||||||||||||
} catch (e) { | ||||||||||||
console.warn(`Skipping invalid URL: ${url}`); | ||||||||||||
return null; | ||||||||||||
} | ||||||||||||
}).filter(url => url !== null); | ||||||||||||
} catch (error) { | ||||||||||||
console.error(`Error fetching sitemap ${sitemapUrl}: ${error.message}`); | ||||||||||||
return []; | ||||||||||||
} | ||||||||||||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
axios is imported but never used; consider removing this import and the axios dependency to keep the codebase clean.
Copilot uses AI. Check for mistakes.