From 0c8ed6fc7ff30d43bd337ff9f3e88b86ddcbbb89 Mon Sep 17 00:00:00 2001
From: asaf <asaf@pulumi.com>
Date: Mon, 28 Apr 2025 11:39:41 -0400
Subject: [PATCH] Improve link checker to process all sitemaps

- Add support for processing all sitemap files (not just main sitemap)
- Implement section filtering capability
- Enhance error handling and progress tracking
- Improve reporting with a complete list of broken links
---
 Makefile                            |  13 +-
 scripts/link-checker/check-links.js | 273 ++++++++++++++++++++--------
 scripts/link-checker/check-links.sh |  19 +-
 3 files changed, 231 insertions(+), 74 deletions(-)

diff --git a/Makefile b/Makefile
index 23f8d5d3b93a..39638eec028e 100644
--- a/Makefile
+++ b/Makefile
@@ -59,7 +59,18 @@ build:
 check_links:
 	$(MAKE) banner
 	$(MAKE) ensure
-	./scripts/link-checker/check-links.sh "https://www.pulumi.com"
+	./scripts/link-checker/check-links.sh "https://www.pulumi.com" "$(SECTION)"
+
+# Usage: make check_section SECTION=/docs/
+.PHONY: check_section
+check_section:
+	@if [ -z "$(SECTION)" ]; then \
+		echo "Error: SECTION variable is required. Example: make check_section SECTION=/docs/"; \
+		exit 1; \
+	fi
+	$(MAKE) banner
+	$(MAKE) ensure
+	./scripts/link-checker/check-links.sh "https://www.pulumi.com" "$(SECTION)"
 
 .PHONY: check_search_urls
 check_search_urls:
diff --git a/scripts/link-checker/check-links.js b/scripts/link-checker/check-links.js
index 000d4d20cce8..191a4fd088e5 100644
--- a/scripts/link-checker/check-links.js
+++ b/scripts/link-checker/check-links.js
@@ -1,11 +1,10 @@
 const { HtmlUrlChecker } = require("broken-link-checker");
 const { WebClient, LogLevel } = require('@slack/web-api');
-const httpServer = require("http-server");
 const Sitemapper = require("sitemapper");
 const sitemap = new Sitemapper();
 const path = require("path");
 const fs = require("fs");
-
+const axios = require("axios");
 
 // Additional routes to check that are not included in the sitemap.
 const additionalRoutes = [
@@ -15,18 +14,25 @@ const additionalRoutes = [
     "https://www.pulumi.com/registry/sitemap.xml",
 ]
 
-
 /**
  *  This script uses the programmatic API of https://github.com/stevenvachon/broken-link-checker
-    to check the links (including images, iframes, and client-side redirects) for either an individual page
-    or for a whole site. Usage:
-
-    # Log successes as well as failures.
-    $ DEBUG=1 node scripts/check-links.js "https://www.pulumi.com"
+ *  to check the links (including images, iframes, and client-side redirects) for either an individual page
+ *  or for a whole site. Usage:
+ *
+ *  # Standard check (default: no section filter)
+ *  $ node scripts/link-checker/check-links.js "https://www.pulumi.com" 2
+ *  
+ *  # Check specific section (e.g., only check /docs/ URLs)
+ *  $ node scripts/link-checker/check-links.js "https://www.pulumi.com" 2 "/docs/"
+ *
+ *  # Log successes as well as failures
+ *  $ DEBUG=1 node scripts/link-checker/check-links.js "https://www.pulumi.com" 2
  */
 
-let [ baseURL, maxRetries ] = process.argv.slice(2);
+let [ baseURL, maxRetries, sectionFilter ] = process.argv.slice(2);
 let retryCount = 0;
+let totalCheckedLinks = 0;
+let startTime = Date.now();
 
 if (!baseURL) {
     throw new Error("A baseURL (e.g., 'https://pulumi.com') is required.");
@@ -61,14 +67,30 @@ checkLinks();
 
 // Runs the checker.
 async function checkLinks() {
-    const checker = getChecker([]);
-
-    // Load all URLs.
-    const urls = await getURLsToCheck(baseURL);
+    const brokenLinks = [];
+    const checker = getChecker(brokenLinks);
 
-    // Start the checker.
-    checker.enqueue(baseURL);
-    urls.forEach(url => checker.enqueue(url));
+    console.log("=== Link Checker Started ===");
+    console.log(`Base URL: ${baseURL}`);
+    if (sectionFilter) {
+        console.log(`Section filter: ${sectionFilter}`);
+    }
+    console.log(`Max retries: ${maxRetries}`);
+    
+    try {
+        // Get all URLs from the main sitemap AND section sitemaps
+        console.log("Fetching URLs from sitemaps...");
+        const urls = await getAllUrlsToCheck(baseURL);
+        
+        console.log(`Found ${urls.length} URLs to check`);
+        
+        // Start the checker with the base URL and all URLs
+        checker.enqueue(baseURL);
+        urls.forEach(url => checker.enqueue(url));
+    } catch (error) {
+        console.error(`Error fetching URLs: ${error.message}`);
+        process.exit(1);
+    }
 }
 
 // Returns an instance of either HtmlUrlChecker.
@@ -101,21 +123,29 @@ function getDefaultHandlers(brokenLinks) {
     return {
         link: (result) => {
             try {
+                totalCheckedLinks++;
+                
+                // Show progress periodically
+                if (totalCheckedLinks % 500 === 0) {
+                    const elapsedMinutes = ((Date.now() - startTime) / 1000 / 60).toFixed(1);
+                    console.log(`Progress: Checked ${totalCheckedLinks} links in ${elapsedMinutes} minutes, found ${brokenLinks.length} broken links`);
+                }
+                
                 onLink(result, brokenLinks);
             }
             catch (error) {
-                fail(error);
+                console.error(`Error in link handler: ${error.message}`);
             }
         },
         error: (error) => {
-            fail(error);
+            console.error(`Checker error: ${error.message}`);
         },
         page: (error, pageURL) => {
             try {
                 onPage(error, pageURL, brokenLinks);
             }
             catch(error) {
-                fail(error);
+                console.error(`Error in page handler: ${error.message}`);
             }
         },
         end: async () => {
@@ -123,7 +153,8 @@ function getDefaultHandlers(brokenLinks) {
                 await onComplete(brokenLinks);
             }
             catch (error) {
-                fail(error);
+                console.error(`Error in end handler: ${error.message}`);
+                process.exit(1);
             }
         },
     };
@@ -143,9 +174,8 @@ function onLink(result, brokenLinks) {
         logLink(source, destination, reason);
 
     } else if (process.env.DEBUG) {
-
         // Log successes when DEBUG is truthy.
-        logLink(source, destination, result.http.response.statusCode);
+        logLink(source, destination, result.http?.response?.statusCode || "SUCCESS");
     }
 }
 
@@ -163,26 +193,63 @@ function onPage(error, pageURL, brokenLinks) {
 // Handles the BLC 'complete' event, which is raised at the end of a run.
 async function onComplete(brokenLinks) {
     const filtered = excludeAcceptable(brokenLinks);
+    const elapsedTime = ((Date.now() - startTime) / 1000).toFixed(1);
 
-    if (filtered.length > 0) {
+    console.log("=== Link Check Completed ===");
+    console.log(`Total time: ${elapsedTime} seconds`);
+    console.log(`Total links checked: ${totalCheckedLinks}`);
+    console.log(`Total broken links found: ${filtered.length}`);
 
+    if (filtered.length > 0) {
         // If we failed and a retry count was provided, retry. Note that retry count !==
         // run count, so a retry count of 1 means run once, then retry once, which means a
         // total run count of two.
         if (maxRetries > 0 && retryCount < maxRetries) {
             retryCount += 1;
             console.log(`Retrying (${retryCount} of ${maxRetries})...`);
+            
+            // Reset counters
+            totalCheckedLinks = 0;
+            startTime = Date.now();
+            
             checkLinks();
             return;
         }
 
+        // Group broken links by reason
+        const groupedByReason = {};
+        filtered.forEach(link => {
+            if (!groupedByReason[link.reason]) {
+                groupedByReason[link.reason] = [];
+            }
+            groupedByReason[link.reason].push(link);
+        });
+
+        // Display summary by reason
+        console.log(`Broken links by reason:`);
+        Object.keys(groupedByReason).forEach(reason => {
+            console.log(`${reason}: ${groupedByReason[reason].length} links`);
+        });
+
+        // List all broken links
+        console.log("\nList of all broken links:");
+        filtered.forEach(link => {
+            console.log(`${link.source} -> ${link.destination} (${link.reason})`);
+        });
+
+        // Format for Slack
         const list = filtered
             .map(link => `:link: <${link.source}|${new URL(link.source).pathname}> → ${link.destination} (${link.reason})`)
             .join("\n");
 
         // Post the results to Slack.
-        console.warn("Posting to slack: " + list);
-        await postToSlack("docs-ops", list);
+        console.log(`Posting ${filtered.length} broken links to Slack...`);
+        await postToSlack("docs-ops", `Found ${filtered.length} broken links:\n${list}`);
+        
+        // Exit with error code
+        process.exit(1);
+    } else {
+        console.log(`All links are valid!`);
     }
 }
 
@@ -339,14 +406,18 @@ async function postToSlack(channel, text) {
         return;
     }
 
-    const client = new WebClient(token, { logLevel: LogLevel.ERROR });
-    return await client.chat.postMessage({
-        text,
-        channel: `#${channel}`,
-        as_user: true,
-        mrkdwn: true,
-        unfurl_links: false,
-    });
+    try {
+        const client = new WebClient(token, { logLevel: LogLevel.ERROR });
+        return await client.chat.postMessage({
+            text,
+            channel: `#${channel}`,
+            as_user: true,
+            mrkdwn: true,
+            unfurl_links: false,
+        });
+    } catch (error) {
+        console.error(`Error posting to Slack: ${error.message}`);
+    }
 }
 
 // Adds a broken link to the running list.
@@ -360,46 +431,104 @@ function addLink(source, destination, reason, links) {
 
 // Logs a link result to the console.
 function logLink(source, destination, reason) {
-    console.log(source);
-    console.log(`  -> ${destination}`);
-    console.log(`  -> ${reason}`);
-    console.log();
+    if (reason && (reason.toString().startsWith('4') || reason.toString().startsWith('5') || 
+              typeof reason === 'string' && !reason.match(/^2\d\d$/))) {
+        console.log(`BROKEN: ${source} -> ${destination} (${reason})`);
+    } else if (process.env.DEBUG) {
+        console.log(`OK: ${source} -> ${destination} (${reason})`);
+    }
 }
 
-// Logs and exits immediately.
-function fail(error) {
-    console.error(error.message);
-    process.exit(1);
+// Get all URLs to check from multiple sitemaps
+async function getAllUrlsToCheck(base) {
+    try {
+        // Set of URLs to check to avoid duplicates
+        const allUrls = new Set();
+        
+        // Add the known section sitemaps for Pulumi docs site
+        const sitemaps = [
+            // Main sitemap
+            `${base}/sitemap.xml`,
+            
+            // Section sitemaps (based on examining the repo)
+            `${base}/static/sitemaps/sitemap-blog.xml`,
+            `${base}/static/sitemaps/sitemap-docs.xml`,
+            `${base}/static/sitemaps/sitemap-tutorials.xml`,
+            `${base}/static/sitemaps/sitemap-templates.xml`,
+            `${base}/static/sitemaps/sitemap-registry.xml`,
+            `${base}/static/sitemaps/sitemap-case-studies.xml`,
+            `${base}/static/sitemaps/sitemap-product.xml`,
+            `${base}/static/sitemaps/sitemap-compliance.xml`,
+            `${base}/static/sitemaps/sitemap-what-is.xml`,
+            `${base}/static/sitemaps/sitemap-other.xml`,
+        ];
+        
+        // Try each sitemap
+        for (const sitemapUrl of sitemaps) {
+            try {
+                console.log(`Processing sitemap: ${sitemapUrl}`);
+                const urls = await processSitemap(sitemapUrl, base);
+                
+                // Add to our set
+                urls.forEach(url => allUrls.add(url));
+                
+                console.log(`Found ${urls.length} URLs in ${sitemapUrl}`);
+            } catch (error) {
+                console.log(`Could not process ${sitemapUrl}: ${error.message}`);
+            }
+        }
+        
+        // Convert to array
+        let urls = [...allUrls];
+        
+        // Apply section filter if provided
+        if (sectionFilter) {
+            urls = urls.filter(url => url.includes(sectionFilter));
+            console.log(`Applied section filter "${sectionFilter}": ${urls.length} URLs remaining`);
+        }
+        
+        // Add the additional routes
+        urls = urls.concat(additionalRoutes);
+        
+        return urls;
+    } catch (error) {
+        console.error(`Error processing sitemaps: ${error.message}`);
+        throw error;
+    }
 }
 
-// Start by fetching the sitemap from `baseURL`.
-async function getURLsToCheck(base) {
-    return await sitemap
-        .fetch(`${base}/sitemap.xml`)
-        .then(map => {
-            const urls = map.sites
-
-                // Exclude resource docs, SDK docs, and CLI download pages.
-                .filter(page => !page.match(/\/registry\/packages\/.+\/api-docs\//))
-                .filter(page => !page.match(/\/docs\/reference\/pkg\/nodejs|python\//))
-                .filter(page => !page.match(/\/docs\/install\/versions\//))
-
-                // Always check using the supplied baseURL.
-                .map(url => {
-                    const newURL = new URL(url);
-                    const baseURLObj = new URL(base);
-                    newURL.hostname = baseURLObj.hostname;
-                    newURL.protocol = baseURLObj.protocol;
-                    return newURL.toString();
-                })
-
-                // Tack on any additional pages we'd like to check.
-                .concat(additionalRoutes)
-
-                // Sort everything alphabetically.
-                .sort();
-
-            // Return the list of URLs to be crawled.
-            return urls;
-        });
-}
+// Process a single sitemap
+async function processSitemap(sitemapUrl, base) {
+    try {
+        const result = await sitemap.fetch(sitemapUrl);
+        
+        if (!result || !result.sites || !Array.isArray(result.sites)) {
+            return [];
+        }
+        
+        const urls = result.sites;
+        
+        // Exclude resource docs, SDK docs, and CLI download pages.
+        const filtered = urls
+            .filter(page => !page.match(/\/registry\/packages\/.+\/api-docs\//)) 
+            .filter(page => !page.match(/\/docs\/reference\/pkg\/nodejs|python\//)) 
+            .filter(page => !page.match(/\/docs\/install\/versions\//)); 
+        
+        // Always check using the supplied baseURL.
+        return filtered.map(url => {
+            try {
+                const newURL = new URL(url);
+                const baseURLObj = new URL(base);
+                newURL.hostname = baseURLObj.hostname;
+                newURL.protocol = baseURLObj.protocol;
+                return newURL.toString();
+            } catch (e) {
+                console.warn(`Skipping invalid URL: ${url}`);
+                return null;
+            }
+        }).filter(url => url !== null);
+    } catch (error) {
+        console.error(`Error fetching sitemap ${sitemapUrl}: ${error.message}`);
+        return [];
+    }
+}
\ No newline at end of file
diff --git a/scripts/link-checker/check-links.sh b/scripts/link-checker/check-links.sh
index 06a31e5510e6..6e1f0f53c16a 100755
--- a/scripts/link-checker/check-links.sh
+++ b/scripts/link-checker/check-links.sh
@@ -5,5 +5,22 @@ source ./scripts/common.sh
 
 echo "Checking links..."
 
+# Get required arguments
 base_url="$1"
-node "./scripts/link-checker/check-links.js" "$base_url" 2
+max_retries=2
+
+# Optional section filter (defaults to empty - check all)
+section_filter=""
+if [ "$#" -gt 1 ]; then
+    section_filter="$2"
+fi
+
+# Print information about what we're checking
+echo "Base URL: $base_url"
+if [ -n "$section_filter" ]; then
+    echo "Section filter: $section_filter"
+fi
+echo "Max retries: $max_retries"
+
+# Run the link checker
+node "./scripts/link-checker/check-links.js" "$base_url" "$max_retries" "$section_filter"