From 3ad0d791c7436b9b18a5eff0185fa89aed222777 Mon Sep 17 00:00:00 2001 From: Eugenio Date: Mon, 26 Jan 2026 11:08:04 -0500 Subject: [PATCH] check-links proposal --- .github/workflows/check-links.yml | 24 +++ package.json | 5 +- scripts/check-internal-links.js | 308 ++++++++++++++++++++++++++++++ 3 files changed, 335 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/check-links.yml create mode 100644 scripts/check-internal-links.js diff --git a/.github/workflows/check-links.yml b/.github/workflows/check-links.yml new file mode 100644 index 000000000..57d5a4608 --- /dev/null +++ b/.github/workflows/check-links.yml @@ -0,0 +1,24 @@ +name: Daily Link Check + +on: + schedule: + - cron: '0 6 * * *' # 6am UTC daily + workflow_dispatch: # Manual trigger + +jobs: + check-links: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install dependencies + run: npm install glob + + - name: Check internal links + run: node scripts/check-internal-links.js + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/package.json b/package.json index 54b3e7a60..32fd2836c 100644 --- a/package.json +++ b/package.json @@ -7,12 +7,13 @@ "build": "next build", "start": "next start", "lint": "next lint", - "check-links": "node dev/check-links.mjs", "check-filenames": "node dev/check-filenames.mjs", "check-images": "node dev/check-images.mjs", + "check-internal-links": "node scripts/check-internal-links.js", "baseai": "baseai", "sync": "npx baseai@latest deploy -m memory-sg-docs-live", - "format": "prettier --config ./prettier.config.js --cache --cache-strategy metadata --write=true '**/{*.{js?(on),ts?(x),md,mdx,s?css},.*.js?(on)}'" + "format": "prettier --config ./prettier.config.js --cache --cache-strategy metadata --write=true '**/{*.{js?(on),ts?(x),md,mdx,s?css},.*.js?(on)}'", + "llmstxt": "node scripts/llms-txt-generator.js" }, "browserslist": "defaults, not ie <= 11", "dependencies": { diff --git a/scripts/check-internal-links.js b/scripts/check-internal-links.js new file mode 100644 index 000000000..8151cb6a9 --- /dev/null +++ b/scripts/check-internal-links.js @@ -0,0 +1,308 @@ +const path = require('path'); +const fs = require('fs'); +const { sync: glob } = require('glob'); + +const DOCS_DIR = './docs'; +const SLACK_WEBHOOK_URL = process.env.SLACK_WEBHOOK_URL; + +const ASSET_EXTENSIONS = new Set([ + 'png', 'jpg', 'jpeg', 'gif', 'svg', 'webp', 'ico', + 'pdf', + 'zip', 'tar', 'gz', 'tgz', + 'mp4', 'mov', 'webm', + 'css', 'js', 'map', +]); + +function normalizeRoute(p) { + if (!p) return '/'; + + p = String(p).trim(); + + // Convert Windows separators to URL separators + p = p.replace(/\\/g, '/'); + + // Ensure leading slash + if (!p.startsWith('/')) p = '/' + p; + + // Collapse duplicate slashes (keep a single leading slash) + p = p.replace(/\/{2,}/g, '/'); + + // Remove trailing slash (except for root) + if (p.length > 1 && p.endsWith('/')) p = p.slice(0, -1); + + return p; +} + +function buildValidRoutes(files) { + const validRoutes = new Set(); + const canonicalByLower = new Map(); // lower -> canonical (for case-mismatch detection) + + for (const file of files) { + // docs/cody/capabilities/chat.mdx → /cody/capabilities/chat + // docs/cody/index.mdx → /cody + // docs/index.mdx → / + let route = file + .replace(/(^|\/)index\.mdx?$/, '') // handles "index.mdx" and "foo/index.mdx" + .replace(/\.mdx?$/, ''); + + route = normalizeRoute(route); + validRoutes.add(route); + + const lower = route.toLowerCase(); + if (!canonicalByLower.has(lower)) { + canonicalByLower.set(lower, route); + } + } + + return { validRoutes, canonicalByLower }; +} + +function extractLinks(content) { + const links = []; + + // Match various link patterns in MDX + const patterns = [ + /\]\((\/[^)\s"'`]+)(?:\s+["'][^"']*["'])?\)/g, // Markdown: [text](/path) or [text](/path "title") + /\bhref\s*=\s*["'](\/[^"']+)["']/g, // JSX: href="/path" + /\bhref\s*=\s*\{\s*["'](\/[^"']+)["']\s*\}/g, // JSX: href={'/path'} + /\bto\s*=\s*["'](\/[^"']+)["']/g, // + /\bto\s*=\s*\{\s*["'](\/[^"']+)["']\s*\}/g, // + ]; + + // Reference-style link definitions: [ref]: /path + const refLinkPattern = /^\s*\[[^\]]+\]:\s*(\/\S+)/gm; + + const lines = content.split('\n'); + let inFence = false; + + lines.forEach((line, lineIndex) => { + // Skip code fences + if (/^\s*```/.test(line)) { + inFence = !inFence; + return; + } + if (inFence) return; + + // Check for reference-style link definitions + let refMatch; + refLinkPattern.lastIndex = 0; + while ((refMatch = refLinkPattern.exec(line)) !== null) { + const fullLink = refMatch[1]; + const cleanLink = normalizeRoute(fullLink.split('#')[0].split('?')[0]); + if (!isAssetLink(cleanLink) && !cleanLink.startsWith('/api/')) { + links.push({ link: cleanLink, line: lineIndex + 1 }); + } + } + + for (const pattern of patterns) { + pattern.lastIndex = 0; // Reset regex state + let match; + while ((match = pattern.exec(line)) !== null) { + const fullLink = match[1]; + // Strip anchor and query params for validation + const cleanLink = normalizeRoute(fullLink.split('#')[0].split('?')[0]); + + // Skip API routes + if (cleanLink.startsWith('/api/')) { + continue; + } + + // Skip asset files (using allowlist) + if (isAssetLink(cleanLink)) { + continue; + } + + links.push({ + link: cleanLink, + line: lineIndex + 1 + }); + } + } + }); + + return links; +} + +function isAssetLink(cleanLink) { + const lastSegment = cleanLink.split('/').pop() || ''; + const m = lastSegment.match(/\.([a-z0-9]+)$/i); + return m && ASSET_EXTENSIONS.has(m[1].toLowerCase()); +} + +function findBrokenLinks(files, validRoutes, canonicalByLower) { + const brokenLinks = []; + + for (const file of files) { + const filePath = path.join(DOCS_DIR, file); + const content = fs.readFileSync(filePath, 'utf-8'); + const links = extractLinks(content); + + for (const { link, line } of links) { + // Exact match - link is valid + if (validRoutes.has(link)) continue; + + // Check for case mismatch (works on macOS/Windows but fails on Linux CI) + const canonical = canonicalByLower.get(link.toLowerCase()); + if (canonical) { + brokenLinks.push({ + file: filePath, + line, + link, + suggestion: canonical, + isCaseMismatch: true + }); + continue; + } + + // Truly broken link + brokenLinks.push({ file: filePath, line, link }); + } + } + + return brokenLinks; +} + +async function sendToSlack(brokenLinks, totalFiles) { + if (!SLACK_WEBHOOK_URL) { + console.log('No SLACK_WEBHOOK_URL provided, printing results locally:'); + console.log(`Total docs scanned: ${totalFiles}`); + console.log(`Broken links found: ${brokenLinks.length}`); + if (brokenLinks.length > 0) { + console.log('\nBroken links:'); + brokenLinks.forEach(({ file, line, link, suggestion, isCaseMismatch }) => { + if (isCaseMismatch) { + console.log(` ${file}:${line} → ${link} (case mismatch, did you mean ${suggestion}?)`); + } else { + console.log(` ${file}:${line} → ${link}`); + } + }); + } + return; + } + + // Guard for older Node versions + if (typeof fetch !== 'function') { + throw new Error('Global fetch is not available. Use Node 18+ or add a fetch polyfill.'); + } + + const date = new Date().toISOString().split('T')[0]; + + const caseMismatches = brokenLinks.filter(b => b.isCaseMismatch); + const trulyBroken = brokenLinks.filter(b => !b.isCaseMismatch); + + const blocks = [ + { + type: 'header', + text: { + type: 'plain_text', + text: `📊 Daily Docs Link Check - ${date}`, + emoji: true + } + }, + { + type: 'section', + text: { + type: 'mrkdwn', + text: `*Total docs scanned:* ${totalFiles}\n*Broken links:* ${trulyBroken.length}\n*Case mismatches:* ${caseMismatches.length}` + } + } + ]; + + if (trulyBroken.length > 0) { + // Group by file for cleaner output + const grouped = {}; + for (const { file, line, link } of trulyBroken) { + if (!grouped[file]) grouped[file] = []; + grouped[file].push({ line, link }); + } + + let detailText = '*Broken links:*\n'; + const entries = Object.entries(grouped).slice(0, 10); // Limit to 10 files + + for (const [file, links] of entries) { + const shortFile = file.replace('./docs/', ''); + detailText += `*${shortFile}*\n`; + for (const { line, link } of links.slice(0, 5)) { // Limit to 5 links per file + detailText += ` • Line ${line}: \`${link}\`\n`; + } + if (links.length > 5) { + detailText += ` • ... and ${links.length - 5} more\n`; + } + } + + if (Object.keys(grouped).length > 10) { + detailText += `\n_... and ${Object.keys(grouped).length - 10} more files with broken links_`; + } + + blocks.push({ + type: 'section', + text: { + type: 'mrkdwn', + text: detailText + } + }); + } + + if (caseMismatches.length > 0 && caseMismatches.length <= 20) { + let caseText = '*Case mismatches (will fail on Linux CI):*\n'; + for (const { file, line, link, suggestion } of caseMismatches.slice(0, 10)) { + const shortFile = file.replace('./docs/', ''); + caseText += `• \`${shortFile}:${line}\`: \`${link}\` → \`${suggestion}\`\n`; + } + if (caseMismatches.length > 10) { + caseText += `_... and ${caseMismatches.length - 10} more_`; + } + + blocks.push({ + type: 'section', + text: { + type: 'mrkdwn', + text: caseText + } + }); + } + + if (brokenLinks.length === 0) { + blocks.push({ + type: 'section', + text: { + type: 'mrkdwn', + text: '✅ No broken internal links found!' + } + }); + } + + const response = await fetch(SLACK_WEBHOOK_URL, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ blocks }) + }); + + if (!response.ok) { + throw new Error(`Slack webhook failed: ${response.status}`); + } + + console.log('Report sent to Slack'); +} + +async function main() { + // Glob once and reuse + const files = glob('**/*.{md,mdx}', { + cwd: DOCS_DIR, + ignore: ['node_modules/**'] + }); + + console.log('Building valid routes...'); + const { validRoutes, canonicalByLower } = buildValidRoutes(files); + console.log(`Found ${validRoutes.size} valid routes`); + + console.log('Scanning for broken links...'); + const brokenLinks = findBrokenLinks(files, validRoutes, canonicalByLower); + + await sendToSlack(brokenLinks, files.length); +} + +main().catch(err => { + console.error('Error:', err.message); + process.exit(1); +});