287 lines
9.8 KiB
JavaScript
287 lines
9.8 KiB
JavaScript
/**
|
||
* Internal documentation link checker
|
||
* Scans markdown files in docs/ and verifies:
|
||
* - All relative links point to existing files
|
||
* - All anchor links (#section) point to valid headings
|
||
* - No duplicate/conflicting paths
|
||
*
|
||
* Exits with code 1 if broken links are found (fails the build).
|
||
*/
|
||
|
||
const { readFileSync, existsSync } = require('node:fs');
|
||
const { resolve, dirname, join, normalize, relative } = require('node:path');
|
||
const { glob } = require('glob');
|
||
|
||
const DOCS_DIR = resolve(process.cwd(), 'docs');
|
||
|
||
// Regex to match markdown links: [text](path) and reference-style [text]: path
|
||
const LINK_PATTERNS = [
|
||
/\[([^\]]*)\]\(([^)]+)\)/g, // [text](path)
|
||
/\[([^\]]+)\]:\s*(\S+)/g, // [text]: path
|
||
];
|
||
|
||
// Regex to extract headings for anchor validation
|
||
const HEADING_PATTERN = /^#{1,6}\s+(.+)$/gm;
|
||
|
||
/**
|
||
* Determines whether a link should be ignored during validation.
|
||
* @param {string} link - The link URL or path to test.
|
||
* @returns {boolean} `true` if the link is external, uses a special protocol (`http://`, `https://`, `mailto:`, `tel:`), or is an absolute path starting with `/`, `false` otherwise.
|
||
*/
|
||
function shouldIgnore(link) {
|
||
return (
|
||
link.startsWith('http://') ||
|
||
link.startsWith('https://') ||
|
||
link.startsWith('mailto:') ||
|
||
link.startsWith('tel:') ||
|
||
link.startsWith('/') // Absolute paths handled by Astro routing
|
||
);
|
||
}
|
||
|
||
/**
|
||
* Convert a markdown heading into the anchor slug used by common Markdown processors.
|
||
*
|
||
* Produces a lowercase slug with emojis and most punctuation removed, whitespace collapsed to single
|
||
* hyphens, consecutive hyphens collapsed, and leading/trailing hyphens trimmed.
|
||
* @param {string} heading - The heading text to convert.
|
||
* @returns {string} The resulting anchor slug.
|
||
*/
|
||
function headingToAnchor(heading) {
|
||
return heading
|
||
.toLowerCase()
|
||
.replaceAll(/[\u{1F300}-\u{1F9FF}]/gu, '') // Remove emojis
|
||
.replaceAll(/[^\w\s-]/g, '') // Remove special chars except hyphens
|
||
.replaceAll(/\s+/g, '-') // Spaces to hyphens
|
||
.replaceAll(/-+/g, '-') // Collapse multiple hyphens
|
||
.replaceAll(/^-+|-+$/g, ''); // Trim leading/trailing hyphens
|
||
}
|
||
|
||
/**
|
||
* Extracts anchor slugs from Markdown content by converting headings to their anchor form.
|
||
*
|
||
* Strips inline formatting (code spans, emphasis, bold, and inline links), processes
|
||
* Markdown headings (levels 1–6), and returns the resulting anchor slugs.
|
||
*
|
||
* @param {string} content - The Markdown text to scan for headings.
|
||
* @returns {Set<string>} A set of anchor slugs derived from the headings in `content`.
|
||
*/
|
||
function extractAnchors(content) {
|
||
const anchors = new Set();
|
||
let match;
|
||
|
||
HEADING_PATTERN.lastIndex = 0;
|
||
while ((match = HEADING_PATTERN.exec(content)) !== null) {
|
||
const headingText = match[1].trim();
|
||
// Remove inline code, bold, italic, links from heading
|
||
const cleanHeading = headingText
|
||
.replaceAll(/`[^`]+`/g, '')
|
||
.replaceAll(/\*\*([^*]+)\*\*/g, '$1')
|
||
.replaceAll(/\*([^*]+)\*/g, '$1')
|
||
.replaceAll(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
||
.trim();
|
||
anchors.add(headingToAnchor(cleanHeading));
|
||
}
|
||
|
||
return anchors;
|
||
}
|
||
|
||
/**
|
||
* Remove fenced and inline code segments from Markdown content.
|
||
*
|
||
* @param {string} content - Markdown text to sanitize.
|
||
* @returns {string} The input content with fenced code blocks (```...``` and ~~~...~~~) and inline code (backtick-enclosed) removed.
|
||
*/
|
||
function stripCodeBlocks(content) {
|
||
// Remove fenced code blocks (``` or ~~~)
|
||
return content
|
||
.replaceAll(/```[\s\S]*?```/g, '')
|
||
.replaceAll(/~~~[\s\S]*?~~~/g, '')
|
||
.replaceAll(/`[^`\n]+`/g, ''); // Also remove inline code
|
||
}
|
||
|
||
/**
|
||
* Extracts all non-external link targets from markdown content, ignoring links inside code blocks.
|
||
* @param {string} content - Markdown source to scan for link targets.
|
||
* @returns {string[]} Array of raw link strings (paths and optional anchors) found in the content; external or protocol-based links are excluded.
|
||
*/
|
||
function extractLinks(content) {
|
||
const strippedContent = stripCodeBlocks(content);
|
||
const links = [];
|
||
for (const pattern of LINK_PATTERNS) {
|
||
let match;
|
||
pattern.lastIndex = 0;
|
||
while ((match = pattern.exec(strippedContent)) !== null) {
|
||
const rawLink = match[2];
|
||
if (!shouldIgnore(rawLink)) {
|
||
links.push(rawLink);
|
||
}
|
||
}
|
||
}
|
||
return links;
|
||
}
|
||
|
||
/**
|
||
* Split a link into its path and anchor components.
|
||
* @param {string} link - The link string to parse; may include a `#` followed by an anchor.
|
||
* @returns {{path: string|null, anchor: string|null}} An object where `path` is the portion before `#` (or `null` when empty, indicating a same-file anchor), and `anchor` is the portion after `#` (or `null` when no `#` is present). Note: `anchor` may be an empty string if the link ends with `#`.
|
||
*/
|
||
function parseLink(link) {
|
||
const hashIndex = link.indexOf('#');
|
||
if (hashIndex === -1) {
|
||
return { path: link, anchor: null };
|
||
}
|
||
return {
|
||
path: link.slice(0, hashIndex) || null, // Empty path means same file
|
||
anchor: link.slice(hashIndex + 1),
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Resolve a relative markdown link path from a source file to a concrete absolute file path.
|
||
* @param {string} fromFile - Absolute path of the file containing the link.
|
||
* @param {string|null} linkPath - Link target as written in markdown; may be `null` or empty for same-file anchors.
|
||
* @returns {string} The resolved absolute path. If `linkPath` is null/empty returns `fromFile`. If the resolved path has no extension, an existing `.md` file or an `index.md` inside a matching directory is preferred; otherwise the normalized resolved path is returned.
|
||
*/
|
||
function resolveLink(fromFile, linkPath) {
|
||
if (!linkPath) return fromFile; // Same file anchor
|
||
|
||
const fromDir = dirname(fromFile);
|
||
let resolved = normalize(resolve(fromDir, linkPath));
|
||
|
||
// If link doesn't have extension, try .md
|
||
if (!resolved.endsWith('.md') && !existsSync(resolved)) {
|
||
const withMd = resolved + '.md';
|
||
if (existsSync(withMd)) {
|
||
return withMd;
|
||
}
|
||
// Try as directory with index.md
|
||
const asIndex = join(resolved, 'index.md');
|
||
if (existsSync(asIndex)) {
|
||
return asIndex;
|
||
}
|
||
}
|
||
|
||
return resolved;
|
||
}
|
||
|
||
// Cache for file anchors to avoid re-reading files
|
||
const anchorCache = new Map();
|
||
|
||
/**
|
||
* Retrieve and cache the set of markdown anchor slugs for a given file.
|
||
*
|
||
* Reads the file at the provided path, extracts heading-based anchor slugs, stores them in an internal cache, and returns them.
|
||
* @param {string} filePath - Absolute or relative path to the markdown file.
|
||
* @returns {Set<string>} The set of anchor slugs present in the file.
|
||
*/
|
||
function getAnchorsForFile(filePath) {
|
||
if (anchorCache.has(filePath)) {
|
||
return anchorCache.get(filePath);
|
||
}
|
||
|
||
const content = readFileSync(filePath, 'utf-8');
|
||
const anchors = extractAnchors(content);
|
||
anchorCache.set(filePath, anchors);
|
||
return anchors;
|
||
}
|
||
|
||
/**
|
||
* Validate Markdown files in docs/ for broken relative links and anchor targets.
|
||
*
|
||
* Scans all `.md` and `.mdx` files under DOCS_DIR, checks that relative links resolve to existing
|
||
* files and that any `#anchor` references point to existing headings. Prints a grouped,
|
||
* colored report of issues to stdout and terminates the process with exit code `0` if no issues
|
||
* were found or `1` if any broken links or anchors are detected.
|
||
*/
|
||
async function main() {
|
||
console.log(' → Scanning for broken links and anchors...');
|
||
|
||
const files = await glob('**/*.{md,mdx}', {
|
||
cwd: DOCS_DIR,
|
||
absolute: true,
|
||
ignore: ['**/_*/**'], // Ignore underscore directories (archive, planning, etc.)
|
||
});
|
||
const errors = [];
|
||
|
||
// Track all resolved paths for duplicate detection
|
||
const pathRegistry = new Map(); // normalized path -> [source files]
|
||
|
||
for (const file of files) {
|
||
const content = readFileSync(file, 'utf-8');
|
||
const links = extractLinks(content);
|
||
const relativePath = relative(DOCS_DIR, file);
|
||
|
||
for (const rawLink of links) {
|
||
const { path: linkPath, anchor } = parseLink(rawLink);
|
||
|
||
// Resolve target file
|
||
const targetFile = resolveLink(file, linkPath);
|
||
const normalizedTarget = normalize(targetFile);
|
||
|
||
// Check if file exists (skip for same-file anchors)
|
||
if (linkPath && !existsSync(targetFile)) {
|
||
errors.push({
|
||
type: 'broken-link',
|
||
file: relativePath,
|
||
link: rawLink,
|
||
message: `File not found: ${linkPath}`,
|
||
});
|
||
continue;
|
||
}
|
||
|
||
// Check anchor if present
|
||
if (anchor) {
|
||
const anchors = getAnchorsForFile(targetFile);
|
||
if (!anchors.has(anchor)) {
|
||
errors.push({
|
||
type: 'broken-anchor',
|
||
file: relativePath,
|
||
link: rawLink,
|
||
message: `Anchor "#${anchor}" not found in ${linkPath || 'same file'}`,
|
||
});
|
||
}
|
||
}
|
||
|
||
// Track paths for duplicate detection
|
||
if (linkPath) {
|
||
if (!pathRegistry.has(normalizedTarget)) {
|
||
pathRegistry.set(normalizedTarget, []);
|
||
}
|
||
pathRegistry.get(normalizedTarget).push({ from: relativePath, link: rawLink });
|
||
}
|
||
}
|
||
}
|
||
|
||
// Report results
|
||
if (errors.length === 0) {
|
||
console.log(` \u001B[32m✓\u001B[0m Checked ${files.length} files - no broken links found.`);
|
||
process.exit(0);
|
||
}
|
||
|
||
console.log(`\n \u001B[31m✗\u001B[0m Found ${errors.length} issue(s):\n`);
|
||
|
||
// Group by file
|
||
const byFile = {};
|
||
for (const error of errors) {
|
||
if (!byFile[error.file]) byFile[error.file] = [];
|
||
byFile[error.file].push(error);
|
||
}
|
||
|
||
for (const [file, fileErrors] of Object.entries(byFile)) {
|
||
console.log(` \u001B[36m${file}\u001B[0m`);
|
||
for (const error of fileErrors) {
|
||
const icon = error.type === 'broken-link' ? '🔗' : '⚓';
|
||
console.log(` ${icon} ${error.link}`);
|
||
console.log(` └─ ${error.message}`);
|
||
}
|
||
console.log();
|
||
}
|
||
|
||
process.exit(1);
|
||
}
|
||
|
||
main().catch((error) => {
|
||
console.error('Error:', error.message);
|
||
process.exit(1);
|
||
});
|