Newer
Older
#!/usr/bin/python3
"""
Script to check internal links and section references in MDX files.
"""
import re
from pathlib import Path
def extract_links(content):
"""
Extract all internal links from the file.
:param content: The content of the MDX file.
:return: A list of internal links.
"""
link_pattern = re.compile(r'\[.*?\]\((?!http)(.*?)\)') # Everything except http/https
return link_pattern.findall(content)
def extract_headers(content):
"""
Extract all H1-H6 headers for hash reference checks.
:param content: The content of the MDX file.
:return: A set of headers formatted as hash links.
"""
header_pattern = re.compile(r'^(#+)\s*(.*)', re.MULTILINE)
return {f"#{match[1].lower().replace(' ', '-')}" for match in header_pattern.findall(content)}
def check_internal_links(directory):
"""
Check the existence of files and hash sections for internal links.
:param directory: The directory containing MDX files.
"""
mdx_files = {f.relative_to(directory): f for f in Path(directory).rglob("*.mdx")}
file_headers = {}
for mdx_file, path in mdx_files.items():
with open(path, "r", encoding="utf-8") as file:
content = file.read()
file_headers[mdx_file] = extract_headers(content)
for mdx_file, path in mdx_files.items():
with open(path, "r", encoding="utf-8") as file:
content = file.read()
links = extract_links(content)
for link in links:
parts = link.split("#")
file_part = parts[0] if parts[0] else mdx_file
hash_part = f"#{parts[1]}" if len(parts) > 1 else None
file_target = (Path(mdx_file).parent / file_part).resolve()
if file_part and file_target not in mdx_files.values():
print(f"❌ Broken file link in {mdx_file}: {link}")
elif hash_part and hash_part not in file_headers.get(file_part, set()):
print(f"⚠️ Broken section link in {mdx_file}: {link}")
if __name__ == "__main__":
check_internal_links("content/docs")