From f697f03bb7779f344d1c463e76426d50f8d8ec61 Mon Sep 17 00:00:00 2001 From: Lukas Krupcik <lukas.krupcik@vsb.cz> Date: Mon, 3 Mar 2025 13:56:09 +0100 Subject: [PATCH] new file: scripts/url-interni-test.py new file: scripts/url-test.py --- scripts/url-interni-test.py | 47 +++++++++++++++++++++++++++++++++++++ scripts/url-test.py | 24 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 scripts/url-interni-test.py create mode 100644 scripts/url-test.py diff --git a/scripts/url-interni-test.py b/scripts/url-interni-test.py new file mode 100644 index 00000000..4e60ee82 --- /dev/null +++ b/scripts/url-interni-test.py @@ -0,0 +1,47 @@ +#!/bin/python3 + +import re +from pathlib import Path + +def extract_links(content): + """ Extrahuje všechny internà odkazy ze souboru. """ + link_pattern = re.compile(r'\[.*?\]\((?!http)(.*?)\)') # Vše kromě http/https + return link_pattern.findall(content) + +def extract_headers(content): + """ Extrahuje všechna H1-H6 nadpisy pro kontrolu hash odkazů. """ + header_pattern = re.compile(r'^(#+)\s*(.*)', re.MULTILINE) + return {f"#{h[1].lower().replace(' ', '-')}": h[1] for h in header_pattern.findall(content)} + +def check_internal_links(directory): + """ Kontroluje existenci souborů a hash sekcà pro internà odkazy. """ + mdx_files = {f.relative_to(directory): f for f in Path(directory).rglob("*.mdx")} + file_headers = {} + + for mdx_file, path in mdx_files.items(): + with open(path, "r", encoding="utf-8") as f: + content = f.read() + file_headers[mdx_file] = extract_headers(content) + + for mdx_file, path in mdx_files.items(): + with open(path, "r", encoding="utf-8") as f: + content = f.read() + links = extract_links(content) + + for link in links: + parts = link.split("#") + file_part = parts[0] if parts[0] else mdx_file + hash_part = f"#{parts[1]}" if len(parts) > 1 else None + + file_target = (Path(mdx_file).parent / file_part).resolve() + + # Kontrola existence souboru + if file_part and file_target not in mdx_files.values(): + print(f"❌ Broken file link in {mdx_file}: {link}") + + # Kontrola existence sekce + elif hash_part and hash_part not in file_headers.get(file_part, {}): + print(f"⚠️ Broken section link in {mdx_file}: {link}") + +check_internal_links("content/docs") + diff --git a/scripts/url-test.py b/scripts/url-test.py new file mode 100644 index 00000000..36b9f632 --- /dev/null +++ b/scripts/url-test.py @@ -0,0 +1,24 @@ +#!/bin/python3 + +import re +import requests +from pathlib import Path + +def check_links_in_mdx(directory): + mdx_files = Path(directory).rglob("*.mdx") + url_pattern = re.compile(r'\[.*?\]\((http[s]?://.*?)\)') + + for mdx_file in mdx_files: + with open(mdx_file, "r", encoding="utf-8") as f: + content = f.read() + links = url_pattern.findall(content) + + for link in links: + try: + response = requests.head(link, allow_redirects=True, timeout=5) + if response.status_code >= 400: + print(f"Broken link in {mdx_file}: {link} (Status: {response.status_code})") + except requests.RequestException: + print(f"Error checking {link} in {mdx_file}") + +check_links_in_mdx("content/docs") -- GitLab