feat: add link checker script for markdown files

- Introduced a new Python script to check the availability of URLs in markdown files within the project. - The script supports options for fixing internal links, specifying output formats, and setting request timeouts. - Generates detailed reports on working, broken, and skipped links, with support for text, JSON, and markdown formats. - Updated markdown files to remove trailing punctuation from tutorial links for consistency.
2026-02-03 02:04:30 +08:00 · 2025-08-24 04:10:51 +00:00
parent 64817cc722
commit 3b79879fcf
4 changed files with 355 additions and 3 deletions
--- a/.github/check_links.py
+++ b/.github/check_links.py
@@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+"""
+Link Checker for eunomia.dev Documentation
+
+This script checks all URLs in markdown files for availability and generates a report.
+It can be run locally or as part of CI/CD pipeline.
+
+Usage:
+    python check_links.py [--fix-internal] [--output-format <format>] [--timeout <seconds>]
+
+Options:
+    --fix-internal    Attempt to fix internal eunomia.dev links
+    --output-format   Output format: text, json, or markdown (default: text)
+    --timeout        Request timeout in seconds (default: 10)
+"""
+
+import re
+import os
+import sys
+import glob
+import json
+import argparse
+import requests
+from urllib.parse import urlparse, urljoin
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import time
+from collections import defaultdict
+from datetime import datetime
+
+# Configuration
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (eunomia.dev Link Checker) AppleWebKit/537.36'
+}
+
+# Regex patterns
+URL_PATTERN = re.compile(r'https?://[^\s\)\]]+')
+MARKDOWN_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
+
+# Known issues and replacements
+KNOWN_REPLACEMENTS = {
+    'https://eunomia.dev/tutorials/': 'https://eunomia.dev/tutorials/',
+    'https://eunomia.dev/blogs/': 'https://eunomia.dev/blog/',
+    'https://eunomia.dev/zh/tutorials/': 'https://eunomia.dev/tutorials/',
+    'https://eunomia.dev/zh/blogs/': 'https://eunomia.dev/blog/',
+}
+
+# URLs to skip checking
+SKIP_URLS = [
+    'http://localhost',
+    'http://127.0.0.1',
+    'http://0.0.0.0',
+    'https://chat.openai.com',  # Often returns 403
+]
+
+class LinkChecker:
+    def __init__(self, root_dir, timeout=10):
+        self.root_dir = root_dir
+        self.timeout = timeout
+        self.url_to_files = defaultdict(list)
+        self.results = {
+            'working': [],
+            'broken': [],
+            'skipped': [],
+            'total': 0
+        }
+        
+    def find_markdown_files(self):
+        """Find all markdown files in the project"""
+        md_files = []
+        for pattern in ['**/*.md', '**/*.MD']:
+            md_files.extend(glob.glob(os.path.join(self.root_dir, pattern), recursive=True))
+        # Filter out node_modules, .git, and other irrelevant directories
+        md_files = [f for f in md_files if not any(skip in f for skip in 
+                    ['node_modules', '.git', 'site/', 'build/', '_build/'])]
+        return sorted(md_files)
+    
+    def extract_urls_from_file(self, filepath):
+        """Extract all URLs from a markdown file"""
+        urls = set()
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+                # Find URLs in markdown links
+                for match in MARKDOWN_LINK_PATTERN.finditer(content):
+                    url = match.group(2).strip()
+                    if url.startswith('http'):
+                        urls.add(url)
+                
+                # Find standalone URLs
+                for match in URL_PATTERN.finditer(content):
+                    url = match.group(0).rstrip('.,;:!?"\'>]')
+                    urls.add(url)
+                    
+        except Exception as e:
+            print(f"Error reading {filepath}: {e}")
+        
+        return urls
+    
+    def should_skip_url(self, url):
+        """Check if URL should be skipped"""
+        return any(skip in url for skip in SKIP_URLS)
+    
+    def check_url(self, url):
+        """Check if a URL is accessible"""
+        if self.should_skip_url(url):
+            return url, None, "Skipped"
+            
+        try:
+            # Remove fragments for checking
+            url_without_fragment = url.split('#')[0]
+            
+            # Try HEAD request first
+            response = requests.head(
+                url_without_fragment, 
+                headers=HEADERS, 
+                timeout=self.timeout, 
+                allow_redirects=True
+            )
+            
+            # If HEAD fails with 4xx/5xx, try GET
+            if response.status_code >= 400:
+                response = requests.get(
+                    url_without_fragment, 
+                    headers=HEADERS, 
+                    timeout=self.timeout, 
+                    allow_redirects=True
+                )
+            
+            return url, response.status_code, None
+            
+        except requests.exceptions.Timeout:
+            return url, None, "Timeout"
+        except requests.exceptions.ConnectionError:
+            return url, None, "Connection Error"
+        except Exception as e:
+            return url, None, str(e)
+    
+    def collect_urls(self):
+        """Collect all URLs from markdown files"""
+        print("Finding markdown files...")
+        md_files = self.find_markdown_files()
+        print(f"Found {len(md_files)} markdown files")
+        
+        print("\nExtracting URLs...")
+        for md_file in md_files:
+            urls = self.extract_urls_from_file(md_file)
+            for url in urls:
+                relative_path = os.path.relpath(md_file, self.root_dir)
+                self.url_to_files[url].append(relative_path)
+        
+        self.results['total'] = len(self.url_to_files)
+        print(f"Found {self.results['total']} unique URLs to check")
+    
+    def check_all_urls(self, max_workers=10):
+        """Check all collected URLs concurrently"""
+        print(f"\nChecking URL availability with {max_workers} workers...")
+        
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_url = {
+                executor.submit(self.check_url, url): url 
+                for url in self.url_to_files.keys()
+            }
+            
+            checked = 0
+            for future in as_completed(future_to_url):
+                url, status_code, error = future.result()
+                checked += 1
+                
+                if error == "Skipped":
+                    self.results['skipped'].append({
+                        'url': url,
+                        'files': self.url_to_files[url]
+                    })
+                elif status_code and 200 <= status_code < 400:
+                    self.results['working'].append({
+                        'url': url,
+                        'status': status_code,
+                        'files': self.url_to_files[url]
+                    })
+                else:
+                    self.results['broken'].append({
+                        'url': url,
+                        'status': status_code,
+                        'error': error,
+                        'files': self.url_to_files[url]
+                    })
+                
+                if checked % 50 == 0:
+                    print(f"Progress: {checked}/{self.results['total']} URLs checked...")
+    
+    def generate_report(self, output_format='text'):
+        """Generate report in specified format"""
+        if output_format == 'json':
+            return self._generate_json_report()
+        elif output_format == 'markdown':
+            return self._generate_markdown_report()
+        else:
+            return self._generate_text_report()
+    
+    def _generate_text_report(self):
+        """Generate plain text report"""
+        report = []
+        report.append("=" * 80)
+        report.append("LINK CHECK REPORT")
+        report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        report.append("=" * 80)
+        report.append(f"\nSUMMARY:")
+        report.append(f"Total URLs checked: {self.results['total']}")
+        report.append(f"Working links: {len(self.results['working'])}")
+        report.append(f"Broken links: {len(self.results['broken'])}")
+        report.append(f"Skipped links: {len(self.results['skipped'])}")
+        report.append(f"Success rate: {len(self.results['working']) / max(1, self.results['total'] - len(self.results['skipped'])) * 100:.1f}%")
+        
+        if self.results['broken']:
+            report.append("\n" + "=" * 80)
+            report.append("BROKEN LINKS (sorted by frequency)")
+            report.append("=" * 80)
+            
+            # Sort by number of occurrences
+            sorted_broken = sorted(
+                self.results['broken'], 
+                key=lambda x: len(x['files']), 
+                reverse=True
+            )
+            
+            for item in sorted_broken[:50]:  # Show top 50
+                report.append(f"\nURL: {item['url']}")
+                if item['status']:
+                    report.append(f"Status: HTTP {item['status']}")
+                else:
+                    report.append(f"Error: {item['error']}")
+                report.append(f"Found in {len(item['files'])} file(s):")
+                
+                for f in item['files'][:5]:
+                    report.append(f"  - {f}")
+                if len(item['files']) > 5:
+                    report.append(f"  ... and {len(item['files']) - 5} more files")
+        
+        return "\n".join(report)
+    
+    def _generate_markdown_report(self):
+        """Generate markdown report suitable for GitHub issues"""
+        report = []
+        report.append("# Link Check Report")
+        report.append(f"\n**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        report.append("\n## Summary\n")
+        report.append(f"- **Total URLs checked:** {self.results['total']}")
+        report.append(f"- **Working links:** {len(self.results['working'])} ✅")
+        report.append(f"- **Broken links:** {len(self.results['broken'])} ❌")
+        report.append(f"- **Skipped links:** {len(self.results['skipped'])} ⏭️")
+        report.append(f"- **Success rate:** {len(self.results['working']) / max(1, self.results['total'] - len(self.results['skipped'])) * 100:.1f}%")
+        
+        if self.results['broken']:
+            report.append("\n## Broken Links\n")
+            report.append("| URL | Status/Error | Files | Count |")
+            report.append("|-----|--------------|-------|-------|")
+            
+            sorted_broken = sorted(
+                self.results['broken'], 
+                key=lambda x: len(x['files']), 
+                reverse=True
+            )
+            
+            for item in sorted_broken[:30]:
+                status = f"HTTP {item['status']}" if item['status'] else item['error']
+                files = f"{item['files'][0]}" if len(item['files']) == 1 else f"{item['files'][0]} (+{len(item['files'])-1} more)"
+                report.append(f"| {item['url']} | {status} | {files} | {len(item['files'])} |")
+        
+        return "\n".join(report)
+    
+    def _generate_json_report(self):
+        """Generate JSON report for programmatic use"""
+        return json.dumps({
+            'metadata': {
+                'generated': datetime.now().isoformat(),
+                'root_dir': self.root_dir,
+                'total_urls': self.results['total']
+            },
+            'summary': {
+                'working': len(self.results['working']),
+                'broken': len(self.results['broken']),
+                'skipped': len(self.results['skipped']),
+                'success_rate': len(self.results['working']) / max(1, self.results['total'] - len(self.results['skipped']))
+            },
+            'results': self.results
+        }, indent=2)
+    
+    def fix_internal_links(self):
+        """Attempt to fix known internal link issues"""
+        fixed_count = 0
+        
+        for broken in self.results['broken']:
+            url = broken['url']
+            if 'eunomia.dev' in url:
+                for old_pattern, new_pattern in KNOWN_REPLACEMENTS.items():
+                    if old_pattern in url:
+                        new_url = url.replace(old_pattern, new_pattern)
+                        print(f"Would fix: {url} -> {new_url}")
+                        fixed_count += 1
+                        # TODO: Actually update the files
+        
+        print(f"\nIdentified {fixed_count} internal links that could be fixed")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Check links in markdown files')
+    parser.add_argument('--root-dir', default='.',
+                        help='Root directory to search for markdown files')
+    parser.add_argument('--fix-internal', action='store_true',
+                        help='Attempt to fix internal eunomia.dev links')
+    parser.add_argument('--output-format', choices=['text', 'json', 'markdown'],
+                        default='text', help='Output format for the report')
+    parser.add_argument('--timeout', type=int, default=10,
+                        help='Request timeout in seconds')
+    parser.add_argument('--max-workers', type=int, default=10,
+                        help='Maximum number of concurrent requests')
+    parser.add_argument('--output-file', help='Save report to file')
+    
+    args = parser.parse_args()
+    
+    # Create checker instance
+    checker = LinkChecker(args.root_dir, args.timeout)
+    
+    # Collect and check URLs
+    checker.collect_urls()
+    checker.check_all_urls(args.max_workers)
+    
+    # Generate report
+    report = checker.generate_report(args.output_format)
+    
+    # Output report
+    if args.output_file:
+        with open(args.output_file, 'w', encoding='utf-8') as f:
+            f.write(report)
+        print(f"\nReport saved to: {args.output_file}")
+    else:
+        print("\n" + report)
+    
+    # Fix internal links if requested
+    if args.fix_internal:
+        checker.fix_internal_links()
+    
+    # Exit with error code if broken links found
+    if checker.results['broken']:
+        sys.exit(1)
+    else:
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()