diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8b79f0f..610e40a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ on: jobs: build-en: - name: Build (English) + name: Build (English mdBook) runs-on: ubuntu-22.04 steps: @@ -17,30 +17,28 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.10' - cache: 'pip' - - name: Install pandoc + - name: Install Rust toolchain run: | - wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb - sudo dpkg -i pandoc-2.19.2-1-amd64.deb + curl -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal + echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - - name: Install d2lbook + - name: Install mdBook + run: cargo install mdbook --locked + + - name: Run mdBook regression tests run: | - git clone https://github.com/openmlsys/d2l-book.git - cd d2l-book - # Fix Python 3.10+ incompatibility: bibtex<2.0.0 depends on oset which - # uses collections.MutableSet removed in Python 3.10. - sed -i "s/'sphinxcontrib-bibtex<2.0.0'/'sphinxcontrib-bibtex>=2.5.0'/" setup.py - python3 -m pip install . + python3 -m unittest discover -s tests -p 'test_prepare_mdbook.py' + python3 -m unittest discover -s tests -p 'test_prepare_mdbook_zh.py' + python3 -m unittest discover -s tests -p 'test_assemble_docs_publish_tree.py' + python3 -m unittest discover -s tests -p 'test_ensure_book_resources.py' + python3 -m unittest discover -s tests -p 'test_mdbook_mathjax.py' - - name: Install Python dependencies - run: python3 -m pip install -r requirements.txt - - - name: Build English HTML - run: bash build_html.sh + - name: Build English HTML with mdBook + run: bash build_mdbook.sh build-zh: - name: Build (Chinese) + name: Build (Chinese mdBook) runs-on: ubuntu-22.04 steps: @@ -50,25 +48,17 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.10' - cache: 'pip' - - - name: Install pandoc + + - name: Install Rust toolchain run: | - wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb - sudo dpkg -i pandoc-2.19.2-1-amd64.deb + curl -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal + echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - - name: Install d2lbook - run: | - git clone https://github.com/openmlsys/d2l-book.git - cd d2l-book - sed -i "s/'sphinxcontrib-bibtex<2.0.0'/'sphinxcontrib-bibtex>=2.5.0'/" setup.py - python3 -m pip install . + - name: Install mdBook + run: cargo install mdbook --locked - - name: Install Python dependencies - run: python3 -m pip install -r requirements.txt - - - name: Build Chinese HTML - run: bash build_html_zh.sh + - name: Build Chinese HTML with mdBook + run: bash build_mdbook_zh.sh build: name: build diff --git a/.github/workflows/update_docs.yml b/.github/workflows/update_docs.yml index ac9694a..7f19a6b 100644 --- a/.github/workflows/update_docs.yml +++ b/.github/workflows/update_docs.yml @@ -16,30 +16,28 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.10' - cache: 'pip' - - name: Install pandoc + - name: Install Rust toolchain run: | - wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb - sudo dpkg -i pandoc-2.19.2-1-amd64.deb + curl -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal + echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - - name: Install d2lbook + - name: Install mdBook + run: cargo install mdbook --locked + + - name: Run mdBook regression tests run: | - git clone https://github.com/openmlsys/d2l-book.git - cd d2l-book - # Fix Python 3.10+ incompatibility: bibtex<2.0.0 depends on oset which - # uses collections.MutableSet removed in Python 3.10. - sed -i "s/'sphinxcontrib-bibtex<2.0.0'/'sphinxcontrib-bibtex>=2.5.0'/" setup.py - python3 -m pip install . + python3 -m unittest discover -s tests -p 'test_prepare_mdbook.py' + python3 -m unittest discover -s tests -p 'test_prepare_mdbook_zh.py' + python3 -m unittest discover -s tests -p 'test_assemble_docs_publish_tree.py' + python3 -m unittest discover -s tests -p 'test_ensure_book_resources.py' + python3 -m unittest discover -s tests -p 'test_mdbook_mathjax.py' - - name: Install Python dependencies - run: python3 -m pip install -r requirements.txt sphinx-mathjax-offline + - name: Build English HTML with mdBook + run: bash build_mdbook.sh - - name: Build English HTML - run: bash build_html.sh - - - name: Build Chinese HTML - run: bash build_html_zh.sh + - name: Build Chinese HTML with mdBook + run: bash build_mdbook_zh.sh - name: Deploy to GitHub Pages env: @@ -47,12 +45,11 @@ jobs: run: | git clone https://x-access-token:${DEPLOY_TOKEN}@github.com/openmlsys/openmlsys.github.io.git - # English → root (default language) - cp -r en_chapters/_build/html/* openmlsys.github.io/docs/ - - # Chinese → /cn/ subdirectory - mkdir -p openmlsys.github.io/docs/cn - cp -r zh_chapters/_build/html/* openmlsys.github.io/docs/cn/ + python3 tools/assemble_docs_publish_tree.py \ + --destination-root openmlsys.github.io \ + --docs-subdir docs \ + --en-source .mdbook/book \ + --zh-source .mdbook-zh/book cd openmlsys.github.io git config user.name "github-actions[bot]" diff --git a/.gitignore b/.gitignore index fa65c61..2f39fab 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ test*.md run.sh .idea env +.mdbook/ .mdbook-zh/ .mdbook-zh-test/ task_plan.md @@ -19,7 +20,6 @@ findings.md progress.md d2l-book/ docs/ -tests/ en_chapters/img en_chapters/references en_chapters/static diff --git a/book.toml b/book.toml index a9fa90f..6a5ddc9 100644 --- a/book.toml +++ b/book.toml @@ -1,15 +1,15 @@ [book] authors = ["OpenMLSys Contributors"] -language = "zh-CN" -src = "zh_chapters" -title = "机器学习系统:设计和实现" +language = "en" +src = "en_chapters" +title = "Machine Learning Systems: Design and Implementation" [build] -build-dir = ".mdbook-zh/book" +build-dir = ".mdbook/book" create-missing = false -[preprocessor.openmlsys-zh] -command = "python3 tools/mdbook_zh_preprocessor.py" +[preprocessor.openmlsys] +command = "python3 tools/mdbook_preprocessor.py" [output.html] git-repository-url = "https://github.com/openmlsys/openmlsys-zh" diff --git a/books/zh/book.toml b/books/zh/book.toml new file mode 100644 index 0000000..532f20a --- /dev/null +++ b/books/zh/book.toml @@ -0,0 +1,18 @@ +[book] +authors = ["OpenMLSys Contributors"] +language = "zh-CN" +src = "../../zh_chapters" +title = "机器学习系统:设计和实现" + +[build] +build-dir = "../../.mdbook-zh/book" +create-missing = false + +[preprocessor.openmlsys-zh] +command = "python3 ../../tools/mdbook_zh_preprocessor.py" + +[output.html] +git-repository-url = "https://github.com/openmlsys/openmlsys-zh" +mathjax-support = true +preferred-dark-theme = "navy" +additional-css = ["theme/dark-mode-images.css"] diff --git a/books/zh/theme/dark-mode-images.css b/books/zh/theme/dark-mode-images.css new file mode 100644 index 0000000..d0b209a --- /dev/null +++ b/books/zh/theme/dark-mode-images.css @@ -0,0 +1,6 @@ +html.light img[src$=".png"], +html.light img[src$=".jpg"], +html.light img[src$=".jpeg"], +html.light img[src$=".gif"] { + background-color: #fff; +} diff --git a/books/zh/theme/head.hbs b/books/zh/theme/head.hbs new file mode 100644 index 0000000..793a459 --- /dev/null +++ b/books/zh/theme/head.hbs @@ -0,0 +1,10 @@ + diff --git a/build_html.sh b/build_html.sh index 6e47429..904791c 100644 --- a/build_html.sh +++ b/build_html.sh @@ -10,15 +10,7 @@ set -e ROOT="$(cd "$(dirname "$0")" && pwd)" # ── Create resource symlinks ────────────────────────────────────────────────── -for target in img references static mlsys.bib; do - link="$ROOT/en_chapters/$target" - rel_target="../$target" - if [ -e "$link" ] && [ ! -L "$link" ]; then - echo "Refusing to replace non-symlink path: $link" >&2 - exit 1 - fi - ln -sfn "$rel_target" "$link" -done +python3 "$ROOT/tools/ensure_book_resources.py" --chapter-dir "$ROOT/en_chapters" # ── Build ───────────────────────────────────────────────────────────────────── cd "$ROOT/en_chapters" diff --git a/build_html_zh.sh b/build_html_zh.sh index 2949005..ccefd76 100755 --- a/build_html_zh.sh +++ b/build_html_zh.sh @@ -10,15 +10,7 @@ set -e ROOT="$(cd "$(dirname "$0")" && pwd)" # ── Create resource symlinks ────────────────────────────────────────────────── -for target in img references static mlsys.bib; do - link="$ROOT/zh_chapters/$target" - rel_target="../$target" - if [ -e "$link" ] && [ ! -L "$link" ]; then - echo "Refusing to replace non-symlink path: $link" >&2 - exit 1 - fi - ln -sfn "$rel_target" "$link" -done +python3 "$ROOT/tools/ensure_book_resources.py" --chapter-dir "$ROOT/zh_chapters" # ── Build ───────────────────────────────────────────────────────────────────── cd "$ROOT/zh_chapters" diff --git a/build_mdbook.sh b/build_mdbook.sh new file mode 100644 index 0000000..f814264 --- /dev/null +++ b/build_mdbook.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PYTHON_BIN="$(command -v python3 || command -v python || true)" + +if [[ -z "${PYTHON_BIN}" ]]; then + echo "Python is required to prepare the mdBook staging tree." >&2 + exit 1 +fi + +if ! command -v mdbook >/dev/null 2>&1; then + echo "mdbook is not installed. Install it first, for example with: cargo install mdbook" >&2 + exit 1 +fi + +"${PYTHON_BIN}" "${ROOT}/tools/ensure_book_resources.py" --chapter-dir "${ROOT}/en_chapters" +"${PYTHON_BIN}" "${ROOT}/tools/prepare_mdbook.py" \ + --source "${ROOT}/en_chapters" \ + --summary-output "${ROOT}/en_chapters/SUMMARY.md" \ + --placeholder-prefix "[TODO: src = zh_chapters/" + +mdbook build "${ROOT}" diff --git a/build_mdbook_zh.sh b/build_mdbook_zh.sh index 6028915..fff91c4 100755 --- a/build_mdbook_zh.sh +++ b/build_mdbook_zh.sh @@ -14,22 +14,12 @@ if ! command -v mdbook >/dev/null 2>&1; then exit 1 fi -# ── Create resource symlinks ────────────────────────────────────────────────── -# Resources (img/, references/, static/, mlsys.bib) live at the repo root and -# are symlinked into zh_chapters/ so mdbook can find them at relative paths. -for target in img references static mlsys.bib; do - link="${ROOT}/zh_chapters/${target}" - rel_target="../${target}" - if [[ -e "${link}" ]] && [[ ! -L "${link}" ]]; then - echo "Refusing to replace non-symlink path: ${link}" >&2 - exit 1 - fi - ln -sfn "${rel_target}" "${link}" -done +# ── Create resource links ───────────────────────────────────────────────────── +"${PYTHON_BIN}" "${ROOT}/tools/ensure_book_resources.py" --chapter-dir "${ROOT}/zh_chapters" # ── Build ───────────────────────────────────────────────────────────────────── "${PYTHON_BIN}" "${ROOT}/tools/prepare_mdbook_zh.py" \ --source "${ROOT}/zh_chapters" \ --summary-output "${ROOT}/zh_chapters/SUMMARY.md" -mdbook build "${ROOT}" +mdbook build "${ROOT}/books/zh" diff --git a/en_chapters/SUMMARY.md b/en_chapters/SUMMARY.md new file mode 100644 index 0000000..9436c5d --- /dev/null +++ b/en_chapters/SUMMARY.md @@ -0,0 +1,3 @@ +# Summary + +[Machine Learning Systems: Design and Implementation](index.md) diff --git a/tests/test_assemble_docs_publish_tree.py b/tests/test_assemble_docs_publish_tree.py new file mode 100644 index 0000000..04e9bf6 --- /dev/null +++ b/tests/test_assemble_docs_publish_tree.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import tempfile +import unittest +from pathlib import Path + +from tools.assemble_docs_publish_tree import assemble_publish_tree + + +class AssembleDocsPublishTreeTests(unittest.TestCase): + def test_assemble_publish_tree_uses_legacy_docs_layout(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + pages_repo = root / "pages" + en_source = root / "en-html" + zh_source = root / "zh-html" + + pages_repo.mkdir() + en_source.mkdir() + zh_source.mkdir() + + (en_source / "index.html").write_text("english home", encoding="utf-8") + (en_source / "guide.html").write_text("english guide", encoding="utf-8") + (zh_source / "index.html").write_text("chinese home", encoding="utf-8") + (zh_source / "searchindex.js").write_text("zh search", encoding="utf-8") + + assemble_publish_tree( + destination_root=pages_repo, + docs_subdir="docs", + en_source=en_source, + zh_source=zh_source, + ) + + self.assertEqual( + (pages_repo / "docs" / "index.html").read_text(encoding="utf-8"), + "english home", + ) + self.assertEqual( + (pages_repo / "docs" / "guide.html").read_text(encoding="utf-8"), + "english guide", + ) + self.assertEqual( + (pages_repo / "docs" / "cn" / "index.html").read_text(encoding="utf-8"), + "chinese home", + ) + self.assertEqual( + (pages_repo / "docs" / "cn" / "searchindex.js").read_text(encoding="utf-8"), + "zh search", + ) + + def test_assemble_publish_tree_replaces_stale_docs_content(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + pages_repo = root / "pages" + en_source = root / "en-html" + zh_source = root / "zh-html" + + (pages_repo / "docs" / "cn").mkdir(parents=True) + (pages_repo / "docs" / "old.html").write_text("stale en", encoding="utf-8") + (pages_repo / "docs" / "cn" / "old.html").write_text("stale zh", encoding="utf-8") + + en_source.mkdir() + zh_source.mkdir() + (en_source / "index.html").write_text("fresh en", encoding="utf-8") + (zh_source / "index.html").write_text("fresh zh", encoding="utf-8") + + assemble_publish_tree( + destination_root=pages_repo, + docs_subdir="docs", + en_source=en_source, + zh_source=zh_source, + ) + + self.assertFalse((pages_repo / "docs" / "old.html").exists()) + self.assertFalse((pages_repo / "docs" / "cn" / "old.html").exists()) + self.assertEqual( + (pages_repo / "docs" / "index.html").read_text(encoding="utf-8"), + "fresh en", + ) + self.assertEqual( + (pages_repo / "docs" / "cn" / "index.html").read_text(encoding="utf-8"), + "fresh zh", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_ensure_book_resources.py b/tests/test_ensure_book_resources.py new file mode 100644 index 0000000..98304e3 --- /dev/null +++ b/tests/test_ensure_book_resources.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import tempfile +import unittest +from pathlib import Path + +from tools.ensure_book_resources import ensure_resource_views + + +class EnsureBookResourcesTests(unittest.TestCase): + def test_ensure_resource_views_creates_missing_symlinks(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + chapter_dir = root / "en_chapters" + chapter_dir.mkdir() + + for directory in ("img", "references", "static"): + (root / directory).mkdir() + (root / "mlsys.bib").write_text("bib", encoding="utf-8") + + ensure_resource_views(chapter_dir, root) + + for name in ("img", "references", "static", "mlsys.bib"): + path = chapter_dir / name + self.assertTrue(path.is_symlink(), f"{name} should be a symlink") + self.assertEqual(path.resolve(), (root / name).resolve()) + + def test_ensure_resource_views_keeps_existing_non_symlink_paths(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + chapter_dir = root / "en_chapters" + chapter_dir.mkdir() + + for directory in ("img", "references", "static"): + (root / directory).mkdir() + (root / "mlsys.bib").write_text("root bib", encoding="utf-8") + + local_bib = chapter_dir / "mlsys.bib" + local_bib.write_text("local bib", encoding="utf-8") + local_static = chapter_dir / "static" + local_static.mkdir() + (local_static / "frontpage.html").write_text("local static", encoding="utf-8") + + ensure_resource_views(chapter_dir, root) + + self.assertFalse(local_bib.is_symlink()) + self.assertEqual(local_bib.read_text(encoding="utf-8"), "local bib") + self.assertFalse(local_static.is_symlink()) + self.assertTrue((local_static / "frontpage.html").exists()) + self.assertTrue((chapter_dir / "img").is_symlink()) + self.assertTrue((chapter_dir / "references").is_symlink()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_prepare_mdbook.py b/tests/test_prepare_mdbook.py new file mode 100644 index 0000000..bfce92e --- /dev/null +++ b/tests/test_prepare_mdbook.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import tempfile +import unittest +from pathlib import Path + +from tools.prepare_mdbook import build_title_cache, rewrite_markdown, write_summary + + +class PrepareMdBookTests(unittest.TestCase): + def test_write_summary_skips_placeholder_pages(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + source = root / "en_chapters" + source.mkdir() + + (source / "index.md").write_text( + """Machine Learning Systems +======================== + +```toc +:maxdepth: 2 + +chapter_preface/index +chapter_introduction/index +``` + +```toc +:maxdepth: 1 + +appendix/index +``` +""", + encoding="utf-8", + ) + + chapter_preface = source / "chapter_preface" + chapter_preface.mkdir() + (chapter_preface / "index.md").write_text( + "[TODO: src = zh_chapters/chapter_preface/index.md]\n", + encoding="utf-8", + ) + + chapter_intro = source / "chapter_introduction" + chapter_intro.mkdir() + (chapter_intro / "index.md").write_text("# Introduction\n", encoding="utf-8") + + appendix = source / "appendix" + appendix.mkdir() + (appendix / "index.md").write_text("# Appendix\n", encoding="utf-8") + + summary_path = write_summary( + source, + placeholder_prefix="[TODO: src = zh_chapters/", + ) + summary = summary_path.read_text(encoding="utf-8") + + self.assertEqual( + summary, + """# Summary + +[Machine Learning Systems](index.md) +[Introduction](chapter_introduction/index.md) +[Appendix](appendix/index.md) +""", + ) + + title_cache = build_title_cache( + source, + placeholder_prefix="[TODO: src = zh_chapters/", + ) + rewritten = rewrite_markdown( + (source / "index.md").read_text(encoding="utf-8"), + (source / "index.md").resolve(), + title_cache, + ) + + self.assertIn("- [Introduction](chapter_introduction/index.md)", rewritten) + self.assertIn("- [Appendix](appendix/index.md)", rewritten) + self.assertNotIn("chapter_preface/index.md", rewritten) + + def test_rewrite_markdown_uses_configured_bibliography_title(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + page = root / "chapter.md" + page.write_text( + """# Introduction + +Reference :cite:`smith2024`. +""", + encoding="utf-8", + ) + + rewritten = rewrite_markdown( + page.read_text(encoding="utf-8"), + page.resolve(), + {page.resolve(): "Introduction"}, + bib_db={ + "smith2024": { + "author": "Smith, Alice and Doe, Bob", + "title": "Systems Paper", + "year": "2024", + "journal": "ML Systems Journal", + } + }, + bibliography_title="References", + ) + + self.assertIn("## References", rewritten) + self.assertNotIn("## 参考文献", rewritten) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_prepare_mdbook_zh.py b/tests/test_prepare_mdbook_zh.py new file mode 100644 index 0000000..7182438 --- /dev/null +++ b/tests/test_prepare_mdbook_zh.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +import tempfile +import unittest +from pathlib import Path + +from tools.prepare_mdbook_zh import extract_title, rewrite_markdown, write_summary + + +class PrepareMdBookZhTests(unittest.TestCase): + def test_extract_title_supports_atx_and_setext_headings(self) -> None: + self.assertEqual(extract_title("# 导论\n"), "导论") + self.assertEqual(extract_title("前言文字\n\n## 机器学习应用\n"), "机器学习应用") + self.assertEqual(extract_title("机器学习系统:设计和实现\n=========================\n"), "机器学习系统:设计和实现") + + def test_write_summary_generates_nested_navigation(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + source = root / "zh_chapters" + source.mkdir() + + (source / "index.md").write_text( + """机器学习系统:设计和实现 +========================= + +```eval_rst +.. raw:: html + :file: frontpage.html +``` + +```toc +:maxdepth: 2 + +[前言](chapter_preface/index) + +# 基础篇 +chapter_introduction/index + +# 附录 +[机器学习基础附录](appendix_machine_learning_introduction/index) +``` +""", + encoding="utf-8", + ) + + chapter_preface = source / "chapter_preface" + chapter_preface.mkdir() + (chapter_preface / "index.md").write_text("# 前言\n", encoding="utf-8") + static_dir = source / "static" + static_dir.mkdir() + (static_dir / "frontpage.html").write_text( + "
frontpage
\n", + encoding="utf-8", + ) + + chapter_intro = source / "chapter_introduction" + chapter_intro.mkdir() + (chapter_intro / "index.md").write_text( + """# 导论 + +```toc +:maxdepth: 2 + +applications +design +``` +""", + encoding="utf-8", + ) + (chapter_intro / "applications.md").write_text("# 机器学习应用\n", encoding="utf-8") + (chapter_intro / "design.md").write_text("# 设计目标\n", encoding="utf-8") + + appendix = source / "appendix_machine_learning_introduction" + appendix.mkdir() + (appendix / "index.md").write_text("# 机器学习基础附录\n", encoding="utf-8") + + for name in ("img", "static", "references"): + (root / name).mkdir() + (root / "mlsys.bib").write_text("% bibliography\n", encoding="utf-8") + + summary_path = write_summary(source) + summary = summary_path.read_text(encoding="utf-8") + self.assertEqual( + summary, + """# Summary + +[机器学习系统:设计和实现](index.md) +[前言](chapter_preface/index.md) + +# 基础篇 + +- [导论](chapter_introduction/index.md) + - [机器学习应用](chapter_introduction/applications.md) + - [设计目标](chapter_introduction/design.md) + +# 附录 + +- [机器学习基础附录](appendix_machine_learning_introduction/index.md) +""", + ) + + title_cache = { + (source / "chapter_preface" / "index.md").resolve(): "前言", + (source / "chapter_introduction" / "index.md").resolve(): "导论", + (source / "chapter_introduction" / "applications.md").resolve(): "机器学习应用", + (source / "chapter_introduction" / "design.md").resolve(): "设计目标", + (source / "appendix_machine_learning_introduction" / "index.md").resolve(): "机器学习基础附录", + } + root_index = rewrite_markdown((source / "index.md").read_text(encoding="utf-8"), (source / "index.md").resolve(), title_cache) + self.assertNotIn("```eval_rst", root_index) + self.assertNotIn("```toc", root_index) + self.assertIn("- [前言](chapter_preface/index.md)", root_index) + self.assertIn("- 基础篇", root_index) + self.assertIn(" - [导论](chapter_introduction/index.md)", root_index) + self.assertIn("- 附录", root_index) + self.assertIn(" - [机器学习基础附录](appendix_machine_learning_introduction/index.md)", root_index) + + intro_index = rewrite_markdown( + (source / "chapter_introduction" / "index.md").read_text(encoding="utf-8"), + (source / "chapter_introduction" / "index.md").resolve(), + title_cache, + ) + self.assertNotIn("```toc", intro_index) + self.assertIn("- [机器学习应用](applications.md)", intro_index) + self.assertIn("- [设计目标](design.md)", intro_index) + + def test_write_summary_raises_for_missing_toc_entries(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + source = root / "zh_chapters" + source.mkdir() + + (source / "index.md").write_text( + """# 首页 + +```toc +:maxdepth: 2 + +existing +missing +``` +""", + encoding="utf-8", + ) + (source / "existing.md").write_text("# 现有章节\n", encoding="utf-8") + + with self.assertRaises(FileNotFoundError): + write_summary(source) + + def test_rewrite_markdown_normalizes_common_d2l_directives(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + source = root / "zh_chapters" + source.mkdir() + + page = source / "chapter.md" + page.write_text( + """# 标题 + +![配图](../img/example.png) +:width:`800px` +:label:`fig_example` + +参见 :numref:`fig_example` 和公式 :eqref:`eq_example`,引用 :cite:`foo2024`。 +""", + encoding="utf-8", + ) + + rewritten = rewrite_markdown(page.read_text(encoding="utf-8"), page.resolve(), {page.resolve(): "标题"}) + self.assertNotIn(":width:", rewritten) + self.assertNotIn(":label:", rewritten) + self.assertNotIn(":numref:", rewritten) + self.assertNotIn(":eqref:", rewritten) + self.assertNotIn(":cite:", rewritten) + self.assertIn("`fig_example`", rewritten) + self.assertIn("`eq_example`", rewritten) + self.assertIn("[foo2024]", rewritten) + + def test_rewrite_markdown_inlines_frontpage_html_include(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + source = root / "zh_chapters" + static_dir = source / "static" + static_dir.mkdir(parents=True) + + index = source / "index.md" + index.write_text( + """# 首页 + +```eval_rst +.. raw:: html + :file: frontpage.html +``` +""", + encoding="utf-8", + ) + (static_dir / "frontpage.html").write_text( + """ + + +
+ + +
+ +""", + encoding="utf-8", + ) + + rewritten = rewrite_markdown(index.read_text(encoding="utf-8"), index.resolve(), {index.resolve(): "首页"}) + self.assertNotIn("```eval_rst", rewritten) + self.assertNotIn("", rewritten) + self.assertIn('class="openmlsys-frontpage"', rewritten) + self.assertIn('
', rewritten) + self.assertIn('", re.IGNORECASE | re.DOTALL) +DEFAULT_BIBLIOGRAPHY_TITLE = "References" +FRONTPAGE_LAYOUT_CSS = """ + +""".strip() + + +@dataclass(frozen=True) +class TocItem: + kind: str + label: str + target: str | None = None + + +def is_placeholder_markdown(markdown: str, placeholder_prefix: str | None = None) -> bool: + if not placeholder_prefix: + return False + + stripped = markdown.strip() + return stripped.startswith(placeholder_prefix) and stripped.endswith("]") + + +def extract_title(markdown: str, fallback: str = "Untitled") -> str: + lines = markdown.splitlines() + + for index, line in enumerate(lines): + stripped = line.strip() + if not stripped: + continue + if stripped.startswith("#"): + heading = stripped.lstrip("#").strip() + if heading: + return heading + + next_index = index + 1 + if next_index < len(lines): + underline = lines[next_index].strip() + if underline and set(underline) <= {"=", "-"}: + return stripped + + return fallback + + +def parse_toc_entries(block_lines: list[str]) -> list[TocItem]: + entries: list[TocItem] = [] + for line in block_lines: + stripped = line.strip() + if not stripped or stripped.startswith(":"): + continue + part_match = TOC_PART_RE.match(stripped) + if part_match: + entries.append(TocItem(kind="part", label=part_match.group(1).strip())) + continue + link_match = TOC_LINK_RE.match(stripped) + if link_match: + entries.append( + TocItem( + kind="chapter", + label=link_match.group(1).strip(), + target=link_match.group(2).strip(), + ) + ) + continue + entries.append(TocItem(kind="chapter", label="", target=stripped)) + return entries + + +def parse_toc_blocks(markdown: str) -> list[list[TocItem]]: + blocks: list[list[TocItem]] = [] + lines = markdown.splitlines() + index = 0 + + while index < len(lines): + if lines[index].strip() == f"```{TOC_FENCE}": + index += 1 + block_lines: list[str] = [] + while index < len(lines) and lines[index].strip() != "```": + block_lines.append(lines[index]) + index += 1 + entries = parse_toc_entries(block_lines) + blocks.append(entries) + index += 1 + + return blocks + + +def resolve_toc_target(current_file: Path, entry: str) -> Path: + target_name = entry if entry.endswith(".md") else f"{entry}.md" + target = (current_file.parent / target_name).resolve() + if not target.exists(): + raise FileNotFoundError(f"TOC entry '{entry}' from '{current_file}' does not exist") + return target + + +def relative_link(from_file: Path, target_file: Path) -> str: + return Path(os.path.relpath(target_file, start=from_file.parent)).as_posix() + + +def _strip_latex_escapes_outside_math(line: str) -> str: + parts = line.split("$") + for i in range(0, len(parts), 2): + parts[i] = LATEX_ESCAPE_RE.sub(r"\1", parts[i]) + return "$".join(parts) + + +def normalize_directives(markdown: str) -> str: + normalized = OPTION_LINE_RE.sub("", markdown) + normalized = NUMREF_RE.sub(lambda match: f"`{match.group(1)}`", normalized) + normalized = EQREF_RE.sub(lambda match: f"`{match.group(1)}`", normalized) + + lines = [_strip_latex_escapes_outside_math(line.rstrip()) for line in normalized.splitlines()] + collapsed: list[str] = [] + previous_blank = False + for line in lines: + is_blank = line == "" + if is_blank and previous_blank: + continue + collapsed.append(line) + previous_blank = is_blank + + while collapsed and collapsed[-1] == "": + collapsed.pop() + + return "\n".join(collapsed) + "\n" + + +def clean_bibtex(value: str) -> str: + value = re.sub(r"\{\\[`'^\"~=.](\w)\}", r"\1", value) + value = re.sub(r"\\[`'^\"~=.](\w)", r"\1", value) + value = value.replace("{", "").replace("}", "") + return value.strip() + + +def _parse_bib_fields(body: str) -> dict[str, str]: + fields: dict[str, str] = {} + i = 0 + while i < len(body): + while i < len(body) and body[i] in " \t\n\r,": + i += 1 + if i >= len(body): + break + start = i + while i < len(body) and body[i] not in "= \t\n\r": + i += 1 + name = body[start:i].strip().lower() + while i < len(body) and body[i] != "=": + i += 1 + if i >= len(body): + break + i += 1 + while i < len(body) and body[i] in " \t\n\r": + i += 1 + if i >= len(body): + break + if body[i] == "{": + depth = 1 + i += 1 + vstart = i + while i < len(body) and depth > 0: + if body[i] == "{": + depth += 1 + elif body[i] == "}": + depth -= 1 + i += 1 + value = body[vstart : i - 1] + elif body[i] == '"': + i += 1 + vstart = i + while i < len(body) and body[i] != '"': + i += 1 + value = body[vstart:i] + i += 1 + else: + vstart = i + while i < len(body) and body[i] not in ", \t\n\r}": + i += 1 + value = body[vstart:i] + if name: + fields[name] = value.strip() + return fields + + +def parse_bib(bib_path: Path) -> dict[str, dict[str, str]]: + text = bib_path.read_text(encoding="utf-8") + entries: dict[str, dict[str, str]] = {} + for match in BIB_ENTRY_RE.finditer(text): + key = match.group(2).strip() + start = match.end() + depth = 1 + pos = start + while pos < len(text) and depth > 0: + if text[pos] == "{": + depth += 1 + elif text[pos] == "}": + depth -= 1 + pos += 1 + fields = _parse_bib_fields(text[start : pos - 1]) + fields["_type"] = match.group(1).lower() + entries[key] = fields + return entries + + +def _render_bibliography( + cited_keys: list[str], + bib_db: dict[str, dict[str, str]], + bibliography_title: str, +) -> list[str]: + lines: list[str] = ["---", "", f"## {bibliography_title}", "", "
    "] + for key in cited_keys: + entry = bib_db.get(key) + if not entry: + lines.append(f'
  1. {key}.
  2. ') + continue + author = clean_bibtex(entry.get("author", "")) + title = clean_bibtex(entry.get("title", "")) + year = entry.get("year", "") + venue = clean_bibtex(entry.get("journal", "") or entry.get("booktitle", "")) + parts: list[str] = [] + if author: + parts.append(author) + if title: + parts.append(f"{title}") + if venue: + parts.append(venue) + if year: + parts.append(year) + text = ". ".join(parts) + "." if parts else f"{key}." + lines.append(f'
  3. {text}
  4. ') + lines.append("
") + return lines + + +def process_citations( + markdown: str, + bib_db: dict[str, dict[str, str]], + bibliography_title: str = DEFAULT_BIBLIOGRAPHY_TITLE, +) -> str: + cited_keys: list[str] = [] + + def _replace_cite(match: re.Match[str]) -> str: + keys = [k.strip() for k in match.group(1).split(",")] + for key in keys: + if key not in cited_keys: + cited_keys.append(key) + if not bib_db: + return "[" + ", ".join(keys) + "]" + nums: list[str] = [] + for key in keys: + idx = cited_keys.index(key) + 1 + nums.append(f'[{idx}]') + return "".join(nums) + + processed = CITE_RE.sub(_replace_cite, markdown) + if cited_keys and bib_db: + bib_lines = _render_bibliography(cited_keys, bib_db, bibliography_title) + processed = processed.rstrip("\n") + "\n\n" + "\n".join(bib_lines) + "\n" + return processed + + +def resolve_raw_html_file(current_file: Path, filename: str) -> Path: + direct = (current_file.parent / filename).resolve() + if direct.exists(): + return direct + + static_fallback = (current_file.parent / "static" / filename).resolve() + if static_fallback.exists(): + return static_fallback + + repo_static = (Path(__file__).resolve().parent.parent / "static" / filename) + if repo_static.exists(): + return repo_static + + raise FileNotFoundError(f"Raw HTML include '{filename}' from '{current_file}' does not exist") + + +def rewrite_frontpage_assets(html: str) -> str: + rewritten = html.replace("./_images/", "static/image/") + rewritten = rewritten.replace("_images/", "static/image/") + rewritten = HEAD_TAG_RE.sub("", rewritten) + rewritten = STYLE_BLOCK_RE.sub(_minify_style_block, rewritten) + return rewritten + + +def _minify_style_block(match: re.Match[str]) -> str: + content = match.group(1) + parts = [line.strip() for line in content.splitlines() if line.strip()] + return f"" + + +def wrap_frontpage_html(html: str) -> str: + return "\n".join([FRONTPAGE_LAYOUT_CSS, '
', html.strip(), "
"]) + + +def inline_raw_html(block_lines: list[str], current_file: Path) -> str | None: + stripped = [line.strip() for line in block_lines if line.strip()] + if not stripped or stripped[0] != ".. raw:: html": + return None + + filename: str | None = None + for line in stripped[1:]: + match = RAW_HTML_FILE_RE.match(line) + if match: + filename = match.group(1) + break + + if filename is None: + return None + + html_path = resolve_raw_html_file(current_file, filename) + html = rewrite_frontpage_assets(html_path.read_text(encoding="utf-8")).strip() + if Path(filename).name == "frontpage.html": + return wrap_frontpage_html(html) + return html + + +def chapter_label(item: TocItem, target: Path, title_cache: dict[Path, str]) -> str: + return item.label or title_cache[target] + + +def render_toc_list(entries: list[TocItem], current_file: Path, title_cache: dict[Path, str]) -> list[str]: + rendered: list[str] = [] + current_indent = 0 + for entry in entries: + if entry.kind == "part": + rendered.append(f"- {entry.label}") + current_indent = 1 + continue + + if entry.target is None: + continue + + target = resolve_toc_target(current_file, entry.target) + if target not in title_cache: + continue + + label = chapter_label(entry, target, title_cache) + rendered.append(f"{' ' * current_indent}- [{label}]({relative_link(current_file, target)})") + return rendered + + +def rewrite_markdown( + markdown: str, + current_file: Path, + title_cache: dict[Path, str], + bib_db: dict[str, dict[str, str]] | None = None, + bibliography_title: str = DEFAULT_BIBLIOGRAPHY_TITLE, +) -> str: + output: list[str] = [] + lines = markdown.splitlines() + index = 0 + + while index < len(lines): + stripped = lines[index].strip() + if stripped in (f"```{TOC_FENCE}", f"```{EVAL_RST_FENCE}"): + fence = stripped[3:] + index += 1 + block_lines: list[str] = [] + while index < len(lines) and lines[index].strip() != "```": + block_lines.append(lines[index]) + index += 1 + + if fence == TOC_FENCE: + entries = parse_toc_entries(block_lines) + if entries: + if output and output[-1] != "": + output.append("") + rendered = render_toc_list(entries, current_file, title_cache) + output.extend(rendered) + if rendered and output and output[-1] != "": + output.append("") + elif fence == EVAL_RST_FENCE: + raw_html = inline_raw_html(block_lines, current_file) + if raw_html: + if output and output[-1] != "": + output.append("") + output.extend(raw_html.splitlines()) + if output and output[-1] != "": + output.append("") + index += 1 + continue + + output.append(lines[index]) + index += 1 + + while output and output[-1] == "": + output.pop() + + result = normalize_directives("\n".join(output) + "\n") + result = process_citations(result, bib_db or {}, bibliography_title=bibliography_title) + return result + + +def build_title_cache( + source_dir: Path, + placeholder_prefix: str | None = None, +) -> dict[Path, str]: + cache: dict[Path, str] = {} + for markdown_file in sorted(source_dir.rglob("*.md")): + if "_build" in markdown_file.parts or markdown_file.name == "SUMMARY.md": + continue + text = markdown_file.read_text(encoding="utf-8") + if is_placeholder_markdown(text, placeholder_prefix): + continue + cache[markdown_file.resolve()] = extract_title(text, fallback=markdown_file.stem) + return cache + + +def build_summary(source_dir: Path, title_cache: dict[Path, str]) -> str: + root_index = (source_dir / "index.md").resolve() + root_markdown = root_index.read_text(encoding="utf-8") + + lines = ["# Summary", "", f"[{title_cache[root_index]}](index.md)"] + seen: set[Path] = {root_index} + + def append_entry(target: Path, indent: int, label: str | None = None) -> None: + target = target.resolve() + if target in seen or target not in title_cache: + return + seen.add(target) + rel = target.relative_to(source_dir.resolve()).as_posix() + title = label or title_cache[target] + lines.append(f"{' ' * indent}- [{title}]({rel})") + + child_markdown = target.read_text(encoding="utf-8") + for block in parse_toc_blocks(child_markdown): + for entry in block: + if entry.kind != "chapter" or entry.target is None: + continue + append_entry(resolve_toc_target(target, entry.target), indent + 1, entry.label or None) + + def append_prefix_chapter(target: Path, label: str | None = None) -> None: + target = target.resolve() + if target in seen or target not in title_cache: + return + seen.add(target) + rel = target.relative_to(source_dir.resolve()).as_posix() + title = label or title_cache[target] + lines.append(f"[{title}]({rel})") + + numbered_started = False + for block in parse_toc_blocks(root_markdown): + for entry in block: + if entry.kind == "part": + if lines and lines[-1] != "": + lines.append("") + lines.append(f"# {entry.label}") + lines.append("") + numbered_started = True + continue + + if entry.target is None: + continue + + target = resolve_toc_target(root_index, entry.target) + if numbered_started: + append_entry(target, 0, entry.label or None) + else: + append_prefix_chapter(target, entry.label or None) + + return "\n".join(lines) + "\n" + + +def write_summary( + source_dir: Path, + summary_path: Path | None = None, + placeholder_prefix: str | None = None, +) -> Path: + source_dir = source_dir.resolve() + summary_path = summary_path.resolve() if summary_path else (source_dir / "SUMMARY.md") + title_cache = build_title_cache(source_dir, placeholder_prefix=placeholder_prefix) + summary_path.write_text(build_summary(source_dir, title_cache), encoding="utf-8") + return summary_path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate mdBook SUMMARY.md for a chapter directory.") + parser.add_argument("--source", type=Path, required=True, help="Source chapter directory") + parser.add_argument("--summary-output", type=Path, required=True, help="Where to write the generated SUMMARY.md") + parser.add_argument( + "--placeholder-prefix", + default=None, + help="If set, files whose entire contents start with this prefix are skipped from mdBook output.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + summary_path = write_summary( + args.source, + summary_path=args.summary_output, + placeholder_prefix=args.placeholder_prefix, + ) + print(f"Wrote mdBook summary to {summary_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/prepare_mdbook_zh.py b/tools/prepare_mdbook_zh.py index 7da3755..a5e17c5 100644 --- a/tools/prepare_mdbook_zh.py +++ b/tools/prepare_mdbook_zh.py @@ -1,580 +1,24 @@ from __future__ import annotations import argparse -import os -import re -from dataclasses import dataclass from pathlib import Path - -TOC_FENCE = "toc" -EVAL_RST_FENCE = "eval_rst" -OPTION_LINE_RE = re.compile(r"^:(width|label):`[^`]+`\s*$", re.MULTILINE) -NUMREF_RE = re.compile(r":numref:`([^`]+)`") -EQREF_RE = re.compile(r":eqref:`([^`]+)`") -CITE_RE = re.compile(r":cite:`([^`]+)`") -BIB_ENTRY_RE = re.compile(r"@(\w+)\{([^,]+),") -LATEX_ESCAPE_RE = re.compile(r"\\([_%#&])") -RAW_HTML_FILE_RE = re.compile(r"^\s*:file:\s*([^\s]+)\s*$") -TOC_LINK_RE = re.compile(r"^\[([^\]]+)\]\(([^)]+)\)\s*$") -TOC_PART_RE = re.compile(r"^#+\s+(.+?)\s*$") -HEAD_TAG_RE = re.compile(r"", re.IGNORECASE) -STYLE_BLOCK_RE = re.compile(r"", re.IGNORECASE | re.DOTALL) -FRONTPAGE_LAYOUT_CSS = """ - -""".strip() - - -@dataclass(frozen=True) -class TocItem: - kind: str - label: str - target: str | None = None - - -def extract_title(markdown: str, fallback: str = "Untitled") -> str: - lines = markdown.splitlines() - - for index, line in enumerate(lines): - stripped = line.strip() - if not stripped: - continue - if stripped.startswith("#"): - heading = stripped.lstrip("#").strip() - if heading: - return heading - - next_index = index + 1 - if next_index < len(lines): - underline = lines[next_index].strip() - if underline and set(underline) <= {"=", "-"}: - return stripped - - return fallback - - -def parse_toc_entries(block_lines: list[str]) -> list[TocItem]: - entries: list[TocItem] = [] - for line in block_lines: - stripped = line.strip() - if not stripped or stripped.startswith(":"): - continue - part_match = TOC_PART_RE.match(stripped) - if part_match: - entries.append(TocItem(kind="part", label=part_match.group(1).strip())) - continue - link_match = TOC_LINK_RE.match(stripped) - if link_match: - entries.append(TocItem(kind="chapter", label=link_match.group(1).strip(), target=link_match.group(2).strip())) - continue - entries.append(TocItem(kind="chapter", label="", target=stripped)) - return entries - - -def parse_toc_blocks(markdown: str) -> list[list[TocItem]]: - blocks: list[list[TocItem]] = [] - lines = markdown.splitlines() - index = 0 - - while index < len(lines): - if lines[index].strip() == f"```{TOC_FENCE}": - index += 1 - block_lines: list[str] = [] - while index < len(lines) and lines[index].strip() != "```": - block_lines.append(lines[index]) - index += 1 - entries = parse_toc_entries(block_lines) - blocks.append(entries) - index += 1 - - return blocks - - -def resolve_toc_target(current_file: Path, entry: str) -> Path: - target_name = entry if entry.endswith(".md") else f"{entry}.md" - target = (current_file.parent / target_name).resolve() - if not target.exists(): - raise FileNotFoundError(f"TOC entry '{entry}' from '{current_file}' does not exist") - return target - - -def relative_link(from_file: Path, target_file: Path) -> str: - return Path(os.path.relpath(target_file, start=from_file.parent)).as_posix() - - -def _strip_latex_escapes_outside_math(line: str) -> str: - """Remove LaTeX escapes (\\_, \\%, \\#, \\&) from text outside $...$ math spans.""" - parts = line.split("$") - for i in range(0, len(parts), 2): # even indices are outside math - parts[i] = LATEX_ESCAPE_RE.sub(r"\1", parts[i]) - return "$".join(parts) - - -def normalize_directives(markdown: str) -> str: - normalized = OPTION_LINE_RE.sub("", markdown) - normalized = NUMREF_RE.sub(lambda match: f"`{match.group(1)}`", normalized) - normalized = EQREF_RE.sub(lambda match: f"`{match.group(1)}`", normalized) - - lines = [_strip_latex_escapes_outside_math(line.rstrip()) for line in normalized.splitlines()] - collapsed: list[str] = [] - previous_blank = False - for line in lines: - is_blank = line == "" - if is_blank and previous_blank: - continue - collapsed.append(line) - previous_blank = is_blank - - while collapsed and collapsed[-1] == "": - collapsed.pop() - - return "\n".join(collapsed) + "\n" - - -# ── BibTeX parsing ──────────────────────────────────────────────────────────── - - -def clean_bibtex(value: str) -> str: - """Remove BibTeX formatting (braces, LaTeX accents) from a string.""" - value = re.sub(r"\{\\[`'^\"~=.](\w)\}", r"\1", value) - value = re.sub(r"\\[`'^\"~=.](\w)", r"\1", value) - value = value.replace("{", "").replace("}", "") - return value.strip() - - -def _parse_bib_fields(body: str) -> dict[str, str]: - """Parse field=value pairs inside a BibTeX entry body.""" - fields: dict[str, str] = {} - i = 0 - while i < len(body): - while i < len(body) and body[i] in " \t\n\r,": - i += 1 - if i >= len(body): - break - start = i - while i < len(body) and body[i] not in "= \t\n\r": - i += 1 - name = body[start:i].strip().lower() - while i < len(body) and body[i] != "=": - i += 1 - if i >= len(body): - break - i += 1 - while i < len(body) and body[i] in " \t\n\r": - i += 1 - if i >= len(body): - break - if body[i] == "{": - depth = 1 - i += 1 - vstart = i - while i < len(body) and depth > 0: - if body[i] == "{": - depth += 1 - elif body[i] == "}": - depth -= 1 - i += 1 - value = body[vstart : i - 1] - elif body[i] == '"': - i += 1 - vstart = i - while i < len(body) and body[i] != '"': - i += 1 - value = body[vstart:i] - i += 1 - else: - vstart = i - while i < len(body) and body[i] not in ", \t\n\r}": - i += 1 - value = body[vstart:i] - if name: - fields[name] = value.strip() - return fields - - -def parse_bib(bib_path: Path) -> dict[str, dict[str, str]]: - """Parse a BibTeX file and return a dict keyed by citation key.""" - text = bib_path.read_text(encoding="utf-8") - entries: dict[str, dict[str, str]] = {} - for match in BIB_ENTRY_RE.finditer(text): - key = match.group(2).strip() - start = match.end() - depth = 1 - pos = start - while pos < len(text) and depth > 0: - if text[pos] == "{": - depth += 1 - elif text[pos] == "}": - depth -= 1 - pos += 1 - fields = _parse_bib_fields(text[start : pos - 1]) - fields["_type"] = match.group(1).lower() - entries[key] = fields - return entries - - -# ── Citation formatting ─────────────────────────────────────────────────────── - - -def _first_author_surname(author_str: str) -> str: - """Extract the first author's surname from a BibTeX author string.""" - author_str = clean_bibtex(author_str) - authors = [a.strip() for a in author_str.split(" and ")] - if not authors or not authors[0]: - return "" - first = authors[0] - if "," in first: - return first.split(",")[0].strip() - parts = first.split() - return parts[-1] if parts else first - - -def _format_cite_label(author: str, year: str) -> str: - """Format an inline citation label like 'Surname et al., Year'.""" - surname = _first_author_surname(author) - if not surname: - return year or "?" - authors = [a.strip() for a in clean_bibtex(author).split(" and ")] - if len(authors) > 2: - name_part = f"{surname} et al." - elif len(authors) == 2: - second = authors[1] - if second.lower() == "others": - name_part = f"{surname} et al." - else: - if "," in second: - surname2 = second.split(",")[0].strip() - else: - parts = second.split() - surname2 = parts[-1] if parts else second - name_part = f"{surname} and {surname2}" - else: - name_part = surname - if year: - return f"{name_part}, {year}" - return name_part - - -def _render_bibliography( - cited_keys: list[str], bib_db: dict[str, dict[str, str]] -) -> list[str]: - """Render a footnote-style bibliography section for the cited keys.""" - lines: list[str] = ["---", "", "## 参考文献", "", "
    "] - for idx, key in enumerate(cited_keys, 1): - entry = bib_db.get(key) - if not entry: - lines.append(f'
  1. {key}.
  2. ') - continue - author = clean_bibtex(entry.get("author", "")) - title = clean_bibtex(entry.get("title", "")) - year = entry.get("year", "") - venue = clean_bibtex(entry.get("journal", "") or entry.get("booktitle", "")) - parts: list[str] = [] - if author: - parts.append(author) - if title: - parts.append(f"{title}") - if venue: - parts.append(venue) - if year: - parts.append(year) - text = ". ".join(parts) + "." if parts else f"{key}." - lines.append(f'
  3. {text}
  4. ') - lines.append("
") - return lines - - -def process_citations( - markdown: str, bib_db: dict[str, dict[str, str]] -) -> str: - """Replace :cite: references with footnote-style numbered citations.""" - cited_keys: list[str] = [] - - def _replace_cite(match: re.Match[str]) -> str: - keys = [k.strip() for k in match.group(1).split(",")] - for key in keys: - if key not in cited_keys: - cited_keys.append(key) - if not bib_db: - return "[" + ", ".join(keys) + "]" - nums: list[str] = [] - for key in keys: - idx = cited_keys.index(key) + 1 - nums.append( - f'[{idx}]' - ) - return "".join(nums) - - processed = CITE_RE.sub(_replace_cite, markdown) - if cited_keys and bib_db: - bib_lines = _render_bibliography(cited_keys, bib_db) - processed = processed.rstrip("\n") + "\n\n" + "\n".join(bib_lines) + "\n" - return processed - - -def resolve_raw_html_file(current_file: Path, filename: str) -> Path: - direct = (current_file.parent / filename).resolve() - if direct.exists(): - return direct - - static_fallback = (current_file.parent / "static" / filename).resolve() - if static_fallback.exists(): - return static_fallback - - repo_static = (Path(__file__).resolve().parent.parent / "static" / filename) - if repo_static.exists(): - return repo_static - - raise FileNotFoundError(f"Raw HTML include '{filename}' from '{current_file}' does not exist") - - -def rewrite_frontpage_assets(html: str) -> str: - rewritten = html.replace('./_images/', 'static/image/') - rewritten = rewritten.replace('_images/', 'static/image/') - rewritten = HEAD_TAG_RE.sub("", rewritten) - rewritten = STYLE_BLOCK_RE.sub(_minify_style_block, rewritten) - return rewritten - - -def _minify_style_block(match: re.Match[str]) -> str: - content = match.group(1) - parts = [line.strip() for line in content.splitlines() if line.strip()] - return f"" - - -def wrap_frontpage_html(html: str) -> str: - return "\n".join([FRONTPAGE_LAYOUT_CSS, '
', html.strip(), '
']) - - -def inline_raw_html(block_lines: list[str], current_file: Path) -> str | None: - stripped = [line.strip() for line in block_lines if line.strip()] - if not stripped or stripped[0] != ".. raw:: html": - return None - - filename: str | None = None - for line in stripped[1:]: - match = RAW_HTML_FILE_RE.match(line) - if match: - filename = match.group(1) - break - - if filename is None: - return None - - html_path = resolve_raw_html_file(current_file, filename) - html = rewrite_frontpage_assets(html_path.read_text(encoding="utf-8")).strip() - if Path(filename).name == "frontpage.html": - return wrap_frontpage_html(html) - return html - - -def chapter_label(item: TocItem, target: Path, title_cache: dict[Path, str]) -> str: - return item.label or title_cache[target] - - -def render_toc_list(entries: list[TocItem], current_file: Path, title_cache: dict[Path, str]) -> list[str]: - rendered: list[str] = [] - current_indent = 0 - for entry in entries: - if entry.kind == "part": - rendered.append(f"- {entry.label}") - current_indent = 1 - continue - - if entry.target is None: - continue - - target = resolve_toc_target(current_file, entry.target) - label = chapter_label(entry, target, title_cache) - rendered.append(f"{' ' * current_indent}- [{label}]({relative_link(current_file, target)})") - return rendered - - -def rewrite_markdown( - markdown: str, - current_file: Path, - title_cache: dict[Path, str], - bib_db: dict[str, dict[str, str]] | None = None, -) -> str: - output: list[str] = [] - lines = markdown.splitlines() - index = 0 - - while index < len(lines): - stripped = lines[index].strip() - if stripped in (f"```{TOC_FENCE}", f"```{EVAL_RST_FENCE}"): - fence = stripped[3:] - index += 1 - block_lines: list[str] = [] - while index < len(lines) and lines[index].strip() != "```": - block_lines.append(lines[index]) - index += 1 - - if fence == TOC_FENCE: - entries = parse_toc_entries(block_lines) - if entries: - if output and output[-1] != "": - output.append("") - output.extend(render_toc_list(entries, current_file, title_cache)) - if output and output[-1] != "": - output.append("") - elif fence == EVAL_RST_FENCE: - raw_html = inline_raw_html(block_lines, current_file) - if raw_html: - if output and output[-1] != "": - output.append("") - output.extend(raw_html.splitlines()) - if output and output[-1] != "": - output.append("") - index += 1 - continue - - output.append(lines[index]) - index += 1 - - while output and output[-1] == "": - output.pop() - - result = normalize_directives("\n".join(output) + "\n") - result = process_citations(result, bib_db or {}) - return result - - -def build_title_cache(source_dir: Path) -> dict[Path, str]: - cache: dict[Path, str] = {} - for markdown_file in sorted(source_dir.rglob("*.md")): - if "_build" in markdown_file.parts or markdown_file.name == "SUMMARY.md": - continue - cache[markdown_file.resolve()] = extract_title(markdown_file.read_text(encoding="utf-8"), fallback=markdown_file.stem) - return cache - - -def build_summary(source_dir: Path, title_cache: dict[Path, str]) -> str: - root_index = (source_dir / "index.md").resolve() - root_markdown = root_index.read_text(encoding="utf-8") - - lines = ["# Summary", "", f"[{title_cache[root_index]}](index.md)"] - seen: set[Path] = {root_index} - - def append_entry(target: Path, indent: int, label: str | None = None) -> None: - target = target.resolve() - if target in seen: - return - seen.add(target) - rel = target.relative_to(source_dir.resolve()).as_posix() - title = label or title_cache[target] - lines.append(f"{' ' * indent}- [{title}]({rel})") - - child_markdown = target.read_text(encoding="utf-8") - for block in parse_toc_blocks(child_markdown): - for entry in block: - if entry.kind != "chapter" or entry.target is None: - continue - append_entry(resolve_toc_target(target, entry.target), indent + 1, entry.label or None) - - def append_prefix_chapter(target: Path, label: str | None = None) -> None: - target = target.resolve() - if target in seen: - return - seen.add(target) - rel = target.relative_to(source_dir.resolve()).as_posix() - title = label or title_cache[target] - lines.append(f"[{title}]({rel})") - - numbered_started = False - for block in parse_toc_blocks(root_markdown): - for entry in block: - if entry.kind == "part": - if lines and lines[-1] != "": - lines.append("") - lines.append(f"# {entry.label}") - lines.append("") - numbered_started = True - continue - - if entry.target is None: - continue - - target = resolve_toc_target(root_index, entry.target) - if numbered_started: - append_entry(target, 0, entry.label or None) - else: - append_prefix_chapter(target, entry.label or None) - - return "\n".join(lines) + "\n" - - -def write_summary(source_dir: Path, summary_path: Path | None = None) -> Path: - source_dir = source_dir.resolve() - summary_path = summary_path.resolve() if summary_path else (source_dir / "SUMMARY.md") - title_cache = build_title_cache(source_dir) - summary_path.write_text(build_summary(source_dir, title_cache), encoding="utf-8") - return summary_path +try: + from tools.prepare_mdbook import ( + build_title_cache, + extract_title, + parse_bib, + rewrite_markdown, + write_summary, + ) +except ModuleNotFoundError: + from prepare_mdbook import ( + build_title_cache, + extract_title, + parse_bib, + rewrite_markdown, + write_summary, + ) def parse_args() -> argparse.Namespace: @@ -591,7 +35,7 @@ def parse_args() -> argparse.Namespace: def main() -> int: args = parse_args() - summary_path = write_summary(args.source, args.summary_output) + summary_path = write_summary(args.source, summary_path=args.summary_output) print(f"Wrote mdBook summary to {summary_path}") return 0