from __future__ import annotations import argparse import os import re from dataclasses import dataclass from pathlib import Path, PurePosixPath TOC_FENCE = "toc" EVAL_RST_FENCE = "eval_rst" WIDTH_LINE_RE = re.compile(r"^:width:`[^`]+`\s*$", re.MULTILINE) LABEL_RE = re.compile(r":label:`([^`]+)`") NUMREF_RE = re.compile(r":numref:`([^`]+)`") IMAGE_LINE_RE = re.compile(r"^!\[([^\]]*)\]$([^)]+)$\s*$") LABEL_LINE_RE = re.compile(r"^:label:`([^`]+)`\s*$") EQREF_RE = re.compile(r":eqref:`([^`]+)`") EQLABEL_LINE_RE = re.compile(r"^:eqlabel:`([^`]+)`\s*$") CITE_RE = re.compile(r":cite:`([^`]+)`") BIB_ENTRY_RE = re.compile(r"@(\w+)\{([^,]+),") LATEX_ESCAPE_RE = re.compile(r"\$[_%#&])") RAW_HTML_FILE_RE = re.compile(r"^\s*:file:\s*([^\s]+)\s*$") TOC_LINK_RE = re.compile(r"^\[([^\]]+)\]\(([^)]+)$\s*$") TOC_PART_RE = re.compile(r"^#+\s+(.+?)\s*$") HEAD_TAG_RE = re.compile(r"", re.IGNORECASE) STYLE_BLOCK_RE = re.compile(r"", re.IGNORECASE | re.DOTALL) DEFAULT_BIBLIOGRAPHY_TITLE = "References" FRONTPAGE_SWITCH_PLACEHOLDER = "" FRONTPAGE_LAYOUT_CSS = """ """.strip() @dataclass(frozen=True) class TocItem: kind: str label: str target: str | None = None def is_placeholder_markdown(markdown: str, placeholder_prefix: str | None = None) -> bool: if not placeholder_prefix: return False stripped = markdown.strip() return stripped.startswith(placeholder_prefix) and stripped.endswith("]") def extract_title(markdown: str, fallback: str = "Untitled") -> str: lines = markdown.splitlines() for index, line in enumerate(lines): stripped = line.strip() if not stripped: continue if stripped.startswith("#"): heading = stripped.lstrip("#").strip() if heading: return heading next_index = index + 1 if next_index < len(lines): underline = lines[next_index].strip() if underline and set(underline) <= {"=", "-"}: return stripped return fallback def parse_toc_entries(block_lines: list[str]) -> list[TocItem]: entries: list[TocItem] = [] for line in block_lines: stripped = line.strip() if not stripped or stripped.startswith(":"): continue part_match = TOC_PART_RE.match(stripped) if part_match: entries.append(TocItem(kind="part", label=part_match.group(1).strip())) continue link_match = TOC_LINK_RE.match(stripped) if link_match: entries.append( TocItem( kind="chapter", label=link_match.group(1).strip(), target=link_match.group(2).strip(), ) ) continue entries.append(TocItem(kind="chapter", label="", target=stripped)) return entries def parse_toc_blocks(markdown: str) -> list[list[TocItem]]: blocks: list[list[TocItem]] = [] lines = markdown.splitlines() index = 0 while index < len(lines): if lines[index].strip() == f"```{TOC_FENCE}": index += 1 block_lines: list[str] = [] while index < len(lines) and lines[index].strip() != "```": block_lines.append(lines[index]) index += 1 entries = parse_toc_entries(block_lines) blocks.append(entries) index += 1 return blocks def resolve_toc_target(current_file: Path, entry: str) -> Path | None: target_name = entry if entry.endswith(".md") else f"{entry}.md" target = (current_file.parent / target_name).resolve() if not target.exists(): return None return target def relative_link(from_file: Path, target_file: Path) -> str: return Path(os.path.relpath(target_file, start=from_file.parent)).as_posix() def _strip_latex_escapes_outside_math(text: str) -> str: """Remove LaTeX text-mode escapes (``\\_``, ``\\#``, etc.) outside math spans, fenced code blocks, and inline code. Operates on the full text (not per-line) to correctly handle multi-line display math ``$$...$$`` blocks. """ # 1. Find all protected regions where escapes must NOT be stripped. protected: list[tuple[int, int]] = [] # (start, end) n = len(text) i = 0 in_fence: str | None = None fence_start = 0 while i < n: # Fenced code blocks (``` or ~~~) at start of line if (i == 0 or text[i - 1] == "\n") and text[i] in ("`", "~"): m = re.match(r"`{3,}|~{3,}", text[i:]) if m: if in_fence is None: in_fence = m.group()[0] fence_start = i elif m.group()[0] == in_fence: eol = text.find("\n", m.end() + i) end = eol + 1 if eol != -1 else n protected.append((fence_start, end)) in_fence = None i = end continue eol = text.find("\n", i) i = eol + 1 if eol != -1 else n continue if in_fence is not None: i += 1 continue # Inline code `...` if text[i] == "`": close = text.find("`", i + 1) if close != -1: protected.append((i, close + 1)) i = close + 1 continue # Display math $$...$$ if text[i:i + 2] == "$$": close = text.find("$$", i + 2) if close != -1: protected.append((i, close + 2)) i = close + 2 continue # Inline math $...$ if text[i] == "$": j = i + 1 while j < n and text[j] != "$" and text[j] != "\n": j += 1 if j < n and text[j] == "$" and j > i + 1: protected.append((i, j + 1)) i = j + 1 continue i += 1 # Unclosed fence → protect everything from fence_start to end if in_fence is not None: protected.append((fence_start, n)) # 2. Apply substitution only to unprotected gaps. parts: list[str] = [] prev = 0 for start, end in protected: if start > prev: parts.append(LATEX_ESCAPE_RE.sub(r"\1", text[prev:start])) parts.append(text[start:end]) prev = end if prev < n: parts.append(LATEX_ESCAPE_RE.sub(r"\1", text[prev:])) return "".join(parts) def process_equation_labels(markdown: str) -> tuple[str, dict[str, int]]: """Convert :eqlabel: directives to MathJax \\tag + \\label in preceding equations. Args: markdown: The markdown content to process. Returns: A tuple of (processed_markdown, label_map) where label_map maps label names to their equation numbers. """ lines = markdown.split("\n") result: list[str] = [] eq_counter = 0 label_map: dict[str, int] = {} for line in lines: match = EQLABEL_LINE_RE.match(line.strip()) if not match: result.append(line) continue label_name = match.group(1) eq_counter += 1 label_map[label_name] = eq_counter tag = f"\\tag{{{eq_counter}}}\\label{{{label_name}}}" # Search backward for the closing $$ of the preceding equation inserted = False for j in range(len(result) - 1, -1, -1): stripped = result[j].rstrip() if not stripped: continue # skip blank lines if stripped == "$$": # Multi-line equation: $$ on its own line result.insert(j, tag) inserted = True break if stripped.endswith("$$"): # Single-line or end-of-content $$ result[j] = stripped[:-2] + tag + "$$" inserted = True break break # non-blank, non-$$ line: no equation found if not inserted: # Fallback: keep original line if no equation found result.append(line) return "\n".join(result), label_map def collect_labels(markdown: str) -> list[str]: """Extract all label names from :label: directives.""" return LABEL_RE.findall(markdown) def collect_figure_labels(markdown: str) -> list[str]: """Return label names for figures (image lines followed by :label:).""" labels: list[str] = [] lines = markdown.splitlines() for i, line in enumerate(lines): if not IMAGE_LINE_RE.match(line.strip()): continue j = i + 1 while j < len(lines): s = lines[j].strip() if not s or WIDTH_LINE_RE.match(s): j += 1 continue m = LABEL_LINE_RE.match(s) if m: labels.append(m.group(1)) break return labels def process_figure_captions( markdown: str, fig_number_map: dict[str, str] | None = None, ) -> str: """Convert image+label blocks into figures with anchors and captions.""" lines = markdown.splitlines() result: list[str] = [] i = 0 while i < len(lines): img_match = IMAGE_LINE_RE.match(lines[i].strip()) if img_match: caption = img_match.group(1) img_line = lines[i] # Look ahead for :width: and :label: j = i + 1 label = None while j < len(lines): s = lines[j].strip() if not s or WIDTH_LINE_RE.match(s): j += 1 continue m = LABEL_LINE_RE.match(s) if m: label = m.group(1) j += 1 break if label: fig_num = (fig_number_map or {}).get(label) result.append(f'') result.append("") result.append(img_line) if fig_num and caption: result.append("") result.append(f'

图{fig_num} {caption}

') elif fig_num: result.append("") result.append(f'

图{fig_num}

') elif caption: result.append("") result.append(f'

{caption}

') i = j continue result.append(lines[i]) i += 1 return "\n".join(result) def _relative_chapter_path(from_path: str, to_path: str) -> str: """Compute relative path between two mdbook source_paths.""" if from_path == to_path: return "" from_dir = str(PurePosixPath(from_path).parent) return PurePosixPath(os.path.relpath(to_path, start=from_dir)).as_posix() def normalize_directives( markdown: str, label_map: dict[str, int] | None = None, ref_label_map: dict[str, str] | None = None, current_source_path: str | None = None, fig_number_map: dict[str, str] | None = None, ) -> str: normalized = WIDTH_LINE_RE.sub("", markdown) normalized = LABEL_RE.sub(lambda m: f'', normalized) def _numref_replace(match: re.Match[str]) -> str: name = match.group(1) if ref_label_map and current_source_path and name in ref_label_map: target_path = ref_label_map[name] rel = _relative_chapter_path(current_source_path, target_path) display = f"图{fig_number_map[name]}" if fig_number_map and name in fig_number_map else name if rel: return f"[{display}]({rel}#{name})" return f"[{display}](#{name})" return f"`{name}`" normalized = NUMREF_RE.sub(_numref_replace, normalized) if label_map: normalized = EQREF_RE.sub( lambda m: f"({label_map[m.group(1)]})" if m.group(1) in label_map else f"$\\eqref{{{m.group(1)}}}$", normalized, ) else: normalized = EQREF_RE.sub(lambda match: f"$\\eqref{{{match.group(1)}}}$", normalized) normalized = _strip_latex_escapes_outside_math(normalized) lines = [line.rstrip() for line in normalized.splitlines()] collapsed: list[str] = [] previous_blank = False for line in lines: is_blank = line == "" if is_blank and previous_blank: continue collapsed.append(line) previous_blank = is_blank while collapsed and collapsed[-1] == "": collapsed.pop() return "\n".join(collapsed) + "\n" def clean_bibtex(value: str) -> str: value = re.sub(r"\{\\[`'^\"~=.](\w)\}", r"\1", value) value = re.sub(r"\\[`'^\"~=.](\w)", r"\1", value) value = value.replace("{", "").replace("}", "") return value.strip() def _parse_bib_fields(body: str) -> dict[str, str]: fields: dict[str, str] = {} i = 0 while i < len(body): while i < len(body) and body[i] in " \t\n\r,": i += 1 if i >= len(body): break start = i while i < len(body) and body[i] not in "= \t\n\r": i += 1 name = body[start:i].strip().lower() while i < len(body) and body[i] != "=": i += 1 if i >= len(body): break i += 1 while i < len(body) and body[i] in " \t\n\r": i += 1 if i >= len(body): break if body[i] == "{": depth = 1 i += 1 vstart = i while i < len(body) and depth > 0: if body[i] == "{": depth += 1 elif body[i] == "}": depth -= 1 i += 1 value = body[vstart : i - 1] elif body[i] == '"': i += 1 vstart = i while i < len(body) and body[i] != '"': i += 1 value = body[vstart:i] i += 1 else: vstart = i while i < len(body) and body[i] not in ", \t\n\r}": i += 1 value = body[vstart:i] if name: fields[name] = value.strip() return fields def parse_bib(bib_path: Path) -> dict[str, dict[str, str]]: text = bib_path.read_text(encoding="utf-8") entries: dict[str, dict[str, str]] = {} for match in BIB_ENTRY_RE.finditer(text): key = match.group(2).strip() start = match.end() depth = 1 pos = start while pos < len(text) and depth > 0: if text[pos] == "{": depth += 1 elif text[pos] == "}": depth -= 1 pos += 1 fields = _parse_bib_fields(text[start : pos - 1]) fields["_type"] = match.group(1).lower() entries[key] = fields return entries def _render_bibliography( cited_keys: list[str], bib_db: dict[str, dict[str, str]], bibliography_title: str, ) -> list[str]: lines: list[str] = ["---", "", f"## {bibliography_title}", "", "

{title}

{text} ↩

") return lines def process_citations( markdown: str, bib_db: dict[str, dict[str, str]], bibliography_title: str = DEFAULT_BIBLIOGRAPHY_TITLE, ) -> str: cited_keys: list[str] = [] def _replace_cite(match: re.Match[str]) -> str: keys = [k.strip() for k in match.group(1).split(",")] for key in keys: if key not in cited_keys and key in bib_db: cited_keys.append(key) if not bib_db: return "[" + ", ".join(keys) + "]" nums: list[str] = [] for key in keys: if key not in bib_db: continue idx = cited_keys.index(key) + 1 nums.append(f'^[{idx}]') return "".join(nums) processed = CITE_RE.sub(_replace_cite, markdown) if cited_keys and bib_db: bib_lines = _render_bibliography(cited_keys, bib_db, bibliography_title) processed = processed.rstrip("\n") + "\n\n" + "\n".join(bib_lines) + "\n" return processed _FENCE_RE = re.compile(r"^(`{3,}|~{3,})", re.MULTILINE) _CJK_RE = re.compile(r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]") def _iter_math_spans(content: str): """Yield ``(start, end, is_display)`` for every math span. Skips spans inside fenced code blocks and inline code. """ n = len(content) i = 0 in_fence: str | None = None # fence marker when inside a code block while i < n: # Track fenced code blocks if content[i] == "`" or content[i] == "~": m = _FENCE_RE.match(content, i) if m and (i == 0 or content[i - 1] == "\n"): marker = m.group(1) if in_fence is None: in_fence = marker[0] # opening i = content.index("\n", i) + 1 if "\n" in content[i:] else n continue elif marker[0] == in_fence: in_fence = None # closing i = m.end() continue if in_fence: i += 1 continue # Skip inline code if content[i] == "`": end_tick = content.find("`", i + 1) if end_tick != -1: i = end_tick + 1 continue # Display math $$...$$ if content[i:i + 2] == "$$": start = i close = content.find("$$", i + 2) if close != -1: yield (start + 2, close, True) i = close + 2 continue # Inline math $...$ if content[i] == "$": start = i j = i + 1 while j < n: if content[j] == "$": if j > i + 1: # non-empty yield (start + 1, j, False) j += 1 break if content[j] == "\n" and not content[i + 1:j].strip(): break # empty line → not math j += 1 i = j continue i += 1 def convert_math_to_mathjax(content: str) -> str: """Replace ``$``/``$$`` delimited math with MathJax ``\$…\$``/``\\[…\\]``. Inside math content, ``\\`` (LaTeX newline) is doubled to ``\\\\`` so that mdBook's markdown processing (which consumes one level of backslash escaping) delivers the correct ``\\`` to MathJax. """ spans = list(_iter_math_spans(content)) if not spans: return content parts: list[str] = [] prev = 0 for start, end, is_display in spans: delim = "$$" if is_display else "$" delim_len = len(delim) delim_start = start - delim_len math = content[start:end] # Spans containing CJK characters are almost certainly mismatched $. # Strip the $ delimiters and emit the raw text. if _CJK_RE.search(math): parts.append(content[prev:delim_start]) parts.append(math) prev = end + delim_len continue parts.append(content[prev:delim_start]) # Double backslashes inside math so that after mdBook markdown # processing (which eats one backslash layer) MathJax sees the # original LaTeX. math = math.replace("\\\\", "\\\\\\\\") math = math.replace("*", "\\*") math = math.replace("_", "\\_") if is_display: parts.append(f"\\\\[{math}\\\\]") else: parts.append(f"\\\${math}\\\$") prev = end + delim_len parts.append(content[prev:]) return "".join(parts) def resolve_raw_html_file(current_file: Path, filename: str) -> Path: direct = (current_file.parent / filename).resolve() if direct.exists(): return direct static_fallback = (current_file.parent / "static" / filename).resolve() if static_fallback.exists(): return static_fallback repo_static = (Path(__file__).resolve().parent.parent / "static" / filename) if repo_static.exists(): return repo_static raise FileNotFoundError(f"Raw HTML include '{filename}' from '{current_file}' does not exist") def rewrite_frontpage_assets(html: str) -> str: rewritten = html.replace("./_images/", "static/image/") rewritten = rewritten.replace("_images/", "static/image/") rewritten = HEAD_TAG_RE.sub("", rewritten) rewritten = STYLE_BLOCK_RE.sub(_minify_style_block, rewritten) return rewritten def _minify_style_block(match: re.Match[str]) -> str: content = match.group(1) parts = [line.strip() for line in content.splitlines() if line.strip()] return f"" def render_frontpage_switch(label: str, href: str) -> str: return ( '

' f'{label}' "

" ) def wrap_frontpage_html( html: str, frontpage_switch_label: str | None = None, frontpage_switch_href: str | None = None, ) -> str: rendered_html = html.strip() if frontpage_switch_label and frontpage_switch_href: switch_html = render_frontpage_switch(frontpage_switch_label, frontpage_switch_href) if FRONTPAGE_SWITCH_PLACEHOLDER in rendered_html: rendered_html = rendered_html.replace(FRONTPAGE_SWITCH_PLACEHOLDER, switch_html) else: rendered_html = "\n".join([switch_html, rendered_html]) parts = [FRONTPAGE_LAYOUT_CSS, '

', rendered_html, "

"] return "\n".join(parts) def inline_raw_html( block_lines: list[str], current_file: Path, frontpage_switch_label: str | None = None, frontpage_switch_href: str | None = None, ) -> str | None: stripped = [line.strip() for line in block_lines if line.strip()] if not stripped or stripped[0] != ".. raw:: html": return None filename: str | None = None for line in stripped[1:]: match = RAW_HTML_FILE_RE.match(line) if match: filename = match.group(1) break if filename is None: return None html_path = resolve_raw_html_file(current_file, filename) html = rewrite_frontpage_assets(html_path.read_text(encoding="utf-8")).strip() if Path(filename).name == "frontpage.html": return wrap_frontpage_html( html, frontpage_switch_label=frontpage_switch_label, frontpage_switch_href=frontpage_switch_href, ) return html def chapter_label(item: TocItem, target: Path, title_cache: dict[Path, str]) -> str: return item.label or title_cache[target] def render_toc_list(entries: list[TocItem], current_file: Path, title_cache: dict[Path, str]) -> list[str]: rendered: list[str] = [] current_indent = 0 for entry in entries: if entry.kind == "part": rendered.append(f"- {entry.label}") current_indent = 1 continue if entry.target is None: continue target = resolve_toc_target(current_file, entry.target) if target is None or target not in title_cache: continue label = chapter_label(entry, target, title_cache) rendered.append(f"{' ' * current_indent}- [{label}]({relative_link(current_file, target)})") return rendered def rewrite_markdown( markdown: str, current_file: Path, title_cache: dict[Path, str], bib_db: dict[str, dict[str, str]] | None = None, bibliography_title: str = DEFAULT_BIBLIOGRAPHY_TITLE, frontpage_switch_label: str | None = None, frontpage_switch_href: str | None = None, ref_label_map: dict[str, str] | None = None, current_source_path: str | None = None, fig_number_map: dict[str, str] | None = None, ) -> str: output: list[str] = [] lines = markdown.splitlines() index = 0 while index < len(lines): stripped = lines[index].strip() if stripped in (f"```{TOC_FENCE}", f"```{EVAL_RST_FENCE}"): fence = stripped[3:] index += 1 block_lines: list[str] = [] while index < len(lines) and lines[index].strip() != "```": block_lines.append(lines[index]) index += 1 if fence == TOC_FENCE: entries = parse_toc_entries(block_lines) if entries: if output and output[-1] != "": output.append("") rendered = render_toc_list(entries, current_file, title_cache) output.extend(rendered) if rendered and output and output[-1] != "": output.append("") elif fence == EVAL_RST_FENCE: raw_html = inline_raw_html( block_lines, current_file, frontpage_switch_label=frontpage_switch_label, frontpage_switch_href=frontpage_switch_href, ) if raw_html: if output and output[-1] != "": output.append("") output.extend(raw_html.splitlines()) if output and output[-1] != "": output.append("") index += 1 continue output.append(lines[index]) index += 1 while output and output[-1] == "": output.pop() raw = "\n".join(output) + "\n" result, label_map = process_equation_labels(raw) result = process_figure_captions(result, fig_number_map=fig_number_map) result = normalize_directives( result, label_map=label_map, ref_label_map=ref_label_map, current_source_path=current_source_path, fig_number_map=fig_number_map, ) result = process_citations(result, bib_db or {}, bibliography_title=bibliography_title) return result def build_title_cache( source_dir: Path, placeholder_prefix: str | None = None, ) -> dict[Path, str]: cache: dict[Path, str] = {} for markdown_file in sorted(source_dir.rglob("*.md")): if "_build" in markdown_file.parts or markdown_file.name == "SUMMARY.md": continue text = markdown_file.read_text(encoding="utf-8") if is_placeholder_markdown(text, placeholder_prefix): continue cache[markdown_file.resolve()] = extract_title(text, fallback=markdown_file.stem) return cache def build_summary(source_dir: Path, title_cache: dict[Path, str]) -> str: root_index = (source_dir / "index.md").resolve() root_markdown = root_index.read_text(encoding="utf-8") lines = ["# Summary", "", f"[{title_cache[root_index]}](index.md)"] seen: set[Path] = {root_index} def append_entry(target: Path, indent: int, label: str | None = None) -> None: target = target.resolve() if target in seen or target not in title_cache: return seen.add(target) rel = target.relative_to(source_dir.resolve()).as_posix() title = label or title_cache[target] lines.append(f"{' ' * indent}- [{title}]({rel})") child_markdown = target.read_text(encoding="utf-8") for block in parse_toc_blocks(child_markdown): for entry in block: if entry.kind != "chapter" or entry.target is None: continue child_target = resolve_toc_target(target, entry.target) if child_target is not None: append_entry(child_target, indent + 1, entry.label or None) def append_prefix_chapter(target: Path, label: str | None = None) -> None: target = target.resolve() if target in seen or target not in title_cache: return seen.add(target) rel = target.relative_to(source_dir.resolve()).as_posix() title = label or title_cache[target] lines.append(f"[{title}]({rel})") numbered_started = False for block in parse_toc_blocks(root_markdown): for entry in block: if entry.kind == "part": if lines and lines[-1] != "": lines.append("") lines.append(f"# {entry.label}") lines.append("") numbered_started = True continue if entry.target is None: continue target = resolve_toc_target(root_index, entry.target) if target is None: continue if numbered_started: append_entry(target, 0, entry.label or None) else: append_prefix_chapter(target, entry.label or None) return "\n".join(lines) + "\n" def write_summary( source_dir: Path, summary_path: Path | None = None, placeholder_prefix: str | None = None, ) -> Path: source_dir = source_dir.resolve() summary_path = summary_path.resolve() if summary_path else (source_dir / "SUMMARY.md") title_cache = build_title_cache(source_dir, placeholder_prefix=placeholder_prefix) summary_path.write_text(build_summary(source_dir, title_cache), encoding="utf-8") return summary_path def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generate mdBook SUMMARY.md for a chapter directory.") parser.add_argument("--source", type=Path, required=True, help="Source chapter directory") parser.add_argument("--summary-output", type=Path, required=True, help="Where to write the generated SUMMARY.md") parser.add_argument( "--placeholder-prefix", default=None, help="If set, files whose entire contents start with this prefix are skipped from mdBook output.", ) return parser.parse_args() def main() -> int: args = parse_args() summary_path = write_summary( args.source, summary_path=args.summary_output, placeholder_prefix=args.placeholder_prefix, ) print(f"Wrote mdBook summary to {summary_path}") return 0 if __name__ == "__main__": raise SystemExit(main())