from __future__ import annotations import argparse import os import re from dataclasses import dataclass from pathlib import Path, PurePosixPath TOC_FENCE = "toc" EVAL_RST_FENCE = "eval_rst" WIDTH_LINE_RE = re.compile(r"^:width:`[^`]+`\s*$", re.MULTILINE) LABEL_RE = re.compile(r":label:`([^`]+)`") NUMREF_RE = re.compile(r":numref:`([^`]+)`") IMAGE_LINE_RE = re.compile(r"^!\[([^\]]*)\]\(([^)]+)\)\s*$") LABEL_LINE_RE = re.compile(r"^:label:`([^`]+)`\s*$") EQREF_RE = re.compile(r":eqref:`([^`]+)`") EQLABEL_LINE_RE = re.compile(r"^:eqlabel:`([^`]+)`\s*$") CITE_RE = re.compile(r":cite:`([^`]+)`") BIB_ENTRY_RE = re.compile(r"@(\w+)\{([^,]+),") LATEX_ESCAPE_RE = re.compile(r"\\([_%#&])") RAW_HTML_FILE_RE = re.compile(r"^\s*:file:\s*([^\s]+)\s*$") TOC_LINK_RE = re.compile(r"^\[([^\]]+)\]\(([^)]+)\)\s*$") TOC_PART_RE = re.compile(r"^#+\s+(.+?)\s*$") HEAD_TAG_RE = re.compile(r"?head>", re.IGNORECASE) STYLE_BLOCK_RE = re.compile(r"", re.IGNORECASE | re.DOTALL) DEFAULT_BIBLIOGRAPHY_TITLE = "References" FRONTPAGE_SWITCH_PLACEHOLDER = "" FRONTPAGE_LAYOUT_CSS = """ """.strip() @dataclass(frozen=True) class TocItem: kind: str label: str target: str | None = None def is_placeholder_markdown(markdown: str, placeholder_prefix: str | None = None) -> bool: if not placeholder_prefix: return False stripped = markdown.strip() return stripped.startswith(placeholder_prefix) and stripped.endswith("]") def extract_title(markdown: str, fallback: str = "Untitled") -> str: lines = markdown.splitlines() for index, line in enumerate(lines): stripped = line.strip() if not stripped: continue if stripped.startswith("#"): heading = stripped.lstrip("#").strip() if heading: return heading next_index = index + 1 if next_index < len(lines): underline = lines[next_index].strip() if underline and set(underline) <= {"=", "-"}: return stripped return fallback def parse_toc_entries(block_lines: list[str]) -> list[TocItem]: entries: list[TocItem] = [] for line in block_lines: stripped = line.strip() if not stripped or stripped.startswith(":"): continue part_match = TOC_PART_RE.match(stripped) if part_match: entries.append(TocItem(kind="part", label=part_match.group(1).strip())) continue link_match = TOC_LINK_RE.match(stripped) if link_match: entries.append( TocItem( kind="chapter", label=link_match.group(1).strip(), target=link_match.group(2).strip(), ) ) continue entries.append(TocItem(kind="chapter", label="", target=stripped)) return entries def parse_toc_blocks(markdown: str) -> list[list[TocItem]]: blocks: list[list[TocItem]] = [] lines = markdown.splitlines() index = 0 while index < len(lines): if lines[index].strip() == f"```{TOC_FENCE}": index += 1 block_lines: list[str] = [] while index < len(lines) and lines[index].strip() != "```": block_lines.append(lines[index]) index += 1 entries = parse_toc_entries(block_lines) blocks.append(entries) index += 1 return blocks def resolve_toc_target(current_file: Path, entry: str) -> Path | None: target_name = entry if entry.endswith(".md") else f"{entry}.md" target = (current_file.parent / target_name).resolve() if not target.exists(): return None return target def relative_link(from_file: Path, target_file: Path) -> str: return Path(os.path.relpath(target_file, start=from_file.parent)).as_posix() def _strip_latex_escapes_outside_math(text: str) -> str: """Remove LaTeX text-mode escapes (``\\_``, ``\\#``, etc.) outside math spans, fenced code blocks, and inline code. Operates on the full text (not per-line) to correctly handle multi-line display math ``$$...$$`` blocks. """ # 1. Find all protected regions where escapes must NOT be stripped. protected: list[tuple[int, int]] = [] # (start, end) n = len(text) i = 0 in_fence: str | None = None fence_start = 0 while i < n: # Fenced code blocks (``` or ~~~) at start of line if (i == 0 or text[i - 1] == "\n") and text[i] in ("`", "~"): m = re.match(r"`{3,}|~{3,}", text[i:]) if m: if in_fence is None: in_fence = m.group()[0] fence_start = i elif m.group()[0] == in_fence: eol = text.find("\n", m.end() + i) end = eol + 1 if eol != -1 else n protected.append((fence_start, end)) in_fence = None i = end continue eol = text.find("\n", i) i = eol + 1 if eol != -1 else n continue if in_fence is not None: i += 1 continue # Inline code `...` if text[i] == "`": close = text.find("`", i + 1) if close != -1: protected.append((i, close + 1)) i = close + 1 continue # Display math $$...$$ if text[i:i + 2] == "$$": close = text.find("$$", i + 2) if close != -1: protected.append((i, close + 2)) i = close + 2 continue # Inline math $...$ if text[i] == "$": j = i + 1 while j < n and text[j] != "$" and text[j] != "\n": j += 1 if j < n and text[j] == "$" and j > i + 1: protected.append((i, j + 1)) i = j + 1 continue i += 1 # Unclosed fence → protect everything from fence_start to end if in_fence is not None: protected.append((fence_start, n)) # 2. Apply substitution only to unprotected gaps. parts: list[str] = [] prev = 0 for start, end in protected: if start > prev: parts.append(LATEX_ESCAPE_RE.sub(r"\1", text[prev:start])) parts.append(text[start:end]) prev = end if prev < n: parts.append(LATEX_ESCAPE_RE.sub(r"\1", text[prev:])) return "".join(parts) def process_equation_labels(markdown: str) -> tuple[str, dict[str, int]]: """Convert :eqlabel: directives to MathJax \\tag + \\label in preceding equations. Args: markdown: The markdown content to process. Returns: A tuple of (processed_markdown, label_map) where label_map maps label names to their equation numbers. """ lines = markdown.split("\n") result: list[str] = [] eq_counter = 0 label_map: dict[str, int] = {} for line in lines: match = EQLABEL_LINE_RE.match(line.strip()) if not match: result.append(line) continue label_name = match.group(1) eq_counter += 1 label_map[label_name] = eq_counter tag = f"\\tag{{{eq_counter}}}\\label{{{label_name}}}" # Search backward for the closing $$ of the preceding equation inserted = False for j in range(len(result) - 1, -1, -1): stripped = result[j].rstrip() if not stripped: continue # skip blank lines if stripped == "$$": # Multi-line equation: $$ on its own line result.insert(j, tag) inserted = True break if stripped.endswith("$$"): # Single-line or end-of-content $$ result[j] = stripped[:-2] + tag + "$$" inserted = True break break # non-blank, non-$$ line: no equation found if not inserted: # Fallback: keep original line if no equation found result.append(line) return "\n".join(result), label_map def collect_labels(markdown: str) -> list[str]: """Extract all label names from :label: directives.""" return LABEL_RE.findall(markdown) def collect_figure_labels(markdown: str) -> list[str]: """Return label names for figures (image lines followed by :label:).""" labels: list[str] = [] lines = markdown.splitlines() for i, line in enumerate(lines): if not IMAGE_LINE_RE.match(line.strip()): continue j = i + 1 while j < len(lines): s = lines[j].strip() if not s or WIDTH_LINE_RE.match(s): j += 1 continue m = LABEL_LINE_RE.match(s) if m: labels.append(m.group(1)) break return labels def process_figure_captions( markdown: str, fig_number_map: dict[str, str] | None = None, ) -> str: """Convert image+label blocks into figures with anchors and captions.""" lines = markdown.splitlines() result: list[str] = [] i = 0 while i < len(lines): img_match = IMAGE_LINE_RE.match(lines[i].strip()) if img_match: caption = img_match.group(1) img_line = lines[i] # Look ahead for :width: and :label: j = i + 1 label = None while j < len(lines): s = lines[j].strip() if not s or WIDTH_LINE_RE.match(s): j += 1 continue m = LABEL_LINE_RE.match(s) if m: label = m.group(1) j += 1 break if label: fig_num = (fig_number_map or {}).get(label) result.append(f'') result.append("") result.append(img_line) if fig_num and caption: result.append("") result.append(f'
图{fig_num} {caption}
') elif fig_num: result.append("") result.append(f'图{fig_num}
') elif caption: result.append("") result.append(f'{caption}
') i = j continue result.append(lines[i]) i += 1 return "\n".join(result) def _relative_chapter_path(from_path: str, to_path: str) -> str: """Compute relative path between two mdbook source_paths.""" if from_path == to_path: return "" from_dir = str(PurePosixPath(from_path).parent) return PurePosixPath(os.path.relpath(to_path, start=from_dir)).as_posix() def normalize_directives( markdown: str, label_map: dict[str, int] | None = None, ref_label_map: dict[str, str] | None = None, current_source_path: str | None = None, fig_number_map: dict[str, str] | None = None, ) -> str: normalized = WIDTH_LINE_RE.sub("", markdown) normalized = LABEL_RE.sub(lambda m: f'', normalized) def _numref_replace(match: re.Match[str]) -> str: name = match.group(1) if ref_label_map and current_source_path and name in ref_label_map: target_path = ref_label_map[name] rel = _relative_chapter_path(current_source_path, target_path) display = f"图{fig_number_map[name]}" if fig_number_map and name in fig_number_map else name if rel: return f"[{display}]({rel}#{name})" return f"[{display}](#{name})" return f"`{name}`" normalized = NUMREF_RE.sub(_numref_replace, normalized) if label_map: normalized = EQREF_RE.sub( lambda m: f"({label_map[m.group(1)]})" if m.group(1) in label_map else f"$\\eqref{{{m.group(1)}}}$", normalized, ) else: normalized = EQREF_RE.sub(lambda match: f"$\\eqref{{{match.group(1)}}}$", normalized) normalized = _strip_latex_escapes_outside_math(normalized) lines = [line.rstrip() for line in normalized.splitlines()] collapsed: list[str] = [] previous_blank = False for line in lines: is_blank = line == "" if is_blank and previous_blank: continue collapsed.append(line) previous_blank = is_blank while collapsed and collapsed[-1] == "": collapsed.pop() return "\n".join(collapsed) + "\n" def clean_bibtex(value: str) -> str: value = re.sub(r"\{\\[`'^\"~=.](\w)\}", r"\1", value) value = re.sub(r"\\[`'^\"~=.](\w)", r"\1", value) value = value.replace("{", "").replace("}", "") return value.strip() def _parse_bib_fields(body: str) -> dict[str, str]: fields: dict[str, str] = {} i = 0 while i < len(body): while i < len(body) and body[i] in " \t\n\r,": i += 1 if i >= len(body): break start = i while i < len(body) and body[i] not in "= \t\n\r": i += 1 name = body[start:i].strip().lower() while i < len(body) and body[i] != "=": i += 1 if i >= len(body): break i += 1 while i < len(body) and body[i] in " \t\n\r": i += 1 if i >= len(body): break if body[i] == "{": depth = 1 i += 1 vstart = i while i < len(body) and depth > 0: if body[i] == "{": depth += 1 elif body[i] == "}": depth -= 1 i += 1 value = body[vstart : i - 1] elif body[i] == '"': i += 1 vstart = i while i < len(body) and body[i] != '"': i += 1 value = body[vstart:i] i += 1 else: vstart = i while i < len(body) and body[i] not in ", \t\n\r}": i += 1 value = body[vstart:i] if name: fields[name] = value.strip() return fields def parse_bib(bib_path: Path) -> dict[str, dict[str, str]]: text = bib_path.read_text(encoding="utf-8") entries: dict[str, dict[str, str]] = {} for match in BIB_ENTRY_RE.finditer(text): key = match.group(2).strip() start = match.end() depth = 1 pos = start while pos < len(text) and depth > 0: if text[pos] == "{": depth += 1 elif text[pos] == "}": depth -= 1 pos += 1 fields = _parse_bib_fields(text[start : pos - 1]) fields["_type"] = match.group(1).lower() entries[key] = fields return entries def _render_bibliography( cited_keys: list[str], bib_db: dict[str, dict[str, str]], bibliography_title: str, ) -> list[str]: lines: list[str] = ["---", "", f"## {bibliography_title}", "", "' f'{label}' "
" ) def wrap_frontpage_html( html: str, frontpage_switch_label: str | None = None, frontpage_switch_href: str | None = None, ) -> str: rendered_html = html.strip() if frontpage_switch_label and frontpage_switch_href: switch_html = render_frontpage_switch(frontpage_switch_label, frontpage_switch_href) if FRONTPAGE_SWITCH_PLACEHOLDER in rendered_html: rendered_html = rendered_html.replace(FRONTPAGE_SWITCH_PLACEHOLDER, switch_html) else: rendered_html = "\n".join([switch_html, rendered_html]) parts = [FRONTPAGE_LAYOUT_CSS, '