mirror of
https://github.com/openmlsys/openmlsys-zh.git
synced 2026-03-21 04:27:33 +08:00
* feat: remove bilingual button on the front page * misc: clean repo * test: fix test suite for v1/v2 restructure and removed language switch
1022 lines
32 KiB
Python
1022 lines
32 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path, PurePosixPath
|
|
|
|
|
|
TOC_FENCE = "toc"
|
|
EVAL_RST_FENCE = "eval_rst"
|
|
WIDTH_LINE_RE = re.compile(r"^:width:`[^`]+`\s*$", re.MULTILINE)
|
|
LABEL_RE = re.compile(r":label:`([^`]+)`")
|
|
NUMREF_RE = re.compile(r":numref:`([^`]+)`")
|
|
IMAGE_LINE_RE = re.compile(r"^!\[([^\]]*)\]\(([^)]+)\)\s*$")
|
|
LABEL_LINE_RE = re.compile(r"^:label:`([^`]+)`\s*$")
|
|
EQREF_RE = re.compile(r":eqref:`([^`]+)`")
|
|
EQLABEL_LINE_RE = re.compile(r"^:eqlabel:`([^`]+)`\s*$")
|
|
CITE_RE = re.compile(r":cite:`([^`]+)`")
|
|
BIB_ENTRY_RE = re.compile(r"@(\w+)\{([^,]+),")
|
|
LATEX_ESCAPE_RE = re.compile(r"\\([_%#&])")
|
|
RAW_HTML_FILE_RE = re.compile(r"^\s*:file:\s*([^\s]+)\s*$")
|
|
TOC_LINK_RE = re.compile(r"^\[([^\]]+)\]\(([^)]+)\)\s*$")
|
|
TOC_PART_RE = re.compile(r"^#+\s+(.+?)\s*$")
|
|
HEAD_TAG_RE = re.compile(r"</?head>", re.IGNORECASE)
|
|
STYLE_BLOCK_RE = re.compile(r"<style>(.*?)</style>", re.IGNORECASE | re.DOTALL)
|
|
DEFAULT_BIBLIOGRAPHY_TITLE = "References"
|
|
FRONTPAGE_SWITCH_PLACEHOLDER = "<!-- OPENMLSYS_LANGUAGE_SWITCH -->"
|
|
FRONTPAGE_LAYOUT_CSS = """
|
|
<style>
|
|
.openmlsys-frontpage {
|
|
width: 100%;
|
|
margin: 0 auto 3rem;
|
|
margin-inline: auto;
|
|
}
|
|
.openmlsys-frontpage-switch-row {
|
|
margin: 12px 0 0;
|
|
display: flex;
|
|
justify-content: center;
|
|
}
|
|
.openmlsys-frontpage-switch {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
justify-content: center;
|
|
min-width: 82px;
|
|
height: 28px;
|
|
padding: 0 14px;
|
|
border-radius: 6px;
|
|
border: 1px solid rgba(31, 35, 40, 0.15);
|
|
background: #f6f8fa;
|
|
color: #24292f;
|
|
font-size: 13px;
|
|
font-weight: 600;
|
|
text-decoration: none;
|
|
box-shadow: 0 1px 0 rgba(31, 35, 40, 0.04);
|
|
}
|
|
.openmlsys-frontpage-switch:hover {
|
|
background: #f3f4f6;
|
|
border-color: rgba(31, 35, 40, 0.2);
|
|
}
|
|
.openmlsys-frontpage .mdl-grid {
|
|
display: flex;
|
|
flex-wrap: wrap;
|
|
gap: 24px;
|
|
width: 100%;
|
|
box-sizing: border-box;
|
|
}
|
|
.openmlsys-frontpage .mdl-cell {
|
|
box-sizing: border-box;
|
|
flex: 1 1 220px;
|
|
min-width: 0;
|
|
}
|
|
.openmlsys-frontpage .mdl-cell--1-col {
|
|
flex: 0 0 48px;
|
|
}
|
|
.openmlsys-frontpage .mdl-cell--3-col {
|
|
flex: 0 1 calc(16.666% - 20px);
|
|
max-width: calc(16.666% - 20px);
|
|
}
|
|
.openmlsys-frontpage .authors.mdl-grid {
|
|
justify-content: center;
|
|
}
|
|
.openmlsys-frontpage .mdl-cell--5-col {
|
|
flex: 1 1 calc(41.666% - 24px);
|
|
max-width: calc(41.666% - 18px);
|
|
}
|
|
.openmlsys-frontpage .mdl-cell--12-col {
|
|
flex: 1 1 100%;
|
|
max-width: 100%;
|
|
}
|
|
.openmlsys-frontpage .mdl-cell--middle {
|
|
align-self: center;
|
|
}
|
|
.openmlsys-frontpage .mdl-color-text--primary {
|
|
color: var(--links, #0b6bcb);
|
|
}
|
|
.openmlsys-frontpage img {
|
|
max-width: 100%;
|
|
height: auto;
|
|
background: transparent !important;
|
|
padding: 0 !important;
|
|
}
|
|
.openmlsys-frontpage + ul,
|
|
.openmlsys-frontpage + ul ul {
|
|
max-width: 960px;
|
|
margin-inline: auto;
|
|
}
|
|
.content main {
|
|
max-width: min(100%, max(65%, var(--content-max-width)));
|
|
}
|
|
@media (max-width: 1000px) {
|
|
.openmlsys-frontpage .mdl-cell,
|
|
.openmlsys-frontpage .mdl-cell--1-col,
|
|
.openmlsys-frontpage .mdl-cell--3-col,
|
|
.openmlsys-frontpage .mdl-cell--5-col {
|
|
flex: 1 1 100%;
|
|
max-width: 100%;
|
|
}
|
|
}
|
|
</style>
|
|
""".strip()
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TocItem:
|
|
kind: str
|
|
label: str
|
|
target: str | None = None
|
|
|
|
|
|
def is_placeholder_markdown(markdown: str, placeholder_prefix: str | None = None) -> bool:
|
|
if not placeholder_prefix:
|
|
return False
|
|
|
|
stripped = markdown.strip()
|
|
return stripped.startswith(placeholder_prefix) and stripped.endswith("]")
|
|
|
|
|
|
def _strip_pandoc_heading_id(heading: str) -> str:
|
|
return re.sub(r"\s+\{#[^}]+\}\s*$", "", heading)
|
|
|
|
|
|
def extract_title(markdown: str, fallback: str = "Untitled") -> str:
|
|
lines = markdown.splitlines()
|
|
|
|
for index, line in enumerate(lines):
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
if stripped.startswith("#"):
|
|
heading = stripped.lstrip("#").strip()
|
|
if heading:
|
|
return _strip_pandoc_heading_id(heading)
|
|
|
|
next_index = index + 1
|
|
if next_index < len(lines):
|
|
underline = lines[next_index].strip()
|
|
if underline and set(underline) <= {"=", "-"}:
|
|
return _strip_pandoc_heading_id(stripped)
|
|
|
|
return fallback
|
|
|
|
|
|
def parse_toc_entries(block_lines: list[str]) -> list[TocItem]:
|
|
entries: list[TocItem] = []
|
|
for line in block_lines:
|
|
stripped = line.strip()
|
|
if not stripped or stripped.startswith(":"):
|
|
continue
|
|
part_match = TOC_PART_RE.match(stripped)
|
|
if part_match:
|
|
entries.append(TocItem(kind="part", label=part_match.group(1).strip()))
|
|
continue
|
|
link_match = TOC_LINK_RE.match(stripped)
|
|
if link_match:
|
|
entries.append(
|
|
TocItem(
|
|
kind="chapter",
|
|
label=link_match.group(1).strip(),
|
|
target=link_match.group(2).strip(),
|
|
)
|
|
)
|
|
continue
|
|
entries.append(TocItem(kind="chapter", label="", target=stripped))
|
|
return entries
|
|
|
|
|
|
def parse_toc_blocks(markdown: str) -> list[list[TocItem]]:
|
|
blocks: list[list[TocItem]] = []
|
|
lines = markdown.splitlines()
|
|
index = 0
|
|
|
|
while index < len(lines):
|
|
if lines[index].strip() == f"```{TOC_FENCE}":
|
|
index += 1
|
|
block_lines: list[str] = []
|
|
while index < len(lines) and lines[index].strip() != "```":
|
|
block_lines.append(lines[index])
|
|
index += 1
|
|
entries = parse_toc_entries(block_lines)
|
|
blocks.append(entries)
|
|
index += 1
|
|
|
|
return blocks
|
|
|
|
|
|
def resolve_toc_target(current_file: Path, entry: str) -> Path | None:
|
|
target_name = entry if entry.endswith(".md") else f"{entry}.md"
|
|
target = (current_file.parent / target_name).resolve()
|
|
if not target.exists():
|
|
return None
|
|
return target
|
|
|
|
|
|
def relative_link(from_file: Path, target_file: Path) -> str:
|
|
return Path(os.path.relpath(target_file, start=from_file.parent)).as_posix()
|
|
|
|
|
|
def _strip_latex_escapes_outside_math(text: str) -> str:
|
|
"""Remove LaTeX text-mode escapes (``\\_``, ``\\#``, etc.) outside math
|
|
spans, fenced code blocks, and inline code.
|
|
|
|
Operates on the full text (not per-line) to correctly handle multi-line
|
|
display math ``$$...$$`` blocks.
|
|
"""
|
|
# 1. Find all protected regions where escapes must NOT be stripped.
|
|
protected: list[tuple[int, int]] = [] # (start, end)
|
|
n = len(text)
|
|
i = 0
|
|
in_fence: str | None = None
|
|
fence_start = 0
|
|
|
|
while i < n:
|
|
# Fenced code blocks (``` or ~~~) at start of line
|
|
if (i == 0 or text[i - 1] == "\n") and text[i] in ("`", "~"):
|
|
m = re.match(r"`{3,}|~{3,}", text[i:])
|
|
if m:
|
|
if in_fence is None:
|
|
in_fence = m.group()[0]
|
|
fence_start = i
|
|
elif m.group()[0] == in_fence:
|
|
eol = text.find("\n", m.end() + i)
|
|
end = eol + 1 if eol != -1 else n
|
|
protected.append((fence_start, end))
|
|
in_fence = None
|
|
i = end
|
|
continue
|
|
eol = text.find("\n", i)
|
|
i = eol + 1 if eol != -1 else n
|
|
continue
|
|
|
|
if in_fence is not None:
|
|
i += 1
|
|
continue
|
|
|
|
# Inline code `...`
|
|
if text[i] == "`":
|
|
close = text.find("`", i + 1)
|
|
if close != -1:
|
|
protected.append((i, close + 1))
|
|
i = close + 1
|
|
continue
|
|
|
|
# Display math $$...$$
|
|
if text[i:i + 2] == "$$":
|
|
close = text.find("$$", i + 2)
|
|
if close != -1:
|
|
protected.append((i, close + 2))
|
|
i = close + 2
|
|
continue
|
|
|
|
# Inline math $...$
|
|
if text[i] == "$":
|
|
j = i + 1
|
|
while j < n and text[j] != "$" and text[j] != "\n":
|
|
j += 1
|
|
if j < n and text[j] == "$" and j > i + 1:
|
|
protected.append((i, j + 1))
|
|
i = j + 1
|
|
continue
|
|
|
|
i += 1
|
|
|
|
# Unclosed fence → protect everything from fence_start to end
|
|
if in_fence is not None:
|
|
protected.append((fence_start, n))
|
|
|
|
# 2. Apply substitution only to unprotected gaps.
|
|
parts: list[str] = []
|
|
prev = 0
|
|
for start, end in protected:
|
|
if start > prev:
|
|
parts.append(LATEX_ESCAPE_RE.sub(r"\1", text[prev:start]))
|
|
parts.append(text[start:end])
|
|
prev = end
|
|
if prev < n:
|
|
parts.append(LATEX_ESCAPE_RE.sub(r"\1", text[prev:]))
|
|
|
|
return "".join(parts)
|
|
|
|
|
|
def process_equation_labels(markdown: str) -> tuple[str, dict[str, int]]:
|
|
"""Convert :eqlabel: directives to MathJax \\tag + \\label in preceding equations.
|
|
|
|
Args:
|
|
markdown: The markdown content to process.
|
|
|
|
Returns:
|
|
A tuple of (processed_markdown, label_map) where label_map maps
|
|
label names to their equation numbers.
|
|
"""
|
|
lines = markdown.split("\n")
|
|
result: list[str] = []
|
|
eq_counter = 0
|
|
label_map: dict[str, int] = {}
|
|
|
|
for line in lines:
|
|
match = EQLABEL_LINE_RE.match(line.strip())
|
|
if not match:
|
|
result.append(line)
|
|
continue
|
|
|
|
label_name = match.group(1)
|
|
eq_counter += 1
|
|
label_map[label_name] = eq_counter
|
|
tag = f"\\tag{{{eq_counter}}}\\label{{{label_name}}}"
|
|
|
|
# Search backward for the closing $$ of the preceding equation
|
|
inserted = False
|
|
for j in range(len(result) - 1, -1, -1):
|
|
stripped = result[j].rstrip()
|
|
if not stripped:
|
|
continue # skip blank lines
|
|
if stripped == "$$":
|
|
# Multi-line equation: $$ on its own line
|
|
result.insert(j, tag)
|
|
inserted = True
|
|
break
|
|
if stripped.endswith("$$"):
|
|
# Single-line or end-of-content $$
|
|
result[j] = stripped[:-2] + tag + "$$"
|
|
inserted = True
|
|
break
|
|
break # non-blank, non-$$ line: no equation found
|
|
|
|
if not inserted:
|
|
# Fallback: keep original line if no equation found
|
|
result.append(line)
|
|
|
|
return "\n".join(result), label_map
|
|
|
|
|
|
def collect_labels(markdown: str) -> list[str]:
|
|
"""Extract all label names from :label: directives."""
|
|
return LABEL_RE.findall(markdown)
|
|
|
|
|
|
def collect_figure_labels(markdown: str) -> list[str]:
|
|
"""Return label names for figures (image lines followed by :label:)."""
|
|
labels: list[str] = []
|
|
lines = markdown.splitlines()
|
|
for i, line in enumerate(lines):
|
|
if not IMAGE_LINE_RE.match(line.strip()):
|
|
continue
|
|
j = i + 1
|
|
while j < len(lines):
|
|
s = lines[j].strip()
|
|
if not s or WIDTH_LINE_RE.match(s):
|
|
j += 1
|
|
continue
|
|
m = LABEL_LINE_RE.match(s)
|
|
if m:
|
|
labels.append(m.group(1))
|
|
break
|
|
return labels
|
|
|
|
|
|
def process_figure_captions(
|
|
markdown: str,
|
|
fig_number_map: dict[str, str] | None = None,
|
|
) -> str:
|
|
"""Convert image+label blocks into figures with anchors and captions."""
|
|
lines = markdown.splitlines()
|
|
result: list[str] = []
|
|
i = 0
|
|
while i < len(lines):
|
|
img_match = IMAGE_LINE_RE.match(lines[i].strip())
|
|
if img_match:
|
|
caption = img_match.group(1)
|
|
img_line = lines[i]
|
|
# Look ahead for :width: and :label:
|
|
j = i + 1
|
|
label = None
|
|
while j < len(lines):
|
|
s = lines[j].strip()
|
|
if not s or WIDTH_LINE_RE.match(s):
|
|
j += 1
|
|
continue
|
|
m = LABEL_LINE_RE.match(s)
|
|
if m:
|
|
label = m.group(1)
|
|
j += 1
|
|
break
|
|
|
|
if label:
|
|
fig_num = (fig_number_map or {}).get(label)
|
|
result.append(f'<a id="{label}"></a>')
|
|
result.append("")
|
|
result.append(img_line)
|
|
if fig_num and caption:
|
|
result.append("")
|
|
result.append(f'<p align="center">图{fig_num} {caption}</p>')
|
|
elif fig_num:
|
|
result.append("")
|
|
result.append(f'<p align="center">图{fig_num}</p>')
|
|
elif caption:
|
|
result.append("")
|
|
result.append(f'<p align="center">{caption}</p>')
|
|
i = j
|
|
continue
|
|
|
|
result.append(lines[i])
|
|
i += 1
|
|
return "\n".join(result)
|
|
|
|
|
|
def _relative_chapter_path(from_path: str, to_path: str) -> str:
|
|
"""Compute relative path between two mdbook source_paths."""
|
|
if from_path == to_path:
|
|
return ""
|
|
from_dir = str(PurePosixPath(from_path).parent)
|
|
return PurePosixPath(os.path.relpath(to_path, start=from_dir)).as_posix()
|
|
|
|
|
|
def normalize_directives(
|
|
markdown: str,
|
|
label_map: dict[str, int] | None = None,
|
|
ref_label_map: dict[str, str] | None = None,
|
|
current_source_path: str | None = None,
|
|
fig_number_map: dict[str, str] | None = None,
|
|
) -> str:
|
|
normalized = WIDTH_LINE_RE.sub("", markdown)
|
|
normalized = LABEL_RE.sub(lambda m: f'<a id="{m.group(1)}"></a>', normalized)
|
|
|
|
def _numref_replace(match: re.Match[str]) -> str:
|
|
name = match.group(1)
|
|
if ref_label_map and current_source_path and name in ref_label_map:
|
|
target_path = ref_label_map[name]
|
|
rel = _relative_chapter_path(current_source_path, target_path)
|
|
display = f"图{fig_number_map[name]}" if fig_number_map and name in fig_number_map else name
|
|
if rel:
|
|
return f"[{display}]({rel}#{name})"
|
|
return f"[{display}](#{name})"
|
|
return f"`{name}`"
|
|
|
|
normalized = NUMREF_RE.sub(_numref_replace, normalized)
|
|
if label_map:
|
|
normalized = EQREF_RE.sub(
|
|
lambda m: f"({label_map[m.group(1)]})" if m.group(1) in label_map else f"$\\eqref{{{m.group(1)}}}$",
|
|
normalized,
|
|
)
|
|
else:
|
|
normalized = EQREF_RE.sub(lambda match: f"$\\eqref{{{match.group(1)}}}$", normalized)
|
|
|
|
normalized = _strip_latex_escapes_outside_math(normalized)
|
|
|
|
lines = [line.rstrip() for line in normalized.splitlines()]
|
|
collapsed: list[str] = []
|
|
previous_blank = False
|
|
for line in lines:
|
|
is_blank = line == ""
|
|
if is_blank and previous_blank:
|
|
continue
|
|
collapsed.append(line)
|
|
previous_blank = is_blank
|
|
|
|
while collapsed and collapsed[-1] == "":
|
|
collapsed.pop()
|
|
|
|
return "\n".join(collapsed) + "\n"
|
|
|
|
|
|
def clean_bibtex(value: str) -> str:
|
|
value = re.sub(r"\{\\[`'^\"~=.](\w)\}", r"\1", value)
|
|
value = re.sub(r"\\[`'^\"~=.](\w)", r"\1", value)
|
|
value = value.replace("{", "").replace("}", "")
|
|
return value.strip()
|
|
|
|
|
|
def _parse_bib_fields(body: str) -> dict[str, str]:
|
|
fields: dict[str, str] = {}
|
|
i = 0
|
|
while i < len(body):
|
|
while i < len(body) and body[i] in " \t\n\r,":
|
|
i += 1
|
|
if i >= len(body):
|
|
break
|
|
start = i
|
|
while i < len(body) and body[i] not in "= \t\n\r":
|
|
i += 1
|
|
name = body[start:i].strip().lower()
|
|
while i < len(body) and body[i] != "=":
|
|
i += 1
|
|
if i >= len(body):
|
|
break
|
|
i += 1
|
|
while i < len(body) and body[i] in " \t\n\r":
|
|
i += 1
|
|
if i >= len(body):
|
|
break
|
|
if body[i] == "{":
|
|
depth = 1
|
|
i += 1
|
|
vstart = i
|
|
while i < len(body) and depth > 0:
|
|
if body[i] == "{":
|
|
depth += 1
|
|
elif body[i] == "}":
|
|
depth -= 1
|
|
i += 1
|
|
value = body[vstart : i - 1]
|
|
elif body[i] == '"':
|
|
i += 1
|
|
vstart = i
|
|
while i < len(body) and body[i] != '"':
|
|
i += 1
|
|
value = body[vstart:i]
|
|
i += 1
|
|
else:
|
|
vstart = i
|
|
while i < len(body) and body[i] not in ", \t\n\r}":
|
|
i += 1
|
|
value = body[vstart:i]
|
|
if name:
|
|
fields[name] = value.strip()
|
|
return fields
|
|
|
|
|
|
def parse_bib(bib_path: Path) -> dict[str, dict[str, str]]:
|
|
text = bib_path.read_text(encoding="utf-8")
|
|
entries: dict[str, dict[str, str]] = {}
|
|
for match in BIB_ENTRY_RE.finditer(text):
|
|
key = match.group(2).strip()
|
|
start = match.end()
|
|
depth = 1
|
|
pos = start
|
|
while pos < len(text) and depth > 0:
|
|
if text[pos] == "{":
|
|
depth += 1
|
|
elif text[pos] == "}":
|
|
depth -= 1
|
|
pos += 1
|
|
fields = _parse_bib_fields(text[start : pos - 1])
|
|
fields["_type"] = match.group(1).lower()
|
|
entries[key] = fields
|
|
return entries
|
|
|
|
|
|
def _render_bibliography(
|
|
cited_keys: list[str],
|
|
bib_db: dict[str, dict[str, str]],
|
|
bibliography_title: str,
|
|
) -> list[str]:
|
|
lines: list[str] = ["---", "", f"## {bibliography_title}", "", "<ol>"]
|
|
for key in cited_keys:
|
|
entry = bib_db.get(key)
|
|
if not entry:
|
|
continue
|
|
author = clean_bibtex(entry.get("author", ""))
|
|
title = clean_bibtex(entry.get("title", ""))
|
|
year = entry.get("year", "")
|
|
venue = clean_bibtex(entry.get("journal", "") or entry.get("booktitle", ""))
|
|
parts: list[str] = []
|
|
if author:
|
|
parts.append(author)
|
|
if title:
|
|
parts.append(f"<em>{title}</em>")
|
|
if venue:
|
|
parts.append(venue)
|
|
if year:
|
|
parts.append(year)
|
|
text = ". ".join(parts) + "." if parts else f"{key}."
|
|
lines.append(f'<li id="ref-{key}">{text} <a href="#cite-{key}">↩</a></li>')
|
|
lines.append("</ol>")
|
|
return lines
|
|
|
|
|
|
def process_citations(
|
|
markdown: str,
|
|
bib_db: dict[str, dict[str, str]],
|
|
bibliography_title: str = DEFAULT_BIBLIOGRAPHY_TITLE,
|
|
) -> str:
|
|
cited_keys: list[str] = []
|
|
|
|
def _replace_cite(match: re.Match[str]) -> str:
|
|
keys = [k.strip() for k in match.group(1).split(",")]
|
|
for key in keys:
|
|
if key not in cited_keys and key in bib_db:
|
|
cited_keys.append(key)
|
|
if not bib_db:
|
|
return "[" + ", ".join(keys) + "]"
|
|
nums: list[str] = []
|
|
for key in keys:
|
|
if key not in bib_db:
|
|
continue
|
|
idx = cited_keys.index(key) + 1
|
|
nums.append(f'<sup id="cite-{key}"><a href="#ref-{key}">[{idx}]</a></sup>')
|
|
return "".join(nums)
|
|
|
|
processed = CITE_RE.sub(_replace_cite, markdown)
|
|
if cited_keys and bib_db:
|
|
bib_lines = _render_bibliography(cited_keys, bib_db, bibliography_title)
|
|
processed = processed.rstrip("\n") + "\n\n" + "\n".join(bib_lines) + "\n"
|
|
return processed
|
|
|
|
|
|
_FENCE_RE = re.compile(r"^(`{3,}|~{3,})", re.MULTILINE)
|
|
_CJK_RE = re.compile(r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]")
|
|
|
|
|
|
def _iter_math_spans(content: str):
|
|
"""Yield ``(start, end, is_display)`` for every math span.
|
|
|
|
Skips spans inside fenced code blocks and inline code.
|
|
"""
|
|
n = len(content)
|
|
i = 0
|
|
in_fence: str | None = None # fence marker when inside a code block
|
|
|
|
while i < n:
|
|
# Track fenced code blocks
|
|
if content[i] == "`" or content[i] == "~":
|
|
m = _FENCE_RE.match(content, i)
|
|
if m and (i == 0 or content[i - 1] == "\n"):
|
|
marker = m.group(1)
|
|
if in_fence is None:
|
|
in_fence = marker[0] # opening
|
|
i = content.index("\n", i) + 1 if "\n" in content[i:] else n
|
|
continue
|
|
elif marker[0] == in_fence:
|
|
in_fence = None # closing
|
|
i = m.end()
|
|
continue
|
|
|
|
if in_fence:
|
|
i += 1
|
|
continue
|
|
|
|
# Skip inline code
|
|
if content[i] == "`":
|
|
end_tick = content.find("`", i + 1)
|
|
if end_tick != -1:
|
|
i = end_tick + 1
|
|
continue
|
|
|
|
# Display math $$...$$
|
|
if content[i:i + 2] == "$$":
|
|
start = i
|
|
close = content.find("$$", i + 2)
|
|
if close != -1:
|
|
yield (start + 2, close, True)
|
|
i = close + 2
|
|
continue
|
|
|
|
# Inline math $...$
|
|
if content[i] == "$":
|
|
start = i
|
|
j = i + 1
|
|
while j < n:
|
|
if content[j] == "$":
|
|
if j > i + 1: # non-empty
|
|
yield (start + 1, j, False)
|
|
j += 1
|
|
break
|
|
if content[j] == "\n" and not content[i + 1:j].strip():
|
|
break # empty line → not math
|
|
j += 1
|
|
i = j
|
|
continue
|
|
|
|
i += 1
|
|
|
|
|
|
def convert_math_to_mathjax(content: str) -> str:
|
|
"""Replace ``$``/``$$`` delimited math with MathJax ``\\(…\\)``/``\\[…\\]``.
|
|
|
|
Inside math content, ``\\`` (LaTeX newline) is doubled to ``\\\\`` so that
|
|
mdBook's markdown processing (which consumes one level of backslash
|
|
escaping) delivers the correct ``\\`` to MathJax.
|
|
"""
|
|
spans = list(_iter_math_spans(content))
|
|
if not spans:
|
|
return content
|
|
|
|
parts: list[str] = []
|
|
prev = 0
|
|
for start, end, is_display in spans:
|
|
delim = "$$" if is_display else "$"
|
|
delim_len = len(delim)
|
|
delim_start = start - delim_len
|
|
|
|
math = content[start:end]
|
|
|
|
# Spans containing CJK characters are almost certainly mismatched $.
|
|
# Strip the $ delimiters and emit the raw text.
|
|
if _CJK_RE.search(math):
|
|
parts.append(content[prev:delim_start])
|
|
parts.append(math)
|
|
prev = end + delim_len
|
|
continue
|
|
|
|
parts.append(content[prev:delim_start])
|
|
|
|
# Double backslashes inside math so that after mdBook markdown
|
|
# processing (which eats one backslash layer) MathJax sees the
|
|
# original LaTeX.
|
|
math = math.replace("\\\\", "\\\\\\\\")
|
|
math = math.replace("*", "\\*")
|
|
math = math.replace("_", "\\_")
|
|
|
|
if is_display:
|
|
parts.append(f"\\\\[{math}\\\\]")
|
|
else:
|
|
parts.append(f"\\\\({math}\\\\)")
|
|
|
|
prev = end + delim_len
|
|
|
|
parts.append(content[prev:])
|
|
return "".join(parts)
|
|
|
|
|
|
def resolve_raw_html_file(current_file: Path, filename: str) -> Path:
|
|
direct = (current_file.parent / filename).resolve()
|
|
if direct.exists():
|
|
return direct
|
|
|
|
static_fallback = (current_file.parent / "static" / filename).resolve()
|
|
if static_fallback.exists():
|
|
return static_fallback
|
|
|
|
repo_static = (Path(__file__).resolve().parent.parent / "static" / filename)
|
|
if repo_static.exists():
|
|
return repo_static
|
|
|
|
raise FileNotFoundError(f"Raw HTML include '{filename}' from '{current_file}' does not exist")
|
|
|
|
|
|
def rewrite_frontpage_assets(html: str) -> str:
|
|
rewritten = html.replace("./_images/", "static/image/")
|
|
rewritten = rewritten.replace("_images/", "static/image/")
|
|
rewritten = HEAD_TAG_RE.sub("", rewritten)
|
|
rewritten = STYLE_BLOCK_RE.sub(_minify_style_block, rewritten)
|
|
return rewritten
|
|
|
|
|
|
def _minify_style_block(match: re.Match[str]) -> str:
|
|
content = match.group(1)
|
|
parts = [line.strip() for line in content.splitlines() if line.strip()]
|
|
return f"<style>{' '.join(parts)}</style>"
|
|
|
|
|
|
def render_frontpage_switch(label: str, href: str) -> str:
|
|
return ""
|
|
|
|
|
|
def wrap_frontpage_html(
|
|
html: str,
|
|
frontpage_switch_label: str | None = None,
|
|
frontpage_switch_href: str | None = None,
|
|
) -> str:
|
|
rendered_html = html.strip()
|
|
if frontpage_switch_label and frontpage_switch_href:
|
|
switch_html = render_frontpage_switch(frontpage_switch_label, frontpage_switch_href)
|
|
if FRONTPAGE_SWITCH_PLACEHOLDER in rendered_html:
|
|
rendered_html = rendered_html.replace(FRONTPAGE_SWITCH_PLACEHOLDER, switch_html)
|
|
else:
|
|
rendered_html = "\n".join([switch_html, rendered_html])
|
|
|
|
parts = [FRONTPAGE_LAYOUT_CSS, '<div class="openmlsys-frontpage">', rendered_html, "</div>"]
|
|
return "\n".join(parts)
|
|
|
|
|
|
def inline_raw_html(
|
|
block_lines: list[str],
|
|
current_file: Path,
|
|
frontpage_switch_label: str | None = None,
|
|
frontpage_switch_href: str | None = None,
|
|
) -> str | None:
|
|
stripped = [line.strip() for line in block_lines if line.strip()]
|
|
if not stripped or stripped[0] != ".. raw:: html":
|
|
return None
|
|
|
|
filename: str | None = None
|
|
for line in stripped[1:]:
|
|
match = RAW_HTML_FILE_RE.match(line)
|
|
if match:
|
|
filename = match.group(1)
|
|
break
|
|
|
|
if filename is None:
|
|
return None
|
|
|
|
html_path = resolve_raw_html_file(current_file, filename)
|
|
html = rewrite_frontpage_assets(html_path.read_text(encoding="utf-8")).strip()
|
|
if Path(filename).name == "frontpage.html":
|
|
return wrap_frontpage_html(
|
|
html,
|
|
frontpage_switch_label=frontpage_switch_label,
|
|
frontpage_switch_href=frontpage_switch_href,
|
|
)
|
|
return html
|
|
|
|
|
|
def chapter_label(item: TocItem, target: Path, title_cache: dict[Path, str]) -> str:
|
|
return item.label or title_cache[target]
|
|
|
|
|
|
def render_toc_list(entries: list[TocItem], current_file: Path, title_cache: dict[Path, str]) -> list[str]:
|
|
rendered: list[str] = []
|
|
current_indent = 0
|
|
for entry in entries:
|
|
if entry.kind == "part":
|
|
rendered.append(f"- {entry.label}")
|
|
current_indent = 1
|
|
continue
|
|
|
|
if entry.target is None:
|
|
continue
|
|
|
|
target = resolve_toc_target(current_file, entry.target)
|
|
if target is None or target not in title_cache:
|
|
continue
|
|
|
|
label = chapter_label(entry, target, title_cache)
|
|
rendered.append(f"{' ' * current_indent}- [{label}]({relative_link(current_file, target)})")
|
|
return rendered
|
|
|
|
|
|
def rewrite_markdown(
|
|
markdown: str,
|
|
current_file: Path,
|
|
title_cache: dict[Path, str],
|
|
bib_db: dict[str, dict[str, str]] | None = None,
|
|
bibliography_title: str = DEFAULT_BIBLIOGRAPHY_TITLE,
|
|
frontpage_switch_label: str | None = None,
|
|
frontpage_switch_href: str | None = None,
|
|
ref_label_map: dict[str, str] | None = None,
|
|
current_source_path: str | None = None,
|
|
fig_number_map: dict[str, str] | None = None,
|
|
) -> str:
|
|
output: list[str] = []
|
|
lines = markdown.splitlines()
|
|
index = 0
|
|
|
|
while index < len(lines):
|
|
stripped = lines[index].strip()
|
|
if stripped in (f"```{TOC_FENCE}", f"```{EVAL_RST_FENCE}"):
|
|
fence = stripped[3:]
|
|
index += 1
|
|
block_lines: list[str] = []
|
|
while index < len(lines) and lines[index].strip() != "```":
|
|
block_lines.append(lines[index])
|
|
index += 1
|
|
|
|
if fence == TOC_FENCE:
|
|
entries = parse_toc_entries(block_lines)
|
|
if entries:
|
|
if output and output[-1] != "":
|
|
output.append("")
|
|
rendered = render_toc_list(entries, current_file, title_cache)
|
|
output.extend(rendered)
|
|
if rendered and output and output[-1] != "":
|
|
output.append("")
|
|
elif fence == EVAL_RST_FENCE:
|
|
raw_html = inline_raw_html(
|
|
block_lines,
|
|
current_file,
|
|
frontpage_switch_label=frontpage_switch_label,
|
|
frontpage_switch_href=frontpage_switch_href,
|
|
)
|
|
if raw_html:
|
|
if output and output[-1] != "":
|
|
output.append("")
|
|
output.extend(raw_html.splitlines())
|
|
if output and output[-1] != "":
|
|
output.append("")
|
|
index += 1
|
|
continue
|
|
|
|
output.append(lines[index])
|
|
index += 1
|
|
|
|
while output and output[-1] == "":
|
|
output.pop()
|
|
|
|
raw = "\n".join(output) + "\n"
|
|
result, label_map = process_equation_labels(raw)
|
|
result = process_figure_captions(result, fig_number_map=fig_number_map)
|
|
result = normalize_directives(
|
|
result,
|
|
label_map=label_map,
|
|
ref_label_map=ref_label_map,
|
|
current_source_path=current_source_path,
|
|
fig_number_map=fig_number_map,
|
|
)
|
|
result = process_citations(result, bib_db or {}, bibliography_title=bibliography_title)
|
|
result = re.sub(r"^(#{1,6}\s+.*?)\s+\{#[^}]+\}\s*$", r"\1", result, flags=re.MULTILINE)
|
|
return result
|
|
|
|
|
|
def build_title_cache(
|
|
source_dir: Path,
|
|
placeholder_prefix: str | None = None,
|
|
) -> dict[Path, str]:
|
|
cache: dict[Path, str] = {}
|
|
for markdown_file in sorted(source_dir.rglob("*.md")):
|
|
if "_build" in markdown_file.parts or markdown_file.name == "SUMMARY.md":
|
|
continue
|
|
text = markdown_file.read_text(encoding="utf-8")
|
|
if is_placeholder_markdown(text, placeholder_prefix):
|
|
continue
|
|
cache[markdown_file.resolve()] = extract_title(text, fallback=markdown_file.stem)
|
|
return cache
|
|
|
|
|
|
def build_summary(source_dir: Path, title_cache: dict[Path, str]) -> str:
|
|
root_index = (source_dir / "index.md").resolve()
|
|
root_markdown = root_index.read_text(encoding="utf-8")
|
|
|
|
lines = ["# Summary", "", f"[{title_cache[root_index]}](index.md)"]
|
|
seen: set[Path] = {root_index}
|
|
|
|
def append_entry(target: Path, indent: int, label: str | None = None) -> None:
|
|
target = target.resolve()
|
|
if target in seen or target not in title_cache:
|
|
return
|
|
seen.add(target)
|
|
rel = target.relative_to(source_dir.resolve()).as_posix()
|
|
title = label or title_cache[target]
|
|
lines.append(f"{' ' * indent}- [{title}]({rel})")
|
|
|
|
child_markdown = target.read_text(encoding="utf-8")
|
|
for block in parse_toc_blocks(child_markdown):
|
|
for entry in block:
|
|
if entry.kind != "chapter" or entry.target is None:
|
|
continue
|
|
child_target = resolve_toc_target(target, entry.target)
|
|
if child_target is not None:
|
|
append_entry(child_target, indent + 1, entry.label or None)
|
|
|
|
def append_prefix_chapter(target: Path, label: str | None = None) -> None:
|
|
target = target.resolve()
|
|
if target in seen or target not in title_cache:
|
|
return
|
|
seen.add(target)
|
|
rel = target.relative_to(source_dir.resolve()).as_posix()
|
|
title = label or title_cache[target]
|
|
lines.append(f"[{title}]({rel})")
|
|
|
|
numbered_started = False
|
|
for block in parse_toc_blocks(root_markdown):
|
|
for entry in block:
|
|
if entry.kind == "part":
|
|
if lines and lines[-1] != "":
|
|
lines.append("")
|
|
lines.append(f"# {entry.label}")
|
|
lines.append("")
|
|
numbered_started = True
|
|
continue
|
|
|
|
if entry.target is None:
|
|
continue
|
|
|
|
target = resolve_toc_target(root_index, entry.target)
|
|
if target is None:
|
|
continue
|
|
if numbered_started:
|
|
append_entry(target, 0, entry.label or None)
|
|
else:
|
|
append_prefix_chapter(target, entry.label or None)
|
|
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def write_summary(
|
|
source_dir: Path,
|
|
summary_path: Path | None = None,
|
|
placeholder_prefix: str | None = None,
|
|
) -> Path:
|
|
source_dir = source_dir.resolve()
|
|
summary_path = summary_path.resolve() if summary_path else (source_dir / "SUMMARY.md")
|
|
title_cache = build_title_cache(source_dir, placeholder_prefix=placeholder_prefix)
|
|
summary_path.write_text(build_summary(source_dir, title_cache), encoding="utf-8")
|
|
return summary_path
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Generate mdBook SUMMARY.md for a chapter directory.")
|
|
parser.add_argument("--source", type=Path, required=True, help="Source chapter directory")
|
|
parser.add_argument("--summary-output", type=Path, required=True, help="Where to write the generated SUMMARY.md")
|
|
parser.add_argument(
|
|
"--placeholder-prefix",
|
|
default=None,
|
|
help="If set, files whose entire contents start with this prefix are skipped from mdBook output.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
summary_path = write_summary(
|
|
args.source,
|
|
summary_path=args.summary_output,
|
|
placeholder_prefix=args.placeholder_prefix,
|
|
)
|
|
print(f"Wrote mdBook summary to {summary_path}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|