From 9d9425f4635a99e11c36025e27880dab51ff6e5c Mon Sep 17 00:00:00 2001 From: vega Date: Mon, 2 Mar 2026 22:16:56 +0800 Subject: [PATCH] Add tts skill --- SKILL.md | 25 ++ skills/speak/SKILL.md | 114 ++++++ skills/speak/scripts/noiz_tts.py | 201 ++++++++++ skills/speak/scripts/render_timeline.py | 486 ++++++++++++++++++++++++ skills/speak/scripts/text_to_srt.py | 115 ++++++ 5 files changed, 941 insertions(+) create mode 100644 SKILL.md create mode 100644 skills/speak/SKILL.md create mode 100644 skills/speak/scripts/noiz_tts.py create mode 100644 skills/speak/scripts/render_timeline.py create mode 100644 skills/speak/scripts/text_to_srt.py diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..7847316 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,25 @@ +--- +name: MockingBird +description: AI voice cloning and TTS toolkit. The only active skill is `speak`. +--- + +# MockingBird + +Voice cloning / text-to-speech toolkit built on encoder-synthesizer-vocoder pipeline. + +## Available Skills + +| Skill | Path | Purpose | +|-------|------|---------| +| speak | `skills/speak/SKILL.md` | Text-to-speech (Kokoro local / Noiz cloud), timeline rendering, voice cloning | + +## Project Layout + +- `models/` — encoder, synthesizer, vocoder, ppg2mel, ppg_extractor +- `control/` — CLI and GUI entry points (toolbox, mkgui) +- `skills/speak/` — TTS skill scripts (`tts.sh`, `noiz_tts.py`, `render_timeline.py`, `text_to_srt.py`) +- `data/ckpt/` — pretrained checkpoints + +## When to Use + +If the user asks for anything related to speech synthesis, TTS, voice cloning, dubbing, or audio generation, read and follow `skills/speak/SKILL.md`. diff --git a/skills/speak/SKILL.md b/skills/speak/SKILL.md new file mode 100644 index 0000000..4418564 --- /dev/null +++ b/skills/speak/SKILL.md @@ -0,0 +1,114 @@ +--- +name: speak +description: Convert text into speech with Kokoro or Noiz, including simple and timeline-aligned modes. +--- + +# speak + +Convert any text into speech audio. Supports two backends (Kokoro local, Noiz cloud), two modes (simple or timeline-accurate), and per-segment voice control. + +## Triggers + +- text to speech / speak / say / tts +- voice clone / dubbing +- epub to audio / srt to audio / convert to audio + +## Simple Mode — text to audio + +```bash +# Kokoro (auto-detected when installed) +bash skills/speak/scripts/tts.sh speak -t "Hello world" -v af_sarah -o hello.wav +bash skills/speak/scripts/tts.sh speak -f article.txt -v zf_xiaoni --lang cmn -o out.mp3 --format mp3 + +# Noiz (auto-detected when NOIZ_API_KEY is set, or force with --backend noiz) +# If --voice-id is omitted, the script prints 5 available built-in voices and exits. +# Pick one from the output and re-run with --voice-id . +bash skills/speak/scripts/tts.sh speak -f input.txt --voice-id voice_abc --auto-emotion --emo '{"Joy":0.5}' -o out.wav + +# Noiz: optional --duration (float, seconds, range (0, 36]) for target audio length +bash skills/speak/scripts/tts.sh speak -t "Short line" --voice-id voice_abc --duration 3.5 -o out.wav + +# Voice cloning (Noiz only — no voice-id needed, uses ref audio) +# Use your own reference audio: local file path or URL (only when using Noiz). +bash skills/speak/scripts/tts.sh speak -t "Hello" --ref-audio ./ref.wav -o clone.wav +bash skills/speak/scripts/tts.sh speak -t "Hello" --ref-audio https://example.com/my_voice.wav -o clone.wav +``` + +## Timeline Mode — SRT to time-aligned audio + +For precise per-segment timing (dubbing, subtitles, video narration). + +### Step 1: Get or create an SRT + +If the user doesn't have one, generate from text: + +```bash +bash skills/speak/scripts/tts.sh to-srt -i article.txt -o article.srt +bash skills/speak/scripts/tts.sh to-srt -i article.txt -o article.srt --cps 15 --gap 500 +``` + +`--cps` = characters per second (default 4, good for Chinese; ~15 for English). The agent can also write SRT manually. + +### Step 2: Create a voice map + +JSON file controlling default + per-segment voice settings. `segments` keys support single index `"3"` or range `"5-8"`. + +Kokoro voice map: + +```json +{ + "default": { "voice": "zf_xiaoni", "lang": "cmn" }, + "segments": { + "1": { "voice": "zm_yunxi" }, + "5-8": { "voice": "af_sarah", "lang": "en-us", "speed": 0.9 } + } +} +``` + +Noiz voice map (adds `emo`, `reference_audio` support). `reference_audio` can be a local path or a URL (user’s own audio; Noiz only): + +```json +{ + "default": { "voice_id": "voice_123", "target_lang": "zh" }, + "segments": { + "1": { "voice_id": "voice_host", "emo": { "Joy": 0.6 } }, + "2-4": { "reference_audio": "./refs/guest.wav" } + } +} +``` + +**Dynamic Reference Audio Slicing**: +If you are translating or dubbing a video and want each sentence to automatically use the audio from the original video at the exact same timestamp as its reference audio, use the `--ref-audio-track` argument instead of setting `reference_audio` in the map: +```bash +bash skills/speak/scripts/tts.sh render --srt input.srt --voice-map vm.json --ref-audio-track original_video.mp4 -o output.wav +``` + +See `examples/` for full samples. + +### Step 3: Render + +```bash +bash skills/speak/scripts/tts.sh render --srt input.srt --voice-map vm.json -o output.wav +bash skills/speak/scripts/tts.sh render --srt input.srt --voice-map vm.json --backend noiz --auto-emotion -o output.wav +``` + +## When to Choose Which + +| Need | Recommended | +|------|-------------| +| Just read text aloud, no fuss | Kokoro (default) | +| EPUB/PDF audiobook with chapters | Kokoro (native support) | +| Voice blending (`"v1:60,v2:40"`) | Kokoro | +| Voice cloning from reference audio | Noiz | +| Emotion control (`emo` param) | Noiz | +| Exact server-side duration per segment | Noiz | + +> When the user needs emotion control + voice cloning + precise duration together, Noiz is the only backend that supports all three. + +## Requirements + +- `ffmpeg` in PATH (timeline mode) +- Noiz: get your API key at [developers.noiz.ai](https://developers.noiz.ai), then run `bash skills/speak/scripts/tts.sh config --set-api-key YOUR_KEY` +- Kokoro: if already installed, pass `--backend kokoro` to use the local backend + +For backend details and full argument reference, see [reference.md](reference.md). diff --git a/skills/speak/scripts/noiz_tts.py b/skills/speak/scripts/noiz_tts.py new file mode 100644 index 0000000..0788855 --- /dev/null +++ b/skills/speak/scripts/noiz_tts.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +"""Simple TTS via Noiz API (no timeline). + +Supports direct text or text-file input, optional emotion enhancement, +voice cloning via reference audio, and emotion parameters. +Use kokoro-tts CLI directly for the Kokoro backend (no wrapper needed). +""" +import argparse +import base64 +import binascii +import json +import sys +from pathlib import Path +from typing import Any, Dict, Optional + +import requests + + +def normalize_api_key_base64(api_key: str) -> str: + key = api_key.strip() + if not key: + return key + padded = key + ("=" * (-len(key) % 4)) + try: + decoded = base64.b64decode(padded, validate=True) + canonical = base64.b64encode(decoded).decode("ascii").rstrip("=") + if decoded and canonical == key.rstrip("="): + return key + except binascii.Error: + pass + return base64.b64encode(key.encode("utf-8")).decode("ascii") + + +def call_emotion_enhance( + base_url: str, api_key: str, text: str, timeout: int +) -> str: + resp = requests.post( + f"{base_url.rstrip('/')}/emotion-enhance", + headers={"Authorization": api_key, "Content-Type": "application/json"}, + json={"text": text}, + timeout=timeout, + ) + if resp.status_code != 200: + raise RuntimeError( + f"/emotion-enhance failed: status={resp.status_code}, body={resp.text}" + ) + enhanced = resp.json().get("data", {}).get("emotion_enhance") + if not enhanced: + raise RuntimeError(f"/emotion-enhance returned no data: {resp.text}") + return enhanced + + +def synthesize( + base_url: str, + api_key: str, + text: str, + voice_id: Optional[str], + reference_audio: Optional[Path], + output_format: str, + speed: float, + emo: Optional[str], + target_lang: Optional[str], + similarity_enh: bool, + save_voice: bool, + duration: Optional[float], + timeout: int, + out_path: Path, +) -> float: + if duration is not None and not (0 < duration <= 36): + raise ValueError("duration must be in range (0, 36] seconds") + url = f"{base_url.rstrip('/')}/text-to-speech" + data: Dict[str, str] = { + "text": text, + "output_format": output_format, + "speed": str(speed), + } + if voice_id: + data["voice_id"] = voice_id + if emo: + data["emo"] = emo + if target_lang: + data["target_lang"] = target_lang + if similarity_enh: + data["similarity_enh"] = "true" + if save_voice: + data["save_voice"] = "true" + if duration is not None: + data["duration"] = str(duration) + + files = None + if reference_audio: + if not reference_audio.exists(): + raise FileNotFoundError(f"Reference audio not found: {reference_audio}") + files = { + "file": ( + reference_audio.name, + reference_audio.open("rb"), + "application/octet-stream", + ) + } + elif not voice_id: + raise ValueError("Either --voice-id or --reference-audio is required.") + + try: + resp = requests.post( + url, + headers={"Authorization": api_key}, + data=data, + files=files, + timeout=timeout, + ) + finally: + if files and files["file"][1]: + files["file"][1].close() + + if resp.status_code != 200: + raise RuntimeError( + f"/text-to-speech failed: status={resp.status_code}, body={resp.text}" + ) + + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_bytes(resp.content) + dur = resp.headers.get("X-Audio-Duration") + return float(dur) if dur else -1.0 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Simple TTS via Noiz API (no timeline).") + g = parser.add_mutually_exclusive_group(required=True) + g.add_argument("--text", help="Text string to synthesize") + g.add_argument("--text-file", help="Path to text file") + parser.add_argument("--api-key", required=True) + parser.add_argument("--voice-id") + parser.add_argument("--reference-audio", help="Local audio for voice cloning") + parser.add_argument("--output", required=True) + parser.add_argument("--base-url", default="https://noiz.ai/v1") + parser.add_argument("--output-format", choices=["wav", "mp3"], default="wav") + parser.add_argument("--auto-emotion", action="store_true") + parser.add_argument("--emo", help='Emotion JSON string, e.g. \'{"Joy":0.5}\'') + parser.add_argument("--speed", type=float, default=1.0) + parser.add_argument("--target-lang") + parser.add_argument("--similarity-enh", action="store_true") + parser.add_argument("--save-voice", action="store_true") + parser.add_argument( + "--duration", + type=float, + default=None, + metavar="SEC", + help="Target audio duration in seconds (0, 36], optional", + ) + parser.add_argument("--timeout-sec", type=int, default=120) + args = parser.parse_args() + args.api_key = normalize_api_key_base64(args.api_key) + + try: + if args.text_file: + text = Path(args.text_file).read_text(encoding="utf-8").strip() + else: + text = args.text + + if not text: + raise ValueError("Input text is empty.") + + if len(text) > 5000: + print( + f"Warning: text is {len(text)} chars (max 5000). " + "Consider chunking for long texts.", + file=sys.stderr, + ) + + if args.auto_emotion: + text = call_emotion_enhance( + args.base_url, args.api_key, text, args.timeout_sec + ) + + ref = Path(args.reference_audio) if args.reference_audio else None + out_duration = synthesize( + base_url=args.base_url, + api_key=args.api_key, + text=text, + voice_id=args.voice_id, + reference_audio=ref, + output_format=args.output_format, + speed=args.speed, + emo=args.emo, + target_lang=args.target_lang, + similarity_enh=args.similarity_enh, + save_voice=args.save_voice, + duration=args.duration, + timeout=args.timeout_sec, + out_path=Path(args.output), + ) + print(f"Done. Output: {args.output} (duration: {out_duration}s)") + return 0 + except Exception as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skills/speak/scripts/render_timeline.py b/skills/speak/scripts/render_timeline.py new file mode 100644 index 0000000..a240f64 --- /dev/null +++ b/skills/speak/scripts/render_timeline.py @@ -0,0 +1,486 @@ +#!/usr/bin/env python3 +"""Timeline mode: render SRT to timeline-accurate audio. + +Supports two backends: + - kokoro (default): local CLI, uses ffmpeg atempo for duration matching + - noiz: cloud API with server-side duration forcing, emotion, voice cloning + +Parses SRT, resolves per-segment voice config from a voice-map JSON, +calls TTS for each segment, normalizes to exact duration, delays to +correct start time, and mixes into one timeline track. +""" +import argparse +import base64 +import binascii +import json +import re +import shutil +import subprocess +import sys +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +TIMESTAMP_RE = re.compile(r"^(\d{2}):(\d{2}):(\d{2})[,.](\d{3})$") + + +def normalize_api_key_base64(api_key: str) -> str: + key = api_key.strip() + if not key: + return key + padded = key + ("=" * (-len(key) % 4)) + try: + decoded = base64.b64decode(padded, validate=True) + canonical = base64.b64encode(decoded).decode("ascii").rstrip("=") + if decoded and canonical == key.rstrip("="): + return key + except binascii.Error: + pass + return base64.b64encode(key.encode("utf-8")).decode("ascii") + + +@dataclass +class Cue: + index: int + start_ms: int + end_ms: int + text: str + + @property + def duration_ms(self) -> int: + return max(1, self.end_ms - self.start_ms) + + +# ── SRT parsing ────────────────────────────────────────────────────── + + +def parse_timestamp_ms(value: str) -> int: + match = TIMESTAMP_RE.match(value.strip()) + if not match: + raise ValueError(f"Invalid SRT timestamp: {value}") + hh, mm, ss, ms = map(int, match.groups()) + return ((hh * 60 + mm) * 60 + ss) * 1000 + ms + + +def parse_srt(path: Path) -> List[Cue]: + content = path.read_text(encoding="utf-8", errors="replace") + blocks = re.split(r"\n\s*\n", content.strip()) + cues: List[Cue] = [] + for block in blocks: + lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()] + if len(lines) < 3: + continue + try: + idx = int(lines[0]) + except ValueError: + continue + if "-->" not in lines[1]: + continue + start_raw, end_raw = [s.strip() for s in lines[1].split("-->", 1)] + start_ms = parse_timestamp_ms(start_raw) + end_ms = parse_timestamp_ms(end_raw) + text = "\n".join(lines[2:]).strip() + if text: + cues.append(Cue(index=idx, start_ms=start_ms, end_ms=end_ms, text=text)) + if not cues: + raise ValueError("No valid cues parsed from SRT.") + return cues + + +# ── Voice map resolution ───────────────────────────────────────────── + + +def parse_segment_key(key: str) -> Tuple[int, int]: + key = key.strip() + if "-" in key: + left, right = key.split("-", 1) + return int(left), int(right) + v = int(key) + return v, v + + +def resolve_segment_cfg(index: int, config: Dict[str, Any]) -> Dict[str, Any]: + merged = dict(config.get("default", {})) + for key, seg_cfg in config.get("segments", {}).items(): + lo, hi = parse_segment_key(key) + if lo <= index <= hi: + merged.update(seg_cfg) + return merged + + +# ── ffmpeg helpers ──────────────────────────────────────────────────── + + +def _run_ff(cmd: List[str]) -> None: + proc = subprocess.run(cmd, capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError(f"ffmpeg failed: {' '.join(cmd)}\n{proc.stderr}") + + +def ensure_ffmpeg() -> None: + if not shutil.which("ffmpeg"): + raise RuntimeError("ffmpeg not found in PATH.") + + +def probe_duration_ms(path: Path) -> float: + proc = subprocess.run( + [ + "ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", str(path), + ], + capture_output=True, text=True, + ) + if proc.returncode != 0: + raise RuntimeError(f"ffprobe failed on {path}: {proc.stderr}") + return float(proc.stdout.strip()) * 1000 + + +def normalize_duration_pad_trim(inp: Path, outp: Path, target_ms: int) -> None: + """Pad short audio then trim to exact target duration (Noiz backend).""" + sec = target_ms / 1000.0 + _run_ff([ + "ffmpeg", "-y", "-i", str(inp), + "-af", f"apad=pad_dur={sec:.3f}", + "-t", f"{sec:.3f}", str(outp), + ]) + + +def normalize_duration_atempo(inp: Path, outp: Path, target_ms: int) -> None: + """Use atempo to stretch/compress audio to target duration (Kokoro backend).""" + actual_ms = probe_duration_ms(inp) + if actual_ms <= 0: + normalize_duration_pad_trim(inp, outp, target_ms) + return + + ratio = actual_ms / target_ms + # atempo accepts 0.5–100.0; chain filters for extreme ratios + filters = [] + r = ratio + while r > 100.0: + filters.append("atempo=100.0") + r /= 100.0 + while r < 0.5: + filters.append("atempo=0.5") + r /= 0.5 + filters.append(f"atempo={r:.6f}") + + _run_ff([ + "ffmpeg", "-y", "-i", str(inp), + "-af", ",".join(filters), + "-t", f"{target_ms / 1000.0:.3f}", str(outp), + ]) + + +def delay_segment(inp: Path, outp: Path, start_ms: int) -> None: + _run_ff([ + "ffmpeg", "-y", "-i", str(inp), + "-af", f"adelay={start_ms}:all=1", str(outp), + ]) + + +def mix_all(inputs: List[Path], outp: Path, total_ms: int) -> None: + if not inputs: + raise ValueError("No segments to mix.") + cmd = ["ffmpeg", "-y"] + for p in inputs: + cmd += ["-i", str(p)] + cmd += [ + "-filter_complex", + f"amix=inputs={len(inputs)}:duration=longest:dropout_transition=0", + "-t", f"{total_ms / 1000.0:.3f}", str(outp), + ] + _run_ff(cmd) + + +# ── Noiz backend ───────────────────────────────────────────────────── + + +def _noiz_emotion_enhance( + base_url: str, api_key: str, text: str, timeout: int +) -> str: + import requests # noqa: delayed import so kokoro path doesn't need requests + + resp = requests.post( + f"{base_url.rstrip('/')}/emotion-enhance", + headers={"Authorization": api_key, "Content-Type": "application/json"}, + json={"text": text}, + timeout=timeout, + ) + if resp.status_code != 200: + raise RuntimeError( + f"/emotion-enhance failed: status={resp.status_code}, body={resp.text}" + ) + enhanced = resp.json().get("data", {}).get("emotion_enhance") + if not enhanced: + raise RuntimeError(f"/emotion-enhance returned no data: {resp.text}") + return enhanced + + +def _bool_form(v: Any) -> str: + return "true" if bool(v) else "false" + + +def _resolve_reference_audio(ref: str, timeout: int) -> Tuple[Path, Optional[Path]]: + """Resolve reference_audio to a path. If ref is a URL, download to temp file. + Returns (path_to_use, temp_path_to_cleanup_or_None).""" + if ref.startswith("http://") or ref.startswith("https://"): + import requests + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + tmp.close() + r = requests.get(ref, timeout=timeout) + r.raise_for_status() + Path(tmp.name).write_bytes(r.content) + return Path(tmp.name), Path(tmp.name) + p = Path(ref) + if not p.exists(): + raise FileNotFoundError(f"reference_audio not found: {ref}") + return p, None + + +def _noiz_tts( + base_url: str, + api_key: str, + cue: Cue, + cfg: Dict[str, Any], + output_format: str, + timeout: int, + out_path: Path, +) -> float: + import requests + + url = f"{base_url.rstrip('/')}/text-to-speech" + payload: Dict[str, str] = { + "text": cue.text, + "duration": f"{cue.duration_ms / 1000.0:.3f}", + "output_format": output_format, + } + for field in ("voice_id", "quality_preset", "speed", "target_lang"): + if field in cfg and cfg[field] is not None: + payload[field] = str(cfg[field]) + if "similarity_enh" in cfg: + payload["similarity_enh"] = _bool_form(cfg["similarity_enh"]) + if "save_voice" in cfg: + payload["save_voice"] = _bool_form(cfg["save_voice"]) + if "emo" in cfg and cfg["emo"] is not None: + emo = cfg["emo"] + payload["emo"] = emo if isinstance(emo, str) else json.dumps(emo) + + files = None + ref_cleanup: Optional[Path] = None + ref = cfg.get("reference_audio") + if ref: + ref_path, ref_cleanup = _resolve_reference_audio(ref, timeout) + files = { + "file": ( + ref_path.name, + ref_path.open("rb"), + "application/octet-stream", + ) + } + elif not cfg.get("voice_id"): + raise ValueError( + f"Cue {cue.index}: either voice_id or reference_audio required." + ) + + try: + resp = requests.post( + url, headers={"Authorization": api_key}, + data=payload, files=files, timeout=timeout, + ) + finally: + if files and files["file"][1]: + files["file"][1].close() + if ref_cleanup is not None: + ref_cleanup.unlink(missing_ok=True) + + if resp.status_code != 200: + raise RuntimeError( + f"/text-to-speech cue {cue.index}: " + f"status={resp.status_code}, body={resp.text}" + ) + out_path.write_bytes(resp.content) + dur_h = resp.headers.get("X-Audio-Duration") + return float(dur_h) if dur_h else -1.0 + + +# ── Kokoro backend ─────────────────────────────────────────────────── + + +def _ensure_kokoro() -> None: + if not shutil.which("kokoro-tts"): + raise RuntimeError("kokoro-tts CLI not found.") + + +def _kokoro_tts( + cue: Cue, + cfg: Dict[str, Any], + output_format: str, + out_path: Path, +) -> float: + with tempfile.NamedTemporaryFile( + mode="w", suffix=".txt", delete=False, encoding="utf-8" + ) as tmp: + tmp.write(cue.text) + tmp_path = tmp.name + + try: + cmd = ["kokoro-tts", tmp_path, str(out_path)] + voice = cfg.get("voice") + if voice: + cmd += ["--voice", str(voice)] + lang = cfg.get("lang") + if lang: + cmd += ["--lang", str(lang)] + speed = cfg.get("speed") + if speed is not None: + cmd += ["--speed", str(speed)] + cmd += ["--format", output_format] + + proc = subprocess.run(cmd, capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError( + f"kokoro-tts failed for cue {cue.index}: {proc.stderr}" + ) + finally: + Path(tmp_path).unlink(missing_ok=True) + + if out_path.exists(): + return probe_duration_ms(out_path) / 1000.0 + raise RuntimeError(f"kokoro-tts produced no output for cue {cue.index}") + + +# ── main ───────────────────────────────────────────────────────────── + + +def main() -> int: + ap = argparse.ArgumentParser( + description="Render timeline-accurate speech from SRT." + ) + ap.add_argument("--srt", required=True, help="Input SRT file") + ap.add_argument("--voice-map", required=True, help="Voice-map JSON") + ap.add_argument( + "--backend", choices=["kokoro", "noiz"], default="kokoro", + help="TTS backend (default: kokoro)", + ) + ap.add_argument("--api-key", help="API key (required for noiz backend)") + ap.add_argument("--output", required=True, help="Output audio file") + ap.add_argument("--base-url", default="https://noiz.ai/v1") + ap.add_argument("--work-dir", default=".tmp/tts") + ap.add_argument("--auto-emotion", action="store_true", + help="Noiz backend only: call /emotion-enhance before TTS") + ap.add_argument("--ref-audio-track", help="Original audio track to dynamically slice as reference audio per segment") + ap.add_argument("--output-format", choices=["wav", "mp3"], default="wav") + ap.add_argument("--timeout-sec", type=int, default=120) + args = ap.parse_args() + + if args.backend == "noiz" and not args.api_key: + print("Error: --api-key is required for noiz backend.", file=sys.stderr) + return 1 + if args.api_key: + args.api_key = normalize_api_key_base64(args.api_key) + + try: + ensure_ffmpeg() + if args.backend == "kokoro": + _ensure_kokoro() + + work = Path(args.work_dir) + work.mkdir(parents=True, exist_ok=True) + + cues = parse_srt(Path(args.srt)) + voice_map = json.loads(Path(args.voice_map).read_text(encoding="utf-8")) + + delayed: List[Path] = [] + report: List[Dict[str, Any]] = [] + + for cue in cues: + cfg = resolve_segment_cfg(cue.index, voice_map) + + if args.ref_audio_track and not cfg.get("voice_id") and not cfg.get("reference_audio"): + ref_slice_path = work / f"seg_{cue.index:04d}_ref.wav" + if not ref_slice_path.exists(): + _run_ff([ + "ffmpeg", "-y", + "-ss", f"{cue.start_ms / 1000.0:.3f}", + "-i", str(args.ref_audio_track), + "-t", f"{cue.duration_ms / 1000.0:.3f}", + "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", + str(ref_slice_path) + ]) + cfg["reference_audio"] = str(ref_slice_path) + + text = cue.text + + if args.backend == "noiz" and args.auto_emotion: + text = _noiz_emotion_enhance( + args.base_url, args.api_key, cue.text, args.timeout_sec + ) + + synth_cue = Cue(cue.index, cue.start_ms, cue.end_ms, text) + raw = work / f"seg_{cue.index:04d}_raw.{args.output_format}" + norm = work / f"seg_{cue.index:04d}_norm.wav" + dly = work / f"seg_{cue.index:04d}_delay.wav" + + if args.backend == "noiz": + api_dur = _noiz_tts( + args.base_url, args.api_key, synth_cue, + cfg, args.output_format, args.timeout_sec, raw, + ) + normalize_duration_pad_trim(raw, norm, cue.duration_ms) + else: + api_dur = _kokoro_tts(synth_cue, cfg, args.output_format, raw) + normalize_duration_atempo(raw, norm, cue.duration_ms) + + delay_segment(norm, dly, cue.start_ms) + delayed.append(dly) + + seg_report: Dict[str, Any] = { + "index": cue.index, + "start_ms": cue.start_ms, + "end_ms": cue.end_ms, + "duration_ms": cue.duration_ms, + "raw_duration_sec": api_dur, + "backend": args.backend, + } + if args.backend == "noiz": + seg_report["voice_id"] = cfg.get("voice_id") + seg_report["reference_audio"] = cfg.get("reference_audio") + seg_report["emo"] = cfg.get("emo") + else: + seg_report["voice"] = cfg.get("voice") + seg_report["lang"] = cfg.get("lang") + report.append(seg_report) + + timeline_wav = work / "timeline.wav" + total_ms = max(c.end_ms for c in cues) + mix_all(delayed, timeline_wav, total_ms) + + out = Path(args.output) + if out.suffix.lower() != ".wav": + _run_ff(["ffmpeg", "-y", "-i", str(timeline_wav), str(out)]) + else: + out.parent.mkdir(parents=True, exist_ok=True) + out.write_bytes(timeline_wav.read_bytes()) + + report_path = work / "render_report.json" + report_path.write_text( + json.dumps({ + "srt": args.srt, + "output": args.output, + "backend": args.backend, + "total_ms": total_ms, + "segments": report, + }, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + print(f"Done. Output: {out}") + print(f"Report: {report_path}") + return 0 + except Exception as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skills/speak/scripts/text_to_srt.py b/skills/speak/scripts/text_to_srt.py new file mode 100644 index 0000000..f59f500 --- /dev/null +++ b/skills/speak/scripts/text_to_srt.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""Convert plain text to SRT with auto-estimated timings. + +Splits text into sentences, estimates duration per sentence based on +character-per-second rate, and writes a valid SRT file. +""" +import argparse +import re +import sys +from pathlib import Path +from typing import List, Tuple + + +SENTENCE_SPLIT_RE = re.compile( + r'(?<=[。!?.!?\n])\s*' +) + + +def split_sentences(text: str) -> List[str]: + raw = SENTENCE_SPLIT_RE.split(text.strip()) + sentences = [s.strip() for s in raw if s.strip()] + return sentences + + +def estimate_timings( + sentences: List[str], + chars_per_second: float, + gap_ms: int, + start_offset_ms: int = 0, +) -> List[Tuple[int, int, int, str]]: + """Return list of (index, start_ms, end_ms, text).""" + result = [] + cursor_ms = start_offset_ms + for i, sentence in enumerate(sentences, start=1): + char_count = len(sentence) + duration_ms = max(500, int(char_count / chars_per_second * 1000)) + start_ms = cursor_ms + end_ms = start_ms + duration_ms + result.append((i, start_ms, end_ms, sentence)) + cursor_ms = end_ms + gap_ms + return result + + +def ms_to_srt_time(ms: int) -> str: + total_sec, millis = divmod(ms, 1000) + total_min, sec = divmod(total_sec, 60) + hour, minute = divmod(total_min, 60) + return f"{hour:02d}:{minute:02d}:{sec:02d},{millis:03d}" + + +def write_srt(entries: List[Tuple[int, int, int, str]], path: Path) -> None: + lines = [] + for idx, start_ms, end_ms, text in entries: + lines.append(str(idx)) + lines.append(f"{ms_to_srt_time(start_ms)} --> {ms_to_srt_time(end_ms)}") + lines.append(text) + lines.append("") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Convert text file to SRT with auto-estimated timings." + ) + parser.add_argument("--input", required=True, help="Input text file path") + parser.add_argument("--output", required=True, help="Output SRT file path") + parser.add_argument( + "--chars-per-second", + type=float, + default=4.0, + help="Reading speed in characters per second (default: 4.0, good for Chinese; " + "use ~15 for English)", + ) + parser.add_argument( + "--gap-ms", + type=int, + default=300, + help="Gap between segments in milliseconds (default: 300)", + ) + parser.add_argument( + "--start-offset-ms", + type=int, + default=0, + help="Timeline start offset in milliseconds (default: 0)", + ) + args = parser.parse_args() + + try: + text = Path(args.input).read_text(encoding="utf-8").strip() + if not text: + raise ValueError("Input text is empty.") + + sentences = split_sentences(text) + if not sentences: + raise ValueError("No sentences found after splitting.") + + entries = estimate_timings( + sentences, + chars_per_second=args.chars_per_second, + gap_ms=args.gap_ms, + start_offset_ms=args.start_offset_ms, + ) + write_srt(entries, Path(args.output)) + print(f"Done. {len(entries)} segments written to {args.output}") + total_ms = entries[-1][2] if entries else 0 + print(f"Total duration: {ms_to_srt_time(total_ms)}") + return 0 + except Exception as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main())