feat: integrate OpenAIParserto TitleParser

2026-04-14 10:30:35 +08:00 · 2023-09-30 19:42:31 +08:00
parent d093fdba0e
commit 3f517c4d2d
3 changed files with 54 additions and 7 deletions
--- a/backend/src/module/parser/openai.py
+++ b/backend/src/module/parser/openai.py
@@ -9,7 +9,8 @@ logger = logging.getLogger(__name__)
 DEFAULT_PROMPT = """\
 You will now play the role of a super assistant. 
 Your task is to extract structured data from unstructured text content and output it in JSON format. 
-If you are unable to extract any information, please leave the field empty. Do not fabricate data!
+If you are unable to extract any information, please keep all fields and leave the field empty or default value like `''`, `None`.
+But Do not fabricate data!

 the python structured data type is:

@@ -32,13 +33,13 @@ Example:

 ```
 input: "【喵萌奶茶屋】★04月新番★[夏日重现/Summer Time Rendering][11][1080p][繁日双语][招募翻译]"
-output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1}'
+output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1, "title_zh": "夏日重现", "sub": "", "title_jp": "", "season_raw": "", "source": ""}'

 input: "【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"
-output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2}'
+output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2, "title_zh": "古见同学有交流障碍症", "sub": "", "title_jp": "", "season_raw": "", "source": ""}'

 input: "[Lilith-Raws] 关于我在无意间被隔壁的天使变成废柴这件事 / Otonari no Tenshi-sama - 09 [Baha][WEB-DL][1080p][AVC AAC][CHT][MP4]"
-output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1}'
+output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1, "source": "WEB-DL", "title_zh": "关于我在无意间被隔壁的天使变成废柴这件事", "sub": "CHT", "title_jp": ""}'
 ```
 """

--- a/backend/src/module/parser/title_parser.py
+++ b/backend/src/module/parser/title_parser.py
@@ -2,8 +2,10 @@ import logging

 from module.conf import settings
 from module.models import Bangumi
+from module.models.bangumi import Episode
+from module.parser.openai import OpenAIParser

-from .analyser import raw_parser, tmdb_parser, torrent_parser, mikan_parser
+from .analyser import mikan_parser, raw_parser, tmdb_parser, torrent_parser

 logger = logging.getLogger(__name__)

@@ -43,14 +45,28 @@ class TitleParser:
            logger.debug(f"TMDB Matched, official title is {tmdb_info.title}")
            bangumi.poster_link = tmdb_info.poster_link
        else:
-            logger.warning(f"Cannot match {bangumi.official_title} in TMDB. Use raw title instead.")
+            logger.warning(
+                f"Cannot match {bangumi.official_title} in TMDB. Use raw title instead."
+            )
            logger.warning("Please change bangumi info manually.")

    @staticmethod
    def raw_parser(raw: str) -> Bangumi | None:
        language = settings.rss_parser.language
        try:
-            episode = raw_parser(raw)
+            # use OpenAI ChatGPT to parse raw title and get structured data
+            if settings.experimental.openai_enable:
+                gpt = OpenAIParser(
+                    api_key=settings.experimental.openai_api_key,
+                    api_base=settings.experimental.openai_api_base,
+                    model=settings.experimental.openai_model,
+                )
+                episode_dict = gpt.parse(raw, asdict=True)
+                print(f"Episode dict: {episode_dict}")
+                episode = Episode(**episode_dict)
+            else:
+                episode = raw_parser(raw)
+
            titles = {
                "zh": episode.title_zh,
                "en": episode.title_en,
--- a/backend/src/test/test_title_parser.py
+++ b/backend/src/test/test_title_parser.py
@@ -0,0 +1,30 @@
+import json
+import os
+
+import pytest
+from module.conf import settings
+from module.parser.title_parser import TitleParser
+
+
+class TestTitleParser:
+    def test_parse_without_openai(self):
+        text = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]"
+        result = TitleParser.raw_parser(text)
+        assert result.group_name == "梦蓝字幕组"
+        assert result.title_raw == "New Doraemon"
+        assert result.dpi == "1080P"
+        assert result.season == 1
+        assert result.subtitle == "GB_JP"
+
+    @pytest.mark.skipif(
+        not settings.experimental.openai_enable,
+        reason="OpenAI is not enabled in settings",
+    )
+    def test_parse_with_openai(self):
+        text = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]"
+        result = TitleParser.raw_parser(text)
+        assert result.group_name == "梦蓝字幕组"
+        assert result.title_raw == "New Doraemon"
+        assert result.dpi == "1080P"
+        assert result.season == 1
+        assert result.subtitle == "GB_JP"