From 3f517c4d2d6243febfc1eccf3b61d5d02b4a4cd6 Mon Sep 17 00:00:00 2001
From: 100gle <loogle.space@gmail.com>
Date: Sat, 30 Sep 2023 19:42:31 +0800
Subject: [PATCH] feat: integrate `OpenAIParser`to `TitleParser`

---
 backend/src/module/parser/openai.py       |  9 ++++---
 backend/src/module/parser/title_parser.py | 22 ++++++++++++++---
 backend/src/test/test_title_parser.py     | 30 +++++++++++++++++++++++
 3 files changed, 54 insertions(+), 7 deletions(-)
 create mode 100644 backend/src/test/test_title_parser.py

diff --git a/backend/src/module/parser/openai.py b/backend/src/module/parser/openai.py
index e14e7cc8..561e3e7a 100644
--- a/backend/src/module/parser/openai.py
+++ b/backend/src/module/parser/openai.py
@@ -9,7 +9,8 @@ logger = logging.getLogger(__name__)
 DEFAULT_PROMPT = """\
 You will now play the role of a super assistant. 
 Your task is to extract structured data from unstructured text content and output it in JSON format. 
-If you are unable to extract any information, please leave the field empty. Do not fabricate data!
+If you are unable to extract any information, please keep all fields and leave the field empty or default value like `''`, `None`.
+But Do not fabricate data!
 
 the python structured data type is:
 
@@ -32,13 +33,13 @@ Example:
 
 ```
 input: "【喵萌奶茶屋】★04月新番★[夏日重现/Summer Time Rendering][11][1080p][繁日双语][招募翻译]"
-output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1}'
+output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1, "title_zh": "夏日重现", "sub": "", "title_jp": "", "season_raw": "", "source": ""}'
 
 input: "【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"
-output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2}'
+output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2, "title_zh": "古见同学有交流障碍症", "sub": "", "title_jp": "", "season_raw": "", "source": ""}'
 
 input: "[Lilith-Raws] 关于我在无意间被隔壁的天使变成废柴这件事 / Otonari no Tenshi-sama - 09 [Baha][WEB-DL][1080p][AVC AAC][CHT][MP4]"
-output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1}'
+output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1, "source": "WEB-DL", "title_zh": "关于我在无意间被隔壁的天使变成废柴这件事", "sub": "CHT", "title_jp": ""}'
 ```
 """
 
diff --git a/backend/src/module/parser/title_parser.py b/backend/src/module/parser/title_parser.py
index 1730ee31..bec9e4ae 100644
--- a/backend/src/module/parser/title_parser.py
+++ b/backend/src/module/parser/title_parser.py
@@ -2,8 +2,10 @@ import logging
 
 from module.conf import settings
 from module.models import Bangumi
+from module.models.bangumi import Episode
+from module.parser.openai import OpenAIParser
 
-from .analyser import raw_parser, tmdb_parser, torrent_parser, mikan_parser
+from .analyser import mikan_parser, raw_parser, tmdb_parser, torrent_parser
 
 logger = logging.getLogger(__name__)
 
@@ -43,14 +45,28 @@ class TitleParser:
             logger.debug(f"TMDB Matched, official title is {tmdb_info.title}")
             bangumi.poster_link = tmdb_info.poster_link
         else:
-            logger.warning(f"Cannot match {bangumi.official_title} in TMDB. Use raw title instead.")
+            logger.warning(
+                f"Cannot match {bangumi.official_title} in TMDB. Use raw title instead."
+            )
             logger.warning("Please change bangumi info manually.")
 
     @staticmethod
     def raw_parser(raw: str) -> Bangumi | None:
         language = settings.rss_parser.language
         try:
-            episode = raw_parser(raw)
+            # use OpenAI ChatGPT to parse raw title and get structured data
+            if settings.experimental.openai_enable:
+                gpt = OpenAIParser(
+                    api_key=settings.experimental.openai_api_key,
+                    api_base=settings.experimental.openai_api_base,
+                    model=settings.experimental.openai_model,
+                )
+                episode_dict = gpt.parse(raw, asdict=True)
+                print(f"Episode dict: {episode_dict}")
+                episode = Episode(**episode_dict)
+            else:
+                episode = raw_parser(raw)
+
             titles = {
                 "zh": episode.title_zh,
                 "en": episode.title_en,
diff --git a/backend/src/test/test_title_parser.py b/backend/src/test/test_title_parser.py
new file mode 100644
index 00000000..e27b60fa
--- /dev/null
+++ b/backend/src/test/test_title_parser.py
@@ -0,0 +1,30 @@
+import json
+import os
+
+import pytest
+from module.conf import settings
+from module.parser.title_parser import TitleParser
+
+
+class TestTitleParser:
+    def test_parse_without_openai(self):
+        text = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]"
+        result = TitleParser.raw_parser(text)
+        assert result.group_name == "梦蓝字幕组"
+        assert result.title_raw == "New Doraemon"
+        assert result.dpi == "1080P"
+        assert result.season == 1
+        assert result.subtitle == "GB_JP"
+
+    @pytest.mark.skipif(
+        not settings.experimental.openai_enable,
+        reason="OpenAI is not enabled in settings",
+    )
+    def test_parse_with_openai(self):
+        text = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]"
+        result = TitleParser.raw_parser(text)
+        assert result.group_name == "梦蓝字幕组"
+        assert result.title_raw == "New Doraemon"
+        assert result.dpi == "1080P"
+        assert result.season == 1
+        assert result.subtitle == "GB_JP"