From 3f517c4d2d6243febfc1eccf3b61d5d02b4a4cd6 Mon Sep 17 00:00:00 2001 From: 100gle Date: Sat, 30 Sep 2023 19:42:31 +0800 Subject: [PATCH] feat: integrate `OpenAIParser`to `TitleParser` --- backend/src/module/parser/openai.py | 9 ++++--- backend/src/module/parser/title_parser.py | 22 ++++++++++++++--- backend/src/test/test_title_parser.py | 30 +++++++++++++++++++++++ 3 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 backend/src/test/test_title_parser.py diff --git a/backend/src/module/parser/openai.py b/backend/src/module/parser/openai.py index e14e7cc8..561e3e7a 100644 --- a/backend/src/module/parser/openai.py +++ b/backend/src/module/parser/openai.py @@ -9,7 +9,8 @@ logger = logging.getLogger(__name__) DEFAULT_PROMPT = """\ You will now play the role of a super assistant. Your task is to extract structured data from unstructured text content and output it in JSON format. -If you are unable to extract any information, please leave the field empty. Do not fabricate data! +If you are unable to extract any information, please keep all fields and leave the field empty or default value like `''`, `None`. +But Do not fabricate data! the python structured data type is: @@ -32,13 +33,13 @@ Example: ``` input: "【喵萌奶茶屋】★04月新番★[夏日重现/Summer Time Rendering][11][1080p][繁日双语][招募翻译]" -output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1}' +output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1, "title_zh": "夏日重现", "sub": "", "title_jp": "", "season_raw": "", "source": ""}' input: "【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】" -output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2}' +output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2, "title_zh": "古见同学有交流障碍症", "sub": "", "title_jp": "", "season_raw": "", "source": ""}' input: "[Lilith-Raws] 关于我在无意间被隔壁的天使变成废柴这件事 / Otonari no Tenshi-sama - 09 [Baha][WEB-DL][1080p][AVC AAC][CHT][MP4]" -output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1}' +output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1, "source": "WEB-DL", "title_zh": "关于我在无意间被隔壁的天使变成废柴这件事", "sub": "CHT", "title_jp": ""}' ``` """ diff --git a/backend/src/module/parser/title_parser.py b/backend/src/module/parser/title_parser.py index 1730ee31..bec9e4ae 100644 --- a/backend/src/module/parser/title_parser.py +++ b/backend/src/module/parser/title_parser.py @@ -2,8 +2,10 @@ import logging from module.conf import settings from module.models import Bangumi +from module.models.bangumi import Episode +from module.parser.openai import OpenAIParser -from .analyser import raw_parser, tmdb_parser, torrent_parser, mikan_parser +from .analyser import mikan_parser, raw_parser, tmdb_parser, torrent_parser logger = logging.getLogger(__name__) @@ -43,14 +45,28 @@ class TitleParser: logger.debug(f"TMDB Matched, official title is {tmdb_info.title}") bangumi.poster_link = tmdb_info.poster_link else: - logger.warning(f"Cannot match {bangumi.official_title} in TMDB. Use raw title instead.") + logger.warning( + f"Cannot match {bangumi.official_title} in TMDB. Use raw title instead." + ) logger.warning("Please change bangumi info manually.") @staticmethod def raw_parser(raw: str) -> Bangumi | None: language = settings.rss_parser.language try: - episode = raw_parser(raw) + # use OpenAI ChatGPT to parse raw title and get structured data + if settings.experimental.openai_enable: + gpt = OpenAIParser( + api_key=settings.experimental.openai_api_key, + api_base=settings.experimental.openai_api_base, + model=settings.experimental.openai_model, + ) + episode_dict = gpt.parse(raw, asdict=True) + print(f"Episode dict: {episode_dict}") + episode = Episode(**episode_dict) + else: + episode = raw_parser(raw) + titles = { "zh": episode.title_zh, "en": episode.title_en, diff --git a/backend/src/test/test_title_parser.py b/backend/src/test/test_title_parser.py new file mode 100644 index 00000000..e27b60fa --- /dev/null +++ b/backend/src/test/test_title_parser.py @@ -0,0 +1,30 @@ +import json +import os + +import pytest +from module.conf import settings +from module.parser.title_parser import TitleParser + + +class TestTitleParser: + def test_parse_without_openai(self): + text = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]" + result = TitleParser.raw_parser(text) + assert result.group_name == "梦蓝字幕组" + assert result.title_raw == "New Doraemon" + assert result.dpi == "1080P" + assert result.season == 1 + assert result.subtitle == "GB_JP" + + @pytest.mark.skipif( + not settings.experimental.openai_enable, + reason="OpenAI is not enabled in settings", + ) + def test_parse_with_openai(self): + text = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]" + result = TitleParser.raw_parser(text) + assert result.group_name == "梦蓝字幕组" + assert result.title_raw == "New Doraemon" + assert result.dpi == "1080P" + assert result.season == 1 + assert result.subtitle == "GB_JP"