feat: integrate OpenAIParserto TitleParser

This commit is contained in:
100gle
2023-09-30 19:42:31 +08:00
parent d093fdba0e
commit 3f517c4d2d
3 changed files with 54 additions and 7 deletions

View File

@@ -9,7 +9,8 @@ logger = logging.getLogger(__name__)
DEFAULT_PROMPT = """\
You will now play the role of a super assistant.
Your task is to extract structured data from unstructured text content and output it in JSON format.
If you are unable to extract any information, please leave the field empty. Do not fabricate data!
If you are unable to extract any information, please keep all fields and leave the field empty or default value like `''`, `None`.
But Do not fabricate data!
the python structured data type is:
@@ -32,13 +33,13 @@ Example:
```
input: "【喵萌奶茶屋】★04月新番★[夏日重现/Summer Time Rendering][11][1080p][繁日双语][招募翻译]"
output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1}'
output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1, "title_zh": "夏日重现", "sub": "", "title_jp": "", "season_raw": "", "source": ""}'
input: "【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"
output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2}'
output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2, "title_zh": "古见同学有交流障碍症", "sub": "", "title_jp": "", "season_raw": "", "source": ""}'
input: "[Lilith-Raws] 关于我在无意间被隔壁的天使变成废柴这件事 / Otonari no Tenshi-sama - 09 [Baha][WEB-DL][1080p][AVC AAC][CHT][MP4]"
output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1}'
output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1, "source": "WEB-DL", "title_zh": "关于我在无意间被隔壁的天使变成废柴这件事", "sub": "CHT", "title_jp": ""}'
```
"""

View File

@@ -2,8 +2,10 @@ import logging
from module.conf import settings
from module.models import Bangumi
from module.models.bangumi import Episode
from module.parser.openai import OpenAIParser
from .analyser import raw_parser, tmdb_parser, torrent_parser, mikan_parser
from .analyser import mikan_parser, raw_parser, tmdb_parser, torrent_parser
logger = logging.getLogger(__name__)
@@ -43,14 +45,28 @@ class TitleParser:
logger.debug(f"TMDB Matched, official title is {tmdb_info.title}")
bangumi.poster_link = tmdb_info.poster_link
else:
logger.warning(f"Cannot match {bangumi.official_title} in TMDB. Use raw title instead.")
logger.warning(
f"Cannot match {bangumi.official_title} in TMDB. Use raw title instead."
)
logger.warning("Please change bangumi info manually.")
@staticmethod
def raw_parser(raw: str) -> Bangumi | None:
language = settings.rss_parser.language
try:
episode = raw_parser(raw)
# use OpenAI ChatGPT to parse raw title and get structured data
if settings.experimental.openai_enable:
gpt = OpenAIParser(
api_key=settings.experimental.openai_api_key,
api_base=settings.experimental.openai_api_base,
model=settings.experimental.openai_model,
)
episode_dict = gpt.parse(raw, asdict=True)
print(f"Episode dict: {episode_dict}")
episode = Episode(**episode_dict)
else:
episode = raw_parser(raw)
titles = {
"zh": episode.title_zh,
"en": episode.title_en,

View File

@@ -0,0 +1,30 @@
import json
import os
import pytest
from module.conf import settings
from module.parser.title_parser import TitleParser
class TestTitleParser:
def test_parse_without_openai(self):
text = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]"
result = TitleParser.raw_parser(text)
assert result.group_name == "梦蓝字幕组"
assert result.title_raw == "New Doraemon"
assert result.dpi == "1080P"
assert result.season == 1
assert result.subtitle == "GB_JP"
@pytest.mark.skipif(
not settings.experimental.openai_enable,
reason="OpenAI is not enabled in settings",
)
def test_parse_with_openai(self):
text = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]"
result = TitleParser.raw_parser(text)
assert result.group_name == "梦蓝字幕组"
assert result.title_raw == "New Doraemon"
assert result.dpi == "1080P"
assert result.season == 1
assert result.subtitle == "GB_JP"