diff --git a/applications/utils/translation.py b/applications/utils/translation.py index 0ee2dd9..3bd9bb9 100644 --- a/applications/utils/translation.py +++ b/applications/utils/translation.py @@ -4,7 +4,8 @@ import time import requests -import translators as ts + +from component import translators as ts def translation_lyc_text(contents): diff --git a/component/translators/__init__.py b/component/translators/__init__.py new file mode 100644 index 0000000..5989e8a --- /dev/null +++ b/component/translators/__init__.py @@ -0,0 +1,5 @@ +__version__ = "5.8.0" +__author__ = "UlionTse" + + +from .server import translate_text, translate_html, translators_pool, get_languages, preaccelerate_and_speedtest diff --git a/component/translators/server.py b/component/translators/server.py new file mode 100644 index 0000000..92c1d05 --- /dev/null +++ b/component/translators/server.py @@ -0,0 +1,5391 @@ +# coding=utf-8 +# author=UlionTse + +""" +Copyright (C) 2017-2023 UlionTse + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +Email: uliontse@outlook.com + +translators Copyright (C) 2017-2023 UlionTse +This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. +This is free software, and you are welcome to redistribute it +under certain conditions; type `show c' for details. +""" + +import os +import re +import sys +import time +import json +import uuid +import hmac +import base64 +import random +import hashlib +import datetime +import warnings +import functools +import urllib.parse +from typing import Optional, Union, Tuple, List + +import tqdm +import execjs +import requests +import lxml.etree +import pathos.multiprocessing + +SessionType = requests.sessions.Session +ResponseType = requests.models.Response +LangMapKwargsType = Union[str, bool] +ApiKwargsType = Union[str, int, float, bool, dict] + +__all__ = [ + 'translate_text', 'translate_html', 'translators_pool', + 'alibaba', 'apertium', 'argos', 'baidu', 'bing', 'caiyun', 'cloudYi', 'deepl', 'elia', 'google', + 'iciba', 'iflytek', 'iflyrec', 'itranslate', 'judic', 'languageWire', 'lingvanex', 'mglip', 'mirai', 'modernMt', + 'myMemory', 'niutrans', 'papago', 'qqFanyi', 'qqTranSmart', 'reverso', 'sogou', 'sysTran', 'tilde', 'translateCom', + 'translateMe', 'utibet', 'volcEngine', 'yandex', 'yeekit', 'youdao', + '_alibaba', '_apertium', '_argos', '_baidu', '_bing', '_caiyun', '_cloudYi', '_deepl', '_elia', '_google', + '_iciba', '_iflytek', '_iflyrec', '_itranslate', '_judic', '_languageWire', '_lingvanex', '_mglip', '_mirai', + '_modernMt', + '_myMemory', '_niutrans', '_papago', '_qqFanyi', '_qqTranSmart', '_reverso', '_sogou', '_sysTran', '_tilde', + '_translateCom', + '_translateMe', '_utibet', '_volcEngine', '_yandex', '_yeekit', '_youdao', +] # 36 + + +class TranslatorError(Exception): + pass + + +class Tse: + def __init__(self): + self.author = 'Ulion.Tse' + self.all_begin_time = time.time() + self.default_session_freq = int(1e3) + self.default_session_seconds = 1.5e3 + self.transform_en_translator_pool = ( + 'itranslate', 'lingvanex', 'myMemory', 'apertium', 'cloudYi', 'translateMe') + self.auto_pool = ('auto', 'detect', 'auto-detect', 'all') + self.zh_pool = ('zh', 'zh-CN', 'zh-cn', 'zh-CHS', 'zh-Hans', 'zh-Hans_CN', 'cn', 'chi', 'Chinese') + + @staticmethod + def time_stat(func): + @functools.wraps(func) + def _wrapper(*args, **kwargs): + if_show_time_stat = kwargs.get('if_show_time_stat', False) + show_time_stat_precision = kwargs.get('show_time_stat_precision', 2) + sleep_seconds = kwargs.get('sleep_seconds', 0) + + if if_show_time_stat and sleep_seconds >= 0: + t1 = time.time() + result = func(*args, **kwargs) + t2 = time.time() + cost_time = round((t2 - t1 - sleep_seconds), show_time_stat_precision) + sys.stderr.write(f'TimeSpent(function: {func.__name__[:-4]}): {cost_time}s\n') + return result + return func(*args, **kwargs) + + return _wrapper + + @staticmethod + def get_timestamp() -> int: + return int(time.time() * 1e3) + + @staticmethod + def get_headers(host_url: str, + if_api: bool = False, + if_referer_for_host: bool = True, + if_ajax_for_api: bool = True, + if_json_for_api: bool = False, + if_multipart_for_api: bool = False, + if_http_override_for_api: bool = False + ) -> dict: + + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" + url_path = urllib.parse.urlparse(host_url.strip('/')).path + host_headers = { + 'Referer' if if_referer_for_host else 'Host': host_url, + "User-Agent": user_agent, + } + api_headers = { + 'Origin': host_url.split(url_path)[0] if url_path else host_url, + 'Referer': host_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + "User-Agent": user_agent, + } + if if_api and not if_ajax_for_api: + api_headers.pop('X-Requested-With') + api_headers.update({'Content-Type': 'text/plain'}) + if if_api and if_json_for_api: + api_headers.update({'Content-Type': 'application/json'}) + if if_api and if_multipart_for_api: + api_headers.pop('Content-Type') + if if_api and if_http_override_for_api: + api_headers.update({'X-HTTP-Method-Override': 'GET'}) + return host_headers if not if_api else api_headers + + def check_en_lang(self, from_lang: str, to_lang: str, default_translator: Optional[str] = None, + default_lang: str = 'en-US') -> Tuple[str, str]: + if default_translator and default_translator in self.transform_en_translator_pool: + from_lang = default_lang if from_lang == 'en' else from_lang + to_lang = default_lang if to_lang == 'en' else to_lang + from_lang = default_lang.replace('-', '_') if default_translator == 'lingvanex' and from_lang[ + :3] == 'en-' else from_lang + to_lang = default_lang.replace('-', '_') if default_translator == 'lingvanex' and to_lang[ + :3] == 'en-' else to_lang + return from_lang, to_lang + + def check_language(self, + from_language: str, + to_language: str, + language_map: dict, + output_auto: str = 'auto', + output_zh: str = 'zh', + output_en_translator: Optional[str] = None, + output_en: str = 'en-US', + if_check_lang_reverse: bool = True, + ) -> Tuple[str, str]: + + if output_en_translator: + from_language, to_language = self.check_en_lang(from_language, to_language, output_en_translator, output_en) + + from_language = output_auto if from_language in self.auto_pool else from_language + from_language = output_zh if from_language in self.zh_pool else from_language + to_language = output_zh if to_language in self.zh_pool else to_language + + if from_language != output_auto and from_language not in language_map: + raise TranslatorError( + 'Unsupported from_language[{}] in {}.'.format(from_language, sorted(language_map.keys()))) + elif to_language not in language_map and if_check_lang_reverse: + raise TranslatorError('Unsupported to_language[{}] in {}.'.format(to_language, sorted(language_map.keys()))) + elif from_language != output_auto and to_language not in language_map[from_language]: + raise TranslatorError('Unsupported translation: from [{0}] to [{1}]!'.format(from_language, to_language)) + elif from_language == to_language: + raise TranslatorError(f'from_language[{from_language}] and to_language[{to_language}] should not be same.') + return from_language, to_language + + @staticmethod + def warning_auto_lang(translator: str, default_from_language: str, if_print_warning: bool = True) -> str: + if if_print_warning: + warn_tips = f'Unsupported [from_language=auto({default_from_language} instead)] with [{translator}]!' + warnings.warn(f'{warn_tips} Please specify it.') + return default_from_language + + @staticmethod + def debug_lang_kwargs(from_language: str, to_language: str, default_from_language: str, + if_print_warning: bool = True) -> dict: + kwargs = { + 'from_language': from_language, + 'to_language': to_language, + 'default_from_language': default_from_language, + 'if_print_warning': if_print_warning, + } + return kwargs + + @staticmethod + def debug_language_map(func): + def make_temp_language_map(from_language: str, to_language: str, default_from_language: str) -> dict: + if from_language == to_language or to_language == 'auto': + raise TranslatorError + + temp_language_map = {from_language: to_language} + if from_language != 'auto': + temp_language_map.update({to_language: from_language}) + elif default_from_language != to_language: + temp_language_map.update({default_from_language: to_language, to_language: default_from_language}) + + return temp_language_map + + @functools.wraps(func) + def _wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except TranslatorError as e: + if kwargs.get('if_print_warning', True): + warnings.warn(f'GetLanguageMapError: {str(e)}.\nThe function make_temp_language_map() works.') + return make_temp_language_map(kwargs.get('from_language'), kwargs.get('to_language'), + kwargs.get('default_from_language')) + + return _wrapper + + @staticmethod + def check_input_limit(query_text: str, input_limit: int) -> None: + if len(query_text) > input_limit: + raise TranslatorError + + @staticmethod + def check_query(func): + def check_query_text(query_text: str, + if_ignore_empty_query: bool, + if_ignore_limit_of_length: bool, + limit_of_length: int + ) -> str: + + if not isinstance(query_text, str): + raise TranslatorError + + query_text = query_text.strip() + qt_length = len(query_text) + if qt_length == 0 and not if_ignore_empty_query: + raise TranslatorError("The `query_text` can't be empty!") + if qt_length >= limit_of_length and not if_ignore_limit_of_length: + raise TranslatorError('The length of `query_text` exceeds the limit.') + else: + if qt_length >= limit_of_length: + warnings.warn(f'The length of `query_text` is {qt_length}, above {limit_of_length}.') + return query_text[:limit_of_length - 1] + return query_text + + @functools.wraps(func) + def _wrapper(*args, **kwargs): + if_ignore_empty_query = kwargs.get('if_ignore_empty_query', False) + if_ignore_limit_of_length = kwargs.get('if_ignore_limit_of_length', False) + limit_of_length = kwargs.get('limit_of_length', 20000) + is_detail_result = kwargs.get('is_detail_result', False) + + query_text = list(args)[1] if len(args) >= 2 else kwargs.get('query_text') + query_text = check_query_text(query_text, if_ignore_empty_query, if_ignore_limit_of_length, limit_of_length) + if not query_text and if_ignore_empty_query: + return {'data': query_text} if is_detail_result else query_text + + if len(args) >= 2: + new_args = list(args) + new_args[1] = query_text + return func(*tuple(new_args), **kwargs) + return func(*args, **{**kwargs, **{'query_text': query_text}}) + + return _wrapper + + @staticmethod + def uncertified(func): + @functools.wraps(func) + def _wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except: + raise_tips1 = f'The function {func.__name__[:-4]}() has been not certified yet.' + raise_tips2_url = 'https://github.com/UlionTse/translators#supported-translation-services' + raise_tips2 = f'Please read for details: Status of Translator on this webpage({raise_tips2_url}).' + raise TranslatorError(f'{raise_tips1} {raise_tips2}') + + return _wrapper + + # @staticmethod + # def certified(func): + # @functools.wraps(func) + # def _wrapper(*args, **kwargs): + # try: + # return func(*args, **kwargs) + # except Exception as e: + # raise TranslatorError(e) + # return _wrapper + + +class GuestSeverRegion(Tse): + def __init__(self): + super().__init__() + self.get_addr_url = 'https://geolocation.onetrust.com/cookieconsentpub/v1/geo/location' + self.get_ip_url = 'https://httpbin.org/ip' + self.ip_api_addr_url = 'http://ip-api.com/json' # must http. + self.ip_tb_add_url = 'https://ip.taobao.com/outGetIpInfo' + self.default_region = os.environ.get('translators_default_region', None) + + @property + def get_server_region(self, if_judge_cn: bool = True) -> str: + if self.default_region: + sys.stderr.write(f'Using customized region {self.default_region} server backend.\n\n') + return ('CN' if self.default_region == 'China' else 'EN') if if_judge_cn else self.default_region + + _headers_fn = lambda url: self.get_headers(url, if_api=False, if_referer_for_host=True) + try: + try: + data = json.loads(requests.get(self.get_addr_url, headers=_headers_fn(self.get_addr_url)).text[9:-2]) + sys.stderr.write(f'Using region {data.get("stateName")} server backend.\n\n') + return data.get('country') if if_judge_cn else data.get("stateName") + except requests.exceptions.Timeout: + ip_address = requests.get(self.get_ip_url, headers=_headers_fn(self.get_ip_url)).json()['origin'] + payload = {'ip': ip_address, 'accessKey': 'alibaba-inc'} + data = requests.post(url=self.ip_tb_add_url, data=payload, + headers=_headers_fn(self.ip_tb_add_url)).json().get('data') + return data.get('country_id') # region_id + + except requests.exceptions.ConnectionError: + raise TranslatorError('Unable to connect the Internet.\n\n') + except: + warnings.warn('Unable to find server backend.\n\n') + region = input('Please input your server region need to visit:\neg: [Qatar, China, ...]\n\n') + sys.stderr.write(f'Using region {region} server backend.\n\n') + return 'CN' if region == 'China' else 'EN' + + +class GoogleV1(Tse): + def __init__(self, server_region='EN'): + super().__init__() + self.begin_time = time.time() + self.host_url = None + self.cn_host_url = 'https://translate.google.cn' + self.en_host_url = 'https://translate.google.com' + self.api_url = None + self.server_region = server_region + self.host_headers = None + self.language_map = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh-CN' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @staticmethod + def _xr(a: int, b: str) -> int: + size_b = len(b) + c = 0 + while c < size_b - 2: + d = b[c + 2] + d = ord(d[0]) - 87 if 'a' <= d else int(d) + d = (a % 2 ** 32) >> d if '+' == b[c + 1] else a << d + a = a + d & (2 ** 32 - 1) if '+' == b[c] else a ^ d + c += 3 + return a + + @staticmethod + def _ints(text: str) -> List[int]: + ints = [] + for v in text: + int_v = ord(v) + if int_v < 2 ** 16: + ints.append(int_v) + else: + # unicode, emoji + ints.append(int((int_v - 2 ** 16) / 2 ** 10 + 55296)) + ints.append(int((int_v - 2 ** 16) % 2 ** 10 + 56320)) + return ints + + def acquire(self, text: str, tkk: str) -> str: + ints = self._ints(text) + size = len(ints) + e = [] + g = 0 + + while g < size: + l = ints[g] + if l < 2 ** 7: # 128(ascii) + e.append(l) + else: + if l < 2 ** 11: # 2048 + e.append(l >> 6 | 192) + else: + if (l & 64512) == 55296 and g + 1 < size and ints[g + 1] & 64512 == 56320: + g += 1 + l = 65536 + ((l & 1023) << 10) + (ints[g] & 1023) + e.append(l >> 18 | 240) + e.append(l >> 12 & 63 | 128) + else: + e.append(l >> 12 | 224) + e.append(l >> 6 & 63 | 128) + e.append(l & 63 | 128) + g += 1 + + b = tkk if tkk != '0' else '' + d = b.split('.') + b = int(d[0]) if len(d) > 1 else 0 + + a = b + for value in e: + a += value + a = self._xr(a, '+-a^+6') + a = self._xr(a, '+-3^+b+-f') + a ^= int(d[1]) if len(d) > 1 else 0 + if a < 0: + a = (a & (2 ** 31 - 1)) + 2 ** 31 + a %= int(1E6) + return '{}.{}'.format(a, a ^ b) + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + et = lxml.etree.HTML(host_html) + lang_list = sorted(list(set(et.xpath('//*/@data-language-code')))) + return {}.fromkeys(lang_list, lang_list) + + def get_tkk(self, host_html: str) -> str: + return re.compile("tkk:'(.*?)'").findall(host_html)[0] + + @Tse.time_stat + @Tse.check_query + def google_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translate.google.com, https://translate.google.cn. + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param if_use_cn_host: bool, default None. + :param reset_host_url: str, default None. + :param if_check_reset_host_url: bool, default True. + :return: str or dict + """ + + reset_host_url = kwargs.get('reset_host_url', None) + if reset_host_url and reset_host_url != self.host_url: + if kwargs.get('if_check_reset_host_url', True) and not reset_host_url[:25] == 'https://translate.google.': + raise TranslatorError + self.host_url = reset_host_url.strip('/') + else: + use_cn_condition = kwargs.get('if_use_cn_host', None) or self.server_region == 'CN' + self.host_url = self.cn_host_url if use_cn_condition else self.en_host_url + + if self.host_url[-2:] == 'cn': + raise TranslatorError('Google service was offline in inland of China on Oct 2022.') + + self.host_headers = self.host_headers or self.get_headers(self.host_url, if_api=False) + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.api_url): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, self.session, timeout, proxies, **debug_lang_kwargs) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + tkk = self.get_tkk(host_html) + tk = self.acquire(query_text, tkk) + + api_url_part_1 = '/translate_a/single?client={0}&sl={1}&tl={2}&hl=zh-CN&dt=at&dt=bd&dt=ex'.format('webapp', + from_language, + to_language) + api_url_part_2 = '&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&source=bh&ssel=0&tsel=0&kc=1' + api_url_part_3 = '&tk={0}&q={1}'.format(tk, urllib.parse.quote(query_text)) + self.api_url = ''.join([self.host_url, api_url_part_1, api_url_part_2, api_url_part_3]) # [t,webapp] + + r = self.session.get(self.api_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else ''.join([item[0] for item in data[0] if isinstance(item[0], str)]) + + +class GoogleV2(Tse): + def __init__(self, server_region='EN'): + super().__init__() + self.begin_time = time.time() + self.host_url = None + self.cn_host_url = 'https://translate.google.cn' + self.en_host_url = 'https://translate.google.com' + self.api_url = None + self.api_url_path = '/_/TranslateWebserverUi/data/batchexecute' + self.server_region = server_region + self.host_headers = None + self.api_headers = None + self.language_map = None + self.session = None + self.rpcid = 'MkEWBc' + self.query_count = 0 + self.output_zh = 'zh-CN' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + et = lxml.etree.HTML(host_html) + lang_list = sorted(list(set(et.xpath('//*/@data-language-code')))) + return {}.fromkeys(lang_list, lang_list) + + def get_rpc(self, query_text: str, from_language: str, to_language: str) -> dict: + param = json.dumps([[query_text, from_language, to_language, True], [1]]) + rpc = json.dumps([[[self.rpcid, param, None, "generic"]]]) + return {'f.req': rpc} + + def get_info(self, host_html: str) -> dict: + data_str = re.compile(r'window.WIZ_global_data = (.*?);').findall(host_html)[0] + data = execjs.eval(data_str) + return {'bl': data['cfb2h'], 'f.sid': data['FdrFJe']} + + def get_consent_cookie(self, consent_html: str) -> str: # by mercuree. merged but not verify. + et = lxml.etree.HTML(consent_html) + input_element = et.xpath('.//input[@type="hidden"][@name="v"]') + cookie_value = input_element[0].attrib.get('value') if input_element else 'cb' + return f'CONSENT=YES+{cookie_value}' # cookie CONSENT=YES+cb works for now + + @Tse.time_stat + @Tse.check_query + def google_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translate.google.com, https://translate.google.cn. + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param reset_host_url: str, default None. + :param if_check_reset_host_url: bool, default True. + :return: str or dict + """ + + reset_host_url = kwargs.get('reset_host_url', None) + if reset_host_url and reset_host_url != self.host_url: + if kwargs.get('if_check_reset_host_url', True) and not reset_host_url[:25] == 'https://translate.google.': + raise TranslatorError + self.host_url = reset_host_url.strip('/') + else: + use_cn_condition = kwargs.get('if_use_cn_host', None) or self.server_region == 'CN' + self.host_url = self.cn_host_url if use_cn_condition else self.en_host_url + + if self.host_url[-2:] == 'cn': + raise TranslatorError('Google service was offline in inland of China on Oct 2022.') + + self.api_url = f'{self.host_url}{self.api_url_path}' + self.host_headers = self.host_headers or self.get_headers(self.host_url, if_api=False) # reuse cookie header + self.api_headers = self.get_headers(self.host_url, if_api=True, if_referer_for_host=True, if_ajax_for_api=True) + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + r = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + if 'consent.google.com' == urllib.parse.urlparse(r.url).hostname: + self.host_headers.update({'cookie': self.get_consent_cookie(r.text)}) + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + else: + host_html = r.text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + rpc_data = self.get_rpc(query_text, from_language, to_language) + rpc_data = urllib.parse.urlencode(rpc_data) + r = self.session.post(self.api_url, headers=self.api_headers, data=rpc_data, timeout=timeout, proxies=proxies) + r.raise_for_status() + json_data = json.loads(r.text[6:]) + data = json.loads(json_data[0][2]) + time.sleep(sleep_seconds) + self.query_count += 1 + return {'data': data} if is_detail_result else ' '.join( + [x[0] for x in (data[1][0][0][5] or data[1][0]) if x[0]]) + + +class BaiduV1(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.baidu.com' + self.api_url = 'https://fanyi.baidu.com/transapi' + self.get_lang_url = None + self.get_lang_url_pattern = 'https://fanyi-cdn.cdn.bcebos.com/webStatic/translation/js/index.(.*?).js' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + # @Tse.debug_language_map + # def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + # lang_str = re.compile('langMap: {(.*?)}').search(host_html.replace('\n', '').replace(' ', '')).group()[8:] + # return execjs.eval(lang_str) + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + js_html = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).text + lang_str = re.compile('exports={auto:(.*?)}}}},').search(js_html).group()[8:-3] + lang_list = re.compile('(\\w+):{zhName:').findall(lang_str) + lang_list = sorted(list(set(lang_list))) + return {}.fromkeys(lang_list, lang_list) + + @Tse.time_stat + @Tse.check_query + def baidu_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.baidu.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies) # must twice, send cookies. + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + + if not self.get_lang_url: + self.get_lang_url = re.compile(self.get_lang_url_pattern).search(host_html).group() + + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.get_lang_url, self.session, self.host_headers, timeout, + proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + payload = { + 'from': from_language, + 'to': to_language, + 'query': query_text, + 'source': 'txt', + } + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else '\n'.join([item['dst'] for item in data['data']]) + + +class BaiduV2(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.baidu.com' + self.api_url = 'https://fanyi.baidu.com/v2transapi' + self.langdetect_url = 'https://fanyi.baidu.com/langdetect' + self.get_sign_url = 'https://fanyi-cdn.cdn.bcebos.com/static/translation/pkg/index_bd36cef.js' + self.get_lang_url = None + self.get_lang_url_pattern = 'https://fanyi-cdn.cdn.bcebos.com/webStatic/translation/js/index.(.*?).js' + self.acs_url = 'https://dlswbr.baidu.com/heicha/mm/{i}/acs-{i}.js'.format(i=2060) + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.session = None + self.professional_field = ('common', 'medicine', 'electronics', 'mechanics', 'novel') + self.token = None + self.sign = None + self.acs_token = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + js_html = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).text + lang_str = re.compile('exports={auto:(.*?)}}}},').search(js_html).group()[8:-3] + lang_list = re.compile('(\\w+):{zhName:').findall(lang_str) + lang_list = sorted(list(set(lang_list))) + return {}.fromkeys(lang_list, lang_list) + + def get_sign(self, query_text: str, host_html: str, ss: SessionType, headers: dict, timeout: float, + proxies: dict) -> str: + gtk_list = re.compile("""window.gtk = '(.*?)';|window.gtk = "(.*?)";""").findall(host_html)[0] + gtk = gtk_list[0] or gtk_list[1] + + sign_html = ss.get(self.get_sign_url, headers=headers, timeout=timeout, proxies=proxies).text + begin_label = 'define("translation:widget/translate/input/pGrab",function(r,o,t){' + end_label = 'var i=null;t.exports=e});' + sign_js = sign_html[sign_html.find(begin_label) + len(begin_label):sign_html.find(end_label)] + sign_js = sign_js.replace('function e(r)', 'function e(r,i)') + return execjs.compile(sign_js).call('e', query_text, gtk) + + def get_tk(self, host_html: str) -> str: + tk_list = re.compile("""token: '(.*?)',|token: "(.*?)",""").findall(host_html)[0] + return tk_list[0] or tk_list[1] + + # def get_acs_token(self): + # pass + + @Tse.time_stat + @Tse.check_query + def baidu_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.baidu.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default 'common'. Choose from ('common', 'medicine', 'electronics', 'mechanics', 'novel') + :return: str or dict + """ + + use_domain = kwargs.get('professional_field', 'common') + if use_domain not in self.professional_field: # only support zh-en, en-zh. + raise TranslatorError + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.token and self.sign): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies) # must twice, send cookies. + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.token = self.get_tk(host_html) + self.sign = self.get_sign(query_text, host_html, self.session, self.host_headers, timeout, proxies) + + if not self.get_lang_url: + self.get_lang_url = re.compile(self.get_lang_url_pattern).search(host_html).group() + + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.get_lang_url, self.session, self.host_headers, timeout, + proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + if from_language == 'auto': + res = self.session.post(self.langdetect_url, headers=self.api_headers, data={"query": query_text}, + timeout=timeout, proxies=proxies) + from_language = res.json()['lan'] + + params = {"from": from_language, "to": to_language} + payload = { + "from": from_language, + "to": to_language, + "query": query_text, # from urllib.parse import quote_plus + "transtype": "realtime", # ["translang","realtime"] + "simple_means_flag": "3", + "sign": self.sign, + "token": self.token, + "domain": use_domain, + } + payload = urllib.parse.urlencode(payload).encode('utf-8') + # self.api_headers.update({'Acs-Token': self.acs_token}) + r = self.session.post(self.api_url, params=params, data=payload, headers=self.api_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else '\n'.join([x['dst'] for x in data['trans_result']['data']]) + + +class YoudaoV1(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.youdao.com' + self.api_url = 'https://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' + self.language_url = 'https://api-overmind.youdao.com/openapi/get/luna/dict/luna-front/prod/langType' + self.get_sign_old_url = 'https://shared.ydstatic.com/fanyi/newweb/v1.0.29/scripts/newweb/fanyi.min.js' + self.get_sign_url = None + self.get_sign_pattern = 'https://shared.ydstatic.com/fanyi/newweb/(.*?)/scripts/newweb/fanyi.min.js' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.session = None + self.sign_key = None + self.query_count = 0 + self.output_zh = 'zh-CHS' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + # @Tse.debug_language_map + # def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + # et = lxml.etree.HTML(host_html) + # lang_list = et.xpath('//*[@id="languageSelect"]/li/@data-value') + # lang_list = [(x.split('2')[0], [x.split('2')[1]]) for x in lang_list if '2' in x] + # lang_map = dict(map(lambda x: x, lang_list)) + # lang_map.pop('zh-CHS') + # lang_map.update({'zh-CHS': list(lang_map.keys())}) + # return lang_map + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + data = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).json() + lang_list = sorted([it['code'] for it in data['data']['value']['textTranslate']['specify']]) + return {}.fromkeys(lang_list, lang_list) + + def get_sign_key(self, host_html: str, ss: SessionType, timeout: float, proxies: dict) -> str: + try: + if not self.get_sign_url: + self.get_sign_url = re.compile(self.get_sign_pattern).search(host_html).group() + r = ss.get(self.get_sign_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + except: + r = ss.get(self.get_sign_old_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + sign = re.compile('md5\\("fanyideskweb" \\+ e \\+ i \\+ "(.*?)"\\)').findall(r.text) + return sign[0] if sign and sign != [''] else "Ygy_4c=r#e#4EX^NUGUc5" # v1.1.10 + + def get_form(self, query_text: str, from_language: str, to_language: str, sign_key: str) -> dict: + ts = str(self.get_timestamp()) + salt = str(ts) + str(random.randrange(0, 10)) + sign_text = ''.join(['fanyideskweb', query_text, salt, sign_key]) + sign = hashlib.md5(sign_text.encode()).hexdigest() + bv = hashlib.md5(self.api_headers['User-Agent'][8:].encode()).hexdigest() + form = { + 'i': query_text, + 'from': from_language, + 'to': to_language, + 'lts': ts, # r = "" + (new Date).getTime() + 'salt': salt, # i = r + parseInt(10 * Math.random(), 10) + 'sign': sign, # n.md5("fanyideskweb" + e + i + "n%A-rKaT5fb[Gy?;N5@Tj"),e=text + 'bv': bv, # n.md5(navigator.appVersion) + 'smartresult': 'dict', + 'client': 'fanyideskweb', + 'doctype': 'json', + 'version': '2.1', + 'keyfrom': 'fanyi.web', + 'action': 'FY_BY_REALTlME', + # not time.["FY_BY_REALTlME", "FY_BY_DEFAULT", "FY_BY_CLICKBUTTION", "lan-select"] + # 'typoResult': 'false' + } + return form + + @Tse.time_stat + @Tse.check_query + def youdao_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.youdao.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.sign_key): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.sign_key = self.get_sign_key(host_html, self.session, timeout, proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.language_url, self.session, self.host_headers, timeout, + proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + form = self.get_form(query_text, from_language, to_language, self.sign_key) + r = self.session.post(self.api_url, data=form, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else '\n'.join( + [' '.join([it['tgt'] for it in item]) for item in data['translateResult']]) + + +class YoudaoV2(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.youdao.com' + self.api_url = 'https://dict.youdao.com/webtranslate' + self.api_host = 'https://dict.youdao.com' + self.get_js_url = None + self.get_js_pattern = 'js/app.(.*?).js' + self.get_sign_url = None + self.get_sign_pattern = '' + self.login_url = 'https://dict.youdao.com/login/acc/query/accountinfo' + self.language_url = 'https://api-overmind.youdao.com/openapi/get/luna/dict/luna-front/prod/langType' + self.domain_url = 'https://doctrans-service.youdao.com/common/enums/list?key=domain' + self.get_key_url = 'https://dict.youdao.com/webtranslate/key' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.api_headers.update({'Host': self.api_host}) + self.language_map = None + self.session = None + self.professional_field = ('0', '1', '2', '3') + self.professional_field_map = None + self.default_key = None + self.secret_key = None + self.decode_key = None + self.decode_iv = None + self.query_count = 0 + self.output_zh = 'zh-CHS' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + data = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).json() + lang_list = sorted([it['code'] for it in data['data']['value']['textTranslate']['specify']]) + return {}.fromkeys(lang_list, lang_list) + + def get_default_key(self, js_html: str) -> str: + return re.compile('="webfanyi-key-getter",(\\w+)="(\\w+)";').search(js_html).group(2) + + def get_sign(self, key: str, timestmp: int) -> str: + value = f'client=fanyideskweb&mysticTime={timestmp}&product=webfanyi&key={key}' + return hashlib.md5(value.encode()).hexdigest() + + def get_payload(self, keyid: str, key: str, timestamp: int, **kwargs: str) -> dict: + if keyid not in ('webfanyi-key-getter', 'webfanyi'): + raise TranslatorError + + payload = { + 'keyid': keyid, + 'mysticTime': str(timestamp), + 'sign': self.get_sign(key, timestamp), + 'client': 'fanyideskweb', + 'product': 'webfanyi', + 'appVersion': '1.0.0', + 'vendor': 'web', + 'keyfrom': 'fanyi.web', + 'pointParam': 'client,mysticTime,product', + } + return {**kwargs, **payload} if keyid == 'webfanyi' else payload + + def decrypt(self, cipher_text: str, decrypt_dictionary: dict) -> str: + _ciphertext = ''.join(list(map(lambda k: decrypt_dictionary[k], cipher_text))) + return base64.b64decode(_ciphertext).decode() + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def youdao_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.youdao.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default '0'. Choose from ('0','1','2','3') + :return: str or dict + """ + + domain = kwargs.get('professional_field', '0') + if domain not in self.professional_field: + raise TranslatorError + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.secret_key): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + _ = self.session.get(self.login_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.professional_field_map = \ + self.session.get(self.domain_url, headers=self.host_headers, timeout=timeout, proxies=proxies).json()[ + 'data'] + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.language_url, self.session, self.host_headers, timeout, + proxies, **debug_lang_kwargs) + + self.get_js_url = ''.join([self.host_url, '/', re.compile(self.get_js_pattern).search(host_html).group()]) + js_html = self.session.get(self.get_js_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + + self.decode_key = re.compile('decodeKey:"(.*?)",').search(js_html).group(1) + self.decode_iv = re.compile('decodeIv:"(.*?)",').search(js_html).group(1) + self.default_key = self.get_default_key(js_html) + + params = self.get_payload(keyid='webfanyi-key-getter', key=self.default_key, timestamp=self.get_timestamp()) + key_r = self.session.get(self.get_key_url, params=params, headers=self.api_headers, timeout=timeout, + proxies=proxies) + self.secret_key = key_r.json()['data']['secretKey'] + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + translate_form = { + 'i': query_text, + 'from': from_language, + 'to': to_language if from_language != 'auto' else '', + 'domain': domain, + 'dictResult': 'true', + } + payload = self.get_payload(keyid='webfanyi', key=self.default_key, timestamp=self.get_timestamp(), + **translate_form) + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() # raise TranslatorError('YoudaoV2 has not been completed.') # TODO + data = self.decrypt(r.text, decrypt_dictionary={}) + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else str(data) # TODO + + +class YoudaoV3(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://ai.youdao.com/product-fanyi-text.s' + self.api_url = 'https://aidemo.youdao.com/trans' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh-CHS' + self.input_limit = int(1e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + et = lxml.etree.HTML(host_html) + lang_list = et.xpath('//*[@id="customSelectOption"]/li/a/@val') + lang_list = sorted([it.split('2')[1] for it in lang_list if f'{self.output_zh}2' in it]) + return {**{lang: [self.output_zh] for lang in lang_list}, **{self.output_zh: lang_list}} + + @Tse.time_stat + @Tse.check_query + def youdao_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://ai.youdao.com/product-fanyi-text.s + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + if from_language == 'auto': + from_language = to_language = 'Auto' + + payload = {'q': query_text, 'from': from_language, 'to': to_language} + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translation'][0] + + +class QQFanyi(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.qq.com' + self.api_url = 'https://fanyi.qq.com/api/translate' + self.get_language_url = 'https://fanyi.qq.com/js/index.js' + self.get_qt_url = 'https://fanyi.qq.com/api/reauth12f' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.qt_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True) + self.language_map = None + self.session = None + self.qtv_qtk = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(2e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, ss: SessionType, language_url: str, timeout: Optional[float], proxies: Optional[dict], + **kwargs: LangMapKwargsType) -> dict: + r = ss.get(language_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + lang_map_str = re.compile('C={(.*?)}|languagePair = {(.*?)}', flags=re.S).search(r.text).group() # C= + return execjs.eval(lang_map_str) + + def get_qt(self, ss: SessionType, timeout: float, proxies: dict) -> dict: + return ss.post(self.get_qt_url, headers=self.qt_headers, json=self.qtv_qtk, timeout=timeout, + proxies=proxies).json() + + @Tse.time_stat + @Tse.check_query + def qqFanyi_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.qq.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.qtv_qtk): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text + self.qtv_qtk = self.get_qt(self.session, timeout, proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.session, self.get_language_url, timeout, proxies, + **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + payload = { + 'source': from_language, + 'target': to_language, + 'sourceText': query_text, + 'qtv': self.qtv_qtk.get('qtv', ''), + 'qtk': self.qtv_qtk.get('qtk', ''), + 'ticket': '', + 'randstr': '', + 'sessionUuid': f'translate_uuid{self.get_timestamp()}', + } + r = self.session.post(self.api_url, headers=self.api_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else ''.join( + item['targetText'] for item in data['translate']['records']) # auto whitespace + + +class QQTranSmart(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://transmart.qq.com' + self.api_url = 'https://transmart.qq.com/api/imt' + self.get_lang_url = None + self.get_lang_url_pattern = '/assets/vendor.(.*?).js' # e4c6831c + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True) + self.language_map = None + self.session = None + self.uuid = str(uuid.uuid4()) + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, timeout: Optional[float], proxies: Optional[dict], + **kwargs: LangMapKwargsType) -> dict: + js_html = ss.get(lang_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text + lang_str_list = re.compile('lngs:\\[(.*?)]').findall(js_html) # 'lngs:\\[(.*?)\\]' + lang_list = [execjs.eval(f'[{x}]') for x in lang_str_list] + lang_list = sorted(list(set([lang for langs in lang_list for lang in langs]))) + return {}.fromkeys(lang_list, lang_list) + + def get_clientKey(self) -> str: + return f'browser-firefox-110.0.0-Windows 10-{self.uuid}-{self.get_timestamp()}' + + def split_sentence(self, data: dict) -> List[str]: + index_pair_list = [[item['start'], item['start'] + item['len']] for item in data['sentence_list']] + index_list = [i for ii in index_pair_list for i in ii] + return [data['text'][index_list[i]: index_list[i + 1]] for i in range(len(index_list) - 1)] + + @Tse.time_stat + @Tse.check_query + def qqTranSmart_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://transmart.qq.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + + if not self.get_lang_url: + self.get_lang_url = f'{self.host_url}{re.compile(self.get_lang_url_pattern).search(host_html).group()}' + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.get_lang_url, self.session, timeout, proxies, + **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('qqTranSmart', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + client_key = self.get_clientKey() + self.api_headers.update({'Cookie': f'client_key={client_key}'}) + + split_payload = { + 'header': { + 'fn': 'text_analysis', + 'client_key': client_key, + }, + 'type': 'plain', + 'text': query_text, + 'normalize': {'merge_broken_line': 'false'} + } + split_data = self.session.post(self.api_url, json=split_payload, headers=self.api_headers, timeout=timeout, + proxies=proxies).json() + text_list = self.split_sentence(split_data) + + api_payload = { + 'header': { + 'fn': 'auto_translation', + 'client_key': client_key, + }, + 'type': 'plain', + 'model_category': 'normal', + 'source': { + 'lang': from_language, + 'text_list': [''] + text_list + [''], + }, + 'target': {'lang': to_language} + } + r = self.session.post(self.api_url, json=api_payload, headers=self.api_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else ''.join(data['auto_translation']) + + +class AlibabaV1(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://translate.alibaba.com' + self.api_url = 'https://translate.alibaba.com/translationopenseviceapp/trans/TranslateTextAddAlignment.do' + self.get_language_url = 'https://translate.alibaba.com/translationopenseviceapp/trans/acquire_supportLanguage.do' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.professional_field = ("general", "message", "offer") + self.dmtrack_pageid = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + def get_dmtrack_pageid(self, host_response: ResponseType) -> str: + try: + e = re.compile("dmtrack_pageid='(\\w+)';").findall(host_response.text)[0] + except: + e = '' + if not e: + e = host_response.cookies.get_dict().get("cna", "001") + e = re.compile('[^a-z\\d]').sub(repl='', string=e.lower())[:16] + else: + n, r = e[0:16], e[16:26] + i = hex(int(r, 10))[2:] if re.compile('^[\\-+]?[0-9]+$').match(r) else r + e = n + i + + s = self.get_timestamp() + o = ''.join([e, hex(s)[2:]]) + for _ in range(1, 10): + a = hex(int(0 * 1e10))[2:] # int->str: 16, '0x' + o += a + return o[:42] + + @Tse.debug_language_map + def get_language_map(self, ss: SessionType, lang_url: str, use_domain: str, dmtrack_pageid: str, + timeout: Optional[float], proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + params = {'dmtrack_pageid': dmtrack_pageid, 'biz_type': use_domain} + language_dict = ss.get(lang_url, params=params, headers=self.host_headers, timeout=timeout, + proxies=proxies).json() + return dict(map(lambda x: x, [(x['sourceLuange'], x['targetLanguages']) for x in language_dict['languageMap']])) + + @Tse.time_stat + @Tse.check_query + def alibaba_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translate.alibaba.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default 'message', choose from ("general","message","offer") + :return: str or dict + """ + + use_domain = kwargs.get('professional_field', 'message') + if use_domain not in self.professional_field: + raise TranslatorError + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.dmtrack_pageid): + self.begin_time = time.time() + self.session = requests.Session() + host_response = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.dmtrack_pageid = self.get_dmtrack_pageid(host_response) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.session, self.get_language_url, use_domain, + self.dmtrack_pageid, timeout, proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + payload = { + "srcLanguage": from_language, + "tgtLanguage": to_language, + "srcText": query_text, + "bizType": use_domain, + "viewType": "", + "source": "", + } + params = {"dmtrack_pageid": self.dmtrack_pageid} + r = self.session.post(self.api_url, headers=self.api_headers, params=params, data=payload, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['listTargetText'][0] + + +class AlibabaV2(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://translate.alibaba.com' + self.api_url = 'https://translate.alibaba.com/api/translate/text' + self.csrf_url = 'https://translate.alibaba.com/api/translate/csrftoken' + self.get_language_pattern = '//lang.alicdn.com/mcms/translation-open-portal/(.*?)/translation-open-portal_interface.json' + self.get_language_url = None + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=False, + if_multipart_for_api=True) + self.language_map = None + self.detail_language_map = None + self.professional_field = ('general',) + self.csrf_token = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_html: str, **kwargs: LangMapKwargsType) -> dict: + lang_paragraph = re.compile('"en_US":{(.*?)},"zh_CN":{').search(lang_html).group().replace('",', '",\n') + lang_items = re.compile('interface.(.*?)":"(.*?)"').findall(lang_paragraph) + _fn_filter = lambda k, v: 1 if (len(k) <= 3 or (len(k) == 5 and '-' in k)) and len(v.split(' ')) <= 2 else 0 + lang_items = sorted([(k, v) for k, v in lang_items if _fn_filter(k, v)]) + d_lang_map = {k: v for k, v in lang_items} + lang_list = list(d_lang_map.keys()) + return {}.fromkeys(lang_list, lang_list) + + def get_d_lang_map(self, lang_html: str) -> dict: + lang_paragraph = re.compile('"en_US":{(.*?)},"zh_CN":{').search(lang_html).group().replace('",', '",\n') + lang_items = re.compile('interface.(.*?)":"(.*?)"').findall(lang_paragraph) + _fn_filter = lambda k, v: 1 if (len(k) <= 3 or (len(k) == 5 and '-' in k)) and len(v.split(' ')) <= 2 else 0 + lang_items = sorted([(k, v) for k, v in lang_items if _fn_filter(k, v)]) + return {k: v for k, v in lang_items} + + @Tse.time_stat + @Tse.check_query + def alibaba_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translate.alibaba.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default 'message', choose from ("general",) + :return: str or dict + """ + + use_domain = kwargs.get('professional_field', 'general') + if use_domain not in self.professional_field: + raise TranslatorError + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.csrf_token): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.get_language_url = f'https:{re.compile(self.get_language_pattern).search(host_html).group()}' + lang_html = self.session.get(self.get_language_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(lang_html, **debug_lang_kwargs) + self.detail_language_map = self.get_d_lang_map(lang_html) + + _ = self.session.get(self.csrf_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.csrf_token = self.session.get(self.csrf_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).json() + self.api_headers.update({self.csrf_token['headerName']: self.csrf_token['token']}) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, self.output_zh) + files_data = { + 'query': (None, query_text), + 'srcLang': (None, from_language), + 'tgtLang': (None, to_language), + '_csrf': (None, self.csrf_token['token']), + 'domain': (None, self.professional_field[0]), + } # Content-Type: multipart/form-data + r = self.session.post(self.api_url, files=files_data, headers=self.api_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['data']['translateText'] + + +class Bing(Tse): + def __init__(self, server_region='EN'): + super().__init__() + self.begin_time = time.time() + self.host_url = None + self.cn_host_url = 'https://cn.bing.com/Translator' + self.en_host_url = 'https://www.bing.com/Translator' + self.server_region = server_region + self.api_url = None + self.host_headers = None + self.api_headers = None + self.language_map = None + self.session = None + self.tk = None + self.ig_iid = None + self.query_count = 0 + self.output_auto = 'auto-detect' + self.output_zh = 'zh-Hans' + self.input_limit = int(1e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + et = lxml.etree.HTML(host_html) + lang_list = et.xpath('//*[@id="tta_srcsl"]/option/@value') or et.xpath('//*[@id="t_srcAllLang"]/option/@value') + lang_list = sorted(list(set(lang_list))) + return {}.fromkeys(lang_list, lang_list) + + def get_ig_iid(self, host_html: str) -> dict: + et = lxml.etree.HTML(host_html) + # iid = et.xpath('//*[@id="tta_outGDCont"]/@data-iid')[0] # browser page is different between request page. + iid = 'translator.5028' + ig = re.compile('IG:"(.*?)"').findall(host_html)[0] + return {'iid': iid, 'ig': ig} + + def get_tk(self, host_html: str) -> dict: + result_str = re.compile('var params_AbusePreventionHelper = (.*?);').findall(host_html)[0] + result = execjs.eval(result_str) + return {'key': result[0], 'token': result[1]} + + @Tse.time_stat + @Tse.check_query + def bing_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://bing.com/Translator, https://cn.bing.com/Translator. + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param if_use_cn_host: bool, default None. + :return: str or dict + """ + + use_cn_condition = kwargs.get('if_use_cn_host', None) or self.server_region == 'CN' + self.host_url = self.cn_host_url if use_cn_condition else self.en_host_url + self.api_url = self.host_url.replace('Translator', 'ttranslatev3') + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.tk and self.ig_iid): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.tk = self.get_tk(host_html) + self.ig_iid = self.get_ig_iid(host_html) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh, output_auto=self.output_auto) + + payload = { + 'text': query_text, + 'fromLang': from_language, + 'to': to_language, + 'tryFetchingGenderDebiasedTranslations': 'true' + } + payload = {**payload, **self.tk} + api_url_param = f'?isVertical=1&&IG={self.ig_iid["ig"]}&IID={self.ig_iid["iid"]}' + api_url = ''.join([self.api_url, api_url_param]) + r = self.session.post(api_url, headers=self.host_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data[0] if is_detail_result else data[0]['translations'][0]['text'] + + +class Sogou(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.sogou.com/text' + self.api_url = 'https://fanyi.sogou.com/api/transpc/text/result' + self.get_language_old_url = 'https://search.sogoucdn.com/translate/pc/static/js/app.7016e0df.js' + self.get_language_pattern = '//search.sogoucdn.com/translate/pc/static/js/vendors.(.*?).js' + self.get_language_url = None + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.uuid = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh-CHS' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, lang_old_url: str, ss: SessionType, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + try: + if not self.get_language_url: + lang_url_path = re.compile(self.get_language_pattern).search(host_html).group() + self.get_language_url = ''.join(['https:', lang_url_path]) + lang_html = ss.get(self.get_language_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text + except: + lang_html = ss.get(lang_old_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text + + lang_list_str = re.compile('"ALL":\\[(.*?)]').search(lang_html).group().replace('!0', '1').replace('!1', '0')[ + 6:] + lang_item_list = json.loads(lang_list_str) + lang_list = [item['lang'] for item in lang_item_list if item['play'] == 1] + return {}.fromkeys(lang_list, lang_list) + + # def get_uuid(self) -> str: + # _uuid = '' + # for i in range(8): + # _uuid += hex(int(65536 * (1 + 0)))[2:][1:] + # if i in range(1, 5): + # _uuid += '-' + # return _uuid + + def get_form(self, query_text: str, from_language: str, to_language: str, uid: str) -> dict: + sign_text = "" + from_language + to_language + query_text + '109984457' # window.__INITIAL_STATE__.common.CONFIG.secretCode + sign = hashlib.md5(sign_text.encode()).hexdigest() + form = { + "from": from_language, + "to": to_language, + "text": query_text, + "uuid": uid, + "s": sign, + "client": "pc", # wap + "fr": "browser_pc", # browser_wap + "needQc": "1", + } + return form + + @Tse.time_stat + @Tse.check_query + def sogou_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.sogou.com/text + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.uuid): + self.uuid = str(uuid.uuid4()) + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, self.get_language_old_url, self.session, timeout, + proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + payload = self.get_form(query_text, from_language, to_language, self.uuid) + r = self.session.post(self.api_url, headers=self.api_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['data']['translate']['dit'] + + +class Caiyun(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.caiyunapp.com' + self.api_url = 'https://api.interpreter.caiyunai.com/v1/translator' + self.get_js_pattern = '/assets/index.(.*?).js' + self.get_js_url = None + self.get_jwt_url = 'https://api.interpreter.caiyunai.com/v1/user/jwt/generate' + self.host_headers = self.get_headers(self.host_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=False, if_json_for_api=True) + self.language_map = None + self.session = None + self.professional_field = (None, "medicine", "law", "machinery",) + self.browser_data = {'browser_id': ''.join(random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 32))} + self.normal_key = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + '0123456789' + '=.+-_/' + self.cipher_key = 'NOPQRSTUVWXYZABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm' + '0123456789' + '=.+-_/' + self.decrypt_dictionary = self.crypt(if_de=True) + self.tk = None + self.jwt = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, js_html: str, **kwargs: LangMapKwargsType) -> dict: + return execjs.eval(re.compile('={auto:\\[(.*?)}').search(js_html).group()[1:]) + + def get_tk(self, js_html: str) -> str: + return re.compile('headers\\["X-Authorization"]="(.*?)",').findall(js_html)[0] + + # def get_jwt(self, browser_id: str, api_headers: dict, ss: SessionType, timeout: float, proxies: dict) -> str: + # data = {"browser_id": browser_id} + # return ss.post(self.get_jwt_url, json=data, headers=api_headers, timeout=timeout, proxies=proxies).json()['jwt'] + + def crypt(self, if_de: bool = True) -> dict: + if if_de: + return {k: v for k, v in zip(self.cipher_key, self.normal_key)} + return {v: k for k, v in zip(self.cipher_key, self.normal_key)} + + def encrypt(self, plain_text: str) -> str: + encrypt_dictionary = self.crypt(if_de=False) + _cipher_text = base64.b64encode(plain_text.encode()).decode() + return ''.join(list(map(lambda k: encrypt_dictionary[k], _cipher_text))) + + def decrypt(self, cipher_text: str) -> str: + _ciphertext = ''.join(list(map(lambda k: self.decrypt_dictionary[k], cipher_text))) + return base64.b64decode(_ciphertext).decode() + + @Tse.time_stat + @Tse.check_query + def caiyun_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.caiyunapp.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default None, choose from (None, "medicine","law","machinery") + :return: str or dict + """ + + use_domain = kwargs.get('professional_field', None) + if use_domain not in (None, "medicine", "law", "machinery"): + raise TranslatorError + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.tk and self.jwt): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + js_url_path = re.compile(self.get_js_pattern).search(host_html).group() + self.get_js_url = ''.join([self.host_url, js_url_path]) + js_html = self.session.get(self.get_js_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.tk = self.get_tk(js_html) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(js_html, **debug_lang_kwargs) + + self.api_headers.update({ + "app-name": "xy", + "device-id": "", + "os-type": "web", + "os-version": "", + "version": "1.8.0", + "X-Authorization": self.tk, + }) + jwt_r = self.session.post(self.get_jwt_url, json=self.browser_data, headers=self.api_headers, + timeout=timeout, proxies=proxies) + self.jwt = jwt_r.json()['jwt'] + self.api_headers.update({"T-Authorization": self.jwt}) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + payload = { + "cached": "true", + "dict": "true", + "media": "text", + "os_type": "web", + "replaced": "true", + "request_id": "web_fanyi", + "source": query_text.split('\n'), + "trans_type": f"{from_language}2{to_language}", + "browser_id": self.browser_data['browser_id'], + } + + if from_language == 'auto': + payload.update({'detect': 'true'}) + if use_domain: + payload.update({"dict_name": use_domain, "use_common_dict": "true"}) + + _ = self.session.options(self.api_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + r = self.session.post(self.api_url, headers=self.api_headers, json=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else '\n'.join([self.decrypt(item) for item in data['target']]) + + +class Deepl(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://www.deepl.com/translator' + self.api_url = 'https://www2.deepl.com/jsonrpc' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=False, if_json_for_api=True) + self.params = {'split': {'method': 'LMT_split_text'}, 'handle': {'method': 'LMT_handle_jobs'}} + self.request_id = int(random.randrange(100, 10000) * 10000 + 4) + self.language_map = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + lang_list = list(set(re.compile('translateIntoLang\\.(\\w+)":').findall(host_html))) + return {}.fromkeys(lang_list, lang_list) + + def split_sentences_param(self, query_text: str, from_language: str) -> dict: + data = { + 'id': self.request_id, + 'jsonrpc': '2.0', + 'params': { + 'texts': query_text.split('\n'), + 'commonJobParams': {'mode': 'translate'}, + 'lang': { + 'lang_user_selected': from_language, + 'preference': { + 'weight': {}, + 'default': 'default', + }, + }, + }, + } + return {**self.params['split'], **data} + + def context_sentences_param(self, sentences: List[str], from_language: str, to_language: str) -> dict: + sentences = [''] + sentences + [''] + data = { + 'id': self.request_id + 1, + 'jsonrpc': ' 2.0', + 'params': { + 'priority': 1, # -1 if 'quality': 'fast' + 'timestamp': self.get_timestamp(), + 'commonJobParams': { + # 'regionalVariant': 'en-US', + 'browserType': 1, + 'mode': 'translate', + }, + 'jobs': [ + { + 'kind': 'default', + # 'quality': 'fast', # -1 + 'sentences': [{'id': i - 1, 'prefix': '', 'text': sentences[i]}], + 'raw_en_context_before': sentences[1:i] if sentences[i - 1] else [], + 'raw_en_context_after': [sentences[i + 1]] if sentences[i + 1] else [], + 'preferred_num_beams': 1 if len(sentences) >= 4 else 4, # 1 if two sentences else 4, len>=2+2 + } for i in range(1, len(sentences) - 1) + ], + 'lang': { + 'preference': { + 'weight': {}, + 'default': 'default', + }, + 'source_lang_user_selected': from_language, # "source_lang_computed" + 'target_lang': to_language, + }, + }, + } + return {**self.params['handle'], **data} + + @Tse.time_stat + @Tse.check_query + def deepl_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.deepl.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, language_map=self.language_map, + output_zh=self.output_zh, output_auto='auto') + from_language = from_language.upper() if from_language != 'auto' else from_language + to_language = to_language.upper() if to_language != 'auto' else to_language + + ssp_data = self.split_sentences_param(query_text, from_language) + r_s = self.session.post(self.api_url, params=self.params['split'], json=ssp_data, headers=self.api_headers, + timeout=timeout, proxies=proxies) + r_s.raise_for_status() + s_data = r_s.json() + + s_sentences = [it['sentences'][0]['text'] for item in s_data['result']['texts'] for it in item['chunks']] + h_data = self.context_sentences_param(s_sentences, from_language, to_language) + + r_cs = self.session.post(self.api_url, params=self.params['handle'], json=h_data, headers=self.api_headers, + timeout=timeout, proxies=proxies) + r_cs.raise_for_status() + data = r_cs.json() + time.sleep(sleep_seconds) + self.request_id += 3 + self.query_count += 1 + return data if is_detail_result else '\n'.join( + item['beams'][0]['sentences'][0]["text"] for item in data['result']['translations']) + + +class Yandex(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.home_url = 'https://yandex.com' + self.host_url = 'https://translate.yandex.com' + self.api_url = 'https://translate.yandex.net/api/v1/tr.json/translate' + self.api_host = 'https://translate.yandex.net' + self.detect_language_url = 'https://translate.yandex.net/api/v1/tr.json/detect' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.api_headers.update({'Referer': self.api_host, 'x-retpath-y': self.host_url}) + self.language_map = None + self.session = None + self.sid = None + self.yu = None + self.yum = None + self.sprvk = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(1e4) # ten thousand. + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + lang_str = re.compile('TRANSLATOR_LANGS: {(.*?)},').search(host_html).group(0)[18:-1] + lang_dict = json.loads(lang_str) + lang_list = sorted(list(lang_dict.keys())) + return {}.fromkeys(lang_list, lang_list) + + def get_yum(self) -> str: + return str(int(time.time() * 1e10)) + + # def get_csrf_token(self, host_html: str) -> str: + # return re.compile(pattern="CSRF_TOKEN: '(.*?)',").findall(host_html)[0] + # + # def get_key(self, host_html: str) -> str: + # return re.compile(pattern="SPEECHKIT_KEY: '(.*?)',").findall(host_html)[0] + + def get_sid(self, host_html: str) -> str: + try: + sid_find = re.compile("SID: '(.*?)',").findall(host_html)[0] + return '.'.join([w[::-1] for w in sid_find.split('.')]) + except Exception as e: + captcha_info = 'SmartCaptcha needs verification' + if captcha_info in host_html: + raise TranslatorError(captcha_info) + raise TranslatorError(str(e)) + + def detect_language(self, ss: SessionType, query_text: str, sid: str, yu: str, headers: dict, timeout: float, + proxies: dict) -> str: + params = { + 'sid': sid, + 'yu': yu, + 'text': query_text, + 'srv': 'tr-text', + 'hint': 'en,ru', + 'options': 1 + } + r = ss.get(self.detect_language_url, params=params, headers=headers, timeout=timeout, proxies=proxies) + lang = r.json().get('lang') + return lang if lang else 'en' + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def yandex_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translate.yandex.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param reset_host_url: str, default None. eg: 'https://translate.yandex.fr' + :param if_check_reset_host_url: bool, default True. + :return: str or dict + """ + + reset_host_url = kwargs.get('reset_host_url', None) + if reset_host_url and reset_host_url != self.host_url: + if kwargs.get('if_check_reset_host_url', True) and not reset_host_url[:25] == 'https://translate.yandex.': + raise TranslatorError + self.host_url = reset_host_url.strip('/') + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.sid and self.yu): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.home_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + self.sid = self.get_sid(host_html) + self.yum = self.get_yum() + self.yu = self.session.cookies.get_dict().get( + 'yuidss') or f'{random.randint(int(1e8), int(9e8))}{int(time.time())}' + self.sprvk = self.session.cookies.get_dict().get('spravka') + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + if from_language == 'auto': + from_language = self.detect_language(self.session, query_text, self.sid, self.yu, self.api_headers, timeout, + proxies) + + params = { + 'id': f'{self.sid}-{self.query_count}-0', + 'source_lang': from_language, + 'target_lang': to_language, + 'srv': 'tr-text', + 'reason': 'paste', # 'auto' + 'format': 'text', + 'ajax': 1, + 'yu': self.yu, + } + if self.sprvk: + params.update({'sprvk': self.sprvk, 'yum': self.yum}) + + payload = urllib.parse.urlencode({'text': query_text, 'options': 4}) + r = self.session.post(self.api_url, params=params, data=payload, headers=self.api_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else '\n'.join(data['text']) + + +class Argos(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://translate.argosopentech.com' + self.api_url = f'{self.host_url}/translate' + self.language_url = f'{self.host_url}/languages' + self.host_headers = self.get_headers(self.host_url, if_api=False, if_ajax_for_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=False, if_json_for_api=True) + self.language_headers = self.get_headers(self.host_url, if_api=False, if_json_for_api=True) + self.host_pool = ['https://translate.argosopentech.com', 'https://libretranslate.de', + 'https://translate.astian.org', 'https://translate.mentality.rip', + 'https://translate.api.skitzen.com', 'https://trans.zillyhuhn.com'] + self.language_map = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) # unknown + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + lang_list = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).json() + lang_list = sorted([lang['code'] for lang in lang_list]) + return {}.fromkeys(lang_list, lang_list) + + @Tse.time_stat + @Tse.check_query + def argos_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translate.argosopentech.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param reset_host_url: str, default None. + :return: str or dict + """ + + reset_host_url = kwargs.get('reset_host_url', None) + if reset_host_url and reset_host_url != self.host_url: + if reset_host_url not in self.host_pool: + raise TranslatorError + self.host_url = reset_host_url + self.api_url = f'{self.host_url}/translate' + self.language_url = f'{self.host_url}/languages' + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.language_url, self.session, self.language_headers, timeout, + proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + payload = {'q': query_text, 'source': from_language, 'target': to_language, 'format': 'text'} + r = self.session.post(self.api_url, headers=self.api_headers, json=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translatedText'] + + +class Iciba(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://www.iciba.com/fy' + self.api_url = 'https://ifanyi.iciba.com/index.php' + self.host_headers = self.get_headers(self.host_url, if_api=False, if_ajax_for_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=True, if_json_for_api=False) + self.language_headers = self.get_headers(self.host_url, if_api=False, if_json_for_api=True) + self.language_map = None + self.session = None + self.s_y2 = 'ifanyiweb8hc9s98e' + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(3e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, api_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + params = {'c': 'trans', 'm': 'getLanguage', 'q': 0, 'type': 'en', 'str': ''} + dd = ss.get(api_url, params=params, headers=headers, timeout=timeout, proxies=proxies).json() + lang_list = sorted(list(set([lang for d in dd for lang in dd[d]]))) + return {}.fromkeys(lang_list, lang_list) + + @Tse.time_stat + @Tse.check_query + def iciba_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.iciba.com/fy + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.api_url, self.session, self.language_headers, timeout, + proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + sign = hashlib.md5(f"6key_web_fanyi{self.s_y2}{query_text}".encode()).hexdigest()[:16] # strip() + params = {'c': 'trans', 'm': 'fy', 'client': 6, 'auth_user': 'key_web_fanyi', 'sign': sign} + payload = {'from': from_language, 'to': to_language, 'q': query_text} + r = self.session.post(self.api_url, headers=self.api_headers, params=params, data=payload, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['content'] if data.get('isSensitive') == 1 else data['content']['out'] + + +class IflytekV1(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://saas.xfyun.cn/translate?tabKey=text' + self.api_url = 'https://saas.xfyun.cn/ai-application/trans/its' + self.language_old_url = 'https://saas.xfyun.cn/_next/static/4bzLSGCWUNl67Xal-AfIl/pages/translate.js' + self.language_url_pattern = '/_next/static/(\w+([-]?\w+))/pages/translate.js' + self.language_url = None + self.cookies_url = 'https://sso.xfyun.cn//SSOService/login/getcookies' + self.info_url = 'https://saas.xfyun.cn/ai-application/user/info' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.session = None + self.query_count = 0 + self.output_zh = 'cn' + self.input_limit = int(2e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + try: + if not self.language_url: + url_path = re.compile(self.language_url_pattern).search(host_html).group() + self.language_url = f'{self.host_url[:21]}{url_path}' + r = ss.get(self.language_url, headers=headers, timeout=timeout, proxies=proxies) + except: + r = ss.get(self.language_old_url, headers=headers, timeout=timeout, proxies=proxies) + + js_html = r.text + lang_str = re.compile('languageList:\\(e={(.*?)}').search(js_html).group()[16:] + lang_list = sorted(list(execjs.eval(lang_str).keys())) + return {}.fromkeys(lang_list, lang_list) + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def iflytek_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://saas.xfyun.cn/translate?tabKey=text + :param query_text: str, must. + :param from_language: str, default 'zh', unsupported 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + _ = self.session.get(self.cookies_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + _ = self.session.get(self.info_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, self.session, self.host_headers, timeout, proxies, + **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('iflytek', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + # cipher_query_text = base64.b64encode(query_text.encode()).decode() + cipher_query_text = query_text + payload = {'from': from_language, 'to': to_language, 'text': cipher_query_text} + r = self.session.post(self.api_url, headers=self.api_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else json.loads(data['data'])['trans_result']['dst'] + + +class IflytekV2(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.xfyun.cn/console/trans/text' # https://www.iflyrec.com/html/translate.html + self.api_url = 'https://fanyi.xfyun.cn/api-tran/trans/its' + self.detect_language_url = 'https://fanyi.xfyun.cn/api-tran/trans/detection' + self.language_url_pattern = '/js/trans-text/index.(.*?).js' + self.language_url = None + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.session = None + self.query_count = 0 + self.output_zh = 'cn' + self.input_limit = int(2e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + host_true_url = f'https://{urllib.parse.urlparse(self.host_url).hostname}' + + et = lxml.etree.HTML(host_html) + host_js_url = f"""{host_true_url}{et.xpath('//script[@type="module"]/@src')[0]}""" + host_js_html = ss.get(host_js_url, headers=headers, timeout=timeout, proxies=proxies).text + self.language_url = f"""{host_true_url}{re.compile(self.language_url_pattern).search(host_js_html).group()}""" + + lang_js_html = ss.get(self.language_url, headers=headers, timeout=timeout, proxies=proxies).text + lang_list = re.compile('languageCode:"(.*?)",').findall(lang_js_html) + lang_list = sorted(list(set(lang_list))) + return {}.fromkeys(lang_list, lang_list) + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def iflytek_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.xfyun.cn/console/trans/text + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, self.session, self.host_headers, timeout, proxies, + **debug_lang_kwargs) + + if from_language == 'auto': + params = {'text': query_text} + detect_r = self.session.get(self.detect_language_url, params=params, headers=self.host_headers, + timeout=timeout, proxies=proxies) + from_language = detect_r.json()[ + 'data'] if detect_r.status_code == 200 and detect_r.text.strip() != '' else self.output_zh + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + payload = {'from': from_language, 'to': to_language, 'text': query_text} + r = self.session.post(self.api_url, headers=self.api_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else json.loads(data['data'])['trans_result']['dst'] + + +class Iflyrec(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://fanyi.iflyrec.com' + self.api_url = 'https://fanyi.iflyrec.com/TranslationService/v1/textAutoTranslation' + self.detect_lang_url = 'https://fanyi.iflyrec.com/TranslationService/v1/languageDetection' + self.language_url = 'https://fanyi.iflyrec.com/TranslationService/v1/textTranslation/languages' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True) + self.lang_index = {'zh': 1, 'en': 2, 'ja': 3, 'ko': 4, 'ru': 5, 'fr': 6, 'es': 7, 'vi': 8, 'yue': 9, 'ar': 12, + 'de': 13, 'it': 14} + self.lang_index_mirror = {v: k for k, v in self.lang_index.items()} + self.language_map = None + self.session = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(2e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_index: dict, **kwargs: LangMapKwargsType) -> dict: + lang_list = sorted(list(lang_index.keys())) + lang_map = {lang: ['zh'] for lang in lang_list if lang != 'zh'} + return {**lang_map, **{'zh': lang_list}} + + @Tse.time_stat + @Tse.check_query + def iflyrec_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://fanyi.iflyrec.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.lang_index, **debug_lang_kwargs) + + if from_language == 'auto': + params = {'t': self.get_timestamp()} + form = {'originalText': query_text} + detect_r = self.session.post(self.detect_lang_url, params=params, json=form, headers=self.api_headers, + timeout=timeout, proxies=proxies) + from_language_id = detect_r.json()['biz'][0]['detectionLanguage'] + from_language = self.lang_index_mirror[from_language_id] + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + api_params = {'t': self.get_timestamp()} + api_form = { + 'from': self.lang_index[from_language], + 'to': self.lang_index[to_language], + 'openTerminology': 'false', + 'contents': [{'text': t.strip(), 'frontBlankLine': 0} for t in query_text.split('\n') if t.strip() != ''], + } + r = self.session.post(self.api_url, params=api_params, json=api_form, headers=self.api_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else '\n'.join([item['translateResult'] for item in data['biz']]) + + +class Reverso(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://www.reverso.net/text-translation' + self.api_url = 'https://api.reverso.net/translate/v1/translation' + self.language_url = None + self.language_pattern = 'https://cdn.reverso.net/trans/v(\\d).(\\d).(\\d)/main.js' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True) + self.session = None + self.language_map = None + self.decrypt_language_map = None + self.query_count = 0 + self.output_zh = 'zh' # 'chi', because there are self.language_tran + self.input_limit = int(2e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_html: str, **kwargs: LangMapKwargsType) -> dict: + lang_dict_str = re.compile('={eng:(.*?)}').search(lang_html).group()[1:] + lang_dict = execjs.eval(lang_dict_str) + lang_list = sorted(list(lang_dict.values())) + return {}.fromkeys(lang_list, lang_list) + + def decrypt_lang_map(self, lang_html: str) -> dict: + lang_dict_str = re.compile('={eng:(.*?)}').search(lang_html).group()[1:] + lang_dict = execjs.eval(lang_dict_str) + return {k: v for v, k in lang_dict.items()} + + @Tse.time_stat + @Tse.check_query + def reverso_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.reverso.net/text-translation + :param query_text: str, must. + :param from_language: str, default 'zh', unsupported 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.decrypt_language_map): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.language_url = re.compile(self.language_pattern).search(host_html).group() + lang_html = self.session.get(self.language_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.decrypt_language_map = self.decrypt_lang_map(lang_html) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(lang_html, **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('reverso', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + from_language, to_language = self.decrypt_language_map[from_language], self.decrypt_language_map[to_language] + + payload = { + 'format': 'text', + 'from': from_language, + 'to': to_language, + 'input': query_text, + 'options': { + 'contextResults': 'true', + 'languageDetection': 'true', + 'sentenceSplitter': 'true', + 'origin': 'translation.web', + } + } + r = self.session.post(self.api_url, json=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else ''.join(data['translation']) + + +class Itranslate(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://itranslate.com/translate' + self.api_url = 'https://web-api.itranslateapp.com/v3/texts/translate' + self.manifest_url = 'https://itranslate-webapp-production.web.app/manifest.json' + self.language_url = None + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True) + self.session = None + self.language_map = None + self.api_key = None + self.query_count = 0 + self.output_zh = 'zh-CN' + self.input_limit = int(1e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_html: str, **kwargs: LangMapKwargsType) -> dict: + lang_str = re.compile('\\[{dialect:"auto",(.*?)}]').search(lang_html).group() + lang_origin_list = execjs.eval(lang_str) + lang_list = sorted(list(set([dd['dialect'] for dd in lang_origin_list]))) + return {}.fromkeys(lang_list, lang_list) + + def get_apikey(self, lang_html: str) -> str: + return re.compile('"API-KEY":"(.*?)"').findall(lang_html)[0] + + @Tse.time_stat + @Tse.check_query + def itranslate_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://itranslate.com/translate + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + + if not self.language_url: + manifest_data = self.session.get(self.manifest_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).json() + self.language_url = manifest_data.get('main.js') + + lang_html = self.session.get(self.language_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(lang_html, **debug_lang_kwargs) + + self.api_key = self.get_apikey(lang_html) + self.api_headers.update({'API-KEY': self.api_key}) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh, + output_en_translator='itranslate', output_en='en-US') + + payload = { + 'source': {'dialect': from_language, 'text': query_text, 'with': ['synonyms']}, + 'target': {'dialect': to_language}, + } + r = self.session.post(self.api_url, headers=self.api_headers, json=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['target']['text'] + + +class TranslateCom(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://www.translate.com/machine-translation' + self.api_url = 'https://www.translate.com/translator/translate_mt' + self.lang_detect_url = 'https://www.translate.com/translator/ajax_lang_auto_detect' + self.language_url = 'https://www.translate.com/ajax/language/ht/all' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=False) + self.session = None + self.language_map = None + self.language_description = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(1.5e4) # fifteen thousand letters left today. + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_desc: dict, **kwargs: LangMapKwargsType) -> dict: + return {item['code']: [it['code'] for it in item['availableTranslationLanguages']] for item in lang_desc} + + @Tse.time_stat + @Tse.check_query + def translateCom_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.translate.com/machine-translation + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + lang_r = self.session.get(self.language_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.language_description = lang_r.json() + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.language_description, **debug_lang_kwargs) + + if from_language == 'auto': + detect_form = {'text_to_translate': query_text} + r_detect = self.session.post(self.lang_detect_url, data=detect_form, headers=self.api_headers, + timeout=timeout, proxies=proxies) + from_language = r_detect.json()['language'] + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + payload = { + 'text_to_translate': query_text, + 'source_lang': from_language, + 'translated_lang': to_language, + 'use_cache_only': 'false', + } + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translated_text'] # translation_source is microsoft, wtf! + + +class Utibet(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'http://mt.utibet.edu.cn/mt' # must http + self.api_url = self.host_url + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=False) + self.language_map = {'ti': ['zh'], 'zh': ['ti']} + self.session = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) # unknown + self.default_from_language = self.output_zh + + def parse_result(self, host_html: str) -> str: + et = lxml.etree.HTML(host_html) + return et.xpath('//*[@name="tgt"]/text()')[0] + + @Tse.time_stat + @Tse.check_query + def utibet_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'ti', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + http://mt.utibet.edu.cn/mt + :param query_text: str, must. + :param from_language: str, default 'auto', equals to 'zh'. + :param to_language: str, default 'ti'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + + if from_language == 'auto': + from_language = self.warning_auto_lang('utibet', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + payload = { + 'src': query_text, + 'tgt': query_text if from_language == 'ti' else '', + 'lang': 'tc' if from_language == 'ti' else 'ct', + } + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, headers=self.api_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data_html = r.text + time.sleep(sleep_seconds) + self.query_count += 1 + return {'data_html': data_html} if is_detail_result else self.parse_result(data_html) + + +class Papago(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://papago.naver.com' + self.api_url = 'https://papago.naver.com/apis/n2mt/translate' # nsmt + self.web_api_url = 'https://papago.naver.net/website' + self.lang_detect_url = 'https://papago.naver.com/apis/langs/dect' + self.language_url = None + self.language_url_pattern = '/home.(.*?).chunk.js' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=False) + self.language_map = None + self.session = None + self.device_id = None + self.auth_key = None # 'v1.7.1_12f919c9b5' #'v1.6.7_cc60b67557' + self.query_count = 0 + self.output_zh = 'zh-CN' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_html: str, **kwargs: LangMapKwargsType) -> dict: + lang_str = re.compile('={ALL:(.*?)}').search(lang_html).group()[1:] + lang_str = lang_str.lower().replace('zh-cn', 'zh-CN').replace('zh-tw', 'zh-TW') + lang_list = re.compile(',"(.*?)":|,(.*?):').findall(lang_str) + lang_list = [j if j else k for j, k in lang_list] + lang_list = sorted(list(filter(lambda x: x not in ('all', 'auto'), lang_list))) + return {}.fromkeys(lang_list, lang_list) + + def get_auth_key(self, lang_html: str) -> str: + return re.compile('AUTH_KEY:"(.*?)"').findall(lang_html)[0] + + def get_authorization(self, url: str, auth_key: str, device_id: str, timestamp: int) -> str: + auth = hmac.new(key=auth_key.encode(), msg=f'{device_id}\n{url}\n{timestamp}'.encode(), + digestmod='md5').digest() + return f'PPG {device_id}:{base64.b64encode(auth).decode()}' + + @Tse.time_stat + @Tse.check_query + def papago_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://papago.naver.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.auth_key): + self.device_id = str(uuid.uuid4()) + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + url_path = re.compile(self.language_url_pattern).search(host_html).group() + self.language_url = ''.join([self.host_url, url_path]) + lang_html = self.session.get(self.language_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(lang_html, **debug_lang_kwargs) + self.auth_key = self.get_auth_key(lang_html) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + detect_time = self.get_timestamp() + detect_auth = self.get_authorization(self.lang_detect_url, self.auth_key, self.device_id, detect_time) + detect_add_headers = {'device-type': 'pc', 'timestamp': str(detect_time), 'authorization': detect_auth} + detect_headers = {**self.api_headers, **detect_add_headers} + + if from_language == 'auto': + detect_form = urllib.parse.urlencode({'query': query_text}) + r_detect = self.session.post(self.lang_detect_url, headers=detect_headers, data=detect_form, + timeout=timeout, proxies=proxies) + from_language = r_detect.json()['langCode'] + + trans_time = self.get_timestamp() + trans_auth = self.get_authorization(self.api_url, self.auth_key, self.device_id, trans_time) + trans_update_headers = {'x-apigw-partnerid': 'papago', 'timestamp': str(trans_time), + 'authorization': trans_auth} + detect_headers.update(trans_update_headers) + trans_headers = detect_headers + + payload = { + 'deviceId': self.device_id, + 'text': query_text, 'source': from_language, 'target': to_language, 'locale': 'en', + 'dict': 'true', 'dictDisplay': 30, 'honorific': 'false', 'instant': 'false', 'paging': 'false', + } + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, headers=trans_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translatedText'] + + +class Lingvanex(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://lingvanex.com/demo/' + self.api_url = None + self.language_url = None + self.auth_url = 'https://lingvanex.com/lingvanex_demo_page/js/api-base.js' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=False) + self.session = None + self.language_map = None + self.detail_language_map = None + self.auth_info = None + self.mode = None + self.model_pool = ('B2B', 'B2C',) + self.query_count = 0 + self.output_zh = 'zh-Hans_CN' + self.input_limit = int(1e4) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + params = {'all': 'true', 'code': 'en_GB', 'platform': 'dp', '_': self.get_timestamp()} + detail_lang_map = ss.get(lang_url, params=params, headers=headers, timeout=timeout, proxies=proxies).json() + for _ in range(3): + _ = ss.get(lang_url, params={'platform': 'dp'}, headers=headers, timeout=timeout, proxies=proxies) + lang_list = sorted(set([item['full_code'] for item in detail_lang_map['result']])) + return {}.fromkeys(lang_list, lang_list) + + def get_d_lang_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: float, proxies: dict) -> dict: + params = {'all': 'true', 'code': 'en_GB', 'platform': 'dp', '_': self.get_timestamp()} + return ss.get(lang_url, params=params, headers=headers, timeout=timeout, proxies=proxies).json() + + def get_auth(self, auth_url: str, ss: SessionType, headers: dict, timeout: float, proxies: dict) -> dict: + js_html = ss.get(auth_url, headers=headers, timeout=timeout, proxies=proxies).text + return {k: v for k, v in re.compile(',(.*?)="(.*?)"').findall(js_html)} + + @Tse.time_stat + @Tse.check_query + def lingvanex_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://lingvanex.com/demo/ + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param lingvanex_mode: str, default "B2C", choose from ("B2B", "B2C"). + :return: str or dict + """ + + mode = kwargs.get('lingvanex_mode', 'B2C') + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not ( + self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.auth_info and self.mode == mode): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.auth_info = self.get_auth(self.auth_url, self.session, self.host_headers, timeout, proxies) + + if mode not in self.model_pool: + raise TranslatorError + + if mode != self.mode: + self.mode = mode + self.api_url = ''.join([self.auth_info[f'{mode}_BASE_URL'], self.auth_info['TRANSLATE_URL']]) + self.language_url = ''.join([self.auth_info[f'{mode}_BASE_URL'], self.auth_info['GET_LANGUAGES_URL']]) + self.host_headers.update({'authorization': self.auth_info[f'{mode}_AUTH_TOKEN']}) + self.api_headers.update({'authorization': self.auth_info[f'{mode}_AUTH_TOKEN']}) + self.api_headers.update({'referer': urllib.parse.urlparse(self.auth_info[f'{mode}_BASE_URL']).netloc}) + + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.language_url, self.session, self.host_headers, timeout, + proxies, **debug_lang_kwargs) + self.detail_language_map = self.get_d_lang_map(self.language_url, self.session, self.host_headers, timeout, + proxies) + + if from_language == 'auto': + from_language = self.warning_auto_lang('lingvanex', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh, + output_en_translator='lingvanex', output_en='en_GB') + + payload = { + 'from': from_language, + 'to': to_language, + 'text': query_text, + 'platform': 'dp', + 'is_return_text_split_ranges': 'true' + } + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['result']['text'] + + +class Mglip(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'http://fy.mglip.com/pc' # must http + self.api_url = 'http://fy.mglip.com/t2t' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=False) + self.language_map = {}.fromkeys(['zh', 'mon', 'xle'], ['zh', 'mon', 'xle']) + self.session = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e2) + self.default_from_language = self.output_zh + + @Tse.time_stat + @Tse.check_query + def mglip_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'mon', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + http://fy.mglip.com/pc + :param query_text: str, must. + :param from_language: str, default 'auto', equals 'zh'. + :param to_language: str, default 'mon'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + + if from_language == 'auto': + from_language = self.warning_auto_lang('mglip', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + payload = {'userInput': query_text, 'from': from_language, 'to': to_language} + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, headers=self.api_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['datas'][0]['paragraph'] if data['datas'][0]['type'] == 'trans' else \ + data['datas'][0]['data'] + + +class VolcEngine(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://translate.volcengine.com' + self.api_url = 'https://translate.volcengine.com/web/translate/v1' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True) + self.session = None + self.language_map = None + self.ms_token = '' + self.x_bogus = 'DFS#todo' + self.signature = '_02B#todo' + self.query_count = 0 + self.output_auto = 'detect' + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + lang_list = re.compile('"language_(.*?)":').findall(host_html) + lang_list = sorted(list(set(lang_list))) + return {}.fromkeys(lang_list, lang_list) + + @property + def professional_field_map(self) -> dict: + data = { + '': {'category': '', 'glossary_list': []}, + 'clean': {'category': 'clean', 'glossary_list': []}, + 'novel': {'category': 'novel', 'glossary_list': []}, + 'finance': {'category': 'finance', 'glossary_list': []}, + 'biomedical': {'category': 'biomedical', 'glossary_list': []}, + + 'ai': {'category': '', 'glossary_list': ['ailab/ai']}, + 'menu': {'category': '', 'glossary_list': ['ailab/menu']}, + 'techfirm': {'category': '', 'glossary_list': ['ailab/techfirm']}, + + 'ecommerce': {'category': 'ecommerce', 'glossary_list': ['ailab/ecommerce']}, + 'technique': {'category': 'technique', 'glossary_list': ['ailab/technique']}, + } + return data + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def volcEngine_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translate.volcengine.com + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default '', choose from ('', 'clean') + :return: str or dict + """ + + use_domain = kwargs.get('professional_field', '') + if use_domain not in self.professional_field_map: + raise TranslatorError + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_auto=self.output_auto, output_zh=self.output_zh) + params = { + 'msToken': self.ms_token, + 'X-Bogus': self.x_bogus, + '_signature': self.signature, + } + payload = { + 'text': query_text, + 'source_language': from_language, + 'target_language': to_language, + 'home_language': 'zh', + 'enable_user_glossary': 'false', + } + payload.update(self.professional_field_map[use_domain]) + r = self.session.post(self.api_url, params=params, json=payload, headers=self.api_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translation'] + + +class ModernMt(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://www.modernmt.com/translate' + self.api_url = 'https://webapi.modernmt.com/translate' + self.language_url = 'https://www.modernmt.com/scripts/app.bundle.js' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True, + if_http_override_for_api=True) + self.session = None + self.language_map = None + self.query_count = 0 + self.output_zh = 'zh-CN' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + lang_html = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).text + d_lang_map = json.loads(re.compile('''('{(.*?)}')''').search(lang_html).group(0)[1:-1]) + lang_list = sorted(d_lang_map.keys()) + return {}.fromkeys(lang_list, lang_list) + + @Tse.time_stat + @Tse.check_query + def modernMt_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.modernmt.com/translate + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.language_url, self.session, self.host_headers, timeout, + proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + timestamp = self.get_timestamp() + payload = { + 'q': query_text, + 'source': '' if from_language == 'auto' else from_language, + 'target': to_language, + 'ts': timestamp, + 'verify': hashlib.md5(f'webkey_E3sTuMjpP8Jez49GcYpDVH7r#{timestamp}#{query_text}'.encode()).hexdigest(), + 'hints': '', + 'multiline': 'true', + } + r = self.session.post(self.api_url, json=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['data']['translation'] + + +class MyMemory(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://mymemory.translated.net' + self.api_web_url = 'https://mymemory.translated.net/api/ajaxfetch' + self.api_api_url = 'https://api.mymemory.translated.net/get' + self.get_matecat_language_url = 'https://www.matecat.com/api/v2/languages' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.session = None + self.language_map = None + self.myMemory_language_list = None + self.mateCat_language_list = None + self.query_count = 0 + self.output_zh = 'zh-CN' + self.input_limit = int(5e2) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, myMemory_host_html: str, matecat_lang_url: str, ss: SessionType, headers: dict, + timeout: Optional[float], proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + et = lxml.etree.HTML(myMemory_host_html) + lang_list = et.xpath('//*[@id="select_source_mm"]/option/@value')[2:] + self.myMemory_language_list = sorted(list(set(lang_list))) + + lang_d_list = ss.get(matecat_lang_url, headers=headers, timeout=timeout, proxies=proxies).json() + self.mateCat_language_list = sorted(list(set([item['code'] for item in lang_d_list]))) + + lang_list = sorted(list(set(self.myMemory_language_list + self.mateCat_language_list))) + return {}.fromkeys(lang_list, lang_list) + + @Tse.time_stat + @Tse.check_query + def myMemory_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://mymemory.translated.net + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param myMemory_mode: str, default "web", choose from ("web", "api"). + :return: str or dict + """ + + mode = kwargs.get('myMemory_mode', 'web') + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, self.get_matecat_language_url, self.session, + self.host_headers, timeout, proxies, **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('myMemory', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh, + output_en_translator='myMemory', output_en='en-GB') + + params = { + 'q': query_text, + 'langpair': f'{from_language}|{to_language}' + } + params = params if mode == 'api' else {**params, **{'mtonly': 1}} + api_url = self.api_api_url if mode == 'api' else self.api_web_url + + r = self.session.get(api_url, params=params, headers=self.host_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['responseData']['translatedText'] + + +class Mirai(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.home_url = 'https://miraitranslate.com' + self.host_url = 'https://miraitranslate.com/trial/' + self.api_url = 'https://trial.miraitranslate.com/trial/api/translate.php' + self.lang_url = None + self.lang_url_pattern = 'main-es2015.(.*?).js' + self.detect_lang_url = 'https://trial.miraitranslate.com/trial/api/detect_lang.php' + self.trace_url = 'https://trial.miraitranslate.com/trial/api/trace.php' + self.host_headers = self.get_headers(self.home_url, if_api=False) + self.api_json_headers = self.get_headers(self.home_url, if_api=True, if_json_for_api=True) + self.api_text_headers = self.get_headers(self.home_url, if_api=True, if_ajax_for_api=False) + self.session = None + self.language_map = None + self.tran_key = None + self.trans_id = str(uuid.uuid4()) + self.user_id = str(uuid.uuid4()) + self.lang_zh_map = {'zh-CN': 'zh', 'zh-TW': 'zt'} + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(2e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + js_html = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).text + lang_pairs = re.compile('"/trial/(\\w{2})/(\\w{2})"').findall(js_html) + return {f_lang: [v for k, v in lang_pairs if k == f_lang] for f_lang, t_lang in lang_pairs} + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def mirai_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'ja', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://miraitranslate.com/en/trial/ + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'ja'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.tran_key): + self.begin_time = time.time() + self.session = requests.Session() + # _ = self.session.get(self.home_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.tran_key = re.compile('var tran = "(.*?)";').search(host_html).group(1) + lang_url_part = re.compile(self.lang_url_pattern).search(host_html).group() + self.lang_url = f'https://miraitranslate.com/trial/inmt/{lang_url_part}' + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.lang_url, self.session, self.api_json_headers, timeout, + proxies, **debug_lang_kwargs) + + if from_language == 'auto': + r = self.session.post(self.detect_lang_url, headers=self.api_json_headers, json={'text': query_text}, + timeout=timeout, proxies=proxies) + from_language = r.json()['language'] + from_language = self.lang_zh_map[from_language] if 'zh' in from_language else from_language + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + + trace_data = { + 'operationType': 'SLA', + 'lang': from_language, + 'source': query_text, + 'userId': self.user_id, + 'transId': self.trans_id, + 'uniqueId': self.tran_key, + 'date': f'{datetime.datetime.utcnow().isoformat()[:-3]}Z', + } + _ = self.session.post(self.trace_url, json=trace_data, headers=self.api_text_headers, timeout=timeout, + proxies=proxies) + + payload = { + 'input': query_text, + 'source': from_language, + 'target': to_language, + 'tran': self.tran_key, + 'adaptPhrases': [], + 'filter_profile': 'nmt', + 'profile': 'inmt', + 'usePrefix': 'false', + 'zt': 'true' if 'zt' in (from_language, to_language) else 'false', + 'InmtTarget': '', + 'InmtTranslateType': 'gisting', + } + r = self.session.post(self.api_url, data=payload, headers=self.api_text_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['ouputs'][0]['output'][0]['translation'] + + +class Apertium(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://www.apertium.org/' + self.api_url = 'https://apertium.org/apy/translate' + self.get_lang_url = 'https://www.apertium.org/index.js' + self.detect_lang_url = 'https://apertium.org/apy/identifyLang' + self.host_headers = self.get_headers(self.host_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.session = None + self.language_map = None + self.query_count = 0 + self.output_zh = None # unsupported + self.output_en = 'eng' + self.input_limit = int(1e4) # almost no limit. + self.default_from_language = 'spa' + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + js_html = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).text + lang_pairs = re.compile('{sourceLanguage:"(.*?)",targetLanguage:"(.*?)"}').findall(js_html) + return {f_lang: [v for k, v in lang_pairs if k == f_lang] for f_lang, t_lang in lang_pairs} + + @Tse.time_stat + @Tse.check_query + def apertium_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.apertium.org/ + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.get_lang_url, self.session, self.host_headers, timeout, + proxies, **debug_lang_kwargs) + + if from_language == 'auto': + payload = urllib.parse.urlencode({'q': query_text}) + langs = self.session.post(self.detect_lang_url, data=payload, headers=self.api_headers, timeout=timeout, + proxies=proxies).json() + from_language = sorted(langs, key=lambda k: langs[k], reverse=True)[0] + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_en_translator='apertium', output_en=self.output_en) + + payload = { + 'q': query_text, + 'langpair': f'{from_language}|{to_language}', + 'prefs': '', + 'markUnknown': 'no', + } + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['responseData']['translatedText'] + + +class Tilde(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://translate.tilde.com/' + self.api_url = 'https://letsmt.eu/ws/service.svc/json/TranslateEx' + self.get_config_url = 'https://translate.tilde.com/assets/config.local.json' # ?version=46852 + self.subscribe_url = 'https://translate.tilde.com/assets/subscriptions-config.local.json' + self.plausible_url = 'https://plausible.io/api/event' + self.auth_url = 'https://auth.tilde.com/auth/realms/Tilde/protocol/openid-connect/login-status-iframe.html/init' + self.speech_url = 'https://va.tilde.com/dl/directline/aHR0cDovL3Byb2RrOHNib3R0aWxkZTQ=/tokens/speech' + self.host_headers = self.get_headers(self.host_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True) + self.session = None + self.language_map = None + self.langpair_ids = None + self.config_data = None + self.sys_data = None + self.query_count = 0 + self.output_zh = None # unsupported + self.output_en = 'eng' + self.input_limit = int(5e3) # unknown + self.default_from_language = 'lv' # 'fr' + + @Tse.debug_language_map + def get_language_map(self, sys_data: dict, **kwargs: LangMapKwargsType) -> dict: + lang_pairs = [[item['SourceLanguage']['Code'], item['TargetLanguage']['Code']] for item in sys_data['System'] if + 'General' in item['Domain']] + return {f_lang: [v for k, v in lang_pairs if k == f_lang] for f_lang, t_lang in lang_pairs} + + def get_langpair_ids(self, sys_data: dict) -> dict: + return {f"{item['SourceLanguage']['Code']}-{item['TargetLanguage']['Code']}": item['ID'] for item in + sys_data['System'] if 'General' in item['Domain']} + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def tilde_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translate.tilde.com/ + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.config_data = self.session.get(self.get_config_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).json() + self.api_headers.update({'client-id': self.config_data['mt']['api']['clientId']}) # must lower keyword + + sys_url = self.config_data['mt']['api']['systemListUrl'] + params = {'appID': self.config_data['mt']['api']['appID'], + 'uiLanguageID': self.config_data['mt']['api']['uiLanguageID']} + self.sys_data = self.session.get(sys_url, params=params, headers=self.api_headers, timeout=timeout, + proxies=proxies).json() # test + self.langpair_ids = self.get_langpair_ids(self.sys_data) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.sys_data, **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('tilde', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map) + + payload = { + 'text': query_text, + 'appID': self.config_data['mt']['api']['appID'], + 'systemID': self.langpair_ids[f'{from_language}-{to_language}'], + 'options': 'widget=text,alignment,markSentences', + } + r = self.session.post(self.api_url, json=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translation'] + + +class CloudYi(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.home_url = 'https://www.cloudtranslation.com' + self.host_url = 'https://www.cloudtranslation.com/#/translate' + self.api_url = 'https://www.cloudtranslation.com/official-website/v1/transOneSrcText' + self.get_lang_url = 'https://online.cloudtranslation.com/api/v1.0/site/get_all_language_and_domain' + self.detect_lang_url = 'https://online.cloudtranslation.com/api/v1.0/request_translate/langid' + self.get_cookie_url = 'https://online.cloudtranslation.com/api/v1.0/site/sites_language_list' + self.host_headers = self.get_headers(self.home_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.home_url, if_api=True, if_json_for_api=True) + self.session = None + self.language_map = None + self.langpair_domain = None + self.professional_field = None + self.query_count = 0 + self.output_zh = 'zh-cn' + self.output_en = 'en-us' + self.output_auto = 'all' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, d_lang_map: dict, **kwargs: LangMapKwargsType) -> dict: + return {k: [it['language_code'] for it in item] for k, item in d_lang_map['data']['src_to_tgt'].items()} + + def get_langpair_domain(self, d_lang_map: dict) -> dict: + return {k: [it['domain_code'] for it in item] for k, item in + d_lang_map['data']['language_pair_to_domain'].items()} + + def get_professional_field_list(self, d_lang_map: dict) -> set: + return {it['domain_code'] for _, item in d_lang_map['data']['language_pair_to_domain'].items() for it in item} + + @Tse.time_stat + @Tse.check_query + def cloudYi_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.cloudtranslation.com/#/translate + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default 'general'. + :return: str or dict + """ + + use_domain = kwargs.get('professional_field', 'general') + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + _ = self.session.get(self.get_cookie_url, headers=self.api_headers, timeout=timeout, proxies=proxies) + d_lang_map = self.session.get(self.get_lang_url, headers=self.api_headers, timeout=timeout, + proxies=proxies).json() + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(d_lang_map, **debug_lang_kwargs) + self.langpair_domain = self.get_langpair_domain(d_lang_map) + self.professional_field = self.get_professional_field_list(d_lang_map) + + if from_language == 'auto': + payload = {'text': query_text} + r = self.session.post(self.detect_lang_url, json=payload, headers=self.api_headers, timeout=timeout, + proxies=proxies) + from_language = r.json()['data']['language'] + from_language, to_language = from_language.lower(), to_language.lower() # must lower + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh, + output_en_translator='cloudYi', output_en=self.output_en) + + domains = self.langpair_domain.get(f'{from_language}_{to_language}') + if not domains: + raise TranslatorError + if use_domain not in domains: + use_domain = domains[0] + + payload = { + 'text': query_text, + 'domain': use_domain, + 'srcLangCode': from_language, + 'tgtLangCode': to_language, + } + r = self.session.post(self.api_url, json=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['data']['translation'] + + +class SysTran(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.home_url = 'https://www.systran.net' + self.host_url = 'https://www.systran.net/translate/' + self.api_url = 'https://api-translate.systran.net/translation/text/translate' + self.get_lang_url = 'https://api-translate.systran.net/translation/supportedLanguages' + self.get_token_url = 'https://translate.systran.net/oidc/token' + self.get_client_url = 'https://www.systran.net/wp-content/themes/systran/translator/js/translateBox.bundle.js' + self.host_headers = self.get_headers(self.home_url, if_api=False, if_referer_for_host=True) + self.api_ajax_headers = self.get_headers(self.home_url, if_api=True, if_ajax_for_api=True) + self.api_json_headers = self.get_headers(self.home_url, if_api=True, if_json_for_api=True) + self.session = None + self.language_map = None + self.professional_field = None + self.langpair_domain = None + self.client_data = None + self.token_data = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, d_lang_map: dict, **kwargs: LangMapKwargsType) -> dict: + return {ii['source']: [jj['target'] for jj in d_lang_map['languagePairs'] if jj['source'] == ii['source']] for + ii in d_lang_map['languagePairs']} + + def get_professional_field_list(self, d_lang_map: dict) -> set: + return {it['selectors']['domain'] for item in d_lang_map['languagePairs'] for it in item['profiles']} + + def get_langpair_domain(self, d_lang_map: dict) -> dict: + data = { + f'{item["source"]}__{item["target"]}__{it["selectors"]["domain"]}': { + 'domain': it["selectors"]["domain"], + 'owner': it['selectors']['owner'], + 'size': it['selectors']['size'], + } for item in d_lang_map['languagePairs'] for it in item['profiles'] + } + return data + + def get_client_data(self, client_url: str, ss: SessionType, headers: dict, timeout: float, proxies: dict) -> dict: + js_html = ss.get(client_url, headers=headers, timeout=timeout, proxies=proxies).text + search_groups = re.compile('"https://translate.systran.net/oidc",\\w="(.*?)",\\w="(.*?)";').search( + js_html) # \\w{1} == \\w + client_data = { + 'grant_type': 'client_credentials', + 'client_id': search_groups.group(1), + 'client_secret': search_groups.group(2), + } + return client_data + + @Tse.time_stat + @Tse.check_query + def sysTran_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.systran.net/translate/ + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default None. + :return: str or dict + """ + + use_domain = kwargs.get('professional_field', 'Generic') + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.client_data = self.get_client_data(self.get_client_url, self.session, self.host_headers, timeout, + proxies) + payload = urllib.parse.urlencode(self.client_data) + self.token_data = self.session.post(self.get_token_url, data=payload, headers=self.api_ajax_headers, + timeout=timeout, proxies=proxies).json() + + header_params = { + 'authorization': f'{self.token_data["token_type"]} {self.token_data["access_token"]}', + 'x-user-agent': 'File Translate Box Portable', + } + self.api_json_headers.update(header_params) + + d_lang_map = self.session.get(self.get_lang_url, headers=self.api_json_headers, timeout=timeout, + proxies=proxies).json() + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(d_lang_map, **debug_lang_kwargs) + self.professional_field = self.get_professional_field_list(d_lang_map) + self.langpair_domain = self.get_langpair_domain(d_lang_map) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh) + if from_language == 'auto': + from_language = self.warning_auto_lang('sysTran', self.default_from_language, if_print_warning) + + payload = { + 'target': to_language, + 'source': from_language if from_language != 'auto' else None, + 'inputs': [paragraph for paragraph in query_text.split('\n') if paragraph.strip()], + 'format': 'text/plain', + 'autodetectionMode': 'single', + 'withInfo': 'true', + 'withAnnotations': 'true', + 'profileId': None, + 'domain': None, + 'owner': None, + 'size': None, + } + if use_domain and from_language != 'auto': + domain_payload = self.langpair_domain.get(f'{from_language}__{to_language}__{use_domain}') + if not domain_payload: + raise TranslatorError + else: + payload.update(domain_payload) + + r = self.session.post(self.api_url, json=payload, headers=self.api_json_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else '\n'.join(' '.join(it['alt_transes'][0]['target']['text'] for it in + item['output']['documents'][0]['trans_units'][0][ + 'sentences']) for item in data['outputs']) + + +class TranslateMe(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://translateme.network/' + self.api_url = 'https://translateme.network/wp-admin/admin-ajax.php' + self.host_headers = self.get_headers(self.host_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=True) + self.session = None + self.language_map = None + self.query_count = 0 + self.output_zh = 'Chinese' + self.output_en = 'English' + self.input_limit = int(1e2) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + lang_list = re.compile('data-lang="(.*?)"').findall(host_html) + if not lang_list: + raise TranslatorError + + lang_list = sorted(list(set(lang_list))) + return {}.fromkeys(lang_list, lang_list) + + # @Tse.uncertified + # @Tse.time_stat + # @Tse.check_query + def _translateMe_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translateme.network/ + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('translateMe', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh, + output_en_translator='translateMe', output_en=self.output_en) + if self.output_en not in (from_language, to_language): + raise TranslatorError('Must use English as an intermediate translation.') + + data_list = [] + paragraphs = [paragraph for paragraph in query_text.split('\n') if paragraph.strip()] + for paragraph in paragraphs: + payload = { + 'text': paragraph, + 'lang_from': from_language, + 'lang_to': to_language, + 'action': 'tm_my_action', + 'type': 'convert' + } + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, + proxies=proxies) + r.raise_for_status() + data = r.json() + data_list.append(data) + time.sleep(sleep_seconds) + self.query_count += 1 + return {'data': data_list} if is_detail_result else '\n'.join([item['to'] for item in data_list]) + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def translateMe_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://translateme.network/ + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('translateMe', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + output_zh=self.output_zh, + output_en_translator='translateMe', output_en=self.output_en) + + if self.output_en in (from_language, to_language): + return self._translateMe_api(query_text, from_language, to_language, **kwargs) + + tmp_kwargs = kwargs.copy() + tmp_kwargs.update({'is_detail_result': False, 'if_show_time_stat': False}) + next_query_text = self._translateMe_api(query_text, from_language, self.output_en, **tmp_kwargs) + return self._translateMe_api(next_query_text, self.output_en, to_language, **kwargs) + + +class Elia(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://elia.eus/translator' + self.api_url = 'https://elia.eus/ajax/translate_string' + self.detect_lang_url = 'https://elia.eus/ajax/language_detection' + self.host_headers = self.get_headers(self.host_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=True) + self.session = None + self.language_map = None + self.professional_field = None + self.langpair_domain = None + self.token = None + self.query_count = 0 + self.output_zh = None # unsupported + self.input_limit = int(1e2) + self.default_from_language = 'fr' + + @Tse.debug_language_map + def get_language_map(self, dd: dict, **kwargs: LangMapKwargsType) -> dict: + return {ii['source_language']['code']: [jj['target_language']['code'] for jj in dd['language_pairs'] if + jj['source_language']['code'] == ii['source_language']['code']] for ii + in dd['language_pairs']} + + def get_professional_field_list(self, dd: dict) -> set: + return {it['translation_model']['code'] for it in dd['language_pairs']} + + def get_langpair_domain(self, dd: dict) -> dict: + data = { + f'{item["source_language"]["code"]}__{item["target_language"]["code"]}__{item["translation_model"]["code"]}': { + 'translation_engine': item["engine"]["pk"], + } for item in dd['language_pairs'] + } + return data + + @Tse.time_stat + @Tse.check_query + def elia_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://elia.eus/translator + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param professional_field: str, default 'general'. Choose from ('general', 'admin'). + :return: str or dict + """ + + use_domain = kwargs.get('professional_field', 'general') + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, + proxies=proxies).text + self.token = re.compile('"csrfmiddlewaretoken": "(.*?)"').search(host_html).group(1) + d_lang_str = re.compile('var languagePairs = JSON.parse\\((.*?)\\);').search(host_html).group() + d_lang_map = json.loads(d_lang_str[43:-4].replace('"', '"')) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(d_lang_map, **debug_lang_kwargs) + self.professional_field = self.get_professional_field_list(d_lang_map) + self.langpair_domain = self.get_langpair_domain(d_lang_map) + + if from_language == 'auto': + payload = { + 'text': query_text, + 'csrfmiddlewaretoken': self.token, + } + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.detect_lang_url, data=payload, headers=self.api_headers, timeout=timeout, + proxies=proxies) + from_language = r.json()['lang_id'] + from_language, to_language = self.check_language(from_language, to_language, self.language_map) + + payload = { + 'input_text': query_text, + 'source_language': from_language, + 'target_language': to_language, + 'translation_model': use_domain, + 'translation_engine': 1, + 'csrfmiddlewaretoken': self.token, + } + + domain_payload = self.langpair_domain.get(f'{from_language}__{to_language}__{use_domain}') + if not domain_payload: + raise TranslatorError + else: + payload.update(domain_payload) + + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translated_text'].replace('', '\n').replace('
', + '').replace( + '', '').replace('', '') + + +class LanguageWire(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.home_url = 'https://jwt.languagewire.com' + self.host_url = 'https://www.languagewire.com/en/technology/languagewire-translate' + self.api_url = 'https://lwt.languagewire.com/f/api/v1/translations/text' + self.lang_url = 'https://lwt.languagewire.com/f/api/v1/language-pairs?includeVariants=true' + self.cookie_url = 'https://lwt.languagewire.com/f/api/v1/auth/cookie' + self.lwt_js_url = 'https://lwt.languagewire.com/en/main.6f20295b104bc52a.js' + self.host_headers = self.get_headers(self.home_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.home_url, if_api=True, if_json_for_api=True) + self.session = None + self.language_map = None + self.lwt_data = None + self.query_count = 0 + self.output_zh = None # unsupported + self.input_limit = int(5e3) + self.default_from_language = 'fr' + self.default_en_to_language = 'en-US' + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], + proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + d_lang_map = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).json() + return {ii['sourceLanguage']['mmtCode']: [jj['targetLanguage']['mmtCode'] for jj in d_lang_map if + jj['sourceLanguage']['mmtCode'] == ii['sourceLanguage']['mmtCode']] + for ii in d_lang_map} + + # def get_lwt_data(self, lwt_js_url: str, ss: SessionType, headers: dict, timeout: float, proxies: dict) -> dict: + # js_html = ss.get(lwt_js_url, headers=headers, timeout=timeout, proxies=proxies).text + # lwt_data = { + # 'x-lwt-application-id': re.compile('"X-LWT-Application-ID":"(.*?)"').search(js_html).group(1), + # 'x-lwt-build-id': re.compile('"X-LWT-Build-ID":"(.*?)"').search(js_html).group(1), + # } + # return lwt_data + + def get_lwt_data(self) -> dict: + lwt_data = { + 'x-lwt-application-id': 'LWT_WEB', + 'x-lwt-build-id': '346775', + } + return lwt_data + + @Tse.time_stat + @Tse.check_query + def languageWire_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.languagewire.com/en/technology/languagewire-translate + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.lwt_data = self.get_lwt_data() + self.api_headers.update(self.lwt_data) + + _ = self.session.post(self.cookie_url, headers=self.api_headers, timeout=timeout, proxies=proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.lang_url, self.session, self.api_headers, timeout, proxies, + **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('languageWire', self.default_from_language, if_print_warning) + to_language = self.default_en_to_language if to_language == 'en' else to_language + from_language, to_language = self.check_language(from_language, to_language, self.language_map, + if_check_lang_reverse=False) + + payload = { + 'sourceText': query_text, + 'sourceLanguage': from_language, + 'targetLanguage': to_language, + } + r = self.session.post(self.api_url, json=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translation'] + + +class Judic(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.home_url = 'https://judic.io' + self.host_url = 'https://judic.io/en/translate' + self.api_url = 'https://judic.io/translate/text' + self.host_headers = self.get_headers(self.home_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.home_url, if_api=True, if_json_for_api=True) + self.session = None + self.lang_list = ['en', 'de', 'fr', 'nl'] + self.language_map = None + self.query_count = 0 + self.output_zh = None # unsupported + self.input_limit = int(1e3) + self.default_from_language = 'nl' + + @Tse.debug_language_map + def get_language_map(self, lang_list: List[str], **kwargs: LangMapKwargsType) -> dict: + return {}.fromkeys(lang_list, lang_list) + + @Tse.time_stat + @Tse.check_query + def judic_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://judic.io/en/translate + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.lang_list, **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('judic', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map) + + payload = { + 'sourceText': query_text, + 'inputLang': from_language, + 'outputLang': to_language + } + r = self.session.post(self.api_url, json=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['translation'] + + +class Yeekit(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.home_url = 'https://www.yeekit.com' + self.host_url = 'https://www.yeekit.com/site/translate' + self.api_url = 'https://www.yeekit.com/site/dotranslate' + self.lang_url = 'https://www.yeekit.com/js/translate.js' + self.host_headers = self.get_headers(self.home_url, if_api=False, if_referer_for_host=True) + self.api_headers = self.get_headers(self.home_url, if_api=True, if_ajax_for_api=True) + self.session = None + self.lang_list = ['zh', 'en', 'ar', 'de', 'ru', 'fr', 'cz', 'pt', 'jp', 'es'] + self.language_map = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(1e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_list: List[str], **kwargs: LangMapKwargsType) -> dict: + return {}.fromkeys(lang_list, lang_list) + + @Tse.uncertified # not code, but server. + @Tse.time_stat + @Tse.check_query + def yeekit_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', + **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://www.yeekit.com/site/translate + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, + if_print_warning) + self.language_map = self.get_language_map(self.lang_list, **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('yeekit', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map) + + payload = { + 'content[]': query_text, + 'sourceLang': f'n{from_language}', + 'targetLang': f'n{to_language}', + } + payload = urllib.parse.urlencode(payload) + r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else '\n'.join( + ' '.join(p) for p in json.loads(data[0])['translation'][0]['translated'][0]['translation list']) + + +class TranslatorsServer: + def __init__(self): + self.cpu_cnt = os.cpu_count() + self.server_region = GuestSeverRegion().get_server_region + self._alibaba = AlibabaV2() + self.alibaba = self._alibaba.alibaba_api + self._apertium = Apertium() + self.apertium = self._apertium.apertium_api + self._argos = Argos() + self.argos = self._argos.argos_api + self._baidu = BaiduV1() # V2 + self.baidu = self._baidu.baidu_api + self._bing = Bing(server_region=self.server_region) + self.bing = self._bing.bing_api + self._caiyun = Caiyun() + self.caiyun = self._caiyun.caiyun_api + self._cloudYi = CloudYi() + self.cloudYi = self._cloudYi.cloudYi_api + self._deepl = Deepl() + self.deepl = self._deepl.deepl_api + self._elia = Elia() + self.elia = self._elia.elia_api + self._google = GoogleV2(server_region=self.server_region) + self.google = self._google.google_api + self._iciba = Iciba() + self.iciba = self._iciba.iciba_api + self._iflytek = IflytekV2() + self.iflytek = self._iflytek.iflytek_api + self._iflyrec = Iflyrec() + self.iflyrec = self._iflyrec.iflyrec_api + self._itranslate = Itranslate() + self.itranslate = self._itranslate.itranslate_api + self._judic = Judic() + self.judic = self._judic.judic_api + self._languageWire = LanguageWire() + self.languageWire = self._languageWire.languageWire_api + self._lingvanex = Lingvanex() + self.lingvanex = self._lingvanex.lingvanex_api + self._mglip = Mglip() + self.mglip = self._mglip.mglip_api + self._mirai = Mirai() + self.mirai = self._mirai.mirai_api + self._modernMt = ModernMt() + self.modernMt = self._modernMt.modernMt_api + self._myMemory = MyMemory() + self.myMemory = self._myMemory.myMemory_api + self._papago = Papago() + self.papago = self._papago.papago_api + self._qqFanyi = QQFanyi() + self.qqFanyi = self._qqFanyi.qqFanyi_api + self._qqTranSmart = QQTranSmart() + self.qqTranSmart = self._qqTranSmart.qqTranSmart_api + self._reverso = Reverso() + self.reverso = self._reverso.reverso_api + self._sogou = Sogou() + self.sogou = self._sogou.sogou_api + self._sysTran = SysTran() + self.sysTran = self._sysTran.sysTran_api + self._tilde = Tilde() + self.tilde = self._tilde.tilde_api + self._translateCom = TranslateCom() + self.translateCom = self._translateCom.translateCom_api + self._translateMe = TranslateMe() + self.translateMe = self._translateMe.translateMe_api + self._utibet = Utibet() + self.utibet = self._utibet.utibet_api + self._volcEngine = VolcEngine() + self.volcEngine = self._volcEngine.volcEngine_api + self._yandex = Yandex() + self.yandex = self._yandex.yandex_api + self._yeekit = Yeekit() + self.yeekit = self._yeekit.yeekit_api + self._youdao = YoudaoV3() + self.youdao = self._youdao.youdao_api + self._translators_dict = { + 'alibaba': self._alibaba, 'apertium': self._apertium, 'argos': self._argos, 'baidu': self._baidu, + 'bing': self._bing, + 'caiyun': self._caiyun, 'cloudYi': self._cloudYi, 'deepl': self._deepl, 'elia': self._elia, + 'google': self._google, + 'iciba': self._iciba, 'iflytek': self._iflytek, 'iflyrec': self._iflyrec, 'itranslate': self._itranslate, + 'judic': self._judic, + 'languageWire': self._languageWire, 'lingvanex': self._lingvanex, + 'mglip': self._mglip, 'mirai': self._mirai, + 'modernMt': self._modernMt, 'myMemory': self._myMemory, 'papago': self._papago, 'qqFanyi': self._qqFanyi, + 'qqTranSmart': self._qqTranSmart, + 'reverso': self._reverso, 'sogou': self._sogou, 'sysTran': self._sysTran, 'tilde': self._tilde, + 'translateCom': self._translateCom, + 'translateMe': self._translateMe, 'utibet': self._utibet, 'volcEngine': self._volcEngine, + 'yandex': self._yandex, 'yeekit': self._yeekit, + 'youdao': self._youdao, + } + self.translators_dict = { + 'alibaba': self.alibaba, 'apertium': self.apertium, 'argos': self.argos, 'baidu': self.baidu, + 'bing': self.bing, + 'caiyun': self.caiyun, 'cloudYi': self.cloudYi, 'deepl': self.deepl, 'elia': self.elia, + 'google': self.google, + 'iciba': self.iciba, 'iflytek': self.iflytek, 'iflyrec': self.iflyrec, 'itranslate': self.itranslate, + 'judic': self.judic, + 'languageWire': self.languageWire, 'lingvanex': self.lingvanex, + 'mglip': self.mglip, 'mirai': self.mirai, + 'modernMt': self.modernMt, 'myMemory': self.myMemory, 'papago': self.papago, 'qqFanyi': self.qqFanyi, + 'qqTranSmart': self.qqTranSmart, + 'reverso': self.reverso, 'sogou': self.sogou, 'sysTran': self.sysTran, 'tilde': self.tilde, + 'translateCom': self.translateCom, + 'translateMe': self.translateMe, 'utibet': self.utibet, 'volcEngine': self.volcEngine, + 'yandex': self.yandex, 'yeekit': self.yeekit, + 'youdao': self.youdao, + } + self.translators_pool = list(self.translators_dict.keys()) + self.not_en_langs = {'utibet': 'ti', 'mglip': 'mon'} + self.not_zh_langs = {'languageWire': 'fr', 'tilde': 'fr', 'elia': 'fr', 'apertium': 'spa'} + self.pre_acceleration_label = 0 + self.example_query_text = '你好。\n欢迎你!' + self.success_translators_pool = [] + self.failure_translators_pool = [] + + def translate_text(self, + query_text: str, + translator: str = 'bing', + from_language: str = 'auto', + to_language: str = 'en', + if_use_preacceleration: bool = False, + **kwargs: ApiKwargsType, + ) -> Union[str, dict]: + """ + :param query_text: str, must. + :param translator: str, default 'bing'. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param if_use_preacceleration: bool, default False. + :param **kwargs: + :param is_detail_result: bool, default False. + :param professional_field: str, support alibaba(), baidu(), caiyun(), cloudYi(), elia(), sysTran(), youdao(), volcEngine() only. + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_use_cn_host: bool, default False. Support google(), bing() only. + :param reset_host_url: str, default None. Support google(), argos(), yandex() only. + :param if_check_reset_host_url: bool, default True. Support google(), yandex() only. + :param if_ignore_empty_query: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param lingvanex_model: str, default 'B2C', choose from ("B2C", "B2B"). + :param myMemory_mode: str, default "web", choose from ("web", "api"). + :return: str or dict + """ + + if translator not in self.translators_pool: + raise TranslatorError + + if not self.pre_acceleration_label and if_use_preacceleration: + _ = self.preaccelerate() + + return self.translators_dict[translator](query_text=query_text, from_language=from_language, + to_language=to_language, **kwargs) + + def translate_html(self, + html_text: str, + translator: str = 'bing', + from_language: str = 'auto', + to_language: str = 'en', + n_jobs: int = -1, + if_use_preacceleration: bool = False, + **kwargs: ApiKwargsType, + ) -> str: + """ + Translate the displayed content of html without changing the html structure. + :param html_text: str, must. + :param translator: str, default 'bing'. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param n_jobs: int, default -1, means os.cpu_cnt(). + :param if_use_preacceleration: bool, default False. + :param **kwargs: + :param is_detail_result: bool, default False. + :param professional_field: str, support alibaba(), baidu(), caiyun(), cloudYi(), elia(), sysTran(), youdao(), volcEngine() only. + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_use_cn_host: bool, default False. Support google(), bing() only. + :param reset_host_url: str, default None. Support google(), argos(), yandex() only. + :param if_check_reset_host_url: bool, default True. Support google(), yandex() only. + :param if_ignore_empty_query: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :param lingvanex_model: str, default 'B2C', choose from ("B2C", "B2B"). + :param myMemory_mode: str, default "web", choose from ("web", "api"). + :return: str + """ + + if translator not in self.translators_pool or kwargs.get('is_detail_result', False) or n_jobs > self.cpu_cnt: + raise TranslatorError + + if not self.pre_acceleration_label and if_use_preacceleration: + _ = self.preaccelerate() + + def _translate_text(sentence: str) -> Tuple[str, str]: + return sentence, self.translators_dict[translator](query_text=sentence, from_language=from_language, + to_language=to_language, **kwargs) + + pattern = re.compile( + "(?:^|(?<=>))([\\s\\S]*?)(?:(?=<)|$)") # TODO:
+ sentence_list = list(set(pattern.findall(html_text))) + + n_jobs = self.cpu_cnt if n_jobs <= 0 else n_jobs + with pathos.multiprocessing.ProcessPool(n_jobs) as pool: + result_list = pool.map(_translate_text, sentence_list) + + result_dict = {text: ts_text for text, ts_text in result_list} + _get_result_func = lambda k: result_dict.get(k.group(1), '') + return pattern.sub(repl=_get_result_func, string=html_text) + + def _test_translate(self, _ts: str, timeout: Optional[float] = None, if_show_time_stat: bool = False) -> str: + from_language = self.not_zh_langs[_ts] if _ts in self.not_zh_langs else 'auto' + to_language = self.not_en_langs[_ts] if _ts in self.not_en_langs else 'en' + result = self.translators_dict[_ts]( + query_text=self.example_query_text, + translator=_ts, + from_language=from_language, + to_language=to_language, + if_print_warning=False, + timeout=timeout, + if_show_time_stat=if_show_time_stat + ) + return result + + def get_languages(self, translator: str = 'bing'): + language_map = self._translators_dict[translator].language_map + if language_map: + return language_map + + _ = self._test_translate(_ts=translator) + return self._translators_dict[translator].language_map + + def preaccelerate(self, timeout: Optional[float] = None, if_show_time_stat: bool = True, **kwargs: str) -> dict: + if self.pre_acceleration_label > 0: + raise TranslatorError('Preacceleration can only be performed once.') + + self.example_query_text = kwargs.get('example_query_text', self.example_query_text) + + sys.stderr.write('Preacceleration-Process will take a few minutes.\n') + sys.stderr.write('Tips: The smaller `timeout` value, the fewer translators pass the test ' + 'and the less time it takes to preaccelerate. However, the slow speed of ' + 'preacceleration does not mean the slow speed of later translation.\n\n') + + for i in tqdm.tqdm(range(len(self.translators_pool)), desc='Preacceleration Process', ncols=80): + _ts = self.translators_pool[i] + try: + _ = self._test_translate(_ts, timeout, if_show_time_stat) + self.success_translators_pool.append(_ts) + except: + self.failure_translators_pool.append(_ts) + + self.pre_acceleration_label += 1 + return {'success': self.success_translators_pool, 'failure': self.failure_translators_pool} + + def speedtest(self, **kwargs: List[str]) -> None: + if self.pre_acceleration_label < 1: + raise TranslatorError('Preacceleration first.') + + test_translators_pool = kwargs.get('test_translators_pool', self.success_translators_pool) + + sys.stderr.write('SpeedTest-Process will take a few seconds.\n\n') + for i in tqdm.tqdm(range(len(test_translators_pool)), desc='SpeedTest Process', ncols=80): + _ts = test_translators_pool[i] + try: + _ = self._test_translate(_ts, timeout=None, if_show_time_stat=True) + except: + pass + return + + def preaccelerate_and_speedtest(self, timeout: Optional[float] = None, **kwargs: str) -> dict: + result = self.preaccelerate(timeout=timeout, **kwargs) + sys.stderr.write('\n\n') + self.speedtest() + return result + + +tss = TranslatorsServer() + +_alibaba = tss._alibaba +alibaba = tss.alibaba +_apertium = tss._apertium +apertium = tss.apertium +_argos = tss._argos +argos = tss.argos +_baidu = tss._baidu +baidu = tss.baidu +_bing = tss._bing +bing = tss.bing +_caiyun = tss._caiyun +caiyun = tss.caiyun +_cloudYi = tss._cloudYi +cloudYi = tss.cloudYi +_deepl = tss._deepl +deepl = tss.deepl +_elia = tss._elia +elia = tss.elia +_google = tss._google +google = tss.google +_iciba = tss._iciba +iciba = tss.iciba +_iflytek = tss._iflytek +iflytek = tss.iflytek +_iflyrec = tss._iflyrec +iflyrec = tss.iflyrec +_itranslate = tss._itranslate +itranslate = tss.itranslate +_judic = tss._judic +judic = tss.judic +_languageWire = tss._languageWire +languageWire = tss.languageWire +_lingvanex = tss._lingvanex +lingvanex = tss.lingvanex +_mglip = tss._mglip +mglip = tss.mglip +_mirai = tss._mirai +mirai = tss.mirai +_modernMt = tss._modernMt +modernMt = tss.modernMt +_myMemory = tss._myMemory +myMemory = tss.myMemory +_papago = tss._papago +papago = tss.papago +_qqFanyi = tss._qqFanyi +qqFanyi = tss.qqFanyi +_qqTranSmart = tss._qqTranSmart +qqTranSmart = tss.qqTranSmart +_reverso = tss._reverso +reverso = tss.reverso +_sogou = tss._sogou +sogou = tss.sogou +_sysTran = tss._sysTran +sysTran = tss.sysTran +_tilde = tss._tilde +tilde = tss.tilde +_translateCom = tss._translateCom +translateCom = tss.translateCom +_translateMe = tss._translateMe +translateMe = tss.translateMe +_utibet = tss._utibet +utibet = tss.utibet +_volcEngine = tss._volcEngine +volcEngine = tss.volcEngine +_yandex = tss._yandex +yandex = tss.yandex +_yeekit = tss._yeekit +yeekit = tss.yeekit +_youdao = tss._youdao +youdao = tss.youdao + +translate_text = tss.translate_text +translate_html = tss.translate_html +translators_pool = tss.translators_pool +get_languages = tss.get_languages + +preaccelerate = tss.preaccelerate +speedtest = tss.speedtest +preaccelerate_and_speedtest = tss.preaccelerate_and_speedtest +# sys.stderr.write(f'Support translators {translators_pool} only.\n') diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile index f0c0663..e8dcc8f 100644 --- a/compose/local/django/Dockerfile +++ b/compose/local/django/Dockerfile @@ -13,7 +13,11 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ libpq-dev \ default-libmysqlclient-dev \ libffi-dev \ - libjpeg-dev + libjpeg-dev \ + libxml2 \ + libxslt1-dev \ + libssl-dev \ + python-dev # Requirements are installed here to ensure they will be cached. COPY ./requirements . # Create Python Dependency and Sub-Dependency Wheels. diff --git a/requirements/base.txt b/requirements/base.txt index c442cdb..303a6dc 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -6,7 +6,7 @@ django-cors-headers==3.2.1 django-filter==2.0.0 djangorestframework==3.8.1 python-dateutil==2.8.2 -requests==2.27.1 +requests==2.31.0 gunicorn==20.1.0 gevent==21.12.0 djangorestframework-jwt==1.11.0 @@ -19,5 +19,4 @@ redis==3.2.0 mysqlclient==1.4.4 sqlalchemy==1.4.23 PyExecJS==1.5.1 -translators==5.8.0