remove gc

This commit is contained in:
jxxghp
2025-06-08 21:19:15 +08:00
parent 7f8e50f83d
commit 0a20234268
2 changed files with 30 additions and 38 deletions

View File

@@ -1,8 +1,8 @@
from typing import Callable, Any, Optional
import gc
from playwright.sync_api import sync_playwright, Page
from cf_clearance import sync_cf_retry, sync_stealth
from playwright.sync_api import sync_playwright, Page
from app.log import logger
@@ -67,8 +67,6 @@ class PlaywrightHelper:
context.close()
if browser:
browser.close()
# 强制垃圾回收
gc.collect()
except Exception as e:
logger.error(f"Playwright初始化失败: {str(e)}")
@@ -120,8 +118,6 @@ class PlaywrightHelper:
context.close()
if browser:
browser.close()
# 强制垃圾回收
gc.collect()
except Exception as e:
logger.error(f"Playwright初始化失败: {str(e)}")

View File

@@ -1,4 +1,3 @@
import gc
import re
import traceback
from typing import List, Tuple, Union, Optional
@@ -18,11 +17,11 @@ class RssHelper:
"""
RSS帮助类解析RSS报文、获取RSS地址等
"""
# RSS解析限制配置
MAX_RSS_SIZE = 50 * 1024 * 1024 # 50MB最大RSS文件大小
MAX_RSS_ITEMS = 1000 # 最大解析条目数
# 各站点RSS链接获取配置
rss_link_conf = {
"default": {
@@ -228,7 +227,8 @@ class RssHelper:
},
}
def parse(self, url, proxy: bool = False, timeout: Optional[int] = 15, headers: dict = None) -> Union[List[dict], None, bool]:
def parse(self, url, proxy: bool = False,
timeout: Optional[int] = 15, headers: dict = None) -> Union[List[dict], None, bool]:
"""
解析RSS订阅URL获取RSS中的种子信息
:param url: RSS地址
@@ -241,7 +241,7 @@ class RssHelper:
ret_array: list = []
if not url:
return False
try:
ret = RequestUtils(proxies=settings.PROXY if proxy else None,
timeout=timeout, headers=headers).get_res(url)
@@ -250,7 +250,7 @@ class RssHelper:
except Exception as err:
logger.error(f"获取RSS失败{str(err)} - {traceback.format_exc()}")
return False
if ret:
ret_xml = None
root = None
@@ -258,9 +258,9 @@ class RssHelper:
# 检查响应大小避免处理过大的RSS文件
raw_data = ret.content
if raw_data and len(raw_data) > self.MAX_RSS_SIZE:
logger.warning(f"RSS文件过大: {len(raw_data)/1024/1024:.1f}MB跳过解析")
logger.warning(f"RSS文件过大: {len(raw_data) / 1024 / 1024:.1f}MB跳过解析")
return False
if raw_data:
try:
result = chardet.detect(raw_data)
@@ -279,7 +279,7 @@ class RssHelper:
ret.encoding = ret.apparent_encoding
if not ret_xml:
ret_xml = ret.text
# 使用lxml.etree解析XML
parser = None
try:
@@ -307,42 +307,39 @@ class RssHelper:
finally:
if parser is not None:
del parser
if root is None:
logger.error("无法解析RSS内容")
return False
# 查找所有item或entry节点
items = root.xpath('.//item | .//entry')
# 限制处理的条目数量
items_count = min(len(items), self.MAX_RSS_ITEMS)
if len(items) > self.MAX_RSS_ITEMS:
logger.warning(f"RSS条目过多: {len(items)},仅处理前{self.MAX_RSS_ITEMS}")
for i, item in enumerate(items[:items_count]):
for item in items[:items_count]:
try:
# 定期执行垃圾回收
if i > 0 and i % 100 == 0:
gc.collect()
# 使用xpath提取信息更高效
title_nodes = item.xpath('.//title')
title = title_nodes[0].text if title_nodes and title_nodes[0].text else ""
if not title:
continue
# 描述
desc_nodes = item.xpath('.//description | .//summary')
description = desc_nodes[0].text if desc_nodes and desc_nodes[0].text else ""
# 种子页面
link_nodes = item.xpath('.//link')
if link_nodes:
link = link_nodes[0].text if hasattr(link_nodes[0], 'text') and link_nodes[0].text else link_nodes[0].get('href', '')
link = link_nodes[0].text if hasattr(link_nodes[0], 'text') and link_nodes[0].text else \
link_nodes[0].get('href', '')
else:
link = ""
# 种子链接
enclosure_nodes = item.xpath('.//enclosure')
enclosure = enclosure_nodes[0].get('url', '') if enclosure_nodes else ""
@@ -351,24 +348,24 @@ class RssHelper:
# 部分RSS只有link没有enclosure
if not enclosure and link:
enclosure = link
# 大小
size = 0
if enclosure_nodes:
size_attr = enclosure_nodes[0].get('length', '0')
if size_attr and str(size_attr).isdigit():
size = int(size_attr)
# 发布日期
pubdate_nodes = item.xpath('.//pubDate | .//published | .//updated')
pubdate = ""
if pubdate_nodes and pubdate_nodes[0].text:
pubdate = StringUtils.get_time(pubdate_nodes[0].text)
# 获取豆瓣昵称
nickname_nodes = item.xpath('.//*[local-name()="creator"]')
nickname = nickname_nodes[0].text if nickname_nodes and nickname_nodes[0].text else ""
# 返回对象
tmp_dict = {
'title': title,
@@ -382,11 +379,11 @@ class RssHelper:
if nickname:
tmp_dict['nickname'] = nickname
ret_array.append(tmp_dict)
except Exception as e1:
logger.debug(f"解析RSS条目失败{str(e1)} - {traceback.format_exc()}")
continue
except Exception as e2:
logger.error(f"解析RSS失败{str(e2)} - {traceback.format_exc()}")
# RSS过期检查
@@ -403,8 +400,7 @@ class RssHelper:
del root
if ret_xml is not None:
del ret_xml
gc.collect()
return ret_array
def get_rss_link(self, url: str, cookie: str, ua: str, proxy: bool = False) -> Tuple[str, str]:
@@ -446,7 +442,7 @@ class RssHelper:
return "", f"获取 {url} RSS链接失败错误码{res.status_code},错误原因:{res.reason}"
else:
return "", f"获取RSS链接失败无法连接 {url} "
# 解析HTML
if html_text:
html = None
@@ -459,7 +455,7 @@ class RssHelper:
finally:
if html is not None:
del html
return "", f"获取RSS链接失败{url}"
except Exception as e:
return "", f"获取 {url} RSS链接失败{str(e)}"