system.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. """
  2. 系统管理工具
  3. 实现系统状态查询和爬虫触发功能。
  4. """
  5. from pathlib import Path
  6. from typing import Dict, List, Optional
  7. from ..services.data_service import DataService
  8. from ..utils.validators import validate_platforms
  9. from ..utils.errors import MCPError, CrawlTaskError
  10. class SystemManagementTools:
  11. """系统管理工具类"""
  12. def __init__(self, project_root: str = None):
  13. """
  14. 初始化系统管理工具
  15. Args:
  16. project_root: 项目根目录
  17. """
  18. self.data_service = DataService(project_root)
  19. if project_root:
  20. self.project_root = Path(project_root)
  21. else:
  22. # 获取项目根目录
  23. current_file = Path(__file__)
  24. self.project_root = current_file.parent.parent.parent
  25. def get_system_status(self) -> Dict:
  26. """
  27. 获取系统运行状态和健康检查信息
  28. Returns:
  29. 系统状态字典
  30. Example:
  31. >>> tools = SystemManagementTools()
  32. >>> result = tools.get_system_status()
  33. >>> print(result['system']['version'])
  34. """
  35. try:
  36. # 获取系统状态
  37. status = self.data_service.get_system_status()
  38. return {
  39. "success": True,
  40. "summary": {
  41. "description": "系统运行状态和健康检查信息"
  42. },
  43. "data": status
  44. }
  45. except MCPError as e:
  46. return {
  47. "success": False,
  48. "error": e.to_dict()
  49. }
  50. except Exception as e:
  51. return {
  52. "success": False,
  53. "error": {
  54. "code": "INTERNAL_ERROR",
  55. "message": str(e)
  56. }
  57. }
  58. def _load_crawl_config(self):
  59. """加载爬取配置,返回 (config_data, target_platforms_config)"""
  60. import yaml
  61. config_path = self.project_root / "config" / "config.yaml"
  62. if not config_path.exists():
  63. raise CrawlTaskError(
  64. "配置文件不存在",
  65. suggestion=f"请确保配置文件存在: {config_path}"
  66. )
  67. with open(config_path, "r", encoding="utf-8") as f:
  68. config_data = yaml.safe_load(f)
  69. platforms_config = config_data.get("platforms", {})
  70. if not platforms_config.get("enabled", True):
  71. raise CrawlTaskError(
  72. "热榜平台已禁用",
  73. suggestion="请检查 config/config.yaml 中的 platforms.enabled 配置"
  74. )
  75. all_platforms = [p for p in platforms_config.get("sources", []) if p.get("enabled", True)]
  76. if not all_platforms:
  77. raise CrawlTaskError(
  78. "配置文件中没有平台配置",
  79. suggestion="请检查 config/config.yaml 中的 platforms.sources 配置"
  80. )
  81. return config_data, all_platforms
  82. def _resolve_target_platforms(self, all_platforms: list, platforms: Optional[List[str]]):
  83. """根据用户指定的平台列表过滤,返回 (target_platforms, ids_list)"""
  84. if platforms:
  85. target_platforms = [p for p in all_platforms if p["id"] in platforms]
  86. if not target_platforms:
  87. raise CrawlTaskError(
  88. f"指定的平台不存在: {platforms}",
  89. suggestion=f"可用平台: {[p['id'] for p in all_platforms]}"
  90. )
  91. else:
  92. target_platforms = all_platforms
  93. ids = []
  94. for platform in target_platforms:
  95. if "name" in platform:
  96. ids.append((platform["id"], platform["name"]))
  97. else:
  98. ids.append(platform["id"])
  99. return target_platforms, ids
  100. def _persist_crawl_data(self, storage, news_data, save_to_local, results, id_to_name, failed_ids, current_time, crawl_time_str):
  101. """持久化爬取数据,返回 (save_success, save_error_msg, saved_files)"""
  102. save_success = False
  103. save_error_msg = ""
  104. saved_files = {}
  105. try:
  106. if storage.save_news_data(news_data):
  107. save_success = True
  108. if save_to_local:
  109. txt_path = storage.save_txt_snapshot(news_data)
  110. if txt_path:
  111. saved_files["txt"] = txt_path
  112. html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time)
  113. html_filename = f"{crawl_time_str}.html"
  114. html_path = storage.save_html_report(html_content, html_filename)
  115. if html_path:
  116. saved_files["html"] = html_path
  117. except Exception as e:
  118. print(f"[System] 数据保存失败: {e}")
  119. save_success = False
  120. save_error_msg = str(e)
  121. return save_success, save_error_msg, saved_files
  122. def _build_crawl_response(self, results, id_to_name, failed_ids, current_time, include_url,
  123. save_success, save_to_local, save_error_msg, saved_files):
  124. """构建爬取结果响应字典"""
  125. import time
  126. news_response_data = []
  127. for platform_id, titles_data in results.items():
  128. platform_name = id_to_name.get(platform_id, platform_id)
  129. for title, info in titles_data.items():
  130. news_item = {
  131. "platform_id": platform_id,
  132. "platform_name": platform_name,
  133. "title": title,
  134. "ranks": info.get("ranks", [])
  135. }
  136. if include_url:
  137. news_item["url"] = info.get("url", "")
  138. news_item["mobile_url"] = info.get("mobileUrl", "")
  139. news_response_data.append(news_item)
  140. result = {
  141. "success": True,
  142. "summary": {
  143. "description": "爬取任务执行结果",
  144. "task_id": f"crawl_{int(time.time())}",
  145. "status": "completed",
  146. "crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"),
  147. "total_news": len(news_response_data),
  148. "platforms": list(results.keys()),
  149. "failed_platforms": failed_ids,
  150. "saved_to_local": save_success and save_to_local
  151. },
  152. "data": news_response_data
  153. }
  154. if save_success:
  155. if save_to_local:
  156. result["saved_files"] = saved_files
  157. result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹"
  158. else:
  159. result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果,未生成TXT快照)"
  160. else:
  161. result["saved_to_local"] = False
  162. result["save_error"] = save_error_msg
  163. if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg:
  164. result["note"] = "爬取成功,但无法写入数据库(Docker只读模式)。数据仅在本次返回中有效。"
  165. else:
  166. result["note"] = f"爬取成功但保存失败: {save_error_msg}"
  167. return result
  168. def trigger_crawl(self, platforms: Optional[List[str]] = None, save_to_local: bool = False, include_url: bool = False) -> Dict:
  169. """
  170. 手动触发一次临时爬取任务(可选持久化)
  171. Args:
  172. platforms: 指定平台列表,为空则爬取所有平台
  173. save_to_local: 是否保存到本地 output 目录,默认 False
  174. include_url: 是否包含URL链接,默认False(节省token)
  175. Returns:
  176. 爬取结果字典,包含新闻数据和保存路径(如果保存)
  177. """
  178. try:
  179. from trendradar.crawler.fetcher import DataFetcher
  180. from trendradar.storage.local import LocalStorageBackend
  181. from trendradar.storage.base import convert_crawl_results_to_news_data
  182. from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename
  183. from ..services.cache_service import get_cache
  184. platforms = validate_platforms(platforms)
  185. # 1. 加载配置
  186. config_data, all_platforms = self._load_crawl_config()
  187. target_platforms, ids = self._resolve_target_platforms(all_platforms, platforms)
  188. print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
  189. # 2. 执行爬取
  190. advanced = config_data.get("advanced", {})
  191. crawler_config = advanced.get("crawler", {})
  192. proxy_url = crawler_config.get("default_proxy") if crawler_config.get("use_proxy") else None
  193. fetcher = DataFetcher(proxy_url=proxy_url)
  194. results, id_to_name, failed_ids = fetcher.crawl_websites(
  195. ids_list=ids,
  196. request_interval=crawler_config.get("request_interval", 100)
  197. )
  198. # 3. 转换与持久化
  199. timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai")
  200. current_time = get_configured_time(timezone)
  201. crawl_date = format_date_folder(None, timezone)
  202. crawl_time_str = format_time_filename(timezone)
  203. news_data = convert_crawl_results_to_news_data(
  204. results=results, id_to_name=id_to_name,
  205. failed_ids=failed_ids, crawl_time=crawl_time_str, crawl_date=crawl_date
  206. )
  207. storage = LocalStorageBackend(
  208. data_dir=str(self.project_root / "output"),
  209. enable_txt=True, enable_html=True, timezone=timezone
  210. )
  211. try:
  212. save_success, save_error_msg, saved_files = self._persist_crawl_data(
  213. storage, news_data, save_to_local, results, id_to_name, failed_ids, current_time, crawl_time_str
  214. )
  215. finally:
  216. get_cache().clear()
  217. print("[System] 缓存已清除")
  218. storage.cleanup()
  219. # 4. 构建响应
  220. return self._build_crawl_response(
  221. results, id_to_name, failed_ids, current_time, include_url,
  222. save_success, save_to_local, save_error_msg, saved_files
  223. )
  224. except MCPError as e:
  225. return {"success": False, "error": e.to_dict()}
  226. except Exception as e:
  227. import traceback
  228. return {
  229. "success": False,
  230. "error": {
  231. "code": "INTERNAL_ERROR",
  232. "message": str(e),
  233. "traceback": traceback.format_exc()
  234. }
  235. }
  236. def _generate_simple_html(self, results: Dict, id_to_name: Dict, failed_ids: List, now) -> str:
  237. """生成简化的 HTML 报告"""
  238. html = """<!DOCTYPE html>
  239. <html>
  240. <head>
  241. <meta charset="UTF-8">
  242. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  243. <title>MCP 爬取结果</title>
  244. <style>
  245. body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
  246. .container { max-width: 900px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; }
  247. h1 { color: #333; border-bottom: 2px solid #4CAF50; padding-bottom: 10px; }
  248. .platform { margin-bottom: 30px; }
  249. .platform-name { background: #4CAF50; color: white; padding: 10px; border-radius: 5px; margin-bottom: 10px; }
  250. .news-item { padding: 8px; border-bottom: 1px solid #eee; }
  251. .rank { color: #666; font-weight: bold; margin-right: 10px; }
  252. .title { color: #333; }
  253. .link { color: #1976D2; text-decoration: none; margin-left: 10px; font-size: 0.9em; }
  254. .link:hover { text-decoration: underline; }
  255. .failed { background: #ffebee; padding: 10px; border-radius: 5px; margin-top: 20px; }
  256. .failed h3 { color: #c62828; margin-top: 0; }
  257. .timestamp { color: #666; font-size: 0.9em; text-align: right; margin-top: 20px; }
  258. </style>
  259. </head>
  260. <body>
  261. <div class="container">
  262. <h1>MCP 爬取结果</h1>
  263. """
  264. # 添加时间戳
  265. html += f' <p class="timestamp">爬取时间: {now.strftime("%Y-%m-%d %H:%M:%S")}</p>\n\n'
  266. # 遍历每个平台
  267. for platform_id, titles_data in results.items():
  268. platform_name = id_to_name.get(platform_id, platform_id)
  269. html += f' <div class="platform">\n'
  270. html += f' <div class="platform-name">{platform_name}</div>\n'
  271. # 排序标题
  272. sorted_items = []
  273. for title, info in titles_data.items():
  274. ranks = info.get("ranks", [])
  275. url = info.get("url", "")
  276. mobile_url = info.get("mobileUrl", "")
  277. rank = ranks[0] if ranks else 999
  278. sorted_items.append((rank, title, url, mobile_url))
  279. sorted_items.sort(key=lambda x: x[0])
  280. # 显示新闻
  281. for rank, title, url, mobile_url in sorted_items:
  282. html += f' <div class="news-item">\n'
  283. html += f' <span class="rank">{rank}.</span>\n'
  284. html += f' <span class="title">{self._html_escape(title)}</span>\n'
  285. if url:
  286. html += f' <a class="link" href="{self._html_escape(url)}" target="_blank">链接</a>\n'
  287. if mobile_url and mobile_url != url:
  288. html += f' <a class="link" href="{self._html_escape(mobile_url)}" target="_blank">移动版</a>\n'
  289. html += ' </div>\n'
  290. html += ' </div>\n\n'
  291. # 失败的平台
  292. if failed_ids:
  293. html += ' <div class="failed">\n'
  294. html += ' <h3>请求失败的平台</h3>\n'
  295. html += ' <ul>\n'
  296. for platform_id in failed_ids:
  297. html += f' <li>{self._html_escape(platform_id)}</li>\n'
  298. html += ' </ul>\n'
  299. html += ' </div>\n'
  300. html += """ </div>
  301. </body>
  302. </html>"""
  303. return html
  304. def _html_escape(self, text: str) -> str:
  305. """HTML 转义"""
  306. if not isinstance(text, str):
  307. text = str(text)
  308. return (
  309. text.replace("&", "&amp;")
  310. .replace("<", "&lt;")
  311. .replace(">", "&gt;")
  312. .replace('"', "&quot;")
  313. .replace("'", "&#x27;")
  314. )
  315. def check_version(self, proxy_url: Optional[str] = None) -> Dict:
  316. """
  317. 检查版本更新
  318. 同时检查 TrendRadar 和 MCP Server 两个组件的版本更新。
  319. 远程版本 URL 从 config.yaml 获取:
  320. - version_check_url: TrendRadar 版本
  321. - mcp_version_check_url: MCP Server 版本
  322. Args:
  323. proxy_url: 可选的代理URL,用于访问远程版本
  324. Returns:
  325. 版本检查结果字典,包含:
  326. - success: 是否成功
  327. - trendradar: TrendRadar 版本检查结果
  328. - mcp: MCP Server 版本检查结果
  329. - any_update: 是否有任何组件需要更新
  330. Example:
  331. >>> tools = SystemManagementTools()
  332. >>> result = tools.check_version()
  333. >>> print(result['data']['any_update'])
  334. """
  335. import yaml
  336. import requests
  337. def parse_version(version_str: str):
  338. """将版本号字符串解析为元组"""
  339. try:
  340. parts = version_str.strip().split(".")
  341. if len(parts) != 3:
  342. raise ValueError("版本号格式不正确")
  343. return int(parts[0]), int(parts[1]), int(parts[2])
  344. except (ValueError, AttributeError, TypeError):
  345. return 0, 0, 0
  346. def check_single_version(
  347. name: str,
  348. local_version: str,
  349. remote_url: str,
  350. proxies: Optional[Dict],
  351. headers: Dict
  352. ) -> Dict:
  353. """检查单个组件的版本"""
  354. try:
  355. response = requests.get(
  356. remote_url, proxies=proxies, headers=headers, timeout=10
  357. )
  358. response.raise_for_status()
  359. remote_version = response.text.strip()
  360. local_tuple = parse_version(local_version)
  361. remote_tuple = parse_version(remote_version)
  362. need_update = local_tuple < remote_tuple
  363. if need_update:
  364. message = f"发现新版本 {remote_version},当前版本 {local_version},建议更新"
  365. elif local_tuple > remote_tuple:
  366. message = f"当前版本 {local_version} 高于远程版本 {remote_version}(可能是开发版本)"
  367. else:
  368. message = f"当前版本 {local_version} 已是最新版本"
  369. return {
  370. "success": True,
  371. "name": name,
  372. "current_version": local_version,
  373. "remote_version": remote_version,
  374. "need_update": need_update,
  375. "current_parsed": list(local_tuple),
  376. "remote_parsed": list(remote_tuple),
  377. "message": message
  378. }
  379. except requests.exceptions.Timeout:
  380. return {
  381. "success": False,
  382. "name": name,
  383. "current_version": local_version,
  384. "error": "获取远程版本超时"
  385. }
  386. except requests.exceptions.RequestException as e:
  387. return {
  388. "success": False,
  389. "name": name,
  390. "current_version": local_version,
  391. "error": f"网络请求失败: {str(e)}"
  392. }
  393. except Exception as e:
  394. return {
  395. "success": False,
  396. "name": name,
  397. "current_version": local_version,
  398. "error": str(e)
  399. }
  400. try:
  401. # 导入本地版本
  402. from trendradar import __version__ as trendradar_version
  403. from mcp_server import __version__ as mcp_version
  404. # 从配置文件获取远程版本 URL
  405. config_path = self.project_root / "config" / "config.yaml"
  406. if not config_path.exists():
  407. return {
  408. "success": False,
  409. "error": {
  410. "code": "CONFIG_NOT_FOUND",
  411. "message": f"配置文件不存在: {config_path}"
  412. }
  413. }
  414. with open(config_path, "r", encoding="utf-8") as f:
  415. config_data = yaml.safe_load(f)
  416. advanced_config = config_data.get("advanced", {})
  417. trendradar_url = advanced_config.get(
  418. "version_check_url",
  419. "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version"
  420. )
  421. mcp_url = advanced_config.get(
  422. "mcp_version_check_url",
  423. "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version_mcp"
  424. )
  425. # 配置代理
  426. proxies = None
  427. if proxy_url:
  428. proxies = {"http": proxy_url, "https": proxy_url}
  429. # 请求头
  430. headers = {
  431. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  432. "Accept": "text/plain, */*",
  433. "Cache-Control": "no-cache",
  434. }
  435. # 检查两个版本
  436. trendradar_result = check_single_version(
  437. "TrendRadar", trendradar_version, trendradar_url, proxies, headers
  438. )
  439. mcp_result = check_single_version(
  440. "MCP Server", mcp_version, mcp_url, proxies, headers
  441. )
  442. # 判断是否有任何更新
  443. any_update = (
  444. (trendradar_result.get("success") and trendradar_result.get("need_update", False)) or
  445. (mcp_result.get("success") and mcp_result.get("need_update", False))
  446. )
  447. return {
  448. "success": True,
  449. "summary": {
  450. "description": "版本检查结果(TrendRadar + MCP Server)",
  451. "any_update": any_update
  452. },
  453. "data": {
  454. "trendradar": trendradar_result,
  455. "mcp": mcp_result,
  456. "any_update": any_update
  457. }
  458. }
  459. except ImportError as e:
  460. return {
  461. "success": False,
  462. "error": {
  463. "code": "IMPORT_ERROR",
  464. "message": f"无法导入版本信息: {str(e)}"
  465. }
  466. }
  467. except Exception as e:
  468. return {
  469. "success": False,
  470. "error": {
  471. "code": "INTERNAL_ERROR",
  472. "message": str(e)
  473. }
  474. }