system.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. """
  2. 系统管理工具
  3. 实现系统状态查询和爬虫触发功能。
  4. """
  5. from pathlib import Path
  6. from typing import Dict, List, Optional
  7. from ..services.data_service import DataService
  8. from ..utils.validators import validate_platforms
  9. from ..utils.errors import MCPError, CrawlTaskError
  10. class SystemManagementTools:
  11. """系统管理工具类"""
  12. def __init__(self, project_root: str = None):
  13. """
  14. 初始化系统管理工具
  15. Args:
  16. project_root: 项目根目录
  17. """
  18. self.data_service = DataService(project_root)
  19. if project_root:
  20. self.project_root = Path(project_root)
  21. else:
  22. # 获取项目根目录
  23. current_file = Path(__file__)
  24. self.project_root = current_file.parent.parent.parent
  25. def get_system_status(self) -> Dict:
  26. """
  27. 获取系统运行状态和健康检查信息
  28. Returns:
  29. 系统状态字典
  30. Example:
  31. >>> tools = SystemManagementTools()
  32. >>> result = tools.get_system_status()
  33. >>> print(result['system']['version'])
  34. """
  35. try:
  36. # 获取系统状态
  37. status = self.data_service.get_system_status()
  38. return {
  39. **status,
  40. "success": True
  41. }
  42. except MCPError as e:
  43. return {
  44. "success": False,
  45. "error": e.to_dict()
  46. }
  47. except Exception as e:
  48. return {
  49. "success": False,
  50. "error": {
  51. "code": "INTERNAL_ERROR",
  52. "message": str(e)
  53. }
  54. }
  55. def trigger_crawl(self, platforms: Optional[List[str]] = None, save_to_local: bool = False, include_url: bool = False) -> Dict:
  56. """
  57. 手动触发一次临时爬取任务(可选持久化)
  58. Args:
  59. platforms: 指定平台列表,为空则爬取所有平台
  60. save_to_local: 是否保存到本地 output 目录,默认 False
  61. include_url: 是否包含URL链接,默认False(节省token)
  62. Returns:
  63. 爬取结果字典,包含新闻数据和保存路径(如果保存)
  64. Example:
  65. >>> tools = SystemManagementTools()
  66. >>> # 临时爬取,不保存
  67. >>> result = tools.trigger_crawl(platforms=['zhihu', 'weibo'])
  68. >>> print(result['data'])
  69. >>> # 爬取并保存到本地
  70. >>> result = tools.trigger_crawl(platforms=['zhihu'], save_to_local=True)
  71. >>> print(result['saved_files'])
  72. """
  73. try:
  74. import time
  75. import yaml
  76. from trendradar.crawler.fetcher import DataFetcher
  77. from trendradar.storage.local import LocalStorageBackend
  78. from trendradar.storage.base import convert_crawl_results_to_news_data
  79. from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename
  80. from ..services.cache_service import get_cache
  81. # 参数验证
  82. platforms = validate_platforms(platforms)
  83. # 加载配置文件
  84. config_path = self.project_root / "config" / "config.yaml"
  85. if not config_path.exists():
  86. raise CrawlTaskError(
  87. "配置文件不存在",
  88. suggestion=f"请确保配置文件存在: {config_path}"
  89. )
  90. # 读取配置
  91. with open(config_path, "r", encoding="utf-8") as f:
  92. config_data = yaml.safe_load(f)
  93. # 获取平台配置
  94. all_platforms = config_data.get("platforms", [])
  95. if not all_platforms:
  96. raise CrawlTaskError(
  97. "配置文件中没有平台配置",
  98. suggestion="请检查 config/config.yaml 中的 platforms 配置"
  99. )
  100. # 过滤平台
  101. if platforms:
  102. target_platforms = [p for p in all_platforms if p["id"] in platforms]
  103. if not target_platforms:
  104. raise CrawlTaskError(
  105. f"指定的平台不存在: {platforms}",
  106. suggestion=f"可用平台: {[p['id'] for p in all_platforms]}"
  107. )
  108. else:
  109. target_platforms = all_platforms
  110. # 构建平台ID列表
  111. ids = []
  112. for platform in target_platforms:
  113. if "name" in platform:
  114. ids.append((platform["id"], platform["name"]))
  115. else:
  116. ids.append(platform["id"])
  117. print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
  118. # 初始化数据获取器
  119. advanced = config_data.get("advanced", {})
  120. crawler_config = advanced.get("crawler", {})
  121. proxy_url = None
  122. if crawler_config.get("use_proxy"):
  123. proxy_url = crawler_config.get("default_proxy")
  124. fetcher = DataFetcher(proxy_url=proxy_url)
  125. request_interval = crawler_config.get("request_interval", 100)
  126. # 执行爬取
  127. results, id_to_name, failed_ids = fetcher.crawl_websites(
  128. ids_list=ids,
  129. request_interval=request_interval
  130. )
  131. # 获取当前时间(统一使用 trendradar 的时间工具)
  132. # 从配置中读取时区,默认为 Asia/Shanghai
  133. timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai")
  134. current_time = get_configured_time(timezone)
  135. crawl_date = format_date_folder(None, timezone)
  136. crawl_time_str = format_time_filename(timezone)
  137. # 转换为标准数据模型
  138. news_data = convert_crawl_results_to_news_data(
  139. results=results,
  140. id_to_name=id_to_name,
  141. failed_ids=failed_ids,
  142. crawl_time=crawl_time_str,
  143. crawl_date=crawl_date
  144. )
  145. # 初始化存储后端
  146. storage = LocalStorageBackend(
  147. data_dir=str(self.project_root / "output"),
  148. enable_txt=True,
  149. enable_html=True,
  150. timezone=timezone
  151. )
  152. # 尝试持久化数据
  153. save_success = False
  154. save_error_msg = ""
  155. saved_files = {}
  156. try:
  157. # 1. 保存到 SQLite (核心持久化)
  158. if storage.save_news_data(news_data):
  159. save_success = True
  160. # 2. 如果请求保存到本地,生成 TXT/HTML 快照
  161. if save_to_local:
  162. # 保存 TXT
  163. txt_path = storage.save_txt_snapshot(news_data)
  164. if txt_path:
  165. saved_files["txt"] = txt_path
  166. # 保存 HTML (使用简化版生成器)
  167. html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time)
  168. html_filename = f"{crawl_time_str}.html"
  169. html_path = storage.save_html_report(html_content, html_filename)
  170. if html_path:
  171. saved_files["html"] = html_path
  172. except Exception as e:
  173. # 捕获所有保存错误(特别是 Docker 只读卷导致的 PermissionError)
  174. print(f"[System] 数据保存失败: {e}")
  175. save_success = False
  176. save_error_msg = str(e)
  177. # 3. 清除缓存,确保下次查询获取最新数据
  178. # 即使保存失败,内存中的数据可能已经通过其他方式更新,或者是临时的
  179. get_cache().clear()
  180. print("[System] 缓存已清除")
  181. # 构建返回结果
  182. news_response_data = []
  183. for platform_id, titles_data in results.items():
  184. platform_name = id_to_name.get(platform_id, platform_id)
  185. for title, info in titles_data.items():
  186. news_item = {
  187. "platform_id": platform_id,
  188. "platform_name": platform_name,
  189. "title": title,
  190. "ranks": info.get("ranks", [])
  191. }
  192. if include_url:
  193. news_item["url"] = info.get("url", "")
  194. news_item["mobile_url"] = info.get("mobileUrl", "")
  195. news_response_data.append(news_item)
  196. result = {
  197. "success": True,
  198. "task_id": f"crawl_{int(time.time())}",
  199. "status": "completed",
  200. "crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"),
  201. "platforms": list(results.keys()),
  202. "total_news": len(news_response_data),
  203. "failed_platforms": failed_ids,
  204. "data": news_response_data,
  205. "saved_to_local": save_success and save_to_local
  206. }
  207. if save_success:
  208. if save_to_local:
  209. result["saved_files"] = saved_files
  210. result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹"
  211. else:
  212. result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果,未生成TXT快照)"
  213. else:
  214. # 明确告知用户保存失败
  215. result["saved_to_local"] = False
  216. result["save_error"] = save_error_msg
  217. if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg:
  218. result["note"] = "爬取成功,但无法写入数据库(Docker只读模式)。数据仅在本次返回中有效。"
  219. else:
  220. result["note"] = f"爬取成功但保存失败: {save_error_msg}"
  221. # 清理资源
  222. storage.cleanup()
  223. return result
  224. except MCPError as e:
  225. return {
  226. "success": False,
  227. "error": e.to_dict()
  228. }
  229. except Exception as e:
  230. import traceback
  231. return {
  232. "success": False,
  233. "error": {
  234. "code": "INTERNAL_ERROR",
  235. "message": str(e),
  236. "traceback": traceback.format_exc()
  237. }
  238. }
  239. def _generate_simple_html(self, results: Dict, id_to_name: Dict, failed_ids: List, now) -> str:
  240. """生成简化的 HTML 报告"""
  241. html = """<!DOCTYPE html>
  242. <html>
  243. <head>
  244. <meta charset="UTF-8">
  245. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  246. <title>MCP 爬取结果</title>
  247. <style>
  248. body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
  249. .container { max-width: 900px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; }
  250. h1 { color: #333; border-bottom: 2px solid #4CAF50; padding-bottom: 10px; }
  251. .platform { margin-bottom: 30px; }
  252. .platform-name { background: #4CAF50; color: white; padding: 10px; border-radius: 5px; margin-bottom: 10px; }
  253. .news-item { padding: 8px; border-bottom: 1px solid #eee; }
  254. .rank { color: #666; font-weight: bold; margin-right: 10px; }
  255. .title { color: #333; }
  256. .link { color: #1976D2; text-decoration: none; margin-left: 10px; font-size: 0.9em; }
  257. .link:hover { text-decoration: underline; }
  258. .failed { background: #ffebee; padding: 10px; border-radius: 5px; margin-top: 20px; }
  259. .failed h3 { color: #c62828; margin-top: 0; }
  260. .timestamp { color: #666; font-size: 0.9em; text-align: right; margin-top: 20px; }
  261. </style>
  262. </head>
  263. <body>
  264. <div class="container">
  265. <h1>MCP 爬取结果</h1>
  266. """
  267. # 添加时间戳
  268. html += f' <p class="timestamp">爬取时间: {now.strftime("%Y-%m-%d %H:%M:%S")}</p>\n\n'
  269. # 遍历每个平台
  270. for platform_id, titles_data in results.items():
  271. platform_name = id_to_name.get(platform_id, platform_id)
  272. html += f' <div class="platform">\n'
  273. html += f' <div class="platform-name">{platform_name}</div>\n'
  274. # 排序标题
  275. sorted_items = []
  276. for title, info in titles_data.items():
  277. ranks = info.get("ranks", [])
  278. url = info.get("url", "")
  279. mobile_url = info.get("mobileUrl", "")
  280. rank = ranks[0] if ranks else 999
  281. sorted_items.append((rank, title, url, mobile_url))
  282. sorted_items.sort(key=lambda x: x[0])
  283. # 显示新闻
  284. for rank, title, url, mobile_url in sorted_items:
  285. html += f' <div class="news-item">\n'
  286. html += f' <span class="rank">{rank}.</span>\n'
  287. html += f' <span class="title">{self._html_escape(title)}</span>\n'
  288. if url:
  289. html += f' <a class="link" href="{self._html_escape(url)}" target="_blank">链接</a>\n'
  290. if mobile_url and mobile_url != url:
  291. html += f' <a class="link" href="{self._html_escape(mobile_url)}" target="_blank">移动版</a>\n'
  292. html += ' </div>\n'
  293. html += ' </div>\n\n'
  294. # 失败的平台
  295. if failed_ids:
  296. html += ' <div class="failed">\n'
  297. html += ' <h3>请求失败的平台</h3>\n'
  298. html += ' <ul>\n'
  299. for platform_id in failed_ids:
  300. html += f' <li>{self._html_escape(platform_id)}</li>\n'
  301. html += ' </ul>\n'
  302. html += ' </div>\n'
  303. html += """ </div>
  304. </body>
  305. </html>"""
  306. return html
  307. def _html_escape(self, text: str) -> str:
  308. """HTML 转义"""
  309. if not isinstance(text, str):
  310. text = str(text)
  311. return (
  312. text.replace("&", "&amp;")
  313. .replace("<", "&lt;")
  314. .replace(">", "&gt;")
  315. .replace('"', "&quot;")
  316. .replace("'", "&#x27;")
  317. )