__main__.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. # coding=utf-8
  2. """
  3. TrendRadar 主程序
  4. 热点新闻聚合与分析工具
  5. 支持: python -m trendradar
  6. """
  7. import os
  8. import webbrowser
  9. from pathlib import Path
  10. from typing import Dict, List, Tuple, Optional
  11. import requests
  12. from trendradar.context import AppContext
  13. from trendradar import __version__
  14. from trendradar.core import load_config
  15. from trendradar.crawler import DataFetcher
  16. from trendradar.storage import convert_crawl_results_to_news_data
  17. def check_version_update(
  18. current_version: str, version_url: str, proxy_url: Optional[str] = None
  19. ) -> Tuple[bool, Optional[str]]:
  20. """检查版本更新"""
  21. try:
  22. proxies = None
  23. if proxy_url:
  24. proxies = {"http": proxy_url, "https": proxy_url}
  25. headers = {
  26. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  27. "Accept": "text/plain, */*",
  28. "Cache-Control": "no-cache",
  29. }
  30. response = requests.get(
  31. version_url, proxies=proxies, headers=headers, timeout=10
  32. )
  33. response.raise_for_status()
  34. remote_version = response.text.strip()
  35. print(f"当前版本: {current_version}, 远程版本: {remote_version}")
  36. # 比较版本
  37. def parse_version(version_str):
  38. try:
  39. parts = version_str.strip().split(".")
  40. if len(parts) != 3:
  41. raise ValueError("版本号格式不正确")
  42. return int(parts[0]), int(parts[1]), int(parts[2])
  43. except:
  44. return 0, 0, 0
  45. current_tuple = parse_version(current_version)
  46. remote_tuple = parse_version(remote_version)
  47. need_update = current_tuple < remote_tuple
  48. return need_update, remote_version if need_update else None
  49. except Exception as e:
  50. print(f"版本检查失败: {e}")
  51. return False, None
  52. # === 主分析器 ===
  53. class NewsAnalyzer:
  54. """新闻分析器"""
  55. # 模式策略定义
  56. MODE_STRATEGIES = {
  57. "incremental": {
  58. "mode_name": "增量模式",
  59. "description": "增量模式(只关注新增新闻,无新增时不推送)",
  60. "realtime_report_type": "实时增量",
  61. "summary_report_type": "当日汇总",
  62. "should_send_realtime": True,
  63. "should_generate_summary": True,
  64. "summary_mode": "daily",
  65. },
  66. "current": {
  67. "mode_name": "当前榜单模式",
  68. "description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)",
  69. "realtime_report_type": "实时当前榜单",
  70. "summary_report_type": "当前榜单汇总",
  71. "should_send_realtime": True,
  72. "should_generate_summary": True,
  73. "summary_mode": "current",
  74. },
  75. "daily": {
  76. "mode_name": "当日汇总模式",
  77. "description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)",
  78. "realtime_report_type": "",
  79. "summary_report_type": "当日汇总",
  80. "should_send_realtime": False,
  81. "should_generate_summary": True,
  82. "summary_mode": "daily",
  83. },
  84. }
  85. def __init__(self):
  86. # 加载配置
  87. print("正在加载配置...")
  88. config = load_config()
  89. print(f"TrendRadar v{__version__} 配置加载完成")
  90. print(f"监控平台数量: {len(config['PLATFORMS'])}")
  91. print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}")
  92. # 创建应用上下文
  93. self.ctx = AppContext(config)
  94. self.request_interval = self.ctx.config["REQUEST_INTERVAL"]
  95. self.report_mode = self.ctx.config["REPORT_MODE"]
  96. self.rank_threshold = self.ctx.rank_threshold
  97. self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
  98. self.is_docker_container = self._detect_docker_environment()
  99. self.update_info = None
  100. self.proxy_url = None
  101. self._setup_proxy()
  102. self.data_fetcher = DataFetcher(self.proxy_url)
  103. # 初始化存储管理器(使用 AppContext)
  104. self._init_storage_manager()
  105. if self.is_github_actions:
  106. self._check_version_update()
  107. def _init_storage_manager(self) -> None:
  108. """初始化存储管理器(使用 AppContext)"""
  109. # 获取数据保留天数(支持环境变量覆盖)
  110. env_retention = os.environ.get("STORAGE_RETENTION_DAYS", "").strip()
  111. if env_retention:
  112. # 环境变量覆盖配置
  113. self.ctx.config["STORAGE"]["RETENTION_DAYS"] = int(env_retention)
  114. self.storage_manager = self.ctx.get_storage_manager()
  115. print(f"存储后端: {self.storage_manager.backend_name}")
  116. retention_days = self.ctx.config.get("STORAGE", {}).get("RETENTION_DAYS", 0)
  117. if retention_days > 0:
  118. print(f"数据保留天数: {retention_days} 天")
  119. def _detect_docker_environment(self) -> bool:
  120. """检测是否运行在 Docker 容器中"""
  121. try:
  122. if os.environ.get("DOCKER_CONTAINER") == "true":
  123. return True
  124. if os.path.exists("/.dockerenv"):
  125. return True
  126. return False
  127. except Exception:
  128. return False
  129. def _should_open_browser(self) -> bool:
  130. """判断是否应该打开浏览器"""
  131. return not self.is_github_actions and not self.is_docker_container
  132. def _setup_proxy(self) -> None:
  133. """设置代理配置"""
  134. if not self.is_github_actions and self.ctx.config["USE_PROXY"]:
  135. self.proxy_url = self.ctx.config["DEFAULT_PROXY"]
  136. print("本地环境,使用代理")
  137. elif not self.is_github_actions and not self.ctx.config["USE_PROXY"]:
  138. print("本地环境,未启用代理")
  139. else:
  140. print("GitHub Actions环境,不使用代理")
  141. def _check_version_update(self) -> None:
  142. """检查版本更新"""
  143. try:
  144. need_update, remote_version = check_version_update(
  145. __version__, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url
  146. )
  147. if need_update and remote_version:
  148. self.update_info = {
  149. "current_version": __version__,
  150. "remote_version": remote_version,
  151. }
  152. print(f"发现新版本: {remote_version} (当前: {__version__})")
  153. else:
  154. print("版本检查完成,当前为最新版本")
  155. except Exception as e:
  156. print(f"版本检查出错: {e}")
  157. def _get_mode_strategy(self) -> Dict:
  158. """获取当前模式的策略配置"""
  159. return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
  160. def _has_notification_configured(self) -> bool:
  161. """检查是否配置了任何通知渠道"""
  162. cfg = self.ctx.config
  163. return any(
  164. [
  165. cfg["FEISHU_WEBHOOK_URL"],
  166. cfg["DINGTALK_WEBHOOK_URL"],
  167. cfg["WEWORK_WEBHOOK_URL"],
  168. (cfg["TELEGRAM_BOT_TOKEN"] and cfg["TELEGRAM_CHAT_ID"]),
  169. (
  170. cfg["EMAIL_FROM"]
  171. and cfg["EMAIL_PASSWORD"]
  172. and cfg["EMAIL_TO"]
  173. ),
  174. (cfg["NTFY_SERVER_URL"] and cfg["NTFY_TOPIC"]),
  175. cfg["BARK_URL"],
  176. cfg["SLACK_WEBHOOK_URL"],
  177. ]
  178. )
  179. def _has_valid_content(
  180. self, stats: List[Dict], new_titles: Optional[Dict] = None
  181. ) -> bool:
  182. """检查是否有有效的新闻内容"""
  183. if self.report_mode == "incremental":
  184. # 增量模式:必须有新增标题且匹配了关键词才推送
  185. has_new_titles = bool(
  186. new_titles and any(len(titles) > 0 for titles in new_titles.values())
  187. )
  188. has_matched_news = any(stat["count"] > 0 for stat in stats)
  189. return has_new_titles and has_matched_news
  190. elif self.report_mode == "current":
  191. # current模式:只要stats有内容就说明有匹配的新闻
  192. return any(stat["count"] > 0 for stat in stats)
  193. else:
  194. # 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻
  195. has_matched_news = any(stat["count"] > 0 for stat in stats)
  196. has_new_news = bool(
  197. new_titles and any(len(titles) > 0 for titles in new_titles.values())
  198. )
  199. return has_matched_news or has_new_news
  200. def _load_analysis_data(
  201. self,
  202. quiet: bool = False,
  203. ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
  204. """统一的数据加载和预处理,使用当前监控平台列表过滤历史数据"""
  205. try:
  206. # 获取当前配置的监控平台ID列表
  207. current_platform_ids = self.ctx.platform_ids
  208. if not quiet:
  209. print(f"当前监控平台: {current_platform_ids}")
  210. all_results, id_to_name, title_info = self.ctx.read_today_titles(
  211. current_platform_ids, quiet=quiet
  212. )
  213. if not all_results:
  214. print("没有找到当天的数据")
  215. return None
  216. total_titles = sum(len(titles) for titles in all_results.values())
  217. if not quiet:
  218. print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)")
  219. new_titles = self.ctx.detect_new_titles(current_platform_ids, quiet=quiet)
  220. word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
  221. return (
  222. all_results,
  223. id_to_name,
  224. title_info,
  225. new_titles,
  226. word_groups,
  227. filter_words,
  228. global_filters,
  229. )
  230. except Exception as e:
  231. print(f"数据加载失败: {e}")
  232. return None
  233. def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
  234. """从当前抓取结果构建标题信息"""
  235. title_info = {}
  236. for source_id, titles_data in results.items():
  237. title_info[source_id] = {}
  238. for title, title_data in titles_data.items():
  239. ranks = title_data.get("ranks", [])
  240. url = title_data.get("url", "")
  241. mobile_url = title_data.get("mobileUrl", "")
  242. title_info[source_id][title] = {
  243. "first_time": time_info,
  244. "last_time": time_info,
  245. "count": 1,
  246. "ranks": ranks,
  247. "url": url,
  248. "mobileUrl": mobile_url,
  249. }
  250. return title_info
  251. def _run_analysis_pipeline(
  252. self,
  253. data_source: Dict,
  254. mode: str,
  255. title_info: Dict,
  256. new_titles: Dict,
  257. word_groups: List[Dict],
  258. filter_words: List[str],
  259. id_to_name: Dict,
  260. failed_ids: Optional[List] = None,
  261. is_daily_summary: bool = False,
  262. global_filters: Optional[List[str]] = None,
  263. quiet: bool = False,
  264. ) -> Tuple[List[Dict], Optional[str]]:
  265. """统一的分析流水线:数据处理 → 统计计算 → HTML生成"""
  266. # 统计计算(使用 AppContext)
  267. stats, total_titles = self.ctx.count_frequency(
  268. data_source,
  269. word_groups,
  270. filter_words,
  271. id_to_name,
  272. title_info,
  273. new_titles,
  274. mode=mode,
  275. global_filters=global_filters,
  276. quiet=quiet,
  277. )
  278. # HTML生成(如果启用)
  279. html_file = None
  280. if self.ctx.config["STORAGE"]["FORMATS"]["HTML"]:
  281. html_file = self.ctx.generate_html(
  282. stats,
  283. total_titles,
  284. failed_ids=failed_ids,
  285. new_titles=new_titles,
  286. id_to_name=id_to_name,
  287. mode=mode,
  288. is_daily_summary=is_daily_summary,
  289. update_info=self.update_info if self.ctx.config["SHOW_VERSION_UPDATE"] else None,
  290. )
  291. return stats, html_file
  292. def _send_notification_if_needed(
  293. self,
  294. stats: List[Dict],
  295. report_type: str,
  296. mode: str,
  297. failed_ids: Optional[List] = None,
  298. new_titles: Optional[Dict] = None,
  299. id_to_name: Optional[Dict] = None,
  300. html_file_path: Optional[str] = None,
  301. ) -> bool:
  302. """统一的通知发送逻辑,包含所有判断条件"""
  303. has_notification = self._has_notification_configured()
  304. cfg = self.ctx.config
  305. if (
  306. cfg["ENABLE_NOTIFICATION"]
  307. and has_notification
  308. and self._has_valid_content(stats, new_titles)
  309. ):
  310. # 推送窗口控制
  311. if cfg["PUSH_WINDOW"]["ENABLED"]:
  312. push_manager = self.ctx.create_push_manager()
  313. time_range_start = cfg["PUSH_WINDOW"]["TIME_RANGE"]["START"]
  314. time_range_end = cfg["PUSH_WINDOW"]["TIME_RANGE"]["END"]
  315. if not push_manager.is_in_time_range(time_range_start, time_range_end):
  316. now = self.ctx.get_time()
  317. print(
  318. f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送"
  319. )
  320. return False
  321. if cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]:
  322. if push_manager.has_pushed_today():
  323. print(f"推送窗口控制:今天已推送过,跳过本次推送")
  324. return False
  325. else:
  326. print(f"推送窗口控制:今天首次推送")
  327. # 准备报告数据
  328. report_data = self.ctx.prepare_report(stats, failed_ids, new_titles, id_to_name, mode)
  329. # 是否发送版本更新信息
  330. update_info_to_send = self.update_info if cfg["SHOW_VERSION_UPDATE"] else None
  331. # 使用 NotificationDispatcher 发送到所有渠道
  332. dispatcher = self.ctx.create_notification_dispatcher()
  333. results = dispatcher.dispatch_all(
  334. report_data=report_data,
  335. report_type=report_type,
  336. update_info=update_info_to_send,
  337. proxy_url=self.proxy_url,
  338. mode=mode,
  339. html_file_path=html_file_path,
  340. )
  341. if not results:
  342. print("未配置任何通知渠道,跳过通知发送")
  343. return False
  344. # 如果成功发送了任何通知,且启用了每天只推一次,则记录推送
  345. if (
  346. cfg["PUSH_WINDOW"]["ENABLED"]
  347. and cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]
  348. and any(results.values())
  349. ):
  350. push_manager = self.ctx.create_push_manager()
  351. push_manager.record_push(report_type)
  352. return True
  353. elif cfg["ENABLE_NOTIFICATION"] and not has_notification:
  354. print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送")
  355. elif not cfg["ENABLE_NOTIFICATION"]:
  356. print(f"跳过{report_type}通知:通知功能已禁用")
  357. elif (
  358. cfg["ENABLE_NOTIFICATION"]
  359. and has_notification
  360. and not self._has_valid_content(stats, new_titles)
  361. ):
  362. mode_strategy = self._get_mode_strategy()
  363. if "实时" in report_type:
  364. if self.report_mode == "incremental":
  365. has_new = bool(
  366. new_titles and any(len(titles) > 0 for titles in new_titles.values())
  367. )
  368. if not has_new:
  369. print("跳过实时推送通知:增量模式下未检测到新增的新闻")
  370. else:
  371. print("跳过实时推送通知:增量模式下新增新闻未匹配到关键词")
  372. else:
  373. print(
  374. f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻"
  375. )
  376. else:
  377. print(
  378. f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容"
  379. )
  380. return False
  381. def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]:
  382. """生成汇总报告(带通知)"""
  383. summary_type = (
  384. "当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总"
  385. )
  386. print(f"生成{summary_type}报告...")
  387. # 加载分析数据
  388. analysis_data = self._load_analysis_data()
  389. if not analysis_data:
  390. return None
  391. all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
  392. analysis_data
  393. )
  394. # 运行分析流水线
  395. stats, html_file = self._run_analysis_pipeline(
  396. all_results,
  397. mode_strategy["summary_mode"],
  398. title_info,
  399. new_titles,
  400. word_groups,
  401. filter_words,
  402. id_to_name,
  403. is_daily_summary=True,
  404. global_filters=global_filters,
  405. )
  406. if html_file:
  407. print(f"{summary_type}报告已生成: {html_file}")
  408. # 发送通知
  409. self._send_notification_if_needed(
  410. stats,
  411. mode_strategy["summary_report_type"],
  412. mode_strategy["summary_mode"],
  413. failed_ids=[],
  414. new_titles=new_titles,
  415. id_to_name=id_to_name,
  416. html_file_path=html_file,
  417. )
  418. return html_file
  419. def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
  420. """生成汇总HTML"""
  421. summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
  422. print(f"生成{summary_type}HTML...")
  423. # 加载分析数据(静默模式,避免重复输出日志)
  424. analysis_data = self._load_analysis_data(quiet=True)
  425. if not analysis_data:
  426. return None
  427. all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
  428. analysis_data
  429. )
  430. # 运行分析流水线(静默模式,避免重复输出日志)
  431. _, html_file = self._run_analysis_pipeline(
  432. all_results,
  433. mode,
  434. title_info,
  435. new_titles,
  436. word_groups,
  437. filter_words,
  438. id_to_name,
  439. is_daily_summary=True,
  440. global_filters=global_filters,
  441. quiet=True,
  442. )
  443. if html_file:
  444. print(f"{summary_type}HTML已生成: {html_file}")
  445. return html_file
  446. def _initialize_and_check_config(self) -> None:
  447. """通用初始化和配置检查"""
  448. now = self.ctx.get_time()
  449. print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
  450. if not self.ctx.config["ENABLE_CRAWLER"]:
  451. print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出")
  452. return
  453. has_notification = self._has_notification_configured()
  454. if not self.ctx.config["ENABLE_NOTIFICATION"]:
  455. print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取")
  456. elif not has_notification:
  457. print("未配置任何通知渠道,将只进行数据抓取,不发送通知")
  458. else:
  459. print("通知功能已启用,将发送通知")
  460. mode_strategy = self._get_mode_strategy()
  461. print(f"报告模式: {self.report_mode}")
  462. print(f"运行模式: {mode_strategy['description']}")
  463. def _crawl_data(self) -> Tuple[Dict, Dict, List]:
  464. """执行数据爬取"""
  465. ids = []
  466. for platform in self.ctx.platforms:
  467. if "name" in platform:
  468. ids.append((platform["id"], platform["name"]))
  469. else:
  470. ids.append(platform["id"])
  471. print(
  472. f"配置的监控平台: {[p.get('name', p['id']) for p in self.ctx.platforms]}"
  473. )
  474. print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
  475. Path("output").mkdir(parents=True, exist_ok=True)
  476. results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
  477. ids, self.request_interval
  478. )
  479. # 转换为 NewsData 格式并保存到存储后端
  480. crawl_time = self.ctx.format_time()
  481. crawl_date = self.ctx.format_date()
  482. news_data = convert_crawl_results_to_news_data(
  483. results, id_to_name, failed_ids, crawl_time, crawl_date
  484. )
  485. # 保存到存储后端(SQLite)
  486. if self.storage_manager.save_news_data(news_data):
  487. print(f"数据已保存到存储后端: {self.storage_manager.backend_name}")
  488. # 保存 TXT 快照(如果启用)
  489. txt_file = self.storage_manager.save_txt_snapshot(news_data)
  490. if txt_file:
  491. print(f"TXT 快照已保存: {txt_file}")
  492. # 兼容:同时保存到原有 TXT 格式(确保向后兼容)
  493. if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
  494. title_file = self.ctx.save_titles(results, id_to_name, failed_ids)
  495. print(f"标题已保存到: {title_file}")
  496. return results, id_to_name, failed_ids
  497. def _execute_mode_strategy(
  498. self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List
  499. ) -> Optional[str]:
  500. """执行模式特定逻辑"""
  501. # 获取当前监控平台ID列表
  502. current_platform_ids = self.ctx.platform_ids
  503. new_titles = self.ctx.detect_new_titles(current_platform_ids)
  504. time_info = self.ctx.format_time()
  505. if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
  506. self.ctx.save_titles(results, id_to_name, failed_ids)
  507. word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
  508. # current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性
  509. if self.report_mode == "current":
  510. # 加载完整的历史数据(已按当前平台过滤)
  511. analysis_data = self._load_analysis_data()
  512. if analysis_data:
  513. (
  514. all_results,
  515. historical_id_to_name,
  516. historical_title_info,
  517. historical_new_titles,
  518. _,
  519. _,
  520. _,
  521. ) = analysis_data
  522. print(
  523. f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}"
  524. )
  525. stats, html_file = self._run_analysis_pipeline(
  526. all_results,
  527. self.report_mode,
  528. historical_title_info,
  529. historical_new_titles,
  530. word_groups,
  531. filter_words,
  532. historical_id_to_name,
  533. failed_ids=failed_ids,
  534. global_filters=global_filters,
  535. )
  536. combined_id_to_name = {**historical_id_to_name, **id_to_name}
  537. if html_file:
  538. print(f"HTML报告已生成: {html_file}")
  539. # 发送实时通知(使用完整历史数据的统计结果)
  540. summary_html = None
  541. if mode_strategy["should_send_realtime"]:
  542. self._send_notification_if_needed(
  543. stats,
  544. mode_strategy["realtime_report_type"],
  545. self.report_mode,
  546. failed_ids=failed_ids,
  547. new_titles=historical_new_titles,
  548. id_to_name=combined_id_to_name,
  549. html_file_path=html_file,
  550. )
  551. else:
  552. print("❌ 严重错误:无法读取刚保存的数据文件")
  553. raise RuntimeError("数据一致性检查失败:保存后立即读取失败")
  554. else:
  555. title_info = self._prepare_current_title_info(results, time_info)
  556. stats, html_file = self._run_analysis_pipeline(
  557. results,
  558. self.report_mode,
  559. title_info,
  560. new_titles,
  561. word_groups,
  562. filter_words,
  563. id_to_name,
  564. failed_ids=failed_ids,
  565. global_filters=global_filters,
  566. )
  567. if html_file:
  568. print(f"HTML报告已生成: {html_file}")
  569. # 发送实时通知(如果需要)
  570. summary_html = None
  571. if mode_strategy["should_send_realtime"]:
  572. self._send_notification_if_needed(
  573. stats,
  574. mode_strategy["realtime_report_type"],
  575. self.report_mode,
  576. failed_ids=failed_ids,
  577. new_titles=new_titles,
  578. id_to_name=id_to_name,
  579. html_file_path=html_file,
  580. )
  581. # 生成汇总报告(如果需要)
  582. summary_html = None
  583. if mode_strategy["should_generate_summary"]:
  584. if mode_strategy["should_send_realtime"]:
  585. # 如果已经发送了实时通知,汇总只生成HTML不发送通知
  586. summary_html = self._generate_summary_html(
  587. mode_strategy["summary_mode"]
  588. )
  589. else:
  590. # daily模式:直接生成汇总报告并发送通知
  591. summary_html = self._generate_summary_report(mode_strategy)
  592. # 打开浏览器(仅在非容器环境)
  593. if self._should_open_browser() and html_file:
  594. if summary_html:
  595. summary_url = "file://" + str(Path(summary_html).resolve())
  596. print(f"正在打开汇总报告: {summary_url}")
  597. webbrowser.open(summary_url)
  598. else:
  599. file_url = "file://" + str(Path(html_file).resolve())
  600. print(f"正在打开HTML报告: {file_url}")
  601. webbrowser.open(file_url)
  602. elif self.is_docker_container and html_file:
  603. if summary_html:
  604. print(f"汇总报告已生成(Docker环境): {summary_html}")
  605. else:
  606. print(f"HTML报告已生成(Docker环境): {html_file}")
  607. return summary_html
  608. def run(self) -> None:
  609. """执行分析流程"""
  610. try:
  611. self._initialize_and_check_config()
  612. mode_strategy = self._get_mode_strategy()
  613. results, id_to_name, failed_ids = self._crawl_data()
  614. self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
  615. except Exception as e:
  616. print(f"分析流程执行出错: {e}")
  617. raise
  618. finally:
  619. # 清理资源(包括过期数据清理和数据库连接关闭)
  620. self.ctx.cleanup()
  621. def main():
  622. """主程序入口"""
  623. try:
  624. analyzer = NewsAnalyzer()
  625. analyzer.run()
  626. except FileNotFoundError as e:
  627. print(f"❌ 配置文件错误: {e}")
  628. print("\n请确保以下文件存在:")
  629. print(" • config/config.yaml")
  630. print(" • config/frequency_words.txt")
  631. print("\n参考项目文档进行正确配置")
  632. except Exception as e:
  633. print(f"❌ 程序运行错误: {e}")
  634. raise
  635. if __name__ == "__main__":
  636. main()