generator.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. # coding=utf-8
  2. """
  3. 报告生成模块
  4. 提供报告数据准备和 HTML 生成功能:
  5. - prepare_report_data: 准备报告数据
  6. - generate_html_report: 生成 HTML 报告
  7. """
  8. from pathlib import Path
  9. from typing import Dict, List, Optional, Callable
  10. def prepare_report_data(
  11. stats: List[Dict],
  12. failed_ids: Optional[List] = None,
  13. new_titles: Optional[Dict] = None,
  14. id_to_name: Optional[Dict] = None,
  15. mode: str = "daily",
  16. rank_threshold: int = 3,
  17. matches_word_groups_func: Optional[Callable] = None,
  18. load_frequency_words_func: Optional[Callable] = None,
  19. show_new_section: bool = True,
  20. ) -> Dict:
  21. """
  22. 准备报告数据
  23. Args:
  24. stats: 统计结果列表
  25. failed_ids: 失败的 ID 列表
  26. new_titles: 新增标题
  27. id_to_name: ID 到名称的映射
  28. mode: 报告模式 (daily/incremental/current)
  29. rank_threshold: 排名阈值
  30. matches_word_groups_func: 词组匹配函数
  31. load_frequency_words_func: 加载频率词函数
  32. show_new_section: 是否显示新增热点区域
  33. Returns:
  34. Dict: 准备好的报告数据
  35. """
  36. processed_new_titles = []
  37. # 在增量模式下或配置关闭时隐藏新增新闻区域
  38. hide_new_section = mode == "incremental" or not show_new_section
  39. # 只有在非隐藏模式下才处理新增新闻部分
  40. if not hide_new_section:
  41. filtered_new_titles = {}
  42. if new_titles and id_to_name:
  43. # 如果提供了匹配函数,使用它过滤
  44. if matches_word_groups_func and load_frequency_words_func:
  45. word_groups, filter_words, global_filters = load_frequency_words_func()
  46. for source_id, titles_data in new_titles.items():
  47. filtered_titles = {}
  48. for title, title_data in titles_data.items():
  49. if matches_word_groups_func(title, word_groups, filter_words, global_filters):
  50. filtered_titles[title] = title_data
  51. if filtered_titles:
  52. filtered_new_titles[source_id] = filtered_titles
  53. else:
  54. # 没有匹配函数时,使用全部
  55. filtered_new_titles = new_titles
  56. # 打印过滤后的新增热点数(与推送显示一致)
  57. original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0
  58. filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0
  59. if original_new_count > 0:
  60. print(f"频率词过滤后:{filtered_new_count} 条新增热点匹配(原始 {original_new_count} 条)")
  61. if filtered_new_titles and id_to_name:
  62. for source_id, titles_data in filtered_new_titles.items():
  63. source_name = id_to_name.get(source_id, source_id)
  64. source_titles = []
  65. for title, title_data in titles_data.items():
  66. url = title_data.get("url", "")
  67. mobile_url = title_data.get("mobileUrl", "")
  68. ranks = title_data.get("ranks", [])
  69. processed_title = {
  70. "title": title,
  71. "source_name": source_name,
  72. "time_display": "",
  73. "count": 1,
  74. "ranks": ranks,
  75. "rank_threshold": rank_threshold,
  76. "url": url,
  77. "mobile_url": mobile_url,
  78. "is_new": True,
  79. }
  80. source_titles.append(processed_title)
  81. if source_titles:
  82. processed_new_titles.append(
  83. {
  84. "source_id": source_id,
  85. "source_name": source_name,
  86. "titles": source_titles,
  87. }
  88. )
  89. processed_stats = []
  90. for stat in stats:
  91. if stat["count"] <= 0:
  92. continue
  93. processed_titles = []
  94. for title_data in stat["titles"]:
  95. processed_title = {
  96. "title": title_data["title"],
  97. "source_name": title_data["source_name"],
  98. "time_display": title_data["time_display"],
  99. "count": title_data["count"],
  100. "ranks": title_data["ranks"],
  101. "rank_threshold": title_data["rank_threshold"],
  102. "url": title_data.get("url", ""),
  103. "mobile_url": title_data.get("mobileUrl", ""),
  104. "is_new": title_data.get("is_new", False),
  105. }
  106. processed_titles.append(processed_title)
  107. processed_stats.append(
  108. {
  109. "word": stat["word"],
  110. "count": stat["count"],
  111. "percentage": stat.get("percentage", 0),
  112. "titles": processed_titles,
  113. }
  114. )
  115. return {
  116. "stats": processed_stats,
  117. "new_titles": processed_new_titles,
  118. "failed_ids": failed_ids or [],
  119. "total_new_count": sum(
  120. len(source["titles"]) for source in processed_new_titles
  121. ),
  122. }
  123. def generate_html_report(
  124. stats: List[Dict],
  125. total_titles: int,
  126. failed_ids: Optional[List] = None,
  127. new_titles: Optional[Dict] = None,
  128. id_to_name: Optional[Dict] = None,
  129. mode: str = "daily",
  130. update_info: Optional[Dict] = None,
  131. rank_threshold: int = 3,
  132. output_dir: str = "output",
  133. date_folder: str = "",
  134. time_filename: str = "",
  135. render_html_func: Optional[Callable] = None,
  136. matches_word_groups_func: Optional[Callable] = None,
  137. load_frequency_words_func: Optional[Callable] = None,
  138. ) -> str:
  139. """
  140. 生成 HTML 报告
  141. 每次生成 HTML 后会:
  142. 1. 保存时间戳快照到 output/html/日期/时间.html(历史记录)
  143. 2. 复制到 output/html/latest/{mode}.html(最新报告)
  144. 3. 复制到 output/index.html 和根目录 index.html(入口)
  145. Args:
  146. stats: 统计结果列表
  147. total_titles: 总标题数
  148. failed_ids: 失败的 ID 列表
  149. new_titles: 新增标题
  150. id_to_name: ID 到名称的映射
  151. mode: 报告模式 (daily/incremental/current)
  152. update_info: 更新信息
  153. rank_threshold: 排名阈值
  154. output_dir: 输出目录
  155. date_folder: 日期文件夹名称
  156. time_filename: 时间文件名
  157. render_html_func: HTML 渲染函数
  158. matches_word_groups_func: 词组匹配函数
  159. load_frequency_words_func: 加载频率词函数
  160. Returns:
  161. str: 生成的 HTML 文件路径(时间戳快照路径)
  162. """
  163. # 时间戳快照文件名
  164. snapshot_filename = f"{time_filename}.html"
  165. # 构建输出路径(扁平化结构:output/html/日期/)
  166. snapshot_path = Path(output_dir) / "html" / date_folder
  167. snapshot_path.mkdir(parents=True, exist_ok=True)
  168. snapshot_file = str(snapshot_path / snapshot_filename)
  169. # 准备报告数据
  170. report_data = prepare_report_data(
  171. stats,
  172. failed_ids,
  173. new_titles,
  174. id_to_name,
  175. mode,
  176. rank_threshold,
  177. matches_word_groups_func,
  178. load_frequency_words_func,
  179. )
  180. # 渲染 HTML 内容
  181. if render_html_func:
  182. html_content = render_html_func(
  183. report_data, total_titles, mode, update_info
  184. )
  185. else:
  186. # 默认简单 HTML
  187. html_content = f"<html><body><h1>Report</h1><pre>{report_data}</pre></body></html>"
  188. # 1. 保存时间戳快照(历史记录)
  189. with open(snapshot_file, "w", encoding="utf-8") as f:
  190. f.write(html_content)
  191. # 2. 复制到 html/latest/{mode}.html(最新报告)
  192. latest_dir = Path(output_dir) / "html" / "latest"
  193. latest_dir.mkdir(parents=True, exist_ok=True)
  194. latest_file = latest_dir / f"{mode}.html"
  195. with open(latest_file, "w", encoding="utf-8") as f:
  196. f.write(html_content)
  197. # 3. 复制到 index.html(入口)
  198. # output/index.html(供 Docker Volume 挂载访问)
  199. output_index = Path(output_dir) / "index.html"
  200. with open(output_index, "w", encoding="utf-8") as f:
  201. f.write(html_content)
  202. # 根目录 index.html(供 GitHub Pages 访问)
  203. root_index = Path("index.html")
  204. with open(root_index, "w", encoding="utf-8") as f:
  205. f.write(html_content)
  206. return snapshot_file