| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236 |
- # coding=utf-8
- """
- 报告生成模块
- 提供报告数据准备和 HTML 生成功能:
- - prepare_report_data: 准备报告数据
- - generate_html_report: 生成 HTML 报告
- """
- from pathlib import Path
- from typing import Dict, List, Optional, Callable
- def prepare_report_data(
- stats: List[Dict],
- failed_ids: Optional[List] = None,
- new_titles: Optional[Dict] = None,
- id_to_name: Optional[Dict] = None,
- mode: str = "daily",
- rank_threshold: int = 3,
- matches_word_groups_func: Optional[Callable] = None,
- load_frequency_words_func: Optional[Callable] = None,
- show_new_section: bool = True,
- ) -> Dict:
- """
- 准备报告数据
- Args:
- stats: 统计结果列表
- failed_ids: 失败的 ID 列表
- new_titles: 新增标题
- id_to_name: ID 到名称的映射
- mode: 报告模式 (daily/incremental/current)
- rank_threshold: 排名阈值
- matches_word_groups_func: 词组匹配函数
- load_frequency_words_func: 加载频率词函数
- show_new_section: 是否显示新增热点区域
- Returns:
- Dict: 准备好的报告数据
- """
- processed_new_titles = []
- # 在增量模式下或配置关闭时隐藏新增新闻区域
- hide_new_section = mode == "incremental" or not show_new_section
- # 只有在非隐藏模式下才处理新增新闻部分
- if not hide_new_section:
- filtered_new_titles = {}
- if new_titles and id_to_name:
- # 如果提供了匹配函数,使用它过滤
- if matches_word_groups_func and load_frequency_words_func:
- word_groups, filter_words, global_filters = load_frequency_words_func()
- for source_id, titles_data in new_titles.items():
- filtered_titles = {}
- for title, title_data in titles_data.items():
- if matches_word_groups_func(title, word_groups, filter_words, global_filters):
- filtered_titles[title] = title_data
- if filtered_titles:
- filtered_new_titles[source_id] = filtered_titles
- else:
- # 没有匹配函数时,使用全部
- filtered_new_titles = new_titles
- # 打印过滤后的新增热点数(与推送显示一致)
- original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0
- filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0
- if original_new_count > 0:
- print(f"频率词过滤后:{filtered_new_count} 条新增热点匹配(原始 {original_new_count} 条)")
- if filtered_new_titles and id_to_name:
- for source_id, titles_data in filtered_new_titles.items():
- source_name = id_to_name.get(source_id, source_id)
- source_titles = []
- for title, title_data in titles_data.items():
- url = title_data.get("url", "")
- mobile_url = title_data.get("mobileUrl", "")
- ranks = title_data.get("ranks", [])
- processed_title = {
- "title": title,
- "source_name": source_name,
- "time_display": "",
- "count": 1,
- "ranks": ranks,
- "rank_threshold": rank_threshold,
- "url": url,
- "mobile_url": mobile_url,
- "is_new": True,
- }
- source_titles.append(processed_title)
- if source_titles:
- processed_new_titles.append(
- {
- "source_id": source_id,
- "source_name": source_name,
- "titles": source_titles,
- }
- )
- processed_stats = []
- for stat in stats:
- if stat["count"] <= 0:
- continue
- processed_titles = []
- for title_data in stat["titles"]:
- processed_title = {
- "title": title_data["title"],
- "source_name": title_data["source_name"],
- "time_display": title_data["time_display"],
- "count": title_data["count"],
- "ranks": title_data["ranks"],
- "rank_threshold": title_data["rank_threshold"],
- "url": title_data.get("url", ""),
- "mobile_url": title_data.get("mobileUrl", ""),
- "is_new": title_data.get("is_new", False),
- }
- processed_titles.append(processed_title)
- processed_stats.append(
- {
- "word": stat["word"],
- "count": stat["count"],
- "percentage": stat.get("percentage", 0),
- "titles": processed_titles,
- }
- )
- return {
- "stats": processed_stats,
- "new_titles": processed_new_titles,
- "failed_ids": failed_ids or [],
- "total_new_count": sum(
- len(source["titles"]) for source in processed_new_titles
- ),
- }
- def generate_html_report(
- stats: List[Dict],
- total_titles: int,
- failed_ids: Optional[List] = None,
- new_titles: Optional[Dict] = None,
- id_to_name: Optional[Dict] = None,
- mode: str = "daily",
- update_info: Optional[Dict] = None,
- rank_threshold: int = 3,
- output_dir: str = "output",
- date_folder: str = "",
- time_filename: str = "",
- render_html_func: Optional[Callable] = None,
- matches_word_groups_func: Optional[Callable] = None,
- load_frequency_words_func: Optional[Callable] = None,
- ) -> str:
- """
- 生成 HTML 报告
- 每次生成 HTML 后会:
- 1. 保存时间戳快照到 output/html/日期/时间.html(历史记录)
- 2. 复制到 output/html/latest/{mode}.html(最新报告)
- 3. 复制到 output/index.html 和根目录 index.html(入口)
- Args:
- stats: 统计结果列表
- total_titles: 总标题数
- failed_ids: 失败的 ID 列表
- new_titles: 新增标题
- id_to_name: ID 到名称的映射
- mode: 报告模式 (daily/incremental/current)
- update_info: 更新信息
- rank_threshold: 排名阈值
- output_dir: 输出目录
- date_folder: 日期文件夹名称
- time_filename: 时间文件名
- render_html_func: HTML 渲染函数
- matches_word_groups_func: 词组匹配函数
- load_frequency_words_func: 加载频率词函数
- Returns:
- str: 生成的 HTML 文件路径(时间戳快照路径)
- """
- # 时间戳快照文件名
- snapshot_filename = f"{time_filename}.html"
- # 构建输出路径(扁平化结构:output/html/日期/)
- snapshot_path = Path(output_dir) / "html" / date_folder
- snapshot_path.mkdir(parents=True, exist_ok=True)
- snapshot_file = str(snapshot_path / snapshot_filename)
- # 准备报告数据
- report_data = prepare_report_data(
- stats,
- failed_ids,
- new_titles,
- id_to_name,
- mode,
- rank_threshold,
- matches_word_groups_func,
- load_frequency_words_func,
- )
- # 渲染 HTML 内容
- if render_html_func:
- html_content = render_html_func(
- report_data, total_titles, mode, update_info
- )
- else:
- # 默认简单 HTML
- html_content = f"<html><body><h1>Report</h1><pre>{report_data}</pre></body></html>"
- # 1. 保存时间戳快照(历史记录)
- with open(snapshot_file, "w", encoding="utf-8") as f:
- f.write(html_content)
- # 2. 复制到 html/latest/{mode}.html(最新报告)
- latest_dir = Path(output_dir) / "html" / "latest"
- latest_dir.mkdir(parents=True, exist_ok=True)
- latest_file = latest_dir / f"{mode}.html"
- with open(latest_file, "w", encoding="utf-8") as f:
- f.write(html_content)
- # 3. 复制到 index.html(入口)
- # output/index.html(供 Docker Volume 挂载访问)
- output_index = Path(output_dir) / "index.html"
- with open(output_index, "w", encoding="utf-8") as f:
- f.write(html_content)
- # 根目录 index.html(供 GitHub Pages 访问)
- root_index = Path("index.html")
- with open(root_index, "w", encoding="utf-8") as f:
- f.write(html_content)
- return snapshot_file
|