analyzer.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779
  1. # coding=utf-8
  2. """
  3. 统计分析模块
  4. 提供新闻统计和分析功能:
  5. - calculate_news_weight: 计算新闻权重
  6. - format_time_display: 格式化时间显示
  7. - count_word_frequency: 统计词频
  8. """
  9. from typing import Dict, List, Tuple, Optional, Callable
  10. from trendradar.core.frequency import matches_word_groups, _word_matches
  11. from trendradar.utils.time import DEFAULT_TIMEZONE
  12. def calculate_news_weight(
  13. title_data: Dict,
  14. rank_threshold: int,
  15. weight_config: Dict,
  16. ) -> float:
  17. """
  18. 计算新闻权重,用于排序
  19. Args:
  20. title_data: 标题数据,包含 ranks 和 count
  21. rank_threshold: 排名阈值
  22. weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
  23. Returns:
  24. float: 计算出的权重值
  25. """
  26. ranks = title_data.get("ranks", [])
  27. if not ranks:
  28. return 0.0
  29. count = title_data.get("count", len(ranks))
  30. # 单次遍历计算排名分数总和与高排名次数
  31. rank_score_sum = 0
  32. high_rank_count = 0
  33. for rank in ranks:
  34. rank_score_sum += 11 - min(rank, 10)
  35. if rank <= rank_threshold:
  36. high_rank_count += 1
  37. # 归一化到 0~100(与 frequency_weight、hotness_weight 量纲对齐)
  38. rank_weight = (rank_score_sum / len(ranks)) * 10
  39. # 频次权重:min(出现次数, 10) × 10
  40. frequency_weight = min(count, 10) * 10
  41. # 热度加成:高排名次数 / 总出现次数 × 100
  42. hotness_ratio = high_rank_count / len(ranks)
  43. hotness_weight = hotness_ratio * 100
  44. total_weight = (
  45. rank_weight * weight_config["RANK_WEIGHT"]
  46. + frequency_weight * weight_config["FREQUENCY_WEIGHT"]
  47. + hotness_weight * weight_config["HOTNESS_WEIGHT"]
  48. )
  49. return total_weight
  50. def format_time_display(
  51. first_time: str,
  52. last_time: str,
  53. convert_time_func: Callable[[str], str],
  54. ) -> str:
  55. """
  56. 格式化时间显示(将 HH-MM 转换为 HH:MM)
  57. Args:
  58. first_time: 首次出现时间
  59. last_time: 最后出现时间
  60. convert_time_func: 时间格式转换函数
  61. Returns:
  62. str: 格式化后的时间显示字符串
  63. """
  64. if not first_time:
  65. return ""
  66. # 转换为显示格式
  67. first_display = convert_time_func(first_time)
  68. last_display = convert_time_func(last_time)
  69. if first_display == last_display or not last_display:
  70. return first_display
  71. else:
  72. return f"[{first_display} ~ {last_display}]"
  73. def count_word_frequency(
  74. results: Dict,
  75. word_groups: List[Dict],
  76. filter_words: List[str],
  77. id_to_name: Dict,
  78. title_info: Optional[Dict] = None,
  79. rank_threshold: int = 3,
  80. new_titles: Optional[Dict] = None,
  81. mode: str = "daily",
  82. global_filters: Optional[List[str]] = None,
  83. weight_config: Optional[Dict] = None,
  84. max_news_per_keyword: int = 0,
  85. sort_by_position_first: bool = False,
  86. is_first_crawl_func: Optional[Callable[[], bool]] = None,
  87. convert_time_func: Optional[Callable[[str], str]] = None,
  88. quiet: bool = False,
  89. ) -> Tuple[List[Dict], int]:
  90. """
  91. 统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题
  92. Args:
  93. results: 抓取结果 {source_id: {title: title_data}}
  94. word_groups: 词组配置列表
  95. filter_words: 过滤词列表
  96. id_to_name: ID 到名称的映射
  97. title_info: 标题统计信息(可选)
  98. rank_threshold: 排名阈值
  99. new_titles: 新增标题(可选)
  100. mode: 报告模式 (daily/incremental/current)
  101. global_filters: 全局过滤词(可选)
  102. weight_config: 权重配置
  103. max_news_per_keyword: 每个关键词最大显示数量
  104. sort_by_position_first: 是否优先按配置位置排序
  105. is_first_crawl_func: 检测是否是当天第一次爬取的函数
  106. convert_time_func: 时间格式转换函数
  107. quiet: 是否静默模式(不打印日志)
  108. Returns:
  109. Tuple[List[Dict], int]: (统计结果列表, 总标题数)
  110. """
  111. # 默认权重配置
  112. if weight_config is None:
  113. weight_config = {
  114. "RANK_WEIGHT": 0.6,
  115. "FREQUENCY_WEIGHT": 0.3,
  116. "HOTNESS_WEIGHT": 0.1,
  117. }
  118. # 默认时间转换函数
  119. if convert_time_func is None:
  120. convert_time_func = lambda x: x
  121. # 默认首次爬取检测函数
  122. if is_first_crawl_func is None:
  123. is_first_crawl_func = lambda: True
  124. # 如果没有配置词组,创建一个包含所有新闻的虚拟词组
  125. if not word_groups:
  126. print("频率词配置为空,将显示所有新闻")
  127. word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
  128. filter_words = [] # 清空过滤词,显示所有新闻
  129. is_first_today = is_first_crawl_func()
  130. # 确定处理的数据源和新增标记逻辑
  131. if mode == "incremental":
  132. if is_first_today:
  133. # 增量模式 + 当天第一次:处理所有新闻,都标记为新增
  134. results_to_process = results
  135. all_news_are_new = True
  136. else:
  137. # 增量模式 + 当天非第一次:只处理新增的新闻
  138. results_to_process = new_titles if new_titles else {}
  139. all_news_are_new = True
  140. elif mode == "current":
  141. # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
  142. if title_info:
  143. latest_time = None
  144. for source_titles in title_info.values():
  145. for title_data in source_titles.values():
  146. last_time = title_data.get("last_time", "")
  147. if last_time:
  148. if latest_time is None or last_time > latest_time:
  149. latest_time = last_time
  150. # 只处理 last_time 等于最新时间的新闻
  151. if latest_time:
  152. results_to_process = {}
  153. for source_id, source_titles in results.items():
  154. if source_id in title_info:
  155. filtered_titles = {}
  156. for title, title_data in source_titles.items():
  157. if title in title_info[source_id]:
  158. info = title_info[source_id][title]
  159. if info.get("last_time") == latest_time:
  160. filtered_titles[title] = title_data
  161. if filtered_titles:
  162. results_to_process[source_id] = filtered_titles
  163. if not quiet:
  164. print(
  165. f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
  166. )
  167. else:
  168. results_to_process = results
  169. else:
  170. results_to_process = results
  171. all_news_are_new = False
  172. else:
  173. # 当日汇总模式:处理所有新闻
  174. results_to_process = results
  175. all_news_are_new = False
  176. total_input_news = sum(len(titles) for titles in results.values())
  177. filter_status = (
  178. "全部显示"
  179. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
  180. else "频率词过滤"
  181. )
  182. print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
  183. word_stats = {}
  184. total_titles = 0
  185. processed_titles = {}
  186. matched_new_count = 0
  187. if title_info is None:
  188. title_info = {}
  189. if new_titles is None:
  190. new_titles = {}
  191. for group in word_groups:
  192. group_key = group["group_key"]
  193. word_stats[group_key] = {"count": 0, "titles": {}}
  194. for source_id, titles_data in results_to_process.items():
  195. total_titles += len(titles_data)
  196. if source_id not in processed_titles:
  197. processed_titles[source_id] = {}
  198. for title, title_data in titles_data.items():
  199. if title in processed_titles.get(source_id, {}):
  200. continue
  201. # 使用统一的匹配逻辑
  202. matches_frequency_words = matches_word_groups(
  203. title, word_groups, filter_words, global_filters
  204. )
  205. if not matches_frequency_words:
  206. continue
  207. # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
  208. if (mode == "incremental" and all_news_are_new) or (
  209. mode == "current" and is_first_today
  210. ):
  211. matched_new_count += 1
  212. source_ranks = title_data.get("ranks", [])
  213. source_url = title_data.get("url", "")
  214. source_mobile_url = title_data.get("mobileUrl", "")
  215. # 找到匹配的词组(防御性转换确保类型安全)
  216. title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
  217. for group in word_groups:
  218. required_words = group["required"]
  219. normal_words = group["normal"]
  220. # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
  221. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
  222. group_key = group["group_key"]
  223. word_stats[group_key]["count"] += 1
  224. if source_id not in word_stats[group_key]["titles"]:
  225. word_stats[group_key]["titles"][source_id] = []
  226. else:
  227. # 原有的匹配逻辑(支持正则语法)
  228. if required_words:
  229. all_required_present = all(
  230. _word_matches(req_item, title_lower)
  231. for req_item in required_words
  232. )
  233. if not all_required_present:
  234. continue
  235. if normal_words:
  236. any_normal_present = any(
  237. _word_matches(normal_item, title_lower)
  238. for normal_item in normal_words
  239. )
  240. if not any_normal_present:
  241. continue
  242. group_key = group["group_key"]
  243. word_stats[group_key]["count"] += 1
  244. if source_id not in word_stats[group_key]["titles"]:
  245. word_stats[group_key]["titles"][source_id] = []
  246. first_time = ""
  247. last_time = ""
  248. count_info = 1
  249. ranks = source_ranks if source_ranks else []
  250. url = source_url
  251. mobile_url = source_mobile_url
  252. rank_timeline = []
  253. # 对于 current 模式,从历史统计信息中获取完整数据
  254. if (
  255. mode == "current"
  256. and title_info
  257. and source_id in title_info
  258. and title in title_info[source_id]
  259. ):
  260. info = title_info[source_id][title]
  261. first_time = info.get("first_time", "")
  262. last_time = info.get("last_time", "")
  263. count_info = info.get("count", 1)
  264. if "ranks" in info and info["ranks"]:
  265. ranks = info["ranks"]
  266. url = info.get("url", source_url)
  267. mobile_url = info.get("mobileUrl", source_mobile_url)
  268. rank_timeline = info.get("rank_timeline", [])
  269. elif (
  270. title_info
  271. and source_id in title_info
  272. and title in title_info[source_id]
  273. ):
  274. info = title_info[source_id][title]
  275. first_time = info.get("first_time", "")
  276. last_time = info.get("last_time", "")
  277. count_info = info.get("count", 1)
  278. if "ranks" in info and info["ranks"]:
  279. ranks = info["ranks"]
  280. url = info.get("url", source_url)
  281. mobile_url = info.get("mobileUrl", source_mobile_url)
  282. rank_timeline = info.get("rank_timeline", [])
  283. if not ranks:
  284. ranks = [99]
  285. time_display = format_time_display(first_time, last_time, convert_time_func)
  286. source_name = id_to_name.get(source_id, source_id)
  287. # 判断是否为新增
  288. is_new = False
  289. if all_news_are_new:
  290. # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
  291. is_new = True
  292. elif new_titles and source_id in new_titles:
  293. # 检查是否在新增列表中
  294. new_titles_for_source = new_titles[source_id]
  295. is_new = title in new_titles_for_source
  296. word_stats[group_key]["titles"][source_id].append(
  297. {
  298. "title": title,
  299. "source_name": source_name,
  300. "first_time": first_time,
  301. "last_time": last_time,
  302. "time_display": time_display,
  303. "count": count_info,
  304. "ranks": ranks,
  305. "rank_threshold": rank_threshold,
  306. "url": url,
  307. "mobileUrl": mobile_url,
  308. "is_new": is_new,
  309. "rank_timeline": rank_timeline,
  310. }
  311. )
  312. if source_id not in processed_titles:
  313. processed_titles[source_id] = {}
  314. processed_titles[source_id][title] = True
  315. break
  316. # 最后统一打印汇总信息
  317. if mode == "incremental":
  318. if is_first_today:
  319. total_input_news = sum(len(titles) for titles in results.values())
  320. filter_status = (
  321. "全部显示"
  322. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
  323. else "频率词匹配"
  324. )
  325. if not quiet:
  326. print(
  327. f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
  328. )
  329. else:
  330. if new_titles:
  331. total_new_count = sum(len(titles) for titles in new_titles.values())
  332. filter_status = (
  333. "全部显示"
  334. if len(word_groups) == 1
  335. and word_groups[0]["group_key"] == "全部新闻"
  336. else "匹配频率词"
  337. )
  338. if not quiet:
  339. print(
  340. f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}"
  341. )
  342. if matched_new_count == 0 and len(word_groups) > 1:
  343. print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
  344. else:
  345. if not quiet:
  346. print("增量模式:未检测到新增新闻")
  347. elif mode == "current":
  348. total_input_news = sum(len(titles) for titles in results_to_process.values())
  349. if is_first_today:
  350. filter_status = (
  351. "全部显示"
  352. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
  353. else "频率词匹配"
  354. )
  355. if not quiet:
  356. print(
  357. f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
  358. )
  359. else:
  360. matched_count = sum(stat["count"] for stat in word_stats.values())
  361. filter_status = (
  362. "全部显示"
  363. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
  364. else "频率词匹配"
  365. )
  366. if not quiet:
  367. print(
  368. f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
  369. )
  370. stats = []
  371. # 创建 group_key 到位置、最大数量、显示名称的映射
  372. group_key_to_position = {
  373. group["group_key"]: idx for idx, group in enumerate(word_groups)
  374. }
  375. group_key_to_max_count = {
  376. group["group_key"]: group.get("max_count", 0) for group in word_groups
  377. }
  378. group_key_to_display_name = {
  379. group["group_key"]: group.get("display_name") for group in word_groups
  380. }
  381. for group_key, data in word_stats.items():
  382. all_titles = []
  383. for source_id, title_list in data["titles"].items():
  384. all_titles.extend(title_list)
  385. # 按权重排序
  386. sorted_titles = sorted(
  387. all_titles,
  388. key=lambda x: (
  389. -calculate_news_weight(x, rank_threshold, weight_config),
  390. min(x["ranks"]) if x["ranks"] else 999,
  391. -x["count"],
  392. ),
  393. )
  394. # 应用最大显示数量限制(优先级:单独配置 > 全局配置)
  395. group_max_count = group_key_to_max_count.get(group_key, 0)
  396. if group_max_count == 0:
  397. # 使用全局配置
  398. group_max_count = max_news_per_keyword
  399. if group_max_count > 0:
  400. sorted_titles = sorted_titles[:group_max_count]
  401. # 优先使用 display_name,否则使用 group_key
  402. display_word = group_key_to_display_name.get(group_key) or group_key
  403. stats.append(
  404. {
  405. "word": display_word,
  406. "count": data["count"],
  407. "position": group_key_to_position.get(group_key, 999),
  408. "titles": sorted_titles,
  409. "percentage": (
  410. round(data["count"] / total_titles * 100, 2)
  411. if total_titles > 0
  412. else 0
  413. ),
  414. }
  415. )
  416. # 根据配置选择排序优先级
  417. if sort_by_position_first:
  418. # 先按配置位置,再按热点条数
  419. stats.sort(key=lambda x: (x["position"], -x["count"]))
  420. else:
  421. # 先按热点条数,再按配置位置(原逻辑)
  422. stats.sort(key=lambda x: (-x["count"], x["position"]))
  423. # 打印过滤后的匹配新闻数
  424. matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
  425. if not quiet and mode == "daily":
  426. print(f"当日汇总模式:处理 {total_titles} 条新闻,模式:频率词过滤")
  427. print(f"频率词过滤后:{matched_news_count} 条新闻匹配")
  428. return stats, total_titles
  429. def count_rss_frequency(
  430. rss_items: List[Dict],
  431. word_groups: List[Dict],
  432. filter_words: List[str],
  433. global_filters: Optional[List[str]] = None,
  434. new_items: Optional[List[Dict]] = None,
  435. max_news_per_keyword: int = 0,
  436. sort_by_position_first: bool = False,
  437. timezone: str = DEFAULT_TIMEZONE,
  438. rank_threshold: int = 5,
  439. quiet: bool = False,
  440. ) -> Tuple[List[Dict], int]:
  441. """
  442. 按关键词分组统计 RSS 条目(与热榜统计格式一致)
  443. Args:
  444. rss_items: RSS 条目列表,每个条目包含:
  445. - title: 标题
  446. - feed_id: RSS 源 ID
  447. - feed_name: RSS 源名称
  448. - url: 文章链接
  449. - published_at: 发布时间(ISO 格式)
  450. word_groups: 词组配置列表
  451. filter_words: 过滤词列表
  452. global_filters: 全局过滤词(可选)
  453. new_items: 新增条目列表(可选,用于标记 is_new)
  454. max_news_per_keyword: 每个关键词最大显示数量
  455. sort_by_position_first: 是否优先按配置位置排序
  456. timezone: 时区名称(用于时间格式化)
  457. quiet: 是否静默模式
  458. Returns:
  459. Tuple[List[Dict], int]: (统计结果列表, 总条目数)
  460. 统计结果格式与热榜一致:
  461. [
  462. {
  463. "word": "关键词",
  464. "count": 5,
  465. "position": 0,
  466. "titles": [
  467. {
  468. "title": "标题",
  469. "source_name": "Hacker News",
  470. "time_display": "12-29 08:20",
  471. "count": 1,
  472. "ranks": [1], # RSS 用发布时间顺序作为排名
  473. "rank_threshold": 50,
  474. "url": "...",
  475. "mobile_url": "",
  476. "is_new": True/False
  477. }
  478. ],
  479. "percentage": 10.0
  480. }
  481. ]
  482. """
  483. from trendradar.utils.time import format_iso_time_friendly
  484. if not rss_items:
  485. return [], 0
  486. # 如果没有配置词组,创建一个包含所有条目的虚拟词组
  487. if not word_groups:
  488. if not quiet:
  489. print("[RSS] 频率词配置为空,将显示所有 RSS 条目")
  490. word_groups = [{"required": [], "normal": [], "group_key": "全部 RSS"}]
  491. filter_words = []
  492. # 创建新增条目的 URL 集合,用于快速查找
  493. new_urls = set()
  494. if new_items:
  495. for item in new_items:
  496. if item.get("url"):
  497. new_urls.add(item["url"])
  498. # 初始化词组统计
  499. word_stats = {}
  500. for group in word_groups:
  501. group_key = group["group_key"]
  502. word_stats[group_key] = {"count": 0, "titles": []}
  503. total_items = len(rss_items)
  504. processed_urls = set() # 用于去重
  505. # 为每个条目分配一个基于发布时间的"排名"
  506. # 按发布时间排序,最新的排在前面
  507. sorted_items = sorted(
  508. rss_items,
  509. key=lambda x: x.get("published_at", ""),
  510. reverse=True
  511. )
  512. url_to_rank = {item.get("url", ""): idx + 1 for idx, item in enumerate(sorted_items)}
  513. for item in rss_items:
  514. title = item.get("title", "")
  515. url = item.get("url", "")
  516. # 去重
  517. if url and url in processed_urls:
  518. continue
  519. if url:
  520. processed_urls.add(url)
  521. # 使用统一的匹配逻辑
  522. if not matches_word_groups(title, word_groups, filter_words, global_filters):
  523. continue
  524. # 找到匹配的词组
  525. title_lower = title.lower()
  526. for group in word_groups:
  527. required_words = group["required"]
  528. normal_words = group["normal"]
  529. group_key = group["group_key"]
  530. # "全部 RSS" 模式:所有条目都匹配
  531. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部 RSS":
  532. matched = True
  533. else:
  534. # 检查必须词(支持正则语法)
  535. if required_words:
  536. all_required_present = all(
  537. _word_matches(req_item, title_lower)
  538. for req_item in required_words
  539. )
  540. if not all_required_present:
  541. continue
  542. # 检查普通词(支持正则语法)
  543. if normal_words:
  544. any_normal_present = any(
  545. _word_matches(normal_item, title_lower)
  546. for normal_item in normal_words
  547. )
  548. if not any_normal_present:
  549. continue
  550. matched = True
  551. if matched:
  552. word_stats[group_key]["count"] += 1
  553. # 格式化时间显示
  554. published_at = item.get("published_at", "")
  555. time_display = format_iso_time_friendly(published_at, timezone, include_date=True) if published_at else ""
  556. # 判断是否为新增
  557. is_new = url in new_urls if url else False
  558. # 获取排名(基于发布时间顺序)
  559. rank = url_to_rank.get(url, 99) if url else 99
  560. title_data = {
  561. "title": title,
  562. "source_name": item.get("feed_name", item.get("feed_id", "RSS")),
  563. "time_display": time_display,
  564. "count": 1, # RSS 条目通常只出现一次
  565. "ranks": [rank],
  566. "rank_threshold": rank_threshold,
  567. "url": url,
  568. "mobile_url": "",
  569. "is_new": is_new,
  570. }
  571. word_stats[group_key]["titles"].append(title_data)
  572. break # 一个条目只匹配第一个词组
  573. # 构建统计结果
  574. stats = []
  575. group_key_to_position = {
  576. group["group_key"]: idx for idx, group in enumerate(word_groups)
  577. }
  578. group_key_to_max_count = {
  579. group["group_key"]: group.get("max_count", 0) for group in word_groups
  580. }
  581. group_key_to_display_name = {
  582. group["group_key"]: group.get("display_name") for group in word_groups
  583. }
  584. for group_key, data in word_stats.items():
  585. if data["count"] == 0:
  586. continue
  587. # 按发布时间排序(最新在前)
  588. sorted_titles = sorted(
  589. data["titles"],
  590. key=lambda x: x["ranks"][0] if x["ranks"] else 999
  591. )
  592. # 应用最大显示数量限制
  593. group_max_count = group_key_to_max_count.get(group_key, 0)
  594. if group_max_count == 0:
  595. group_max_count = max_news_per_keyword
  596. if group_max_count > 0:
  597. sorted_titles = sorted_titles[:group_max_count]
  598. # 优先使用 display_name,否则使用 group_key
  599. display_word = group_key_to_display_name.get(group_key) or group_key
  600. stats.append({
  601. "word": display_word,
  602. "count": data["count"],
  603. "position": group_key_to_position.get(group_key, 999),
  604. "titles": sorted_titles,
  605. "percentage": round(data["count"] / total_items * 100, 2) if total_items > 0 else 0,
  606. })
  607. # 排序
  608. if sort_by_position_first:
  609. stats.sort(key=lambda x: (x["position"], -x["count"]))
  610. else:
  611. stats.sort(key=lambda x: (-x["count"], x["position"]))
  612. matched_count = sum(stat["count"] for stat in stats)
  613. if not quiet:
  614. print(f"[RSS] 关键词分组统计:{matched_count}/{total_items} 条匹配")
  615. return stats, total_items
  616. def convert_keyword_stats_to_platform_stats(
  617. keyword_stats: List[Dict],
  618. weight_config: Dict,
  619. rank_threshold: int = 5,
  620. ) -> List[Dict]:
  621. """
  622. 将按关键词分组的统计数据转换为按平台分组的统计数据
  623. Args:
  624. keyword_stats: 原始按关键词分组的统计数据
  625. weight_config: 权重配置
  626. rank_threshold: 排名阈值
  627. Returns:
  628. 按平台分组的统计数据,格式与原 stats 一致
  629. """
  630. # 1. 收集所有新闻,按平台分组
  631. platform_map: Dict[str, List[Dict]] = {}
  632. for stat in keyword_stats:
  633. keyword = stat["word"]
  634. for title_data in stat["titles"]:
  635. source_name = title_data["source_name"]
  636. if source_name not in platform_map:
  637. platform_map[source_name] = []
  638. # 复制 title_data 并添加匹配的关键词
  639. title_with_keyword = title_data.copy()
  640. title_with_keyword["matched_keyword"] = keyword
  641. platform_map[source_name].append(title_with_keyword)
  642. # 2. 去重(同一平台下相同标题只保留一条,保留第一个匹配的关键词)
  643. for source_name, titles in platform_map.items():
  644. seen_titles: Dict[str, bool] = {}
  645. unique_titles = []
  646. for title_data in titles:
  647. title_text = title_data["title"]
  648. if title_text not in seen_titles:
  649. seen_titles[title_text] = True
  650. unique_titles.append(title_data)
  651. platform_map[source_name] = unique_titles
  652. # 3. 按权重排序每个平台内的新闻
  653. for source_name, titles in platform_map.items():
  654. platform_map[source_name] = sorted(
  655. titles,
  656. key=lambda x: (
  657. -calculate_news_weight(x, rank_threshold, weight_config),
  658. min(x["ranks"]) if x["ranks"] else 999,
  659. -x["count"],
  660. ),
  661. )
  662. # 4. 构建平台统计结果
  663. platform_stats = []
  664. for source_name, titles in platform_map.items():
  665. platform_stats.append({
  666. "word": source_name, # 平台名作为分组标识
  667. "count": len(titles),
  668. "titles": titles,
  669. "percentage": 0, # 可后续计算
  670. })
  671. # 5. 按新闻条数排序平台
  672. platform_stats.sort(key=lambda x: -x["count"])
  673. return platform_stats