| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776 |
- # coding=utf-8
- """
- 统计分析模块
- 提供新闻统计和分析功能:
- - calculate_news_weight: 计算新闻权重
- - format_time_display: 格式化时间显示
- - count_word_frequency: 统计词频
- """
- from typing import Dict, List, Tuple, Optional, Callable
- from trendradar.core.frequency import matches_word_groups, _word_matches
- def calculate_news_weight(
- title_data: Dict,
- rank_threshold: int,
- weight_config: Dict,
- ) -> float:
- """
- 计算新闻权重,用于排序
- Args:
- title_data: 标题数据,包含 ranks 和 count
- rank_threshold: 排名阈值
- weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
- Returns:
- float: 计算出的权重值
- """
- ranks = title_data.get("ranks", [])
- if not ranks:
- return 0.0
- count = title_data.get("count", len(ranks))
- # 排名权重:Σ(11 - min(rank, 10)) / 出现次数
- rank_scores = []
- for rank in ranks:
- score = 11 - min(rank, 10)
- rank_scores.append(score)
- rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
- # 频次权重:min(出现次数, 10) × 10
- frequency_weight = min(count, 10) * 10
- # 热度加成:高排名次数 / 总出现次数 × 100
- high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
- hotness_ratio = high_rank_count / len(ranks) if ranks else 0
- hotness_weight = hotness_ratio * 100
- total_weight = (
- rank_weight * weight_config["RANK_WEIGHT"]
- + frequency_weight * weight_config["FREQUENCY_WEIGHT"]
- + hotness_weight * weight_config["HOTNESS_WEIGHT"]
- )
- return total_weight
- def format_time_display(
- first_time: str,
- last_time: str,
- convert_time_func: Callable[[str], str],
- ) -> str:
- """
- 格式化时间显示(将 HH-MM 转换为 HH:MM)
- Args:
- first_time: 首次出现时间
- last_time: 最后出现时间
- convert_time_func: 时间格式转换函数
- Returns:
- str: 格式化后的时间显示字符串
- """
- if not first_time:
- return ""
- # 转换为显示格式
- first_display = convert_time_func(first_time)
- last_display = convert_time_func(last_time)
- if first_display == last_display or not last_display:
- return first_display
- else:
- return f"[{first_display} ~ {last_display}]"
- def count_word_frequency(
- results: Dict,
- word_groups: List[Dict],
- filter_words: List[str],
- id_to_name: Dict,
- title_info: Optional[Dict] = None,
- rank_threshold: int = 3,
- new_titles: Optional[Dict] = None,
- mode: str = "daily",
- global_filters: Optional[List[str]] = None,
- weight_config: Optional[Dict] = None,
- max_news_per_keyword: int = 0,
- sort_by_position_first: bool = False,
- is_first_crawl_func: Optional[Callable[[], bool]] = None,
- convert_time_func: Optional[Callable[[str], str]] = None,
- quiet: bool = False,
- ) -> Tuple[List[Dict], int]:
- """
- 统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题
- Args:
- results: 抓取结果 {source_id: {title: title_data}}
- word_groups: 词组配置列表
- filter_words: 过滤词列表
- id_to_name: ID 到名称的映射
- title_info: 标题统计信息(可选)
- rank_threshold: 排名阈值
- new_titles: 新增标题(可选)
- mode: 报告模式 (daily/incremental/current)
- global_filters: 全局过滤词(可选)
- weight_config: 权重配置
- max_news_per_keyword: 每个关键词最大显示数量
- sort_by_position_first: 是否优先按配置位置排序
- is_first_crawl_func: 检测是否是当天第一次爬取的函数
- convert_time_func: 时间格式转换函数
- quiet: 是否静默模式(不打印日志)
- Returns:
- Tuple[List[Dict], int]: (统计结果列表, 总标题数)
- """
- # 默认权重配置
- if weight_config is None:
- weight_config = {
- "RANK_WEIGHT": 0.4,
- "FREQUENCY_WEIGHT": 0.3,
- "HOTNESS_WEIGHT": 0.3,
- }
- # 默认时间转换函数
- if convert_time_func is None:
- convert_time_func = lambda x: x
- # 默认首次爬取检测函数
- if is_first_crawl_func is None:
- is_first_crawl_func = lambda: True
- # 如果没有配置词组,创建一个包含所有新闻的虚拟词组
- if not word_groups:
- print("频率词配置为空,将显示所有新闻")
- word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
- filter_words = [] # 清空过滤词,显示所有新闻
- is_first_today = is_first_crawl_func()
- # 确定处理的数据源和新增标记逻辑
- if mode == "incremental":
- if is_first_today:
- # 增量模式 + 当天第一次:处理所有新闻,都标记为新增
- results_to_process = results
- all_news_are_new = True
- else:
- # 增量模式 + 当天非第一次:只处理新增的新闻
- results_to_process = new_titles if new_titles else {}
- all_news_are_new = True
- elif mode == "current":
- # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
- if title_info:
- latest_time = None
- for source_titles in title_info.values():
- for title_data in source_titles.values():
- last_time = title_data.get("last_time", "")
- if last_time:
- if latest_time is None or last_time > latest_time:
- latest_time = last_time
- # 只处理 last_time 等于最新时间的新闻
- if latest_time:
- results_to_process = {}
- for source_id, source_titles in results.items():
- if source_id in title_info:
- filtered_titles = {}
- for title, title_data in source_titles.items():
- if title in title_info[source_id]:
- info = title_info[source_id][title]
- if info.get("last_time") == latest_time:
- filtered_titles[title] = title_data
- if filtered_titles:
- results_to_process[source_id] = filtered_titles
- if not quiet:
- print(
- f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
- )
- else:
- results_to_process = results
- else:
- results_to_process = results
- all_news_are_new = False
- else:
- # 当日汇总模式:处理所有新闻
- results_to_process = results
- all_news_are_new = False
- total_input_news = sum(len(titles) for titles in results.values())
- filter_status = (
- "全部显示"
- if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
- else "频率词过滤"
- )
- print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
- word_stats = {}
- total_titles = 0
- processed_titles = {}
- matched_new_count = 0
- if title_info is None:
- title_info = {}
- if new_titles is None:
- new_titles = {}
- for group in word_groups:
- group_key = group["group_key"]
- word_stats[group_key] = {"count": 0, "titles": {}}
- for source_id, titles_data in results_to_process.items():
- total_titles += len(titles_data)
- if source_id not in processed_titles:
- processed_titles[source_id] = {}
- for title, title_data in titles_data.items():
- if title in processed_titles.get(source_id, {}):
- continue
- # 使用统一的匹配逻辑
- matches_frequency_words = matches_word_groups(
- title, word_groups, filter_words, global_filters
- )
- if not matches_frequency_words:
- continue
- # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
- if (mode == "incremental" and all_news_are_new) or (
- mode == "current" and is_first_today
- ):
- matched_new_count += 1
- source_ranks = title_data.get("ranks", [])
- source_url = title_data.get("url", "")
- source_mobile_url = title_data.get("mobileUrl", "")
- # 找到匹配的词组(防御性转换确保类型安全)
- title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
- for group in word_groups:
- required_words = group["required"]
- normal_words = group["normal"]
- # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
- if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
- group_key = group["group_key"]
- word_stats[group_key]["count"] += 1
- if source_id not in word_stats[group_key]["titles"]:
- word_stats[group_key]["titles"][source_id] = []
- else:
- # 原有的匹配逻辑(支持正则语法)
- if required_words:
- all_required_present = all(
- _word_matches(req_item, title_lower)
- for req_item in required_words
- )
- if not all_required_present:
- continue
- if normal_words:
- any_normal_present = any(
- _word_matches(normal_item, title_lower)
- for normal_item in normal_words
- )
- if not any_normal_present:
- continue
- group_key = group["group_key"]
- word_stats[group_key]["count"] += 1
- if source_id not in word_stats[group_key]["titles"]:
- word_stats[group_key]["titles"][source_id] = []
- first_time = ""
- last_time = ""
- count_info = 1
- ranks = source_ranks if source_ranks else []
- url = source_url
- mobile_url = source_mobile_url
- rank_timeline = []
- # 对于 current 模式,从历史统计信息中获取完整数据
- if (
- mode == "current"
- and title_info
- and source_id in title_info
- and title in title_info[source_id]
- ):
- info = title_info[source_id][title]
- first_time = info.get("first_time", "")
- last_time = info.get("last_time", "")
- count_info = info.get("count", 1)
- if "ranks" in info and info["ranks"]:
- ranks = info["ranks"]
- url = info.get("url", source_url)
- mobile_url = info.get("mobileUrl", source_mobile_url)
- rank_timeline = info.get("rank_timeline", [])
- elif (
- title_info
- and source_id in title_info
- and title in title_info[source_id]
- ):
- info = title_info[source_id][title]
- first_time = info.get("first_time", "")
- last_time = info.get("last_time", "")
- count_info = info.get("count", 1)
- if "ranks" in info and info["ranks"]:
- ranks = info["ranks"]
- url = info.get("url", source_url)
- mobile_url = info.get("mobileUrl", source_mobile_url)
- rank_timeline = info.get("rank_timeline", [])
- if not ranks:
- ranks = [99]
- time_display = format_time_display(first_time, last_time, convert_time_func)
- source_name = id_to_name.get(source_id, source_id)
- # 判断是否为新增
- is_new = False
- if all_news_are_new:
- # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
- is_new = True
- elif new_titles and source_id in new_titles:
- # 检查是否在新增列表中
- new_titles_for_source = new_titles[source_id]
- is_new = title in new_titles_for_source
- word_stats[group_key]["titles"][source_id].append(
- {
- "title": title,
- "source_name": source_name,
- "first_time": first_time,
- "last_time": last_time,
- "time_display": time_display,
- "count": count_info,
- "ranks": ranks,
- "rank_threshold": rank_threshold,
- "url": url,
- "mobileUrl": mobile_url,
- "is_new": is_new,
- "rank_timeline": rank_timeline,
- }
- )
- if source_id not in processed_titles:
- processed_titles[source_id] = {}
- processed_titles[source_id][title] = True
- break
- # 最后统一打印汇总信息
- if mode == "incremental":
- if is_first_today:
- total_input_news = sum(len(titles) for titles in results.values())
- filter_status = (
- "全部显示"
- if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
- else "频率词匹配"
- )
- if not quiet:
- print(
- f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
- )
- else:
- if new_titles:
- total_new_count = sum(len(titles) for titles in new_titles.values())
- filter_status = (
- "全部显示"
- if len(word_groups) == 1
- and word_groups[0]["group_key"] == "全部新闻"
- else "匹配频率词"
- )
- if not quiet:
- print(
- f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}"
- )
- if matched_new_count == 0 and len(word_groups) > 1:
- print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
- else:
- if not quiet:
- print("增量模式:未检测到新增新闻")
- elif mode == "current":
- total_input_news = sum(len(titles) for titles in results_to_process.values())
- if is_first_today:
- filter_status = (
- "全部显示"
- if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
- else "频率词匹配"
- )
- if not quiet:
- print(
- f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
- )
- else:
- matched_count = sum(stat["count"] for stat in word_stats.values())
- filter_status = (
- "全部显示"
- if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
- else "频率词匹配"
- )
- if not quiet:
- print(
- f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
- )
- stats = []
- # 创建 group_key 到位置、最大数量、显示名称的映射
- group_key_to_position = {
- group["group_key"]: idx for idx, group in enumerate(word_groups)
- }
- group_key_to_max_count = {
- group["group_key"]: group.get("max_count", 0) for group in word_groups
- }
- group_key_to_display_name = {
- group["group_key"]: group.get("display_name") for group in word_groups
- }
- for group_key, data in word_stats.items():
- all_titles = []
- for source_id, title_list in data["titles"].items():
- all_titles.extend(title_list)
- # 按权重排序
- sorted_titles = sorted(
- all_titles,
- key=lambda x: (
- -calculate_news_weight(x, rank_threshold, weight_config),
- min(x["ranks"]) if x["ranks"] else 999,
- -x["count"],
- ),
- )
- # 应用最大显示数量限制(优先级:单独配置 > 全局配置)
- group_max_count = group_key_to_max_count.get(group_key, 0)
- if group_max_count == 0:
- # 使用全局配置
- group_max_count = max_news_per_keyword
- if group_max_count > 0:
- sorted_titles = sorted_titles[:group_max_count]
- # 优先使用 display_name,否则使用 group_key
- display_word = group_key_to_display_name.get(group_key) or group_key
- stats.append(
- {
- "word": display_word,
- "count": data["count"],
- "position": group_key_to_position.get(group_key, 999),
- "titles": sorted_titles,
- "percentage": (
- round(data["count"] / total_titles * 100, 2)
- if total_titles > 0
- else 0
- ),
- }
- )
- # 根据配置选择排序优先级
- if sort_by_position_first:
- # 先按配置位置,再按热点条数
- stats.sort(key=lambda x: (x["position"], -x["count"]))
- else:
- # 先按热点条数,再按配置位置(原逻辑)
- stats.sort(key=lambda x: (-x["count"], x["position"]))
- # 打印过滤后的匹配新闻数
- matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
- if not quiet and mode == "daily":
- print(f"当日汇总模式:处理 {total_titles} 条新闻,模式:频率词过滤")
- print(f"频率词过滤后:{matched_news_count} 条新闻匹配")
- return stats, total_titles
- def count_rss_frequency(
- rss_items: List[Dict],
- word_groups: List[Dict],
- filter_words: List[str],
- global_filters: Optional[List[str]] = None,
- new_items: Optional[List[Dict]] = None,
- max_news_per_keyword: int = 0,
- sort_by_position_first: bool = False,
- timezone: str = "Asia/Shanghai",
- rank_threshold: int = 5,
- quiet: bool = False,
- ) -> Tuple[List[Dict], int]:
- """
- 按关键词分组统计 RSS 条目(与热榜统计格式一致)
- Args:
- rss_items: RSS 条目列表,每个条目包含:
- - title: 标题
- - feed_id: RSS 源 ID
- - feed_name: RSS 源名称
- - url: 文章链接
- - published_at: 发布时间(ISO 格式)
- word_groups: 词组配置列表
- filter_words: 过滤词列表
- global_filters: 全局过滤词(可选)
- new_items: 新增条目列表(可选,用于标记 is_new)
- max_news_per_keyword: 每个关键词最大显示数量
- sort_by_position_first: 是否优先按配置位置排序
- timezone: 时区名称(用于时间格式化)
- quiet: 是否静默模式
- Returns:
- Tuple[List[Dict], int]: (统计结果列表, 总条目数)
- 统计结果格式与热榜一致:
- [
- {
- "word": "关键词",
- "count": 5,
- "position": 0,
- "titles": [
- {
- "title": "标题",
- "source_name": "Hacker News",
- "time_display": "12-29 08:20",
- "count": 1,
- "ranks": [1], # RSS 用发布时间顺序作为排名
- "rank_threshold": 50,
- "url": "...",
- "mobile_url": "",
- "is_new": True/False
- }
- ],
- "percentage": 10.0
- }
- ]
- """
- from trendradar.utils.time import format_iso_time_friendly
- if not rss_items:
- return [], 0
- # 如果没有配置词组,创建一个包含所有条目的虚拟词组
- if not word_groups:
- if not quiet:
- print("[RSS] 频率词配置为空,将显示所有 RSS 条目")
- word_groups = [{"required": [], "normal": [], "group_key": "全部 RSS"}]
- filter_words = []
- # 创建新增条目的 URL 集合,用于快速查找
- new_urls = set()
- if new_items:
- for item in new_items:
- if item.get("url"):
- new_urls.add(item["url"])
- # 初始化词组统计
- word_stats = {}
- for group in word_groups:
- group_key = group["group_key"]
- word_stats[group_key] = {"count": 0, "titles": []}
- total_items = len(rss_items)
- processed_urls = set() # 用于去重
- # 为每个条目分配一个基于发布时间的"排名"
- # 按发布时间排序,最新的排在前面
- sorted_items = sorted(
- rss_items,
- key=lambda x: x.get("published_at", ""),
- reverse=True
- )
- url_to_rank = {item.get("url", ""): idx + 1 for idx, item in enumerate(sorted_items)}
- for item in rss_items:
- title = item.get("title", "")
- url = item.get("url", "")
- # 去重
- if url and url in processed_urls:
- continue
- if url:
- processed_urls.add(url)
- # 使用统一的匹配逻辑
- if not matches_word_groups(title, word_groups, filter_words, global_filters):
- continue
- # 找到匹配的词组
- title_lower = title.lower()
- for group in word_groups:
- required_words = group["required"]
- normal_words = group["normal"]
- group_key = group["group_key"]
- # "全部 RSS" 模式:所有条目都匹配
- if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部 RSS":
- matched = True
- else:
- # 检查必须词(支持正则语法)
- if required_words:
- all_required_present = all(
- _word_matches(req_item, title_lower)
- for req_item in required_words
- )
- if not all_required_present:
- continue
- # 检查普通词(支持正则语法)
- if normal_words:
- any_normal_present = any(
- _word_matches(normal_item, title_lower)
- for normal_item in normal_words
- )
- if not any_normal_present:
- continue
- matched = True
- if matched:
- word_stats[group_key]["count"] += 1
- # 格式化时间显示
- published_at = item.get("published_at", "")
- time_display = format_iso_time_friendly(published_at, timezone, include_date=True) if published_at else ""
- # 判断是否为新增
- is_new = url in new_urls if url else False
- # 获取排名(基于发布时间顺序)
- rank = url_to_rank.get(url, 99) if url else 99
- title_data = {
- "title": title,
- "source_name": item.get("feed_name", item.get("feed_id", "RSS")),
- "time_display": time_display,
- "count": 1, # RSS 条目通常只出现一次
- "ranks": [rank],
- "rank_threshold": rank_threshold,
- "url": url,
- "mobile_url": "",
- "is_new": is_new,
- }
- word_stats[group_key]["titles"].append(title_data)
- break # 一个条目只匹配第一个词组
- # 构建统计结果
- stats = []
- group_key_to_position = {
- group["group_key"]: idx for idx, group in enumerate(word_groups)
- }
- group_key_to_max_count = {
- group["group_key"]: group.get("max_count", 0) for group in word_groups
- }
- group_key_to_display_name = {
- group["group_key"]: group.get("display_name") for group in word_groups
- }
- for group_key, data in word_stats.items():
- if data["count"] == 0:
- continue
- # 按发布时间排序(最新在前)
- sorted_titles = sorted(
- data["titles"],
- key=lambda x: x["ranks"][0] if x["ranks"] else 999
- )
- # 应用最大显示数量限制
- group_max_count = group_key_to_max_count.get(group_key, 0)
- if group_max_count == 0:
- group_max_count = max_news_per_keyword
- if group_max_count > 0:
- sorted_titles = sorted_titles[:group_max_count]
- # 优先使用 display_name,否则使用 group_key
- display_word = group_key_to_display_name.get(group_key) or group_key
- stats.append({
- "word": display_word,
- "count": data["count"],
- "position": group_key_to_position.get(group_key, 999),
- "titles": sorted_titles,
- "percentage": round(data["count"] / total_items * 100, 2) if total_items > 0 else 0,
- })
- # 排序
- if sort_by_position_first:
- stats.sort(key=lambda x: (x["position"], -x["count"]))
- else:
- stats.sort(key=lambda x: (-x["count"], x["position"]))
- matched_count = sum(stat["count"] for stat in stats)
- if not quiet:
- print(f"[RSS] 关键词分组统计:{matched_count}/{total_items} 条匹配")
- return stats, total_items
- def convert_keyword_stats_to_platform_stats(
- keyword_stats: List[Dict],
- weight_config: Dict,
- rank_threshold: int = 5,
- ) -> List[Dict]:
- """
- 将按关键词分组的统计数据转换为按平台分组的统计数据
- Args:
- keyword_stats: 原始按关键词分组的统计数据
- weight_config: 权重配置
- rank_threshold: 排名阈值
- Returns:
- 按平台分组的统计数据,格式与原 stats 一致
- """
- # 1. 收集所有新闻,按平台分组
- platform_map: Dict[str, List[Dict]] = {}
- for stat in keyword_stats:
- keyword = stat["word"]
- for title_data in stat["titles"]:
- source_name = title_data["source_name"]
- if source_name not in platform_map:
- platform_map[source_name] = []
- # 复制 title_data 并添加匹配的关键词
- title_with_keyword = title_data.copy()
- title_with_keyword["matched_keyword"] = keyword
- platform_map[source_name].append(title_with_keyword)
- # 2. 去重(同一平台下相同标题只保留一条,保留第一个匹配的关键词)
- for source_name, titles in platform_map.items():
- seen_titles: Dict[str, bool] = {}
- unique_titles = []
- for title_data in titles:
- title_text = title_data["title"]
- if title_text not in seen_titles:
- seen_titles[title_text] = True
- unique_titles.append(title_data)
- platform_map[source_name] = unique_titles
- # 3. 按权重排序每个平台内的新闻
- for source_name, titles in platform_map.items():
- platform_map[source_name] = sorted(
- titles,
- key=lambda x: (
- -calculate_news_weight(x, rank_threshold, weight_config),
- min(x["ranks"]) if x["ranks"] else 999,
- -x["count"],
- ),
- )
- # 4. 构建平台统计结果
- platform_stats = []
- for source_name, titles in platform_map.items():
- platform_stats.append({
- "word": source_name, # 平台名作为分组标识
- "count": len(titles),
- "titles": titles,
- "percentage": 0, # 可后续计算
- })
- # 5. 按新闻条数排序平台
- platform_stats.sort(key=lambda x: -x["count"])
- return platform_stats
|