|
|
@@ -4,12 +4,17 @@
|
|
|
提供热度趋势分析、平台对比、关键词共现、情感分析等高级分析功能。
|
|
|
"""
|
|
|
|
|
|
+import os
|
|
|
import re
|
|
|
from collections import Counter, defaultdict
|
|
|
from datetime import datetime, timedelta
|
|
|
from typing import Dict, List, Optional, Union
|
|
|
from difflib import SequenceMatcher
|
|
|
|
|
|
+import yaml
|
|
|
+
|
|
|
+from trendradar.core.analyzer import calculate_news_weight as _calculate_news_weight
|
|
|
+
|
|
|
from ..services.data_service import DataService
|
|
|
from ..utils.validators import (
|
|
|
validate_platforms,
|
|
|
@@ -22,13 +27,43 @@ from ..utils.validators import (
|
|
|
from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
|
|
|
|
|
|
|
|
|
+def _get_weight_config() -> Dict:
|
|
|
+ """
|
|
|
+ 从 config.yaml 读取权重配置
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 权重配置字典,包含 RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT
|
|
|
+ """
|
|
|
+ # 默认值
|
|
|
+ default_config = {
|
|
|
+ "RANK_WEIGHT": 0.6,
|
|
|
+ "FREQUENCY_WEIGHT": 0.3,
|
|
|
+ "HOTNESS_WEIGHT": 0.1,
|
|
|
+ }
|
|
|
+
|
|
|
+ try:
|
|
|
+ current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
+ config_path = os.path.join(current_dir, "..", "..", "config", "config.yaml")
|
|
|
+ config_path = os.path.normpath(config_path)
|
|
|
+
|
|
|
+ with open(config_path, 'r', encoding='utf-8') as f:
|
|
|
+ config = yaml.safe_load(f)
|
|
|
+ weight = config.get('advanced', {}).get('weight', {})
|
|
|
+ return {
|
|
|
+ "RANK_WEIGHT": weight.get('rank', 0.6),
|
|
|
+ "FREQUENCY_WEIGHT": weight.get('frequency', 0.3),
|
|
|
+ "HOTNESS_WEIGHT": weight.get('hotness', 0.1),
|
|
|
+ }
|
|
|
+ except Exception:
|
|
|
+ return default_config
|
|
|
+
|
|
|
+
|
|
|
def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
|
|
|
"""
|
|
|
计算新闻权重(用于排序)
|
|
|
|
|
|
- - 排名权重 (60%):新闻在榜单中的排名
|
|
|
- - 频次权重 (30%):新闻出现的次数
|
|
|
- - 热度权重 (10%):高排名出现的比例
|
|
|
+ 复用 trendradar.core.analyzer.calculate_news_weight 实现,
|
|
|
+ 权重配置从 config.yaml 的 advanced.weight 读取。
|
|
|
|
|
|
Args:
|
|
|
news_data: 新闻数据字典,包含 ranks 和 count 字段
|
|
|
@@ -37,41 +72,7 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
|
|
|
Returns:
|
|
|
权重分数(0-100之间的浮点数)
|
|
|
"""
|
|
|
- ranks = news_data.get("ranks", [])
|
|
|
- if not ranks:
|
|
|
- return 0.0
|
|
|
-
|
|
|
- count = news_data.get("count", len(ranks))
|
|
|
-
|
|
|
- # 权重配置(与 config.yaml 保持一致)
|
|
|
- RANK_WEIGHT = 0.6
|
|
|
- FREQUENCY_WEIGHT = 0.3
|
|
|
- HOTNESS_WEIGHT = 0.1
|
|
|
-
|
|
|
- # 1. 排名权重:Σ(11 - min(rank, 10)) / 出现次数
|
|
|
- rank_scores = []
|
|
|
- for rank in ranks:
|
|
|
- score = 11 - min(rank, 10)
|
|
|
- rank_scores.append(score)
|
|
|
-
|
|
|
- rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
|
|
|
-
|
|
|
- # 2. 频次权重:min(出现次数, 10) × 10
|
|
|
- frequency_weight = min(count, 10) * 10
|
|
|
-
|
|
|
- # 3. 热度加成:高排名次数 / 总出现次数 × 100
|
|
|
- high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
|
|
|
- hotness_ratio = high_rank_count / len(ranks) if ranks else 0
|
|
|
- hotness_weight = hotness_ratio * 100
|
|
|
-
|
|
|
- # 综合权重
|
|
|
- total_weight = (
|
|
|
- rank_weight * RANK_WEIGHT
|
|
|
- + frequency_weight * FREQUENCY_WEIGHT
|
|
|
- + hotness_weight * HOTNESS_WEIGHT
|
|
|
- )
|
|
|
-
|
|
|
- return total_weight
|
|
|
+ return _calculate_news_weight(news_data, rank_threshold, _get_weight_config())
|
|
|
|
|
|
|
|
|
class AnalyticsTools:
|
|
|
@@ -2158,6 +2159,8 @@ class AnalyticsTools:
|
|
|
"""
|
|
|
对新闻列表进行相似度聚合
|
|
|
|
|
|
+ 使用双层过滤策略:先用 Jaccard 快速粗筛,再用 SequenceMatcher 精确计算
|
|
|
+
|
|
|
Args:
|
|
|
news_list: 新闻列表
|
|
|
threshold: 相似度阈值
|
|
|
@@ -2169,17 +2172,31 @@ class AnalyticsTools:
|
|
|
if not news_list:
|
|
|
return []
|
|
|
|
|
|
- # 按权重排序,优先保留高权重新闻作为代表
|
|
|
- sorted_news = sorted(news_list, key=lambda x: x.get("weight", 0), reverse=True)
|
|
|
+ # 预计算字符集合用于快速过滤
|
|
|
+ prepared_news = []
|
|
|
+ for news in news_list:
|
|
|
+ char_set = set(news["title"])
|
|
|
+ prepared_news.append({
|
|
|
+ "data": news,
|
|
|
+ "char_set": char_set,
|
|
|
+ "set_len": len(char_set)
|
|
|
+ })
|
|
|
+
|
|
|
+ # 按权重排序
|
|
|
+ sorted_items = sorted(prepared_news, key=lambda x: x["data"].get("weight", 0), reverse=True)
|
|
|
|
|
|
aggregated = []
|
|
|
used_indices = set()
|
|
|
+ PRE_FILTER_RATIO = 0.5 # 粗筛阈值系数
|
|
|
|
|
|
- for i, news in enumerate(sorted_news):
|
|
|
+ for i, item in enumerate(sorted_items):
|
|
|
if i in used_indices:
|
|
|
continue
|
|
|
|
|
|
- # 创建聚合组
|
|
|
+ news = item["data"]
|
|
|
+ base_set = item["char_set"]
|
|
|
+ base_len = item["set_len"]
|
|
|
+
|
|
|
group = {
|
|
|
"representative_title": news["title"],
|
|
|
"platforms": [news["platform_name"]],
|
|
|
@@ -2205,13 +2222,35 @@ class AnalyticsTools:
|
|
|
used_indices.add(i)
|
|
|
|
|
|
# 查找相似新闻
|
|
|
- for j, other_news in enumerate(sorted_news):
|
|
|
+ for j in range(i + 1, len(sorted_items)):
|
|
|
if j in used_indices:
|
|
|
continue
|
|
|
|
|
|
- similarity = self._calculate_similarity(news["title"], other_news["title"])
|
|
|
+ compare_item = sorted_items[j]
|
|
|
+ compare_set = compare_item["char_set"]
|
|
|
+ compare_len = compare_item["set_len"]
|
|
|
+
|
|
|
+ # 快速粗筛:长度检查
|
|
|
+ if base_len == 0 or compare_len == 0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 快速粗筛:长度比例检查
|
|
|
+ if min(base_len, compare_len) / max(base_len, compare_len) < (threshold * PRE_FILTER_RATIO):
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 快速粗筛:Jaccard 相似度
|
|
|
+ intersection = len(base_set & compare_set)
|
|
|
+ union = len(base_set | compare_set)
|
|
|
+ jaccard_sim = intersection / union if union > 0 else 0
|
|
|
+
|
|
|
+ if jaccard_sim < (threshold * PRE_FILTER_RATIO):
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 精确计算:SequenceMatcher
|
|
|
+ other_news = compare_item["data"]
|
|
|
+ real_similarity = self._calculate_similarity(news["title"], other_news["title"])
|
|
|
|
|
|
- if similarity >= threshold:
|
|
|
+ if real_similarity >= threshold:
|
|
|
# 合并到当前组
|
|
|
if other_news["platform_name"] not in group["platforms"]:
|
|
|
group["platforms"].append(other_news["platform_name"])
|