Parcourir la source

fix: 修复平台配置解析错误等问题

sansan il y a 4 mois
Parent
commit
d14eba178e

+ 1 - 1
mcp_server/__init__.py

@@ -5,4 +5,4 @@ TrendRadar MCP Server
 
 """
 
-__version__ = "3.1.6"
+__version__ = "3.1.7"

+ 84 - 45
mcp_server/tools/analytics.py

@@ -4,12 +4,17 @@
 提供热度趋势分析、平台对比、关键词共现、情感分析等高级分析功能。
 """
 
+import os
 import re
 from collections import Counter, defaultdict
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Union
 from difflib import SequenceMatcher
 
+import yaml
+
+from trendradar.core.analyzer import calculate_news_weight as _calculate_news_weight
+
 from ..services.data_service import DataService
 from ..utils.validators import (
     validate_platforms,
@@ -22,13 +27,43 @@ from ..utils.validators import (
 from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
 
 
+def _get_weight_config() -> Dict:
+    """
+    从 config.yaml 读取权重配置
+
+    Returns:
+        权重配置字典,包含 RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT
+    """
+    # 默认值
+    default_config = {
+        "RANK_WEIGHT": 0.6,
+        "FREQUENCY_WEIGHT": 0.3,
+        "HOTNESS_WEIGHT": 0.1,
+    }
+
+    try:
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        config_path = os.path.join(current_dir, "..", "..", "config", "config.yaml")
+        config_path = os.path.normpath(config_path)
+
+        with open(config_path, 'r', encoding='utf-8') as f:
+            config = yaml.safe_load(f)
+            weight = config.get('advanced', {}).get('weight', {})
+            return {
+                "RANK_WEIGHT": weight.get('rank', 0.6),
+                "FREQUENCY_WEIGHT": weight.get('frequency', 0.3),
+                "HOTNESS_WEIGHT": weight.get('hotness', 0.1),
+            }
+    except Exception:
+        return default_config
+
+
 def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
     """
     计算新闻权重(用于排序)
 
-    - 排名权重 (60%):新闻在榜单中的排名
-    - 频次权重 (30%):新闻出现的次数
-    - 热度权重 (10%):高排名出现的比例
+    复用 trendradar.core.analyzer.calculate_news_weight 实现,
+    权重配置从 config.yaml 的 advanced.weight 读取。
 
     Args:
         news_data: 新闻数据字典,包含 ranks 和 count 字段
@@ -37,41 +72,7 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
     Returns:
         权重分数(0-100之间的浮点数)
     """
-    ranks = news_data.get("ranks", [])
-    if not ranks:
-        return 0.0
-
-    count = news_data.get("count", len(ranks))
-
-    # 权重配置(与 config.yaml 保持一致)
-    RANK_WEIGHT = 0.6
-    FREQUENCY_WEIGHT = 0.3
-    HOTNESS_WEIGHT = 0.1
-
-    # 1. 排名权重:Σ(11 - min(rank, 10)) / 出现次数
-    rank_scores = []
-    for rank in ranks:
-        score = 11 - min(rank, 10)
-        rank_scores.append(score)
-
-    rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
-
-    # 2. 频次权重:min(出现次数, 10) × 10
-    frequency_weight = min(count, 10) * 10
-
-    # 3. 热度加成:高排名次数 / 总出现次数 × 100
-    high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
-    hotness_ratio = high_rank_count / len(ranks) if ranks else 0
-    hotness_weight = hotness_ratio * 100
-
-    # 综合权重
-    total_weight = (
-        rank_weight * RANK_WEIGHT
-        + frequency_weight * FREQUENCY_WEIGHT
-        + hotness_weight * HOTNESS_WEIGHT
-    )
-
-    return total_weight
+    return _calculate_news_weight(news_data, rank_threshold, _get_weight_config())
 
 
 class AnalyticsTools:
@@ -2158,6 +2159,8 @@ class AnalyticsTools:
         """
         对新闻列表进行相似度聚合
 
+        使用双层过滤策略:先用 Jaccard 快速粗筛,再用 SequenceMatcher 精确计算
+
         Args:
             news_list: 新闻列表
             threshold: 相似度阈值
@@ -2169,17 +2172,31 @@ class AnalyticsTools:
         if not news_list:
             return []
 
-        # 按权重排序,优先保留高权重新闻作为代表
-        sorted_news = sorted(news_list, key=lambda x: x.get("weight", 0), reverse=True)
+        # 预计算字符集合用于快速过滤
+        prepared_news = []
+        for news in news_list:
+            char_set = set(news["title"])
+            prepared_news.append({
+                "data": news,
+                "char_set": char_set,
+                "set_len": len(char_set)
+            })
+
+        # 按权重排序
+        sorted_items = sorted(prepared_news, key=lambda x: x["data"].get("weight", 0), reverse=True)
 
         aggregated = []
         used_indices = set()
+        PRE_FILTER_RATIO = 0.5  # 粗筛阈值系数
 
-        for i, news in enumerate(sorted_news):
+        for i, item in enumerate(sorted_items):
             if i in used_indices:
                 continue
 
-            # 创建聚合组
+            news = item["data"]
+            base_set = item["char_set"]
+            base_len = item["set_len"]
+
             group = {
                 "representative_title": news["title"],
                 "platforms": [news["platform_name"]],
@@ -2205,13 +2222,35 @@ class AnalyticsTools:
             used_indices.add(i)
 
             # 查找相似新闻
-            for j, other_news in enumerate(sorted_news):
+            for j in range(i + 1, len(sorted_items)):
                 if j in used_indices:
                     continue
 
-                similarity = self._calculate_similarity(news["title"], other_news["title"])
+                compare_item = sorted_items[j]
+                compare_set = compare_item["char_set"]
+                compare_len = compare_item["set_len"]
+
+                # 快速粗筛:长度检查
+                if base_len == 0 or compare_len == 0:
+                    continue
+
+                # 快速粗筛:长度比例检查
+                if min(base_len, compare_len) / max(base_len, compare_len) < (threshold * PRE_FILTER_RATIO):
+                    continue
+
+                # 快速粗筛:Jaccard 相似度
+                intersection = len(base_set & compare_set)
+                union = len(base_set | compare_set)
+                jaccard_sim = intersection / union if union > 0 else 0
+
+                if jaccard_sim < (threshold * PRE_FILTER_RATIO):
+                    continue
+
+                # 精确计算:SequenceMatcher
+                other_news = compare_item["data"]
+                real_similarity = self._calculate_similarity(news["title"], other_news["title"])
 
-                if similarity >= threshold:
+                if real_similarity >= threshold:
                     # 合并到当前组
                     if other_news["platform_name"] not in group["platforms"]:
                         group["platforms"].append(other_news["platform_name"])

+ 9 - 3
mcp_server/tools/system.py

@@ -113,12 +113,18 @@ class SystemManagementTools:
             with open(config_path, "r", encoding="utf-8") as f:
                 config_data = yaml.safe_load(f)
 
-            # 获取平台配置
-            all_platforms = config_data.get("platforms", [])
+            # 获取平台配置(嵌套结构:{enabled: bool, sources: [...]})
+            platforms_config = config_data.get("platforms", {})
+            if not platforms_config.get("enabled", True):
+                raise CrawlTaskError(
+                    "热榜平台已禁用",
+                    suggestion="请检查 config/config.yaml 中的 platforms.enabled 配置"
+                )
+            all_platforms = platforms_config.get("sources", [])
             if not all_platforms:
                 raise CrawlTaskError(
                     "配置文件中没有平台配置",
-                    suggestion="请检查 config/config.yaml 中的 platforms 配置"
+                    suggestion="请检查 config/config.yaml 中的 platforms.sources 配置"
                 )
 
             # 过滤平台

+ 4 - 2
mcp_server/utils/validators.py

@@ -167,8 +167,10 @@ def get_supported_platforms() -> List[str]:
 
         with open(config_path, 'r', encoding='utf-8') as f:
             config = yaml.safe_load(f)
-            platforms = config.get('platforms', [])
-            return [p['id'] for p in platforms if 'id' in p]
+            platforms_config = config.get('platforms', {})
+            # 处理嵌套结构:{enabled: bool, sources: [...]}
+            sources = platforms_config.get('sources', [])
+            return [p['id'] for p in sources if 'id' in p]
     except Exception as e:
         # 降级方案:返回空列表,允许所有平台
         print(f"警告:无法加载平台配置 ({config_path}): {e}")

+ 1 - 1
version_mcp

@@ -1 +1 @@
-3.1.6
+3.1.7