Przeglądaj źródła

fix: 修复平台配置解析错误等问题

sansan 4 miesięcy temu
rodzic
commit
d14eba178e

+ 1 - 1
mcp_server/__init__.py

@@ -5,4 +5,4 @@ TrendRadar MCP Server
 
 
 """
 """
 
 
-__version__ = "3.1.6"
+__version__ = "3.1.7"

+ 84 - 45
mcp_server/tools/analytics.py

@@ -4,12 +4,17 @@
 提供热度趋势分析、平台对比、关键词共现、情感分析等高级分析功能。
 提供热度趋势分析、平台对比、关键词共现、情感分析等高级分析功能。
 """
 """
 
 
+import os
 import re
 import re
 from collections import Counter, defaultdict
 from collections import Counter, defaultdict
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Union
 from typing import Dict, List, Optional, Union
 from difflib import SequenceMatcher
 from difflib import SequenceMatcher
 
 
+import yaml
+
+from trendradar.core.analyzer import calculate_news_weight as _calculate_news_weight
+
 from ..services.data_service import DataService
 from ..services.data_service import DataService
 from ..utils.validators import (
 from ..utils.validators import (
     validate_platforms,
     validate_platforms,
@@ -22,13 +27,43 @@ from ..utils.validators import (
 from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
 from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
 
 
 
 
+def _get_weight_config() -> Dict:
+    """
+    从 config.yaml 读取权重配置
+
+    Returns:
+        权重配置字典,包含 RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT
+    """
+    # 默认值
+    default_config = {
+        "RANK_WEIGHT": 0.6,
+        "FREQUENCY_WEIGHT": 0.3,
+        "HOTNESS_WEIGHT": 0.1,
+    }
+
+    try:
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        config_path = os.path.join(current_dir, "..", "..", "config", "config.yaml")
+        config_path = os.path.normpath(config_path)
+
+        with open(config_path, 'r', encoding='utf-8') as f:
+            config = yaml.safe_load(f)
+            weight = config.get('advanced', {}).get('weight', {})
+            return {
+                "RANK_WEIGHT": weight.get('rank', 0.6),
+                "FREQUENCY_WEIGHT": weight.get('frequency', 0.3),
+                "HOTNESS_WEIGHT": weight.get('hotness', 0.1),
+            }
+    except Exception:
+        return default_config
+
+
 def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
 def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
     """
     """
     计算新闻权重(用于排序)
     计算新闻权重(用于排序)
 
 
-    - 排名权重 (60%):新闻在榜单中的排名
-    - 频次权重 (30%):新闻出现的次数
-    - 热度权重 (10%):高排名出现的比例
+    复用 trendradar.core.analyzer.calculate_news_weight 实现,
+    权重配置从 config.yaml 的 advanced.weight 读取。
 
 
     Args:
     Args:
         news_data: 新闻数据字典,包含 ranks 和 count 字段
         news_data: 新闻数据字典,包含 ranks 和 count 字段
@@ -37,41 +72,7 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
     Returns:
     Returns:
         权重分数(0-100之间的浮点数)
         权重分数(0-100之间的浮点数)
     """
     """
-    ranks = news_data.get("ranks", [])
-    if not ranks:
-        return 0.0
-
-    count = news_data.get("count", len(ranks))
-
-    # 权重配置(与 config.yaml 保持一致)
-    RANK_WEIGHT = 0.6
-    FREQUENCY_WEIGHT = 0.3
-    HOTNESS_WEIGHT = 0.1
-
-    # 1. 排名权重:Σ(11 - min(rank, 10)) / 出现次数
-    rank_scores = []
-    for rank in ranks:
-        score = 11 - min(rank, 10)
-        rank_scores.append(score)
-
-    rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
-
-    # 2. 频次权重:min(出现次数, 10) × 10
-    frequency_weight = min(count, 10) * 10
-
-    # 3. 热度加成:高排名次数 / 总出现次数 × 100
-    high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
-    hotness_ratio = high_rank_count / len(ranks) if ranks else 0
-    hotness_weight = hotness_ratio * 100
-
-    # 综合权重
-    total_weight = (
-        rank_weight * RANK_WEIGHT
-        + frequency_weight * FREQUENCY_WEIGHT
-        + hotness_weight * HOTNESS_WEIGHT
-    )
-
-    return total_weight
+    return _calculate_news_weight(news_data, rank_threshold, _get_weight_config())
 
 
 
 
 class AnalyticsTools:
 class AnalyticsTools:
@@ -2158,6 +2159,8 @@ class AnalyticsTools:
         """
         """
         对新闻列表进行相似度聚合
         对新闻列表进行相似度聚合
 
 
+        使用双层过滤策略:先用 Jaccard 快速粗筛,再用 SequenceMatcher 精确计算
+
         Args:
         Args:
             news_list: 新闻列表
             news_list: 新闻列表
             threshold: 相似度阈值
             threshold: 相似度阈值
@@ -2169,17 +2172,31 @@ class AnalyticsTools:
         if not news_list:
         if not news_list:
             return []
             return []
 
 
-        # 按权重排序,优先保留高权重新闻作为代表
-        sorted_news = sorted(news_list, key=lambda x: x.get("weight", 0), reverse=True)
+        # 预计算字符集合用于快速过滤
+        prepared_news = []
+        for news in news_list:
+            char_set = set(news["title"])
+            prepared_news.append({
+                "data": news,
+                "char_set": char_set,
+                "set_len": len(char_set)
+            })
+
+        # 按权重排序
+        sorted_items = sorted(prepared_news, key=lambda x: x["data"].get("weight", 0), reverse=True)
 
 
         aggregated = []
         aggregated = []
         used_indices = set()
         used_indices = set()
+        PRE_FILTER_RATIO = 0.5  # 粗筛阈值系数
 
 
-        for i, news in enumerate(sorted_news):
+        for i, item in enumerate(sorted_items):
             if i in used_indices:
             if i in used_indices:
                 continue
                 continue
 
 
-            # 创建聚合组
+            news = item["data"]
+            base_set = item["char_set"]
+            base_len = item["set_len"]
+
             group = {
             group = {
                 "representative_title": news["title"],
                 "representative_title": news["title"],
                 "platforms": [news["platform_name"]],
                 "platforms": [news["platform_name"]],
@@ -2205,13 +2222,35 @@ class AnalyticsTools:
             used_indices.add(i)
             used_indices.add(i)
 
 
             # 查找相似新闻
             # 查找相似新闻
-            for j, other_news in enumerate(sorted_news):
+            for j in range(i + 1, len(sorted_items)):
                 if j in used_indices:
                 if j in used_indices:
                     continue
                     continue
 
 
-                similarity = self._calculate_similarity(news["title"], other_news["title"])
+                compare_item = sorted_items[j]
+                compare_set = compare_item["char_set"]
+                compare_len = compare_item["set_len"]
+
+                # 快速粗筛:长度检查
+                if base_len == 0 or compare_len == 0:
+                    continue
+
+                # 快速粗筛:长度比例检查
+                if min(base_len, compare_len) / max(base_len, compare_len) < (threshold * PRE_FILTER_RATIO):
+                    continue
+
+                # 快速粗筛:Jaccard 相似度
+                intersection = len(base_set & compare_set)
+                union = len(base_set | compare_set)
+                jaccard_sim = intersection / union if union > 0 else 0
+
+                if jaccard_sim < (threshold * PRE_FILTER_RATIO):
+                    continue
+
+                # 精确计算:SequenceMatcher
+                other_news = compare_item["data"]
+                real_similarity = self._calculate_similarity(news["title"], other_news["title"])
 
 
-                if similarity >= threshold:
+                if real_similarity >= threshold:
                     # 合并到当前组
                     # 合并到当前组
                     if other_news["platform_name"] not in group["platforms"]:
                     if other_news["platform_name"] not in group["platforms"]:
                         group["platforms"].append(other_news["platform_name"])
                         group["platforms"].append(other_news["platform_name"])

+ 9 - 3
mcp_server/tools/system.py

@@ -113,12 +113,18 @@ class SystemManagementTools:
             with open(config_path, "r", encoding="utf-8") as f:
             with open(config_path, "r", encoding="utf-8") as f:
                 config_data = yaml.safe_load(f)
                 config_data = yaml.safe_load(f)
 
 
-            # 获取平台配置
-            all_platforms = config_data.get("platforms", [])
+            # 获取平台配置(嵌套结构:{enabled: bool, sources: [...]})
+            platforms_config = config_data.get("platforms", {})
+            if not platforms_config.get("enabled", True):
+                raise CrawlTaskError(
+                    "热榜平台已禁用",
+                    suggestion="请检查 config/config.yaml 中的 platforms.enabled 配置"
+                )
+            all_platforms = platforms_config.get("sources", [])
             if not all_platforms:
             if not all_platforms:
                 raise CrawlTaskError(
                 raise CrawlTaskError(
                     "配置文件中没有平台配置",
                     "配置文件中没有平台配置",
-                    suggestion="请检查 config/config.yaml 中的 platforms 配置"
+                    suggestion="请检查 config/config.yaml 中的 platforms.sources 配置"
                 )
                 )
 
 
             # 过滤平台
             # 过滤平台

+ 4 - 2
mcp_server/utils/validators.py

@@ -167,8 +167,10 @@ def get_supported_platforms() -> List[str]:
 
 
         with open(config_path, 'r', encoding='utf-8') as f:
         with open(config_path, 'r', encoding='utf-8') as f:
             config = yaml.safe_load(f)
             config = yaml.safe_load(f)
-            platforms = config.get('platforms', [])
-            return [p['id'] for p in platforms if 'id' in p]
+            platforms_config = config.get('platforms', {})
+            # 处理嵌套结构:{enabled: bool, sources: [...]}
+            sources = platforms_config.get('sources', [])
+            return [p['id'] for p in sources if 'id' in p]
     except Exception as e:
     except Exception as e:
         # 降级方案:返回空列表,允许所有平台
         # 降级方案:返回空列表,允许所有平台
         print(f"警告:无法加载平台配置 ({config_path}): {e}")
         print(f"警告:无法加载平台配置 ({config_path}): {e}")

+ 1 - 1
version_mcp

@@ -1 +1 @@
-3.1.6
+3.1.7