6 miesięcy temu · 7468f8adcc
--- a/README-EN.md
+++ b/README-EN.md
@@ -13,7 +13,7 @@
 
															 [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
														
 
															 [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
														
 
															 [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
														
 
															-[![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar)
														
 
															+[![Version](https://img.shields.io/badge/version-v4.0.3-blue.svg)](https://github.com/sansan0/TrendRadar)
														
 
															 [![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar)
														
 
															 [![WeWork](https://img.shields.io/badge/WeWork-Notification-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
														
@@ -330,10 +330,11 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
 
															 - **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts
														
 
															-### 2025/12/17 - v4.0.1
														
 
															+### 2025/12/20 - v4.0.3
														
 
															+
														
 
															+- Added URL normalization to fix duplicate push issues caused by dynamic parameters (e.g., Weibo's `band_rank`)
														
 
															+- Fixed incremental mode detection logic to correctly identify historical titles
														
 
															-- StorageManager adds push record proxy methods
														
 
															-- S3 client switches to virtual-hosted style for better compatibility (supports Tencent Cloud COS and more services)
														
 
															 ### 2025/12/13 - mcp-v1.1.0
														
@@ -349,6 +350,12 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
 
															 <summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
														
 
															+### 2025/12/17 - v4.0.1
														
 
															+
														
 
															+- StorageManager adds push record proxy methods
														
 
															+- S3 client switches to virtual-hosted style for better compatibility (supports Tencent Cloud COS and more services)
														
 
															+
														
 
															+
														
 
															 ### 2025/12/13 - v4.0.0
														
 
															 **🎉 Major Update: Comprehensive Refactoring of Storage and Core Architecture**
														
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
															 [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
														
 
															 [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
														
 
															 [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
														
 
															-[![Version](https://img.shields.io/badge/version-v4.0.2-blue.svg)](https://github.com/sansan0/TrendRadar)
														
 
															+[![Version](https://img.shields.io/badge/version-v4.0.3-blue.svg)](https://github.com/sansan0/TrendRadar)
														
 
															 [![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar)
														
 
															 [![企业微信通知](https://img.shields.io/badge/企业微信-通知-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
														
@@ -377,10 +377,10 @@ GitHub 一键 Fork 即可使用，无需编程基础。
 
															-### 2025/12/17 - v4.0.1
														
 
															+### 2025/12/20 - v4.0.3
														
 
															-- StorageManager 添加推送记录代理方法
														
 
															-- S3 客户端切换至 virtual-hosted style 以提升兼容性（支持腾讯云 COS 等更多服务）
														
 
															+- 新增 URL 标准化功能，解决微博等平台因动态参数（如 `band_rank`）导致的重复推送问题
														
 
															+- 修复增量模式检测逻辑，正确识别历史标题
														
 
															 ### 2025/12/13 - mcp-v1.1.0
														
@@ -397,6 +397,13 @@ GitHub 一键 Fork 即可使用，无需编程基础。
 
															 <summary>👉 点击展开：<strong>历史更新</strong></summary>
														
 
															+
														
 
															+### 2025/12/17 - v4.0.1
														
 
															+
														
 
															+- StorageManager 添加推送记录代理方法
														
 
															+- S3 客户端切换至 virtual-hosted style 以提升兼容性（支持腾讯云 COS 等更多服务）
														
 
															+
														
 
															+
														
 
															 ### 2025/12/13 - v4.0.0
														
 
															 **🎉 重大更新：全面重构存储和核心架构**
														
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -73,7 +73,7 @@ crawler:
 
															 # 推送模式选择
														
 
															 report:
														
 
															-  mode: "daily" # 可选: "daily"|"incremental"|"current"
														
 
															+  mode: "current" # 可选: "daily"|"incremental"|"current"
														
 
															   rank_threshold: 5 # 排名高亮阈值
														
 
															   sort_by_position_first: false # 排序优先级：true=先按配置位置排序，false=先按热点条数排序
														
 
															   max_news_per_keyword: 0 # 每个关键词最大显示数量，0=不限制
														
--- a/trendradar/__init__.py
+++ b/trendradar/__init__.py
@@ -9,5 +9,5 @@ TrendRadar - 热点新闻聚合与分析工具
 
															 from trendradar.context import AppContext
														
 
															-__version__ = "4.0.2"
														
 
															+__version__ = "4.0.3"
														
 
															 __all__ = ["AppContext", "__version__"]
														
--- a/trendradar/__main__.py
+++ b/trendradar/__main__.py
@@ -214,8 +214,14 @@ class NewsAnalyzer:
 
															         self, stats: List[Dict], new_titles: Optional[Dict] = None
														
 
															     ) -> bool:
														
 
															         """检查是否有有效的新闻内容"""
														
 
															-        if self.report_mode in ["incremental", "current"]:
														
 
															-            # 增量模式和current模式下，只要stats有内容就说明有匹配的新闻
														
 
															+        if self.report_mode == "incremental":
														
 
															+            # 增量模式：必须有新增标题才推送
														
 
															+            has_new_titles = bool(
														
 
															+                new_titles and any(len(titles) > 0 for titles in new_titles.values())
														
 
															+            )
														
 
															+            return has_new_titles
														
 
															+        elif self.report_mode == "current":
														
 
															+            # current模式：只要stats有内容就说明有匹配的新闻
														
 
															             return any(stat["count"] > 0 for stat in stats)
														
 
															         else:
														
 
															             # 当日汇总模式下，检查是否有匹配的频率词新闻或新增新闻
														
@@ -227,15 +233,17 @@ class NewsAnalyzer:
 
															     def _load_analysis_data(
														
 
															         self,
														
 
															+        quiet: bool = False,
														
 
															     ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
														
 
															         """统一的数据加载和预处理，使用当前监控平台列表过滤历史数据"""
														
 
															         try:
														
 
															             # 获取当前配置的监控平台ID列表
														
 
															             current_platform_ids = self.ctx.platform_ids
														
 
															-            print(f"当前监控平台: {current_platform_ids}")
														
 
															+            if not quiet:
														
 
															+                print(f"当前监控平台: {current_platform_ids}")
														
 
															             all_results, id_to_name, title_info = self.ctx.read_today_titles(
														
 
															-                current_platform_ids
														
 
															+                current_platform_ids, quiet=quiet
														
 
															             )
														
 
															             if not all_results:
														
@@ -243,9 +251,10 @@ class NewsAnalyzer:
 
															                 return None
														
 
															             total_titles = sum(len(titles) for titles in all_results.values())
														
 
															-            print(f"读取到 {total_titles} 个标题（已按当前监控平台过滤）")
														
 
															+            if not quiet:
														
 
															+                print(f"读取到 {total_titles} 个标题（已按当前监控平台过滤）")
														
 
															-            new_titles = self.ctx.detect_new_titles(current_platform_ids)
														
 
															+            new_titles = self.ctx.detect_new_titles(current_platform_ids, quiet=quiet)
														
 
															             word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
														
 
															             return (
														
@@ -293,6 +302,7 @@ class NewsAnalyzer:
 
															         failed_ids: Optional[List] = None,
														
 
															         is_daily_summary: bool = False,
														
 
															         global_filters: Optional[List[str]] = None,
														
 
															+        quiet: bool = False,
														
 
															     ) -> Tuple[List[Dict], Optional[str]]:
														
 
															         """统一的分析流水线：数据处理 → 统计计算 → HTML生成"""
														
@@ -306,6 +316,7 @@ class NewsAnalyzer:
 
															             new_titles,
														
 
															             mode=mode,
														
 
															             global_filters=global_filters,
														
 
															+            quiet=quiet,
														
 
															         )
														
 
															         # HTML生成（如果启用）
														
@@ -406,9 +417,12 @@ class NewsAnalyzer:
 
															         ):
														
 
															             mode_strategy = self._get_mode_strategy()
														
 
															             if "实时" in report_type:
														
 
															-                print(
														
 
															-                    f"跳过实时推送通知：{mode_strategy['mode_name']}下未检测到匹配的新闻"
														
 
															-                )
														
 
															+                if self.report_mode == "incremental":
														
 
															+                    print("跳过实时推送通知：增量模式下未检测到新增的新闻")
														
 
															+                else:
														
 
															+                    print(
														
 
															+                        f"跳过实时推送通知：{mode_strategy['mode_name']}下未检测到匹配的新闻"
														
 
															+                    )
														
 
															             else:
														
 
															                 print(
														
 
															                     f"跳过{mode_strategy['summary_report_type']}通知：未匹配到有效的新闻内容"
														
@@ -466,8 +480,8 @@ class NewsAnalyzer:
 
															         summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
														
 
															         print(f"生成{summary_type}HTML...")
														
 
															-        # 加载分析数据
														
 
															-        analysis_data = self._load_analysis_data()
														
 
															+        # 加载分析数据（静默模式，避免重复输出日志）
														
 
															+        analysis_data = self._load_analysis_data(quiet=True)
														
 
															         if not analysis_data:
														
 
															             return None
														
@@ -475,7 +489,7 @@ class NewsAnalyzer:
 
															             analysis_data
														
 
															         )
														
 
															-        # 运行分析流水线
														
 
															+        # 运行分析流水线（静默模式，避免重复输出日志）
														
 
															         _, html_file = self._run_analysis_pipeline(
														
 
															             all_results,
														
 
															             mode,
														
@@ -486,6 +500,7 @@ class NewsAnalyzer:
 
															             id_to_name,
														
 
															             is_daily_summary=True,
														
 
															             global_filters=global_filters,
														
 
															+            quiet=True,
														
 
															         )
														
 
															         if html_file:
														
--- a/trendradar/context.py
+++ b/trendradar/context.py
@@ -167,20 +167,20 @@ class AppContext:
 
															         return save_titles_to_file(results, id_to_name, failed_ids, output_path, clean_title)
														
 
															     def read_today_titles(
														
 
															-        self, platform_ids: Optional[List[str]] = None
														
 
															+        self, platform_ids: Optional[List[str]] = None, quiet: bool = False
														
 
															     ) -> Tuple[Dict, Dict, Dict]:
														
 
															         """读取当天所有标题"""
														
 
															-        return read_all_today_titles(self.get_storage_manager(), platform_ids)
														
 
															+        return read_all_today_titles(self.get_storage_manager(), platform_ids, quiet=quiet)
														
 
															     def detect_new_titles(
														
 
															-        self, platform_ids: Optional[List[str]] = None
														
 
															+        self, platform_ids: Optional[List[str]] = None, quiet: bool = False
														
 
															     ) -> Dict:
														
 
															         """检测最新批次的新增标题"""
														
 
															-        return detect_latest_new_titles(self.get_storage_manager(), platform_ids)
														
 
															+        return detect_latest_new_titles(self.get_storage_manager(), platform_ids, quiet=quiet)
														
 
															     def is_first_crawl(self) -> bool:
														
 
															         """检测是否是当天第一次爬取"""
														
 
															-        return is_first_crawl_today("output", self.format_date())
														
 
															+        return self.get_storage_manager().is_first_crawl_today()
														
 
															     # === 频率词处理 ===
														
@@ -212,6 +212,7 @@ class AppContext:
 
															         new_titles: Optional[Dict] = None,
														
 
															         mode: str = "daily",
														
 
															         global_filters: Optional[List[str]] = None,
														
 
															+        quiet: bool = False,
														
 
															     ) -> Tuple[List[Dict], int]:
														
 
															         """统计词频"""
														
 
															         return count_word_frequency(
														
@@ -229,6 +230,7 @@ class AppContext:
 
															             sort_by_position_first=self.config.get("SORT_BY_POSITION_FIRST", False),
														
 
															             is_first_crawl_func=self.is_first_crawl,
														
 
															             convert_time_func=self.convert_time_display,
														
 
															+            quiet=quiet,
														
 
															         )
														
 
															     # === 报告生成 ===
														
--- a/trendradar/core/analyzer.py
+++ b/trendradar/core/analyzer.py
@@ -102,6 +102,7 @@ def count_word_frequency(
 
															     sort_by_position_first: bool = False,
														
 
															     is_first_crawl_func: Optional[Callable[[], bool]] = None,
														
 
															     convert_time_func: Optional[Callable[[str], str]] = None,
														
 
															+    quiet: bool = False,
														
 
															 ) -> Tuple[List[Dict], int]:
														
 
															     """
														
 
															     统计词频，支持必须词、频率词、过滤词、全局过滤词，并标记新增标题
														
@@ -121,6 +122,7 @@ def count_word_frequency(
 
															         sort_by_position_first: 是否优先按配置位置排序
														
 
															         is_first_crawl_func: 检测是否是当天第一次爬取的函数
														
 
															         convert_time_func: 时间格式转换函数
														
 
															+        quiet: 是否静默模式（不打印日志）
														
 
															     Returns:
														
 
															         Tuple[List[Dict], int]: (统计结果列表, 总标题数)
														
@@ -461,9 +463,10 @@ def count_word_frequency(
 
															         # 先按热点条数，再按配置位置（原逻辑）
														
 
															         stats.sort(key=lambda x: (-x["count"], x["position"]))
														
 
															-    # 打印过滤后的匹配新闻数（与推送显示一致）
														
 
															+    # 打印过滤后的匹配新闻数
														
 
															     matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
														
 
															-    if mode == "daily":
														
 
															-        print(f"频率词过滤后：{matched_news_count} 条新闻匹配（将显示在推送中）")
														
 
															+    if not quiet and mode == "daily":
														
 
															+        print(f"当日汇总模式：处理 {total_titles} 条新闻，模式：频率词过滤")
														
 
															+        print(f"频率词过滤后：{matched_news_count} 条新闻匹配")
														
 
															     return stats, total_titles
														
--- a/trendradar/core/data.py
+++ b/trendradar/core/data.py
@@ -152,6 +152,7 @@ def read_all_today_titles_from_storage(
 
															 def read_all_today_titles(
														
 
															     storage_manager,
														
 
															     current_platform_ids: Optional[List[str]] = None,
														
 
															+    quiet: bool = False,
														
 
															 ) -> Tuple[Dict, Dict, Dict]:
														
 
															     """
														
 
															     读取当天所有标题（从存储后端）
														
@@ -159,6 +160,7 @@ def read_all_today_titles(
 
															     Args:
														
 
															         storage_manager: 存储管理器实例
														
 
															         current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
														
 
															+        quiet: 是否静默模式（不打印日志）
														
 
															     Returns:
														
 
															         Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
														
@@ -167,11 +169,12 @@ def read_all_today_titles(
 
															         storage_manager, current_platform_ids
														
 
															     )
														
 
															-    if all_results:
														
 
															-        total_count = sum(len(titles) for titles in all_results.values())
														
 
															-        print(f"[存储] 已从存储后端读取 {total_count} 条标题")
														
 
															-    else:
														
 
															-        print("[存储] 当天暂无数据")
														
 
															+    if not quiet:
														
 
															+        if all_results:
														
 
															+            total_count = sum(len(titles) for titles in all_results.values())
														
 
															+            print(f"[存储] 已从存储后端读取 {total_count} 条标题")
														
 
															+        else:
														
 
															+            print("[存储] 当天暂无数据")
														
 
															     return all_results, final_id_to_name, title_info
														
@@ -202,19 +205,35 @@ def detect_latest_new_titles_from_storage(
 
															             # 没有历史数据（第一次抓取），不应该有"新增"标题
														
 
															             return {}
														
 
															-        # 收集历史标题（不包括最新批次的时间）
														
 
															+        # 获取最新批次时间
														
 
															         latest_time = latest_data.crawl_time
														
 
															-        historical_titles = {}
														
 
															+        # 步骤1：收集最新批次的标题（last_crawl_time = latest_time 的标题）
														
 
															+        latest_titles = {}
														
 
															+        for source_id, news_list in latest_data.items.items():
														
 
															+            if current_platform_ids is not None and source_id not in current_platform_ids:
														
 
															+                continue
														
 
															+            latest_titles[source_id] = {}
														
 
															+            for item in news_list:
														
 
															+                latest_titles[source_id][item.title] = {
														
 
															+                    "ranks": [item.rank],
														
 
															+                    "url": item.url or "",
														
 
															+                    "mobileUrl": item.mobile_url or "",
														
 
															+                }
														
 
															+
														
 
															+        # 步骤2：收集历史标题
														
 
															+        # 关键逻辑：一个标题只要其 first_crawl_time < latest_time，就是历史标题
														
 
															+        # 这样即使同一标题有多条记录（URL 不同），只要任何一条是历史的，该标题就算历史
														
 
															+        historical_titles = {}
														
 
															         for source_id, news_list in all_data.items.items():
														
 
															             if current_platform_ids is not None and source_id not in current_platform_ids:
														
 
															                 continue
														
 
															             historical_titles[source_id] = set()
														
 
															             for item in news_list:
														
 
															-                # 只统计非最新批次的标题
														
 
															                 first_time = getattr(item, 'first_time', item.crawl_time)
														
 
															-                if first_time != latest_time:
														
 
															+                # 如果该记录的首次出现时间早于最新批次，则该标题是历史标题
														
 
															+                if first_time < latest_time:
														
 
															                     historical_titles[source_id].add(item.title)
														
 
															         # 检查是否是当天第一次抓取（没有任何历史标题）
														
@@ -223,22 +242,15 @@ def detect_latest_new_titles_from_storage(
 
															         if not has_historical_data:
														
 
															             return {}
														
 
															-        # 找出新增标题
														
 
															+        # 步骤3：找出新增标题 = 最新批次标题 - 历史标题
														
 
															         new_titles = {}
														
 
															-        for source_id, news_list in latest_data.items.items():
														
 
															-            if current_platform_ids is not None and source_id not in current_platform_ids:
														
 
															-                continue
														
 
															-
														
 
															+        for source_id, source_latest_titles in latest_titles.items():
														
 
															             historical_set = historical_titles.get(source_id, set())
														
 
															             source_new_titles = {}
														
 
															-            for item in news_list:
														
 
															-                if item.title not in historical_set:
														
 
															-                    source_new_titles[item.title] = {
														
 
															-                        "ranks": [item.rank],
														
 
															-                        "url": item.url or "",
														
 
															-                        "mobileUrl": item.mobile_url or "",
														
 
															-                    }
														
 
															+            for title, title_data in source_latest_titles.items():
														
 
															+                if title not in historical_set:
														
 
															+                    source_new_titles[title] = title_data
														
 
															             if source_new_titles:
														
 
															                 new_titles[source_id] = source_new_titles
														
@@ -253,6 +265,7 @@ def detect_latest_new_titles_from_storage(
 
															 def detect_latest_new_titles(
														
 
															     storage_manager,
														
 
															     current_platform_ids: Optional[List[str]] = None,
														
 
															+    quiet: bool = False,
														
 
															 ) -> Dict:
														
 
															     """
														
 
															     检测当日最新批次的新增标题（从存储后端）
														
@@ -260,12 +273,13 @@ def detect_latest_new_titles(
 
															     Args:
														
 
															         storage_manager: 存储管理器实例
														
 
															         current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
														
 
															+        quiet: 是否静默模式（不打印日志）
														
 
															     Returns:
														
 
															         Dict: 新增标题 {source_id: {title: title_data}}
														
 
															     """
														
 
															     new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
														
 
															-    if new_titles:
														
 
															+    if new_titles and not quiet:
														
 
															         total_new = sum(len(titles) for titles in new_titles.values())
														
 
															         print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
														
 
															     return new_titles
														
--- a/trendradar/notification/renderer.py
+++ b/trendradar/notification/renderer.py
@@ -6,7 +6,7 @@
 
															 """
														
 
															 from datetime import datetime
														
 
															-from typing import Dict, List, Optional, Callable
														
 
															+from typing import Dict, Optional, Callable
														
 
															 from trendradar.report.formatter import format_title_for_platform
														
--- a/trendradar/storage/base.py
+++ b/trendradar/storage/base.py
@@ -7,9 +7,7 @@
 
															 from abc import ABC, abstractmethod
														
 
															 from dataclasses import dataclass, field
														
 
															-from datetime import datetime
														
 
															 from typing import Dict, List, Optional, Any
														
 
															-import json
														
 
															 @dataclass
														
--- a/trendradar/storage/local.py
+++ b/trendradar/storage/local.py
@@ -6,13 +6,12 @@
 
															 """
														
 
															 import sqlite3
														
 
															-import os
														
 
															 import shutil
														
 
															 import pytz
														
 
															 import re
														
 
															 from datetime import datetime, timedelta
														
 
															 from pathlib import Path
														
 
															-from typing import Dict, List, Optional, Any
														
 
															+from typing import Dict, List, Optional
														
 
															 from trendradar.storage.base import StorageBackend, NewsItem, NewsData
														
 
															 from trendradar.utils.time import (
														
@@ -20,6 +19,7 @@ from trendradar.utils.time import (
 
															     format_date_folder,
														
 
															     format_time_filename,
														
 
															 )
														
 
															+from trendradar.utils.url import normalize_url
														
 
															 class LocalStorageBackend(StorageBackend):
														
@@ -148,12 +148,15 @@ class LocalStorageBackend(StorageBackend):
 
															                 for item in news_list:
														
 
															                     try:
														
 
															-                        # 检查是否已存在（通过 URL + platform_id）
														
 
															-                        if item.url:
														
 
															+                        # 标准化 URL（去除动态参数，如微博的 band_rank）
														
 
															+                        normalized_url = normalize_url(item.url, source_id) if item.url else ""
														
 
															+
														
 
															+                        # 检查是否已存在（通过标准化 URL + platform_id）
														
 
															+                        if normalized_url:
														
 
															                             cursor.execute("""
														
 
															                                 SELECT id, title FROM news_items
														
 
															                                 WHERE url = ? AND platform_id = ?
														
 
															-                            """, (item.url, source_id))
														
 
															+                            """, (normalized_url, source_id))
														
 
															                             existing = cursor.fetchone()
														
 
															                             if existing:
														
@@ -191,14 +194,14 @@ class LocalStorageBackend(StorageBackend):
 
															                                       data.crawl_time, now_str, existing_id))
														
 
															                                 updated_count += 1
														
 
															                             else:
														
 
															-                                # 不存在，插入新记录
														
 
															+                                # 不存在，插入新记录（存储标准化后的 URL）
														
 
															                                 cursor.execute("""
														
 
															                                     INSERT INTO news_items
														
 
															                                     (title, platform_id, rank, url, mobile_url,
														
 
															                                      first_crawl_time, last_crawl_time, crawl_count,
														
 
															                                      created_at, updated_at)
														
 
															                                     VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
														
 
															-                                """, (item.title, source_id, item.rank, item.url,
														
 
															+                                """, (item.title, source_id, item.rank, normalized_url,
														
 
															                                       item.mobile_url, data.crawl_time, data.crawl_time,
														
 
															                                       now_str, now_str))
														
 
															                                 new_id = cursor.lastrowid
														
@@ -217,7 +220,7 @@ class LocalStorageBackend(StorageBackend):
 
															                                  first_crawl_time, last_crawl_time, crawl_count,
														
 
															                                  created_at, updated_at)
														
 
															                                 VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
														
 
															-                            """, (item.title, source_id, item.rank, item.url,
														
 
															+                            """, (item.title, source_id, item.rank, "",
														
 
															                                   item.mobile_url, data.crawl_time, data.crawl_time,
														
 
															                                   now_str, now_str))
														
 
															                             new_id = cursor.lastrowid
														
@@ -524,6 +527,9 @@ class LocalStorageBackend(StorageBackend):
 
															         """
														
 
															         检测新增的标题
														
 
															+        该方法比较当前抓取数据与历史数据，找出新增的标题。
														
 
															+        关键逻辑：只有在历史批次中从未出现过的标题才算新增。
														
 
															+
														
 
															         Args:
														
 
															             current_data: 当前抓取的数据
														
@@ -541,10 +547,24 @@ class LocalStorageBackend(StorageBackend):
 
															                     new_titles[source_id] = {item.title: item for item in news_list}
														
 
															                 return new_titles
														
 
															-            # 收集历史标题
														
 
															+            # 获取当前批次时间
														
 
															+            current_time = current_data.crawl_time
														
 
															+
														
 
															+            # 收集历史标题（first_time < current_time 的标题）
														
 
															+            # 这样可以正确处理同一标题因 URL 变化而产生多条记录的情况
														
 
															             historical_titles: Dict[str, set] = {}
														
 
															             for source_id, news_list in historical_data.items.items():
														
 
															-                historical_titles[source_id] = {item.title for item in news_list}
														
 
															+                historical_titles[source_id] = set()
														
 
															+                for item in news_list:
														
 
															+                    first_time = getattr(item, 'first_time', item.crawl_time)
														
 
															+                    if first_time < current_time:
														
 
															+                        historical_titles[source_id].add(item.title)
														
 
															+
														
 
															+            # 检查是否有历史数据
														
 
															+            has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
														
 
															+            if not has_historical_data:
														
 
															+                # 第一次抓取，没有"新增"概念
														
 
															+                return {}
														
 
															             # 检测新增
														
 
															             new_titles = {}
														
--- a/trendradar/storage/remote.py
+++ b/trendradar/storage/remote.py
@@ -34,6 +34,7 @@ from trendradar.utils.time import (
 
															     format_date_folder,
														
 
															     format_time_filename,
														
 
															 )
														
 
															+from trendradar.utils.url import normalize_url
														
 
															 class RemoteStorageBackend(StorageBackend):
														
@@ -355,12 +356,15 @@ class RemoteStorageBackend(StorageBackend):
 
															                 for item in news_list:
														
 
															                     try:
														
 
															-                        # 检查是否已存在（通过 URL + platform_id）
														
 
															-                        if item.url:
														
 
															+                        # 标准化 URL（去除动态参数，如微博的 band_rank）
														
 
															+                        normalized_url = normalize_url(item.url, source_id) if item.url else ""
														
 
															+
														
 
															+                        # 检查是否已存在（通过标准化 URL + platform_id）
														
 
															+                        if normalized_url:
														
 
															                             cursor.execute("""
														
 
															                                 SELECT id, title FROM news_items
														
 
															                                 WHERE url = ? AND platform_id = ?
														
 
															-                            """, (item.url, source_id))
														
 
															+                            """, (normalized_url, source_id))
														
 
															                             existing = cursor.fetchone()
														
 
															                             if existing:
														
@@ -398,14 +402,14 @@ class RemoteStorageBackend(StorageBackend):
 
															                                       data.crawl_time, now_str, existing_id))
														
 
															                                 updated_count += 1
														
 
															                             else:
														
 
															-                                # 不存在，插入新记录
														
 
															+                                # 不存在，插入新记录（存储标准化后的 URL）
														
 
															                                 cursor.execute("""
														
 
															                                     INSERT INTO news_items
														
 
															                                     (title, platform_id, rank, url, mobile_url,
														
 
															                                      first_crawl_time, last_crawl_time, crawl_count,
														
 
															                                      created_at, updated_at)
														
 
															                                     VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
														
 
															-                                """, (item.title, source_id, item.rank, item.url,
														
 
															+                                """, (item.title, source_id, item.rank, normalized_url,
														
 
															                                       item.mobile_url, data.crawl_time, data.crawl_time,
														
 
															                                       now_str, now_str))
														
 
															                                 new_id = cursor.lastrowid
														
@@ -424,7 +428,7 @@ class RemoteStorageBackend(StorageBackend):
 
															                                  first_crawl_time, last_crawl_time, crawl_count,
														
 
															                                  created_at, updated_at)
														
 
															                                 VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
														
 
															-                            """, (item.title, source_id, item.rank, item.url,
														
 
															+                            """, (item.title, source_id, item.rank, "",
														
 
															                                   item.mobile_url, data.crawl_time, data.crawl_time,
														
 
															                                   now_str, now_str))
														
 
															                             new_id = cursor.lastrowid
														
@@ -693,7 +697,12 @@ class RemoteStorageBackend(StorageBackend):
 
															             return None
														
 
															     def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
														
 
															-        """检测新增的标题"""
														
 
															+        """
														
 
															+        检测新增的标题
														
 
															+
														
 
															+        该方法比较当前抓取数据与历史数据，找出新增的标题。
														
 
															+        关键逻辑：只有在历史批次中从未出现过的标题才算新增。
														
 
															+        """
														
 
															         try:
														
 
															             historical_data = self.get_today_all_data(current_data.date)
														
@@ -703,9 +712,24 @@ class RemoteStorageBackend(StorageBackend):
 
															                     new_titles[source_id] = {item.title: item for item in news_list}
														
 
															                 return new_titles
														
 
															+            # 获取当前批次时间
														
 
															+            current_time = current_data.crawl_time
														
 
															+
														
 
															+            # 收集历史标题（first_time < current_time 的标题）
														
 
															+            # 这样可以正确处理同一标题因 URL 变化而产生多条记录的情况
														
 
															             historical_titles: Dict[str, set] = {}
														
 
															             for source_id, news_list in historical_data.items.items():
														
 
															-                historical_titles[source_id] = {item.title for item in news_list}
														
 
															+                historical_titles[source_id] = set()
														
 
															+                for item in news_list:
														
 
															+                    first_time = getattr(item, 'first_time', item.crawl_time)
														
 
															+                    if first_time < current_time:
														
 
															+                        historical_titles[source_id].add(item.title)
														
 
															+
														
 
															+            # 检查是否有历史数据
														
 
															+            has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
														
 
															+            if not has_historical_data:
														
 
															+                # 第一次抓取，没有"新增"概念
														
 
															+                return {}
														
 
															             new_titles = {}
														
 
															             for source_id, news_list in current_data.items.items():
														
--- a/trendradar/utils/__init__.py
+++ b/trendradar/utils/__init__.py
@@ -10,6 +10,7 @@ from trendradar.utils.time import (
 
															     get_current_time_display,
														
 
															     convert_time_for_display,
														
 
															 )
														
 
															+from trendradar.utils.url import normalize_url, get_url_signature
														
 
															 __all__ = [
														
 
															     "get_configured_time",
														
@@ -17,4 +18,6 @@ __all__ = [
 
															     "format_time_filename",
														
 
															     "get_current_time_display",
														
 
															     "convert_time_for_display",
														
 
															+    "normalize_url",
														
 
															+    "get_url_signature",
														
 
															 ]
														
--- a/trendradar/utils/url.py
+++ b/trendradar/utils/url.py
@@ -0,0 +1,146 @@
 
															+# coding=utf-8
														
 
															+"""
														
 
															+URL 处理工具模块
														
 
															+
														
 
															+提供 URL 标准化功能，用于去重时消除动态参数的影响：
														
 
															+- normalize_url: 标准化 URL，去除动态参数
														
 
															+"""
														
 
															+
														
 
															+from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
														
 
															+from typing import Dict, Set
														
 
															+
														
 
															+
														
 
															+# 各平台需要移除的特定参数
														
 
															+#   - weibo: 有 band_rank（排名）和 Refer（来源）动态参数
														
 
															+#   - 其他平台: URL 为路径格式或简单关键词查询，无需处理
														
 
															+PLATFORM_PARAMS_TO_REMOVE: Dict[str, Set[str]] = {
														
 
															+    # 微博：band_rank 是动态排名参数，Refer 是来源参数，t 是时间范围参数
														
 
															+    # 示例：https://s.weibo.com/weibo?q=xxx&t=31&band_rank=1&Refer=top
														
 
															+    # 保留：q（关键词）
														
 
															+    # 移除：band_rank, Refer, t
														
 
															+    "weibo": {"band_rank", "Refer", "t"},
														
 
															+}
														
 
															+
														
 
															+# 通用追踪参数（适用于所有平台）
														
 
															+# 这些参数通常由分享链接或广告追踪添加，不影响内容识别
														
 
															+COMMON_TRACKING_PARAMS: Set[str] = {
														
 
															+    # UTM 追踪参数
														
 
															+    "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
														
 
															+    # 常见追踪参数
														
 
															+    "ref", "referrer", "source", "channel",
														
 
															+    # 时间戳和随机参数
														
 
															+    "_t", "timestamp", "_", "random",
														
 
															+    # 分享相关
														
 
															+    "share_token", "share_id", "share_from",
														
 
															+}
														
 
															+
														
 
															+
														
 
															+def normalize_url(url: str, platform_id: str = "") -> str:
														
 
															+    """
														
 
															+    标准化 URL，去除动态参数
														
 
															+
														
 
															+    用于数据库去重，确保同一条新闻的不同 URL 变体能被正确识别为同一条。
														
 
															+
														
 
															+    处理规则：
														
 
															+    1. 去除平台特定的动态参数（如微博的 band_rank）
														
 
															+    2. 去除通用追踪参数（如 utm_*）
														
 
															+    3. 保留核心查询参数（如搜索关键词 q=, wd=, keyword=）
														
 
															+    4. 对查询参数按字母序排序（确保一致性）
														
 
															+
														
 
															+    Args:
														
 
															+        url: 原始 URL
														
 
															+        platform_id: 平台 ID，用于应用平台特定规则
														
 
															+
														
 
															+    Returns:
														
 
															+        标准化后的 URL
														
 
															+
														
 
															+    Examples:
														
 
															+        >>> normalize_url("https://s.weibo.com/weibo?q=test&band_rank=6&Refer=top", "weibo")
														
 
															+        'https://s.weibo.com/weibo?q=test'
														
 
															+
														
 
															+        >>> normalize_url("https://example.com/page?id=1&utm_source=twitter", "")
														
 
															+        'https://example.com/page?id=1'
														
 
															+    """
														
 
															+    if not url:
														
 
															+        return url
														
 
															+
														
 
															+    try:
														
 
															+        # 解析 URL
														
 
															+        parsed = urlparse(url)
														
 
															+
														
 
															+        # 如果没有查询参数，直接返回
														
 
															+        if not parsed.query:
														
 
															+            return url
														
 
															+
														
 
															+        # 解析查询参数
														
 
															+        params = parse_qs(parsed.query, keep_blank_values=True)
														
 
															+
														
 
															+        # 收集需要移除的参数（使用小写进行比较）
														
 
															+        params_to_remove: Set[str] = set()
														
 
															+
														
 
															+        # 添加通用追踪参数
														
 
															+        params_to_remove.update(COMMON_TRACKING_PARAMS)
														
 
															+
														
 
															+        # 添加平台特定参数
														
 
															+        if platform_id and platform_id in PLATFORM_PARAMS_TO_REMOVE:
														
 
															+            params_to_remove.update(PLATFORM_PARAMS_TO_REMOVE[platform_id])
														
 
															+
														
 
															+        # 过滤参数（参数名转小写进行比较）
														
 
															+        filtered_params = {
														
 
															+            key: values
														
 
															+            for key, values in params.items()
														
 
															+            if key.lower() not in {p.lower() for p in params_to_remove}
														
 
															+        }
														
 
															+
														
 
															+        # 如果过滤后没有参数了，返回不带查询字符串的 URL
														
 
															+        if not filtered_params:
														
 
															+            return urlunparse((
														
 
															+                parsed.scheme,
														
 
															+                parsed.netloc,
														
 
															+                parsed.path,
														
 
															+                parsed.params,
														
 
															+                "",  # 空查询字符串
														
 
															+                ""   # 移除 fragment
														
 
															+            ))
														
 
															+
														
 
															+        # 重建查询字符串（按字母序排序以确保一致性）
														
 
															+        sorted_params = []
														
 
															+        for key in sorted(filtered_params.keys()):
														
 
															+            for value in filtered_params[key]:
														
 
															+                sorted_params.append((key, value))
														
 
															+
														
 
															+        new_query = urlencode(sorted_params)
														
 
															+
														
 
															+        # 重建 URL（移除 fragment）
														
 
															+        normalized = urlunparse((
														
 
															+            parsed.scheme,
														
 
															+            parsed.netloc,
														
 
															+            parsed.path,
														
 
															+            parsed.params,
														
 
															+            new_query,
														
 
															+            ""  # 移除 fragment
														
 
															+        ))
														
 
															+
														
 
															+        return normalized
														
 
															+
														
 
															+    except Exception:
														
 
															+        # 解析失败时返回原始 URL
														
 
															+        return url
														
 
															+
														
 
															+
														
 
															+def get_url_signature(url: str, platform_id: str = "") -> str:
														
 
															+    """
														
 
															+    获取 URL 的签名（用于快速比较）
														
 
															+
														
 
															+    基于标准化 URL 生成签名，可用于：
														
 
															+    - 快速判断两个 URL 是否指向同一内容
														
 
															+    - 作为缓存键
														
 
															+
														
 
															+    Args:
														
 
															+        url: 原始 URL
														
 
															+        platform_id: 平台 ID
														
 
															+
														
 
															+    Returns:
														
 
															+        URL 签名字符串
														
 
															+    """
														
 
															+    return normalize_url(url, platform_id)
														
--- a/version
+++ b/version
@@ -1 +1 @@
 
															-4.0.2
														
 
															+4.0.3