kekezack
/
TrendRadar


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880
							"""
智能新闻检索工具

提供模糊搜索、链接查询、历史相关新闻检索等高级搜索功能。
"""

import re
from collections import Counter
from datetime import datetime, timedelta
from difflib import SequenceMatcher
from typing import Dict, List, Optional, Tuple, Union

from ..services.data_service import DataService
from ..utils.validators import validate_keyword, validate_limit, validate_threshold
from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError


class SearchTools:
    """智能新闻检索工具类"""

    def __init__(self, project_root: str = None):
        """
        初始化智能检索工具

        Args:
            project_root: 项目根目录
        """
        self.data_service = DataService(project_root)
        # 中文停用词列表
        self.stopwords = {
            '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
            '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有',
            '看', '好', '自己', '这', '那', '来', '被', '与', '为', '对', '将', '从',
            '以', '及', '等', '但', '或', '而', '于', '中', '由', '可', '可以', '已',
            '已经', '还', '更', '最', '再', '因为', '所以', '如果', '虽然', '然而'
        }

    def search_news_unified(
        self,
        query: str,
        search_mode: str = "keyword",
        date_range: Optional[Union[Dict[str, str], str]] = None,
        platforms: Optional[List[str]] = None,
        limit: int = 50,
        sort_by: str = "relevance",
        threshold: float = 0.6,
        include_url: bool = False
    ) -> Dict:
        """
        统一新闻搜索工具 - 整合多种搜索模式

        Args:
            query: 查询内容（必需）- 关键词、内容片段或实体名称
            search_mode: 搜索模式，可选值：
                - "keyword": 精确关键词匹配（默认）
                - "fuzzy": 模糊内容匹配（使用相似度算法）
                - "entity": 实体名称搜索（自动按权重排序）
            date_range: 日期范围（可选）
                       - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
                       - **示例**: {"start": "2025-01-01", "end": "2025-01-07"}
                       - **默认**: 不指定时默认查询今天
                       - **注意**: start和end可以相同（表示单日查询）
            platforms: 平台过滤列表，如 ['zhihu', 'weibo']
            limit: 返回条数限制，默认50
            sort_by: 排序方式，可选值：
                - "relevance": 按相关度排序（默认）
                - "weight": 按新闻权重排序
                - "date": 按日期排序
            threshold: 相似度阈值（仅fuzzy模式有效），0-1之间，默认0.6
            include_url: 是否包含URL链接，默认False（节省token）

        Returns:
            搜索结果字典，包含匹配的新闻列表

        Examples:
            - search_news_unified(query="人工智能", search_mode="keyword")
            - search_news_unified(query="特斯拉降价", search_mode="fuzzy", threshold=0.4)
            - search_news_unified(query="马斯克", search_mode="entity", limit=20)
            - search_news_unified(query="iPhone 16", date_range={"start": "2025-01-01", "end": "2025-01-07"})
        """
        try:
            # 参数验证
            query = validate_keyword(query)

            if search_mode not in ["keyword", "fuzzy", "entity"]:
                raise InvalidParameterError(
                    f"无效的搜索模式: {search_mode}",
                    suggestion="支持的模式: keyword, fuzzy, entity"
                )

            if sort_by not in ["relevance", "weight", "date"]:
                raise InvalidParameterError(
                    f"无效的排序方式: {sort_by}",
                    suggestion="支持的排序: relevance, weight, date"
                )

            limit = validate_limit(limit, default=50)
            threshold = validate_threshold(threshold, default=0.6, min_value=0.0, max_value=1.0)

            # 处理日期范围
            if date_range:
                from ..utils.validators import validate_date_range
                date_range_tuple = validate_date_range(date_range)
                start_date, end_date = date_range_tuple
            else:
                # 不指定日期时，使用最新可用数据日期（而非 datetime.now()）
                earliest, latest = self.data_service.get_available_date_range()

                if latest is None:
                    # 没有任何可用数据
                    return {
                        "success": False,
                        "error": {
                            "code": "NO_DATA_AVAILABLE",
                            "message": "output 目录下没有可用的新闻数据",
                            "suggestion": "请先运行爬虫生成数据，或检查 output 目录"
                        }
                    }

                # 使用最新可用日期
                start_date = end_date = latest

            # 收集所有匹配的新闻
            all_matches = []
            current_date = start_date

            while current_date <= end_date:
                try:
                    all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date(
                        date=current_date,
                        platform_ids=platforms
                    )

                    # 根据搜索模式执行不同的搜索逻辑
                    if search_mode == "keyword":
                        matches = self._search_by_keyword_mode(
                            query, all_titles, id_to_name, current_date, include_url
                        )
                    elif search_mode == "fuzzy":
                        matches = self._search_by_fuzzy_mode(
                            query, all_titles, id_to_name, current_date, threshold, include_url
                        )
                    else:  # entity
                        matches = self._search_by_entity_mode(
                            query, all_titles, id_to_name, current_date, include_url
                        )

                    all_matches.extend(matches)

                except DataNotFoundError:
                    # 该日期没有数据，继续下一天
                    pass

                current_date += timedelta(days=1)

            if not all_matches:
                # 获取可用日期范围用于错误提示
                earliest, latest = self.data_service.get_available_date_range()

                # 判断时间范围描述
                if start_date.date() == datetime.now().date() and start_date == end_date:
                    time_desc = "今天"
                elif start_date == end_date:
                    time_desc = start_date.strftime("%Y-%m-%d")
                else:
                    time_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"

                # 构建错误消息
                if earliest and latest:
                    available_desc = f"{earliest.strftime('%Y-%m-%d')} 至 {latest.strftime('%Y-%m-%d')}"
                    message = f"未找到匹配的新闻（查询范围: {time_desc}，可用数据: {available_desc}）"
                else:
                    message = f"未找到匹配的新闻（{time_desc}）"

                result = {
                    "success": True,
                    "results": [],
                    "total": 0,
                    "query": query,
                    "search_mode": search_mode,
                    "time_range": time_desc,
                    "message": message
                }
                return result

            # 统一排序逻辑
            if sort_by == "relevance":
                all_matches.sort(key=lambda x: x.get("similarity_score", 1.0), reverse=True)
            elif sort_by == "weight":
                from .analytics import calculate_news_weight
                all_matches.sort(key=lambda x: calculate_news_weight(x), reverse=True)
            elif sort_by == "date":
                all_matches.sort(key=lambda x: x.get("date", ""), reverse=True)

            # 限制返回数量
            results = all_matches[:limit]

            # 构建时间范围描述（正确判断是否为今天）
            if start_date.date() == datetime.now().date() and start_date == end_date:
                time_range_desc = "今天"
            elif start_date == end_date:
                time_range_desc = start_date.strftime("%Y-%m-%d")
            else:
                time_range_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"

            result = {
                "success": True,
                "summary": {
                    "total_found": len(all_matches),
                    "returned_count": len(results),
                    "requested_limit": limit,
                    "search_mode": search_mode,
                    "query": query,
                    "platforms": platforms or "所有平台",
                    "time_range": time_range_desc,
                    "sort_by": sort_by
                },
                "results": results
            }

            if search_mode == "fuzzy":
                result["summary"]["threshold"] = threshold
                if len(all_matches) < limit:
                    result["note"] = f"模糊搜索模式下，相似度阈值 {threshold} 仅匹配到 {len(all_matches)} 条结果"

            return result

        except MCPError as e:
            return {
                "success": False,
                "error": e.to_dict()
            }
        except Exception as e:
            return {
                "success": False,
                "error": {
                    "code": "INTERNAL_ERROR",
                    "message": str(e)
                }
            }

    def _search_by_keyword_mode(
        self,
        query: str,
        all_titles: Dict,
        id_to_name: Dict,
        current_date: datetime,
        include_url: bool
    ) -> List[Dict]:
        """
        关键词搜索模式（精确匹配）

        Args:
            query: 搜索关键词
            all_titles: 所有标题字典
            id_to_name: 平台ID到名称映射
            current_date: 当前日期

        Returns:
            匹配的新闻列表
        """
        matches = []
        query_lower = query.lower()

        for platform_id, titles in all_titles.items():
            platform_name = id_to_name.get(platform_id, platform_id)

            for title, info in titles.items():
                # 精确包含判断
                if query_lower in title.lower():
                    news_item = {
                        "title": title,
                        "platform": platform_id,
                        "platform_name": platform_name,
                        "date": current_date.strftime("%Y-%m-%d"),
                        "similarity_score": 1.0,  # 精确匹配，相似度为1
                        "ranks": info.get("ranks", []),
                        "count": len(info.get("ranks", [])),
                        "rank": info["ranks"][0] if info["ranks"] else 999
                    }

                    # 条件性添加 URL 字段
                    if include_url:
                        news_item["url"] = info.get("url", "")
                        news_item["mobileUrl"] = info.get("mobileUrl", "")

                    matches.append(news_item)

        return matches

    def _search_by_fuzzy_mode(
        self,
        query: str,
        all_titles: Dict,
        id_to_name: Dict,
        current_date: datetime,
        threshold: float,
        include_url: bool
    ) -> List[Dict]:
        """
        模糊搜索模式（使用相似度算法）

        Args:
            query: 搜索内容
            all_titles: 所有标题字典
            id_to_name: 平台ID到名称映射
            current_date: 当前日期
            threshold: 相似度阈值

        Returns:
            匹配的新闻列表
        """
        matches = []

        for platform_id, titles in all_titles.items():
            platform_name = id_to_name.get(platform_id, platform_id)

            for title, info in titles.items():
                # 模糊匹配
                is_match, similarity = self._fuzzy_match(query, title, threshold)

                if is_match:
                    news_item = {
                        "title": title,
                        "platform": platform_id,
                        "platform_name": platform_name,
                        "date": current_date.strftime("%Y-%m-%d"),
                        "similarity_score": round(similarity, 4),
                        "ranks": info.get("ranks", []),
                        "count": len(info.get("ranks", [])),
                        "rank": info["ranks"][0] if info["ranks"] else 999
                    }

                    # 条件性添加 URL 字段
                    if include_url:
                        news_item["url"] = info.get("url", "")
                        news_item["mobileUrl"] = info.get("mobileUrl", "")

                    matches.append(news_item)

        return matches

    def _search_by_entity_mode(
        self,
        query: str,
        all_titles: Dict,
        id_to_name: Dict,
        current_date: datetime,
        include_url: bool
    ) -> List[Dict]:
        """
        实体搜索模式（自动按权重排序）

        Args:
            query: 实体名称
            all_titles: 所有标题字典
            id_to_name: 平台ID到名称映射
            current_date: 当前日期

        Returns:
            匹配的新闻列表
        """
        matches = []

        for platform_id, titles in all_titles.items():
            platform_name = id_to_name.get(platform_id, platform_id)

            for title, info in titles.items():
                # 实体搜索：精确包含实体名称
                if query in title:
                    news_item = {
                        "title": title,
                        "platform": platform_id,
                        "platform_name": platform_name,
                        "date": current_date.strftime("%Y-%m-%d"),
                        "similarity_score": 1.0,
                        "ranks": info.get("ranks", []),
                        "count": len(info.get("ranks", [])),
                        "rank": info["ranks"][0] if info["ranks"] else 999
                    }

                    # 条件性添加 URL 字段
                    if include_url:
                        news_item["url"] = info.get("url", "")
                        news_item["mobileUrl"] = info.get("mobileUrl", "")

                    matches.append(news_item)

        return matches

    def _calculate_similarity(self, text1: str, text2: str) -> float:
        """
        计算两个文本的相似度

        Args:
            text1: 文本1
            text2: 文本2

        Returns:
            相似度分数 (0-1之间)
        """
        # 使用 difflib.SequenceMatcher 计算序列相似度
        return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()

    def _fuzzy_match(self, query: str, text: str, threshold: float = 0.3) -> Tuple[bool, float]:
        """
        模糊匹配函数

        Args:
            query: 查询文本
            text: 待匹配文本
            threshold: 匹配阈值

        Returns:
            (是否匹配, 相似度分数)
        """
        # 直接包含判断
        if query.lower() in text.lower():
            return True, 1.0

        # 计算整体相似度
        similarity = self._calculate_similarity(query, text)
        if similarity >= threshold:
            return True, similarity

        # 分词后的部分匹配
        query_words = set(self._extract_keywords(query))
        text_words = set(self._extract_keywords(text))

        if not query_words or not text_words:
            return False, 0.0

        # 计算关键词重合度
        common_words = query_words & text_words
        keyword_overlap = len(common_words) / len(query_words)

        if keyword_overlap >= 0.5:  # 50%的关键词重合
            return True, keyword_overlap

        return False, similarity

    def _extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
        """
        从文本中提取关键词

        Args:
            text: 输入文本
            min_length: 最小词长

        Returns:
            关键词列表
        """
        # 移除URL和特殊字符
        text = re.sub(r'http[s]?://\S+', '', text)
        text = re.sub(r'\[.*?\]', '', text)  # 移除方括号内容

        # 使用正则表达式分词（中文和英文）
        words = re.findall(r'[\w]+', text)

        # 过滤停用词和短词
        keywords = [
            word for word in words
            if word and len(word) >= min_length and word not in self.stopwords
        ]

        return keywords

    def _calculate_keyword_overlap(self, keywords1: List[str], keywords2: List[str]) -> float:
        """
        计算两个关键词列表的重合度

        Args:
            keywords1: 关键词列表1
            keywords2: 关键词列表2

        Returns:
            重合度分数 (0-1之间)
        """
        if not keywords1 or not keywords2:
            return 0.0

        set1 = set(keywords1)
        set2 = set(keywords2)

        # Jaccard 相似度
        intersection = len(set1 & set2)
        union = len(set1 | set2)

        if union == 0:
            return 0.0

        return intersection / union

    def _jaccard_similarity(self, list1: List[str], list2: List[str]) -> float:
        """
        计算两个列表的 Jaccard 相似度

        Args:
            list1: 列表1
            list2: 列表2

        Returns:
            Jaccard 相似度 (0-1之间)
        """
        if not list1 or not list2:
            return 0.0

        set1 = set(list1)
        set2 = set(list2)

        intersection = len(set1 & set2)
        union = len(set1 | set2)

        if union == 0:
            return 0.0

        return intersection / union

    def search_related_news_history(
        self,
        reference_title: str,
        time_preset: str = "yesterday",
        start_date: Optional[datetime] = None,
        end_date: Optional[datetime] = None,
        threshold: float = 0.4,
        limit: int = 50,
        include_url: bool = False
    ) -> Dict:
        """
        在历史数据中搜索与给定新闻相关的新闻

        Args:
            reference_title: 参考新闻标题或内容
            time_preset: 时间范围预设值，可选：
                - "yesterday": 昨天
                - "last_week": 上周 (7天)
                - "last_month": 上个月 (30天)
                - "custom": 自定义日期范围（需要提供 start_date 和 end_date）
            start_date: 自定义开始日期（仅当 time_preset="custom" 时有效）
            end_date: 自定义结束日期（仅当 time_preset="custom" 时有效）
            threshold: 相似度阈值 (0-1之间)，默认0.4
            limit: 返回条数限制，默认50
            include_url: 是否包含URL链接，默认False（节省token）

        Returns:
            搜索结果字典，包含相关新闻列表

        Example:
            >>> tools = SearchTools()
            >>> result = tools.search_related_news_history(
            ...     reference_title="人工智能技术突破",
            ...     time_preset="last_week",
            ...     threshold=0.4,
            ...     limit=50
            ... )
            >>> for news in result['results']:
            ...     print(f"{news['date']}: {news['title']} (相似度: {news['similarity_score']})")
        """
        try:
            # 参数验证
            reference_title = validate_keyword(reference_title)
            threshold = validate_threshold(threshold, default=0.4, min_value=0.0, max_value=1.0)
            limit = validate_limit(limit, default=50)

            # 确定查询日期范围
            today = datetime.now()

            if time_preset == "yesterday":
                search_start = today - timedelta(days=1)
                search_end = today - timedelta(days=1)
            elif time_preset == "last_week":
                search_start = today - timedelta(days=7)
                search_end = today - timedelta(days=1)
            elif time_preset == "last_month":
                search_start = today - timedelta(days=30)
                search_end = today - timedelta(days=1)
            elif time_preset == "custom":
                if not start_date or not end_date:
                    raise InvalidParameterError(
                        "自定义时间范围需要提供 start_date 和 end_date",
                        suggestion="请提供 start_date 和 end_date 参数"
                    )
                search_start = start_date
                search_end = end_date
            else:
                raise InvalidParameterError(
                    f"不支持的时间范围: {time_preset}",
                    suggestion="请使用 'yesterday', 'last_week', 'last_month' 或 'custom'"
                )

            # 提取参考文本的关键词
            reference_keywords = self._extract_keywords(reference_title)

            if not reference_keywords:
                raise InvalidParameterError(
                    "无法从参考文本中提取关键词",
                    suggestion="请提供更详细的文本内容"
                )

            # 收集所有相关新闻
            all_related_news = []
            current_date = search_start

            while current_date <= search_end:
                try:
                    # 读取该日期的数据
                    all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(current_date)

                    # 搜索相关新闻
                    for platform_id, titles in all_titles.items():
                        platform_name = id_to_name.get(platform_id, platform_id)

                        for title, info in titles.items():
                            # 计算标题相似度
                            title_similarity = self._calculate_similarity(reference_title, title)

                            # 提取标题关键词
                            title_keywords = self._extract_keywords(title)

                            # 计算关键词重合度
                            keyword_overlap = self._calculate_keyword_overlap(
                                reference_keywords,
                                title_keywords
                            )

                            # 综合相似度 (70% 关键词重合 + 30% 文本相似度)
                            combined_score = keyword_overlap * 0.7 + title_similarity * 0.3

                            if combined_score >= threshold:
                                news_item = {
                                    "title": title,
                                    "platform": platform_id,
                                    "platform_name": platform_name,
                                    "date": current_date.strftime("%Y-%m-%d"),
                                    "similarity_score": round(combined_score, 4),
                                    "keyword_overlap": round(keyword_overlap, 4),
                                    "text_similarity": round(title_similarity, 4),
                                    "common_keywords": list(set(reference_keywords) & set(title_keywords)),
                                    "rank": info["ranks"][0] if info["ranks"] else 0
                                }

                                # 条件性添加 URL 字段
                                if include_url:
                                    news_item["url"] = info.get("url", "")
                                    news_item["mobileUrl"] = info.get("mobileUrl", "")

                                all_related_news.append(news_item)

                except DataNotFoundError:
                    # 该日期没有数据，继续下一天
                    pass
                except Exception as e:
                    # 记录错误但继续处理其他日期
                    print(f"Warning: 处理日期 {current_date.strftime('%Y-%m-%d')} 时出错: {e}")

                # 移动到下一天
                current_date += timedelta(days=1)

            if not all_related_news:
                return {
                    "success": True,
                    "results": [],
                    "total": 0,
                    "query": reference_title,
                    "time_preset": time_preset,
                    "date_range": {
                        "start": search_start.strftime("%Y-%m-%d"),
                        "end": search_end.strftime("%Y-%m-%d")
                    },
                    "message": "未找到相关新闻"
                }

            # 按相似度排序
            all_related_news.sort(key=lambda x: x["similarity_score"], reverse=True)

            # 限制返回数量
            results = all_related_news[:limit]

            # 统计信息
            platform_distribution = Counter([news["platform"] for news in all_related_news])
            date_distribution = Counter([news["date"] for news in all_related_news])

            result = {
                "success": True,
                "summary": {
                    "total_found": len(all_related_news),
                    "returned_count": len(results),
                    "requested_limit": limit,
                    "threshold": threshold,
                    "reference_title": reference_title,
                    "reference_keywords": reference_keywords,
                    "time_preset": time_preset,
                    "date_range": {
                        "start": search_start.strftime("%Y-%m-%d"),
                        "end": search_end.strftime("%Y-%m-%d")
                    }
                },
                "results": results,
                "statistics": {
                    "platform_distribution": dict(platform_distribution),
                    "date_distribution": dict(date_distribution),
                    "avg_similarity": round(
                        sum([news["similarity_score"] for news in all_related_news]) / len(all_related_news),
                        4
                    ) if all_related_news else 0.0
                }
            }

            if len(all_related_news) < limit:
                result["note"] = f"相关性阈值 {threshold} 下仅找到 {len(all_related_news)} 条相关新闻"

            return result

        except MCPError as e:
            return {
                "success": False,
                "error": e.to_dict()
            }
        except Exception as e:
            return {
                "success": False,
                "error": {
                    "code": "INTERNAL_ERROR",
                    "message": str(e)
                }
            }

    def find_related_news_unified(
        self,
        reference_title: str,
        date_range: Optional[Union[Dict[str, str], str]] = None,
        threshold: float = 0.5,
        limit: int = 50,
        include_url: bool = False
    ) -> Dict:
        """
        统一的相关新闻查找工具 - 整合相似新闻和历史相关搜索

        Args:
            reference_title: 参考新闻标题
            date_range: 日期范围（可选）
                - 不指定: 只查询今天的数据
                - {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}: 查询指定日期范围
                - "today": 今天
                - "yesterday": 昨天
                - "last_week": 最近7天
                - "last_month": 最近30天
            threshold: 相似度阈值，0-1之间，默认0.5
            limit: 返回条数限制，默认50
            include_url: 是否包含URL链接，默认False

        Returns:
            相关新闻列表，按相似度排序
        """
        try:
            # 参数验证
            reference_title = validate_keyword(reference_title)
            threshold = validate_threshold(threshold, default=0.5, min_value=0.0, max_value=1.0)
            limit = validate_limit(limit, default=50)

            # 确定日期范围
            today = datetime.now()
            
            if date_range is None or date_range == "today":
                # 只查询今天
                search_dates = [today]
            elif isinstance(date_range, str):
                # 预设时间范围
                if date_range == "yesterday":
                    search_dates = [today - timedelta(days=1)]
                elif date_range == "last_week":
                    search_dates = [today - timedelta(days=i) for i in range(7)]
                elif date_range == "last_month":
                    search_dates = [today - timedelta(days=i) for i in range(30)]
                else:
                    # 单日字符串格式
                    try:
                        single_date = datetime.strptime(date_range, "%Y-%m-%d")
                        search_dates = [single_date]
                    except ValueError:
                        search_dates = [today]
            elif isinstance(date_range, dict):
                # 日期范围对象
                start_str = date_range.get("start")
                end_str = date_range.get("end")
                if start_str and end_str:
                    start_date = datetime.strptime(start_str, "%Y-%m-%d")
                    end_date = datetime.strptime(end_str, "%Y-%m-%d")
                    search_dates = []
                    current = start_date
                    while current <= end_date:
                        search_dates.append(current)
                        current += timedelta(days=1)
                else:
                    search_dates = [today]
            else:
                search_dates = [today]

            # 提取参考标题的关键词
            reference_keywords = self._extract_keywords(reference_title)

            # 收集所有相关新闻
            all_related_news = []
            
            for search_date in search_dates:
                try:
                    all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(search_date)
                    
                    for platform_id, titles in all_titles.items():
                        platform_name = id_to_name.get(platform_id, platform_id)
                        
                        for title, info in titles.items():
                            if title == reference_title:
                                continue
                            
                            # 计算相似度（使用混合算法）
                            text_similarity = self._calculate_similarity(reference_title, title)
                            
                            # 如果有关键词，也计算关键词重合度
                            if reference_keywords:
                                title_keywords = self._extract_keywords(title)
                                keyword_similarity = self._jaccard_similarity(reference_keywords, title_keywords)
                                # 混合相似度：70% 文本 + 30% 关键词
                                similarity = 0.7 * text_similarity + 0.3 * keyword_similarity
                            else:
                                similarity = text_similarity
                            
                            if similarity >= threshold:
                                news_item = {
                                    "title": title,
                                    "platform": platform_id,
                                    "platform_name": platform_name,
                                    "date": search_date.strftime("%Y-%m-%d"),
                                    "similarity": round(similarity, 3),
                                    "rank": info["ranks"][0] if info["ranks"] else 0
                                }
                                
                                if include_url:
                                    news_item["url"] = info.get("url", "")
                                
                                all_related_news.append(news_item)
                                
                except Exception:
                    # 某天数据读取失败，跳过
                    continue

            # 按相似度排序
            all_related_news.sort(key=lambda x: x["similarity"], reverse=True)
            
            # 限制数量
            results = all_related_news[:limit]

            # 统计信息
            from collections import Counter
            platform_dist = Counter([n["platform_name"] for n in all_related_news])
            date_dist = Counter([n["date"] for n in all_related_news])

            return {
                "success": True,
                "summary": {
                    "total_found": len(all_related_news),
                    "returned_count": len(results),
                    "reference_title": reference_title,
                    "threshold": threshold,
                    "date_range": {
                        "start": min(search_dates).strftime("%Y-%m-%d"),
                        "end": max(search_dates).strftime("%Y-%m-%d")
                    } if search_dates else None
                },
                "results": results,
                "statistics": {
                    "platform_distribution": dict(platform_dist),
                    "date_distribution": dict(date_dist)
                }
            }

        except MCPError as e:
            return {"success": False, "error": e.to_dict()}
        except Exception as e:
            return {"success": False, "error": {"code": "INTERNAL_ERROR", "message": str(e)}}