| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880 |
- """
- 智能新闻检索工具
- 提供模糊搜索、链接查询、历史相关新闻检索等高级搜索功能。
- """
- import re
- from collections import Counter
- from datetime import datetime, timedelta
- from difflib import SequenceMatcher
- from typing import Dict, List, Optional, Tuple, Union
- from ..services.data_service import DataService
- from ..utils.validators import validate_keyword, validate_limit, validate_threshold
- from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
- class SearchTools:
- """智能新闻检索工具类"""
- def __init__(self, project_root: str = None):
- """
- 初始化智能检索工具
- Args:
- project_root: 项目根目录
- """
- self.data_service = DataService(project_root)
- # 中文停用词列表
- self.stopwords = {
- '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
- '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有',
- '看', '好', '自己', '这', '那', '来', '被', '与', '为', '对', '将', '从',
- '以', '及', '等', '但', '或', '而', '于', '中', '由', '可', '可以', '已',
- '已经', '还', '更', '最', '再', '因为', '所以', '如果', '虽然', '然而'
- }
- def search_news_unified(
- self,
- query: str,
- search_mode: str = "keyword",
- date_range: Optional[Union[Dict[str, str], str]] = None,
- platforms: Optional[List[str]] = None,
- limit: int = 50,
- sort_by: str = "relevance",
- threshold: float = 0.6,
- include_url: bool = False
- ) -> Dict:
- """
- 统一新闻搜索工具 - 整合多种搜索模式
- Args:
- query: 查询内容(必需)- 关键词、内容片段或实体名称
- search_mode: 搜索模式,可选值:
- - "keyword": 精确关键词匹配(默认)
- - "fuzzy": 模糊内容匹配(使用相似度算法)
- - "entity": 实体名称搜索(自动按权重排序)
- date_range: 日期范围(可选)
- - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
- - **示例**: {"start": "2025-01-01", "end": "2025-01-07"}
- - **默认**: 不指定时默认查询今天
- - **注意**: start和end可以相同(表示单日查询)
- platforms: 平台过滤列表,如 ['zhihu', 'weibo']
- limit: 返回条数限制,默认50
- sort_by: 排序方式,可选值:
- - "relevance": 按相关度排序(默认)
- - "weight": 按新闻权重排序
- - "date": 按日期排序
- threshold: 相似度阈值(仅fuzzy模式有效),0-1之间,默认0.6
- include_url: 是否包含URL链接,默认False(节省token)
- Returns:
- 搜索结果字典,包含匹配的新闻列表
- Examples:
- - search_news_unified(query="人工智能", search_mode="keyword")
- - search_news_unified(query="特斯拉降价", search_mode="fuzzy", threshold=0.4)
- - search_news_unified(query="马斯克", search_mode="entity", limit=20)
- - search_news_unified(query="iPhone 16", date_range={"start": "2025-01-01", "end": "2025-01-07"})
- """
- try:
- # 参数验证
- query = validate_keyword(query)
- if search_mode not in ["keyword", "fuzzy", "entity"]:
- raise InvalidParameterError(
- f"无效的搜索模式: {search_mode}",
- suggestion="支持的模式: keyword, fuzzy, entity"
- )
- if sort_by not in ["relevance", "weight", "date"]:
- raise InvalidParameterError(
- f"无效的排序方式: {sort_by}",
- suggestion="支持的排序: relevance, weight, date"
- )
- limit = validate_limit(limit, default=50)
- threshold = validate_threshold(threshold, default=0.6, min_value=0.0, max_value=1.0)
- # 处理日期范围
- if date_range:
- from ..utils.validators import validate_date_range
- date_range_tuple = validate_date_range(date_range)
- start_date, end_date = date_range_tuple
- else:
- # 不指定日期时,使用最新可用数据日期(而非 datetime.now())
- earliest, latest = self.data_service.get_available_date_range()
- if latest is None:
- # 没有任何可用数据
- return {
- "success": False,
- "error": {
- "code": "NO_DATA_AVAILABLE",
- "message": "output 目录下没有可用的新闻数据",
- "suggestion": "请先运行爬虫生成数据,或检查 output 目录"
- }
- }
- # 使用最新可用日期
- start_date = end_date = latest
- # 收集所有匹配的新闻
- all_matches = []
- current_date = start_date
- while current_date <= end_date:
- try:
- all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date(
- date=current_date,
- platform_ids=platforms
- )
- # 根据搜索模式执行不同的搜索逻辑
- if search_mode == "keyword":
- matches = self._search_by_keyword_mode(
- query, all_titles, id_to_name, current_date, include_url
- )
- elif search_mode == "fuzzy":
- matches = self._search_by_fuzzy_mode(
- query, all_titles, id_to_name, current_date, threshold, include_url
- )
- else: # entity
- matches = self._search_by_entity_mode(
- query, all_titles, id_to_name, current_date, include_url
- )
- all_matches.extend(matches)
- except DataNotFoundError:
- # 该日期没有数据,继续下一天
- pass
- current_date += timedelta(days=1)
- if not all_matches:
- # 获取可用日期范围用于错误提示
- earliest, latest = self.data_service.get_available_date_range()
- # 判断时间范围描述
- if start_date.date() == datetime.now().date() and start_date == end_date:
- time_desc = "今天"
- elif start_date == end_date:
- time_desc = start_date.strftime("%Y-%m-%d")
- else:
- time_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
- # 构建错误消息
- if earliest and latest:
- available_desc = f"{earliest.strftime('%Y-%m-%d')} 至 {latest.strftime('%Y-%m-%d')}"
- message = f"未找到匹配的新闻(查询范围: {time_desc},可用数据: {available_desc})"
- else:
- message = f"未找到匹配的新闻({time_desc})"
- result = {
- "success": True,
- "results": [],
- "total": 0,
- "query": query,
- "search_mode": search_mode,
- "time_range": time_desc,
- "message": message
- }
- return result
- # 统一排序逻辑
- if sort_by == "relevance":
- all_matches.sort(key=lambda x: x.get("similarity_score", 1.0), reverse=True)
- elif sort_by == "weight":
- from .analytics import calculate_news_weight
- all_matches.sort(key=lambda x: calculate_news_weight(x), reverse=True)
- elif sort_by == "date":
- all_matches.sort(key=lambda x: x.get("date", ""), reverse=True)
- # 限制返回数量
- results = all_matches[:limit]
- # 构建时间范围描述(正确判断是否为今天)
- if start_date.date() == datetime.now().date() and start_date == end_date:
- time_range_desc = "今天"
- elif start_date == end_date:
- time_range_desc = start_date.strftime("%Y-%m-%d")
- else:
- time_range_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
- result = {
- "success": True,
- "summary": {
- "total_found": len(all_matches),
- "returned_count": len(results),
- "requested_limit": limit,
- "search_mode": search_mode,
- "query": query,
- "platforms": platforms or "所有平台",
- "time_range": time_range_desc,
- "sort_by": sort_by
- },
- "results": results
- }
- if search_mode == "fuzzy":
- result["summary"]["threshold"] = threshold
- if len(all_matches) < limit:
- result["note"] = f"模糊搜索模式下,相似度阈值 {threshold} 仅匹配到 {len(all_matches)} 条结果"
- return result
- except MCPError as e:
- return {
- "success": False,
- "error": e.to_dict()
- }
- except Exception as e:
- return {
- "success": False,
- "error": {
- "code": "INTERNAL_ERROR",
- "message": str(e)
- }
- }
- def _search_by_keyword_mode(
- self,
- query: str,
- all_titles: Dict,
- id_to_name: Dict,
- current_date: datetime,
- include_url: bool
- ) -> List[Dict]:
- """
- 关键词搜索模式(精确匹配)
- Args:
- query: 搜索关键词
- all_titles: 所有标题字典
- id_to_name: 平台ID到名称映射
- current_date: 当前日期
- Returns:
- 匹配的新闻列表
- """
- matches = []
- query_lower = query.lower()
- for platform_id, titles in all_titles.items():
- platform_name = id_to_name.get(platform_id, platform_id)
- for title, info in titles.items():
- # 精确包含判断
- if query_lower in title.lower():
- news_item = {
- "title": title,
- "platform": platform_id,
- "platform_name": platform_name,
- "date": current_date.strftime("%Y-%m-%d"),
- "similarity_score": 1.0, # 精确匹配,相似度为1
- "ranks": info.get("ranks", []),
- "count": len(info.get("ranks", [])),
- "rank": info["ranks"][0] if info["ranks"] else 999
- }
- # 条件性添加 URL 字段
- if include_url:
- news_item["url"] = info.get("url", "")
- news_item["mobileUrl"] = info.get("mobileUrl", "")
- matches.append(news_item)
- return matches
- def _search_by_fuzzy_mode(
- self,
- query: str,
- all_titles: Dict,
- id_to_name: Dict,
- current_date: datetime,
- threshold: float,
- include_url: bool
- ) -> List[Dict]:
- """
- 模糊搜索模式(使用相似度算法)
- Args:
- query: 搜索内容
- all_titles: 所有标题字典
- id_to_name: 平台ID到名称映射
- current_date: 当前日期
- threshold: 相似度阈值
- Returns:
- 匹配的新闻列表
- """
- matches = []
- for platform_id, titles in all_titles.items():
- platform_name = id_to_name.get(platform_id, platform_id)
- for title, info in titles.items():
- # 模糊匹配
- is_match, similarity = self._fuzzy_match(query, title, threshold)
- if is_match:
- news_item = {
- "title": title,
- "platform": platform_id,
- "platform_name": platform_name,
- "date": current_date.strftime("%Y-%m-%d"),
- "similarity_score": round(similarity, 4),
- "ranks": info.get("ranks", []),
- "count": len(info.get("ranks", [])),
- "rank": info["ranks"][0] if info["ranks"] else 999
- }
- # 条件性添加 URL 字段
- if include_url:
- news_item["url"] = info.get("url", "")
- news_item["mobileUrl"] = info.get("mobileUrl", "")
- matches.append(news_item)
- return matches
- def _search_by_entity_mode(
- self,
- query: str,
- all_titles: Dict,
- id_to_name: Dict,
- current_date: datetime,
- include_url: bool
- ) -> List[Dict]:
- """
- 实体搜索模式(自动按权重排序)
- Args:
- query: 实体名称
- all_titles: 所有标题字典
- id_to_name: 平台ID到名称映射
- current_date: 当前日期
- Returns:
- 匹配的新闻列表
- """
- matches = []
- for platform_id, titles in all_titles.items():
- platform_name = id_to_name.get(platform_id, platform_id)
- for title, info in titles.items():
- # 实体搜索:精确包含实体名称
- if query in title:
- news_item = {
- "title": title,
- "platform": platform_id,
- "platform_name": platform_name,
- "date": current_date.strftime("%Y-%m-%d"),
- "similarity_score": 1.0,
- "ranks": info.get("ranks", []),
- "count": len(info.get("ranks", [])),
- "rank": info["ranks"][0] if info["ranks"] else 999
- }
- # 条件性添加 URL 字段
- if include_url:
- news_item["url"] = info.get("url", "")
- news_item["mobileUrl"] = info.get("mobileUrl", "")
- matches.append(news_item)
- return matches
- def _calculate_similarity(self, text1: str, text2: str) -> float:
- """
- 计算两个文本的相似度
- Args:
- text1: 文本1
- text2: 文本2
- Returns:
- 相似度分数 (0-1之间)
- """
- # 使用 difflib.SequenceMatcher 计算序列相似度
- return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
- def _fuzzy_match(self, query: str, text: str, threshold: float = 0.3) -> Tuple[bool, float]:
- """
- 模糊匹配函数
- Args:
- query: 查询文本
- text: 待匹配文本
- threshold: 匹配阈值
- Returns:
- (是否匹配, 相似度分数)
- """
- # 直接包含判断
- if query.lower() in text.lower():
- return True, 1.0
- # 计算整体相似度
- similarity = self._calculate_similarity(query, text)
- if similarity >= threshold:
- return True, similarity
- # 分词后的部分匹配
- query_words = set(self._extract_keywords(query))
- text_words = set(self._extract_keywords(text))
- if not query_words or not text_words:
- return False, 0.0
- # 计算关键词重合度
- common_words = query_words & text_words
- keyword_overlap = len(common_words) / len(query_words)
- if keyword_overlap >= 0.5: # 50%的关键词重合
- return True, keyword_overlap
- return False, similarity
- def _extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
- """
- 从文本中提取关键词
- Args:
- text: 输入文本
- min_length: 最小词长
- Returns:
- 关键词列表
- """
- # 移除URL和特殊字符
- text = re.sub(r'http[s]?://\S+', '', text)
- text = re.sub(r'\[.*?\]', '', text) # 移除方括号内容
- # 使用正则表达式分词(中文和英文)
- words = re.findall(r'[\w]+', text)
- # 过滤停用词和短词
- keywords = [
- word for word in words
- if word and len(word) >= min_length and word not in self.stopwords
- ]
- return keywords
- def _calculate_keyword_overlap(self, keywords1: List[str], keywords2: List[str]) -> float:
- """
- 计算两个关键词列表的重合度
- Args:
- keywords1: 关键词列表1
- keywords2: 关键词列表2
- Returns:
- 重合度分数 (0-1之间)
- """
- if not keywords1 or not keywords2:
- return 0.0
- set1 = set(keywords1)
- set2 = set(keywords2)
- # Jaccard 相似度
- intersection = len(set1 & set2)
- union = len(set1 | set2)
- if union == 0:
- return 0.0
- return intersection / union
- def _jaccard_similarity(self, list1: List[str], list2: List[str]) -> float:
- """
- 计算两个列表的 Jaccard 相似度
- Args:
- list1: 列表1
- list2: 列表2
- Returns:
- Jaccard 相似度 (0-1之间)
- """
- if not list1 or not list2:
- return 0.0
- set1 = set(list1)
- set2 = set(list2)
- intersection = len(set1 & set2)
- union = len(set1 | set2)
- if union == 0:
- return 0.0
- return intersection / union
- def search_related_news_history(
- self,
- reference_title: str,
- time_preset: str = "yesterday",
- start_date: Optional[datetime] = None,
- end_date: Optional[datetime] = None,
- threshold: float = 0.4,
- limit: int = 50,
- include_url: bool = False
- ) -> Dict:
- """
- 在历史数据中搜索与给定新闻相关的新闻
- Args:
- reference_title: 参考新闻标题或内容
- time_preset: 时间范围预设值,可选:
- - "yesterday": 昨天
- - "last_week": 上周 (7天)
- - "last_month": 上个月 (30天)
- - "custom": 自定义日期范围(需要提供 start_date 和 end_date)
- start_date: 自定义开始日期(仅当 time_preset="custom" 时有效)
- end_date: 自定义结束日期(仅当 time_preset="custom" 时有效)
- threshold: 相似度阈值 (0-1之间),默认0.4
- limit: 返回条数限制,默认50
- include_url: 是否包含URL链接,默认False(节省token)
- Returns:
- 搜索结果字典,包含相关新闻列表
- Example:
- >>> tools = SearchTools()
- >>> result = tools.search_related_news_history(
- ... reference_title="人工智能技术突破",
- ... time_preset="last_week",
- ... threshold=0.4,
- ... limit=50
- ... )
- >>> for news in result['results']:
- ... print(f"{news['date']}: {news['title']} (相似度: {news['similarity_score']})")
- """
- try:
- # 参数验证
- reference_title = validate_keyword(reference_title)
- threshold = validate_threshold(threshold, default=0.4, min_value=0.0, max_value=1.0)
- limit = validate_limit(limit, default=50)
- # 确定查询日期范围
- today = datetime.now()
- if time_preset == "yesterday":
- search_start = today - timedelta(days=1)
- search_end = today - timedelta(days=1)
- elif time_preset == "last_week":
- search_start = today - timedelta(days=7)
- search_end = today - timedelta(days=1)
- elif time_preset == "last_month":
- search_start = today - timedelta(days=30)
- search_end = today - timedelta(days=1)
- elif time_preset == "custom":
- if not start_date or not end_date:
- raise InvalidParameterError(
- "自定义时间范围需要提供 start_date 和 end_date",
- suggestion="请提供 start_date 和 end_date 参数"
- )
- search_start = start_date
- search_end = end_date
- else:
- raise InvalidParameterError(
- f"不支持的时间范围: {time_preset}",
- suggestion="请使用 'yesterday', 'last_week', 'last_month' 或 'custom'"
- )
- # 提取参考文本的关键词
- reference_keywords = self._extract_keywords(reference_title)
- if not reference_keywords:
- raise InvalidParameterError(
- "无法从参考文本中提取关键词",
- suggestion="请提供更详细的文本内容"
- )
- # 收集所有相关新闻
- all_related_news = []
- current_date = search_start
- while current_date <= search_end:
- try:
- # 读取该日期的数据
- all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(current_date)
- # 搜索相关新闻
- for platform_id, titles in all_titles.items():
- platform_name = id_to_name.get(platform_id, platform_id)
- for title, info in titles.items():
- # 计算标题相似度
- title_similarity = self._calculate_similarity(reference_title, title)
- # 提取标题关键词
- title_keywords = self._extract_keywords(title)
- # 计算关键词重合度
- keyword_overlap = self._calculate_keyword_overlap(
- reference_keywords,
- title_keywords
- )
- # 综合相似度 (70% 关键词重合 + 30% 文本相似度)
- combined_score = keyword_overlap * 0.7 + title_similarity * 0.3
- if combined_score >= threshold:
- news_item = {
- "title": title,
- "platform": platform_id,
- "platform_name": platform_name,
- "date": current_date.strftime("%Y-%m-%d"),
- "similarity_score": round(combined_score, 4),
- "keyword_overlap": round(keyword_overlap, 4),
- "text_similarity": round(title_similarity, 4),
- "common_keywords": list(set(reference_keywords) & set(title_keywords)),
- "rank": info["ranks"][0] if info["ranks"] else 0
- }
- # 条件性添加 URL 字段
- if include_url:
- news_item["url"] = info.get("url", "")
- news_item["mobileUrl"] = info.get("mobileUrl", "")
- all_related_news.append(news_item)
- except DataNotFoundError:
- # 该日期没有数据,继续下一天
- pass
- except Exception as e:
- # 记录错误但继续处理其他日期
- print(f"Warning: 处理日期 {current_date.strftime('%Y-%m-%d')} 时出错: {e}")
- # 移动到下一天
- current_date += timedelta(days=1)
- if not all_related_news:
- return {
- "success": True,
- "results": [],
- "total": 0,
- "query": reference_title,
- "time_preset": time_preset,
- "date_range": {
- "start": search_start.strftime("%Y-%m-%d"),
- "end": search_end.strftime("%Y-%m-%d")
- },
- "message": "未找到相关新闻"
- }
- # 按相似度排序
- all_related_news.sort(key=lambda x: x["similarity_score"], reverse=True)
- # 限制返回数量
- results = all_related_news[:limit]
- # 统计信息
- platform_distribution = Counter([news["platform"] for news in all_related_news])
- date_distribution = Counter([news["date"] for news in all_related_news])
- result = {
- "success": True,
- "summary": {
- "total_found": len(all_related_news),
- "returned_count": len(results),
- "requested_limit": limit,
- "threshold": threshold,
- "reference_title": reference_title,
- "reference_keywords": reference_keywords,
- "time_preset": time_preset,
- "date_range": {
- "start": search_start.strftime("%Y-%m-%d"),
- "end": search_end.strftime("%Y-%m-%d")
- }
- },
- "results": results,
- "statistics": {
- "platform_distribution": dict(platform_distribution),
- "date_distribution": dict(date_distribution),
- "avg_similarity": round(
- sum([news["similarity_score"] for news in all_related_news]) / len(all_related_news),
- 4
- ) if all_related_news else 0.0
- }
- }
- if len(all_related_news) < limit:
- result["note"] = f"相关性阈值 {threshold} 下仅找到 {len(all_related_news)} 条相关新闻"
- return result
- except MCPError as e:
- return {
- "success": False,
- "error": e.to_dict()
- }
- except Exception as e:
- return {
- "success": False,
- "error": {
- "code": "INTERNAL_ERROR",
- "message": str(e)
- }
- }
- def find_related_news_unified(
- self,
- reference_title: str,
- date_range: Optional[Union[Dict[str, str], str]] = None,
- threshold: float = 0.5,
- limit: int = 50,
- include_url: bool = False
- ) -> Dict:
- """
- 统一的相关新闻查找工具 - 整合相似新闻和历史相关搜索
- Args:
- reference_title: 参考新闻标题
- date_range: 日期范围(可选)
- - 不指定: 只查询今天的数据
- - {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}: 查询指定日期范围
- - "today": 今天
- - "yesterday": 昨天
- - "last_week": 最近7天
- - "last_month": 最近30天
- threshold: 相似度阈值,0-1之间,默认0.5
- limit: 返回条数限制,默认50
- include_url: 是否包含URL链接,默认False
- Returns:
- 相关新闻列表,按相似度排序
- """
- try:
- # 参数验证
- reference_title = validate_keyword(reference_title)
- threshold = validate_threshold(threshold, default=0.5, min_value=0.0, max_value=1.0)
- limit = validate_limit(limit, default=50)
- # 确定日期范围
- today = datetime.now()
-
- if date_range is None or date_range == "today":
- # 只查询今天
- search_dates = [today]
- elif isinstance(date_range, str):
- # 预设时间范围
- if date_range == "yesterday":
- search_dates = [today - timedelta(days=1)]
- elif date_range == "last_week":
- search_dates = [today - timedelta(days=i) for i in range(7)]
- elif date_range == "last_month":
- search_dates = [today - timedelta(days=i) for i in range(30)]
- else:
- # 单日字符串格式
- try:
- single_date = datetime.strptime(date_range, "%Y-%m-%d")
- search_dates = [single_date]
- except ValueError:
- search_dates = [today]
- elif isinstance(date_range, dict):
- # 日期范围对象
- start_str = date_range.get("start")
- end_str = date_range.get("end")
- if start_str and end_str:
- start_date = datetime.strptime(start_str, "%Y-%m-%d")
- end_date = datetime.strptime(end_str, "%Y-%m-%d")
- search_dates = []
- current = start_date
- while current <= end_date:
- search_dates.append(current)
- current += timedelta(days=1)
- else:
- search_dates = [today]
- else:
- search_dates = [today]
- # 提取参考标题的关键词
- reference_keywords = self._extract_keywords(reference_title)
- # 收集所有相关新闻
- all_related_news = []
-
- for search_date in search_dates:
- try:
- all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(search_date)
-
- for platform_id, titles in all_titles.items():
- platform_name = id_to_name.get(platform_id, platform_id)
-
- for title, info in titles.items():
- if title == reference_title:
- continue
-
- # 计算相似度(使用混合算法)
- text_similarity = self._calculate_similarity(reference_title, title)
-
- # 如果有关键词,也计算关键词重合度
- if reference_keywords:
- title_keywords = self._extract_keywords(title)
- keyword_similarity = self._jaccard_similarity(reference_keywords, title_keywords)
- # 混合相似度:70% 文本 + 30% 关键词
- similarity = 0.7 * text_similarity + 0.3 * keyword_similarity
- else:
- similarity = text_similarity
-
- if similarity >= threshold:
- news_item = {
- "title": title,
- "platform": platform_id,
- "platform_name": platform_name,
- "date": search_date.strftime("%Y-%m-%d"),
- "similarity": round(similarity, 3),
- "rank": info["ranks"][0] if info["ranks"] else 0
- }
-
- if include_url:
- news_item["url"] = info.get("url", "")
-
- all_related_news.append(news_item)
-
- except Exception:
- # 某天数据读取失败,跳过
- continue
- # 按相似度排序
- all_related_news.sort(key=lambda x: x["similarity"], reverse=True)
-
- # 限制数量
- results = all_related_news[:limit]
- # 统计信息
- from collections import Counter
- platform_dist = Counter([n["platform_name"] for n in all_related_news])
- date_dist = Counter([n["date"] for n in all_related_news])
- return {
- "success": True,
- "summary": {
- "total_found": len(all_related_news),
- "returned_count": len(results),
- "reference_title": reference_title,
- "threshold": threshold,
- "date_range": {
- "start": min(search_dates).strftime("%Y-%m-%d"),
- "end": max(search_dates).strftime("%Y-%m-%d")
- } if search_dates else None
- },
- "results": results,
- "statistics": {
- "platform_distribution": dict(platform_dist),
- "date_distribution": dict(date_dist)
- }
- }
- except MCPError as e:
- return {"success": False, "error": e.to_dict()}
- except Exception as e:
- return {"success": False, "error": {"code": "INTERNAL_ERROR", "message": str(e)}}
|