search_tools.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907
  1. """
  2. 智能新闻检索工具
  3. 提供模糊搜索、链接查询、历史相关新闻检索等高级搜索功能。
  4. """
  5. import re
  6. from collections import Counter
  7. from datetime import datetime, timedelta
  8. from difflib import SequenceMatcher
  9. from typing import Dict, List, Optional, Tuple, Union
  10. from ..services.data_service import DataService
  11. from ..utils.validators import validate_keyword, validate_limit, validate_threshold, normalize_date_range
  12. from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
  13. class SearchTools:
  14. """智能新闻检索工具类"""
  15. def __init__(self, project_root: str = None):
  16. """
  17. 初始化智能检索工具
  18. Args:
  19. project_root: 项目根目录
  20. """
  21. self.data_service = DataService(project_root)
  22. def search_news_unified(
  23. self,
  24. query: str,
  25. search_mode: str = "keyword",
  26. date_range: Optional[Union[Dict[str, str], str]] = None,
  27. platforms: Optional[List[str]] = None,
  28. limit: int = 50,
  29. sort_by: str = "relevance",
  30. threshold: float = 0.6,
  31. include_url: bool = False,
  32. include_rss: bool = False,
  33. rss_limit: int = 20
  34. ) -> Dict:
  35. """
  36. 统一新闻搜索工具 - 整合多种搜索模式,支持同时搜索热榜和RSS
  37. Args:
  38. query: 查询内容(必需)- 关键词、内容片段或实体名称
  39. search_mode: 搜索模式,可选值:
  40. - "keyword": 精确关键词匹配(默认)
  41. - "fuzzy": 模糊内容匹配(使用相似度算法)
  42. - "entity": 实体名称搜索(自动按权重排序)
  43. date_range: 日期范围(可选)
  44. - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
  45. - **示例**: {"start": "2025-01-01", "end": "2025-01-07"}
  46. - **默认**: 不指定时默认查询今天
  47. - **注意**: start和end可以相同(表示单日查询)
  48. platforms: 平台过滤列表,如 ['zhihu', 'weibo']
  49. limit: 热榜返回条数限制,默认50
  50. sort_by: 排序方式,可选值:
  51. - "relevance": 按相关度排序(默认)
  52. - "weight": 按新闻权重排序
  53. - "date": 按日期排序
  54. threshold: 相似度阈值(仅fuzzy模式有效),0-1之间,默认0.6
  55. include_url: 是否包含URL链接,默认False(节省token)
  56. include_rss: 是否同时搜索RSS数据,默认False
  57. rss_limit: RSS返回条数限制,默认20
  58. Returns:
  59. 搜索结果字典,包含匹配的新闻列表(热榜和RSS分开展示)
  60. Examples:
  61. - search_news_unified(query="人工智能", search_mode="keyword")
  62. - search_news_unified(query="特斯拉降价", search_mode="fuzzy", threshold=0.4)
  63. - search_news_unified(query="马斯克", search_mode="entity", limit=20)
  64. - search_news_unified(query="AI", include_rss=True) # 同时搜索热榜和RSS
  65. - search_news_unified(query="iPhone 16", date_range={"start": "2025-01-01", "end": "2025-01-07"})
  66. """
  67. try:
  68. # 参数验证
  69. query = validate_keyword(query)
  70. if search_mode not in ["keyword", "fuzzy", "entity"]:
  71. raise InvalidParameterError(
  72. f"无效的搜索模式: {search_mode}",
  73. suggestion="支持的模式: keyword, fuzzy, entity"
  74. )
  75. if sort_by not in ["relevance", "weight", "date"]:
  76. raise InvalidParameterError(
  77. f"无效的排序方式: {sort_by}",
  78. suggestion="支持的排序: relevance, weight, date"
  79. )
  80. limit = validate_limit(limit, default=50)
  81. threshold = validate_threshold(threshold, default=0.6, min_value=0.0, max_value=1.0)
  82. # 处理日期范围
  83. if date_range:
  84. from ..utils.validators import validate_date_range
  85. date_range_tuple = validate_date_range(date_range)
  86. start_date, end_date = date_range_tuple
  87. else:
  88. # 不指定日期时,使用最新可用数据日期(而非 datetime.now())
  89. earliest, latest = self.data_service.get_available_date_range()
  90. if latest is None:
  91. # 没有任何可用数据
  92. return {
  93. "success": False,
  94. "error": {
  95. "code": "NO_DATA_AVAILABLE",
  96. "message": "output 目录下没有可用的新闻数据",
  97. "suggestion": "请先运行爬虫生成数据,或检查 output 目录"
  98. }
  99. }
  100. # 使用最新可用日期
  101. start_date = end_date = latest
  102. # 收集所有匹配的新闻
  103. all_matches = []
  104. current_date = start_date
  105. while current_date <= end_date:
  106. try:
  107. all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date(
  108. date=current_date,
  109. platform_ids=platforms
  110. )
  111. # 根据搜索模式执行不同的搜索逻辑
  112. if search_mode == "keyword":
  113. matches = self._search_by_keyword_mode(
  114. query, all_titles, id_to_name, current_date, include_url
  115. )
  116. elif search_mode == "fuzzy":
  117. matches = self._search_by_fuzzy_mode(
  118. query, all_titles, id_to_name, current_date, threshold, include_url
  119. )
  120. else: # entity
  121. matches = self._search_by_entity_mode(
  122. query, all_titles, id_to_name, current_date, include_url
  123. )
  124. all_matches.extend(matches)
  125. except DataNotFoundError:
  126. # 该日期没有数据,继续下一天
  127. pass
  128. current_date += timedelta(days=1)
  129. if not all_matches:
  130. # 获取可用日期范围用于错误提示
  131. earliest, latest = self.data_service.get_available_date_range()
  132. # 判断时间范围描述
  133. if start_date.date() == datetime.now().date() and start_date == end_date:
  134. time_desc = "今天"
  135. elif start_date == end_date:
  136. time_desc = start_date.strftime("%Y-%m-%d")
  137. else:
  138. time_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
  139. # 构建错误消息
  140. if earliest and latest:
  141. available_desc = f"{earliest.strftime('%Y-%m-%d')} 至 {latest.strftime('%Y-%m-%d')}"
  142. message = f"未找到匹配的新闻(查询范围: {time_desc},可用数据: {available_desc})"
  143. else:
  144. message = f"未找到匹配的新闻({time_desc})"
  145. result = {
  146. "success": True,
  147. "results": [],
  148. "total": 0,
  149. "query": query,
  150. "search_mode": search_mode,
  151. "time_range": time_desc,
  152. "message": message
  153. }
  154. return result
  155. # 统一排序逻辑
  156. if sort_by == "relevance":
  157. all_matches.sort(key=lambda x: x.get("similarity_score", 1.0), reverse=True)
  158. elif sort_by == "weight":
  159. from .analytics import calculate_news_weight
  160. all_matches.sort(key=lambda x: calculate_news_weight(x), reverse=True)
  161. elif sort_by == "date":
  162. all_matches.sort(key=lambda x: x.get("date", ""), reverse=True)
  163. # 限制返回数量
  164. results = all_matches[:limit]
  165. # 构建时间范围描述(正确判断是否为今天)
  166. if start_date.date() == datetime.now().date() and start_date == end_date:
  167. time_range_desc = "今天"
  168. elif start_date == end_date:
  169. time_range_desc = start_date.strftime("%Y-%m-%d")
  170. else:
  171. time_range_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
  172. result = {
  173. "success": True,
  174. "summary": {
  175. "description": f"新闻搜索结果({search_mode}模式)",
  176. "total_found": len(all_matches),
  177. "returned": len(results),
  178. "requested_limit": limit,
  179. "search_mode": search_mode,
  180. "query": query,
  181. "platforms": platforms or "所有平台",
  182. "time_range": time_range_desc,
  183. "sort_by": sort_by
  184. },
  185. "data": results
  186. }
  187. if search_mode == "fuzzy":
  188. result["summary"]["threshold"] = threshold
  189. if len(all_matches) < limit:
  190. result["note"] = f"模糊搜索模式下,相似度阈值 {threshold} 仅匹配到 {len(all_matches)} 条结果"
  191. # 如果启用 RSS 搜索,同时搜索 RSS 数据
  192. if include_rss:
  193. rss_results = self._search_rss_by_keyword(
  194. query=query,
  195. start_date=start_date,
  196. end_date=end_date,
  197. limit=rss_limit,
  198. include_url=include_url
  199. )
  200. result["rss"] = rss_results["items"]
  201. result["rss_total"] = rss_results["total"]
  202. result["summary"]["include_rss"] = True
  203. result["summary"]["rss_found"] = rss_results["total"]
  204. result["summary"]["rss_returned"] = len(rss_results["items"])
  205. return result
  206. except MCPError as e:
  207. return {
  208. "success": False,
  209. "error": e.to_dict()
  210. }
  211. except Exception as e:
  212. return {
  213. "success": False,
  214. "error": {
  215. "code": "INTERNAL_ERROR",
  216. "message": str(e)
  217. }
  218. }
  219. def _search_titles(
  220. self,
  221. all_titles: Dict,
  222. id_to_name: Dict,
  223. current_date: datetime,
  224. include_url: bool,
  225. match_func,
  226. ) -> List[Dict]:
  227. """
  228. 通用标题搜索方法
  229. Args:
  230. all_titles: 所有标题字典
  231. id_to_name: 平台ID到名称映射
  232. current_date: 当前日期
  233. include_url: 是否包含URL
  234. match_func: 匹配函数,接收 (title, info),返回 (is_match, similarity_score) 或 None
  235. Returns:
  236. 匹配的新闻列表
  237. """
  238. matches = []
  239. for platform_id, titles in all_titles.items():
  240. platform_name = id_to_name.get(platform_id, platform_id)
  241. for title, info in titles.items():
  242. result = match_func(title, info)
  243. if result is None:
  244. continue
  245. is_match, similarity = result
  246. if not is_match:
  247. continue
  248. news_item = {
  249. "title": title,
  250. "platform": platform_id,
  251. "platform_name": platform_name,
  252. "date": current_date.strftime("%Y-%m-%d"),
  253. "similarity_score": round(similarity, 4),
  254. "ranks": info.get("ranks", []),
  255. "count": len(info.get("ranks", [])),
  256. "rank": info["ranks"][0] if info["ranks"] else 999
  257. }
  258. if include_url:
  259. news_item["url"] = info.get("url", "")
  260. news_item["mobileUrl"] = info.get("mobileUrl", "")
  261. matches.append(news_item)
  262. return matches
  263. def _search_by_keyword_mode(
  264. self, query: str, all_titles: Dict, id_to_name: Dict,
  265. current_date: datetime, include_url: bool
  266. ) -> List[Dict]:
  267. """关键词搜索模式(精确匹配)"""
  268. query_lower = query.lower()
  269. return self._search_titles(
  270. all_titles, id_to_name, current_date, include_url,
  271. match_func=lambda title, info: (True, 1.0) if query_lower in title.lower() else (False, 0),
  272. )
  273. def _search_by_fuzzy_mode(
  274. self, query: str, all_titles: Dict, id_to_name: Dict,
  275. current_date: datetime, threshold: float, include_url: bool
  276. ) -> List[Dict]:
  277. """模糊搜索模式(使用相似度算法)"""
  278. return self._search_titles(
  279. all_titles, id_to_name, current_date, include_url,
  280. match_func=lambda title, info: self._fuzzy_match(query, title, threshold),
  281. )
  282. def _search_by_entity_mode(
  283. self, query: str, all_titles: Dict, id_to_name: Dict,
  284. current_date: datetime, include_url: bool
  285. ) -> List[Dict]:
  286. """实体搜索模式(精确包含实体名称)"""
  287. return self._search_titles(
  288. all_titles, id_to_name, current_date, include_url,
  289. match_func=lambda title, info: (True, 1.0) if query in title else (False, 0),
  290. )
  291. def _calculate_similarity(self, text1: str, text2: str) -> float:
  292. """
  293. 计算两个文本的相似度
  294. Args:
  295. text1: 文本1
  296. text2: 文本2
  297. Returns:
  298. 相似度分数 (0-1之间)
  299. """
  300. # 使用 difflib.SequenceMatcher 计算序列相似度
  301. return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
  302. def _fuzzy_match(self, query: str, text: str, threshold: float = 0.3) -> Tuple[bool, float]:
  303. """
  304. 模糊匹配函数
  305. Args:
  306. query: 查询文本
  307. text: 待匹配文本
  308. threshold: 匹配阈值
  309. Returns:
  310. (是否匹配, 相似度分数)
  311. """
  312. # 直接包含判断
  313. if query.lower() in text.lower():
  314. return True, 1.0
  315. # 计算整体相似度
  316. similarity = self._calculate_similarity(query, text)
  317. if similarity >= threshold:
  318. return True, similarity
  319. # 分词后的部分匹配
  320. query_words = set(self._extract_keywords(query))
  321. text_words = set(self._extract_keywords(text))
  322. if not query_words or not text_words:
  323. return False, 0.0
  324. # 计算关键词重合度
  325. common_words = query_words & text_words
  326. keyword_overlap = len(common_words) / len(query_words)
  327. if keyword_overlap >= 0.5: # 50%的关键词重合
  328. return True, keyword_overlap
  329. return False, similarity
  330. def _extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
  331. """
  332. 从文本中提取关键词
  333. Args:
  334. text: 输入文本
  335. min_length: 最小词长
  336. Returns:
  337. 关键词列表
  338. """
  339. # 移除URL和特殊字符
  340. text = re.sub(r'http[s]?://\S+', '', text)
  341. text = re.sub(r'\[.*?\]', '', text) # 移除方括号内容
  342. # 使用正则表达式分词(中文和英文)
  343. words = re.findall(r'[\w]+', text)
  344. # 过滤短词
  345. keywords = [word for word in words if word and len(word) >= min_length]
  346. return keywords
  347. def _calculate_keyword_overlap(self, keywords1: List[str], keywords2: List[str]) -> float:
  348. """
  349. 计算两个关键词列表的重合度
  350. Args:
  351. keywords1: 关键词列表1
  352. keywords2: 关键词列表2
  353. Returns:
  354. 重合度分数 (0-1之间)
  355. """
  356. if not keywords1 or not keywords2:
  357. return 0.0
  358. set1 = set(keywords1)
  359. set2 = set(keywords2)
  360. # Jaccard 相似度
  361. intersection = len(set1 & set2)
  362. union = len(set1 | set2)
  363. if union == 0:
  364. return 0.0
  365. return intersection / union
  366. def _jaccard_similarity(self, list1: List[str], list2: List[str]) -> float:
  367. """
  368. 计算两个列表的 Jaccard 相似度
  369. Args:
  370. list1: 列表1
  371. list2: 列表2
  372. Returns:
  373. Jaccard 相似度 (0-1之间)
  374. """
  375. if not list1 or not list2:
  376. return 0.0
  377. set1 = set(list1)
  378. set2 = set(list2)
  379. intersection = len(set1 & set2)
  380. union = len(set1 | set2)
  381. if union == 0:
  382. return 0.0
  383. return intersection / union
  384. def search_related_news_history(
  385. self,
  386. reference_title: str,
  387. time_preset: str = "yesterday",
  388. start_date: Optional[datetime] = None,
  389. end_date: Optional[datetime] = None,
  390. threshold: float = 0.4,
  391. limit: int = 50,
  392. include_url: bool = False
  393. ) -> Dict:
  394. """
  395. 在历史数据中搜索与给定新闻相关的新闻
  396. Args:
  397. reference_title: 参考新闻标题或内容
  398. time_preset: 时间范围预设值,可选:
  399. - "yesterday": 昨天
  400. - "last_week": 上周 (7天)
  401. - "last_month": 上个月 (30天)
  402. - "custom": 自定义日期范围(需要提供 start_date 和 end_date)
  403. start_date: 自定义开始日期(仅当 time_preset="custom" 时有效)
  404. end_date: 自定义结束日期(仅当 time_preset="custom" 时有效)
  405. threshold: 相似度阈值 (0-1之间),默认0.4
  406. limit: 返回条数限制,默认50
  407. include_url: 是否包含URL链接,默认False(节省token)
  408. Returns:
  409. 搜索结果字典,包含相关新闻列表
  410. Example:
  411. >>> tools = SearchTools()
  412. >>> result = tools.search_related_news_history(
  413. ... reference_title="人工智能技术突破",
  414. ... time_preset="last_week",
  415. ... threshold=0.4,
  416. ... limit=50
  417. ... )
  418. >>> for news in result['results']:
  419. ... print(f"{news['date']}: {news['title']} (相似度: {news['similarity_score']})")
  420. """
  421. try:
  422. # 参数验证
  423. reference_title = validate_keyword(reference_title)
  424. threshold = validate_threshold(threshold, default=0.4, min_value=0.0, max_value=1.0)
  425. limit = validate_limit(limit, default=50)
  426. # 确定查询日期范围
  427. today = datetime.now()
  428. if time_preset == "yesterday":
  429. search_start = today - timedelta(days=1)
  430. search_end = today - timedelta(days=1)
  431. elif time_preset == "last_week":
  432. search_start = today - timedelta(days=7)
  433. search_end = today - timedelta(days=1)
  434. elif time_preset == "last_month":
  435. search_start = today - timedelta(days=30)
  436. search_end = today - timedelta(days=1)
  437. elif time_preset == "custom":
  438. if not start_date or not end_date:
  439. raise InvalidParameterError(
  440. "自定义时间范围需要提供 start_date 和 end_date",
  441. suggestion="请提供 start_date 和 end_date 参数"
  442. )
  443. search_start = start_date
  444. search_end = end_date
  445. else:
  446. raise InvalidParameterError(
  447. f"不支持的时间范围: {time_preset}",
  448. suggestion="请使用 'yesterday', 'last_week', 'last_month' 或 'custom'"
  449. )
  450. # 提取参考文本的关键词
  451. reference_keywords = self._extract_keywords(reference_title)
  452. if not reference_keywords:
  453. raise InvalidParameterError(
  454. "无法从参考文本中提取关键词",
  455. suggestion="请提供更详细的文本内容"
  456. )
  457. # 收集所有相关新闻
  458. all_related_news = []
  459. current_date = search_start
  460. while current_date <= search_end:
  461. try:
  462. # 读取该日期的数据
  463. all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(current_date)
  464. # 搜索相关新闻
  465. for platform_id, titles in all_titles.items():
  466. platform_name = id_to_name.get(platform_id, platform_id)
  467. for title, info in titles.items():
  468. # 计算标题相似度
  469. title_similarity = self._calculate_similarity(reference_title, title)
  470. # 提取标题关键词
  471. title_keywords = self._extract_keywords(title)
  472. # 计算关键词重合度
  473. keyword_overlap = self._calculate_keyword_overlap(
  474. reference_keywords,
  475. title_keywords
  476. )
  477. # 综合相似度 (70% 关键词重合 + 30% 文本相似度)
  478. combined_score = keyword_overlap * 0.7 + title_similarity * 0.3
  479. if combined_score >= threshold:
  480. news_item = {
  481. "title": title,
  482. "platform": platform_id,
  483. "platform_name": platform_name,
  484. "date": current_date.strftime("%Y-%m-%d"),
  485. "similarity_score": round(combined_score, 4),
  486. "keyword_overlap": round(keyword_overlap, 4),
  487. "text_similarity": round(title_similarity, 4),
  488. "common_keywords": list(set(reference_keywords) & set(title_keywords)),
  489. "rank": info["ranks"][0] if info["ranks"] else 0
  490. }
  491. # 条件性添加 URL 字段
  492. if include_url:
  493. news_item["url"] = info.get("url", "")
  494. news_item["mobileUrl"] = info.get("mobileUrl", "")
  495. all_related_news.append(news_item)
  496. except DataNotFoundError:
  497. # 该日期没有数据,继续下一天
  498. pass
  499. except Exception as e:
  500. # 记录错误但继续处理其他日期
  501. print(f"Warning: 处理日期 {current_date.strftime('%Y-%m-%d')} 时出错: {e}")
  502. # 移动到下一天
  503. current_date += timedelta(days=1)
  504. if not all_related_news:
  505. return {
  506. "success": True,
  507. "results": [],
  508. "total": 0,
  509. "query": reference_title,
  510. "time_preset": time_preset,
  511. "date_range": {
  512. "start": search_start.strftime("%Y-%m-%d"),
  513. "end": search_end.strftime("%Y-%m-%d")
  514. },
  515. "message": "未找到相关新闻"
  516. }
  517. # 按相似度排序
  518. all_related_news.sort(key=lambda x: x["similarity_score"], reverse=True)
  519. # 限制返回数量
  520. results = all_related_news[:limit]
  521. # 统计信息
  522. platform_distribution = Counter([news["platform"] for news in all_related_news])
  523. date_distribution = Counter([news["date"] for news in all_related_news])
  524. result = {
  525. "success": True,
  526. "summary": {
  527. "description": "历史相关新闻搜索结果",
  528. "total_found": len(all_related_news),
  529. "returned": len(results),
  530. "requested_limit": limit,
  531. "threshold": threshold,
  532. "reference_title": reference_title,
  533. "reference_keywords": reference_keywords,
  534. "time_preset": time_preset,
  535. "date_range": {
  536. "start": search_start.strftime("%Y-%m-%d"),
  537. "end": search_end.strftime("%Y-%m-%d")
  538. }
  539. },
  540. "data": results,
  541. "statistics": {
  542. "platform_distribution": dict(platform_distribution),
  543. "date_distribution": dict(date_distribution),
  544. "avg_similarity": round(
  545. sum([news["similarity_score"] for news in all_related_news]) / len(all_related_news),
  546. 4
  547. ) if all_related_news else 0.0
  548. }
  549. }
  550. if len(all_related_news) < limit:
  551. result["note"] = f"相关性阈值 {threshold} 下仅找到 {len(all_related_news)} 条相关新闻"
  552. return result
  553. except MCPError as e:
  554. return {
  555. "success": False,
  556. "error": e.to_dict()
  557. }
  558. except Exception as e:
  559. return {
  560. "success": False,
  561. "error": {
  562. "code": "INTERNAL_ERROR",
  563. "message": str(e)
  564. }
  565. }
  566. def find_related_news_unified(
  567. self,
  568. reference_title: str,
  569. date_range: Optional[Union[Dict[str, str], str]] = None,
  570. threshold: float = 0.5,
  571. limit: int = 50,
  572. include_url: bool = False
  573. ) -> Dict:
  574. """
  575. 统一的相关新闻查找工具 - 整合相似新闻和历史相关搜索
  576. Args:
  577. reference_title: 参考新闻标题
  578. date_range: 日期范围(可选)
  579. - 不指定: 只查询今天的数据
  580. - {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}: 查询指定日期范围
  581. - "today": 今天
  582. - "yesterday": 昨天
  583. - "last_week": 最近7天
  584. - "last_month": 最近30天
  585. threshold: 相似度阈值,0-1之间,默认0.5
  586. limit: 返回条数限制,默认50
  587. include_url: 是否包含URL链接,默认False
  588. Returns:
  589. 相关新闻列表,按相似度排序
  590. """
  591. try:
  592. # 参数验证
  593. reference_title = validate_keyword(reference_title)
  594. threshold = validate_threshold(threshold, default=0.5, min_value=0.0, max_value=1.0)
  595. limit = validate_limit(limit, default=50)
  596. # 确定日期范围
  597. today = datetime.now()
  598. # 规范化 date_range(处理 JSON 字符串序列化问题)
  599. date_range = normalize_date_range(date_range)
  600. if date_range is None or date_range == "today":
  601. # 只查询今天
  602. search_dates = [today]
  603. elif isinstance(date_range, str):
  604. # 预设时间范围
  605. if date_range == "yesterday":
  606. search_dates = [today - timedelta(days=1)]
  607. elif date_range == "last_week":
  608. search_dates = [today - timedelta(days=i) for i in range(7)]
  609. elif date_range == "last_month":
  610. search_dates = [today - timedelta(days=i) for i in range(30)]
  611. else:
  612. # 单日字符串格式
  613. try:
  614. single_date = datetime.strptime(date_range, "%Y-%m-%d")
  615. search_dates = [single_date]
  616. except ValueError:
  617. search_dates = [today]
  618. elif isinstance(date_range, dict):
  619. # 日期范围对象
  620. start_str = date_range.get("start")
  621. end_str = date_range.get("end")
  622. if start_str and end_str:
  623. start_date = datetime.strptime(start_str, "%Y-%m-%d")
  624. end_date = datetime.strptime(end_str, "%Y-%m-%d")
  625. search_dates = []
  626. current = start_date
  627. while current <= end_date:
  628. search_dates.append(current)
  629. current += timedelta(days=1)
  630. else:
  631. search_dates = [today]
  632. else:
  633. search_dates = [today]
  634. # 提取参考标题的关键词
  635. reference_keywords = self._extract_keywords(reference_title)
  636. # 收集所有相关新闻
  637. all_related_news = []
  638. for search_date in search_dates:
  639. try:
  640. all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(search_date)
  641. for platform_id, titles in all_titles.items():
  642. platform_name = id_to_name.get(platform_id, platform_id)
  643. for title, info in titles.items():
  644. if title == reference_title:
  645. continue
  646. # 计算相似度(使用混合算法)
  647. text_similarity = self._calculate_similarity(reference_title, title)
  648. # 如果有关键词,也计算关键词重合度
  649. if reference_keywords:
  650. title_keywords = self._extract_keywords(title)
  651. keyword_similarity = self._jaccard_similarity(reference_keywords, title_keywords)
  652. # 混合相似度:70% 文本 + 30% 关键词
  653. similarity = 0.7 * text_similarity + 0.3 * keyword_similarity
  654. else:
  655. similarity = text_similarity
  656. if similarity >= threshold:
  657. news_item = {
  658. "title": title,
  659. "platform": platform_id,
  660. "platform_name": platform_name,
  661. "date": search_date.strftime("%Y-%m-%d"),
  662. "similarity": round(similarity, 3),
  663. "rank": info["ranks"][0] if info["ranks"] else 0
  664. }
  665. if include_url:
  666. news_item["url"] = info.get("url", "")
  667. all_related_news.append(news_item)
  668. except (OSError, KeyError, TypeError, ValueError):
  669. # 某天数据读取失败,跳过
  670. continue
  671. # 按相似度排序
  672. all_related_news.sort(key=lambda x: x["similarity"], reverse=True)
  673. # 限制数量
  674. results = all_related_news[:limit]
  675. # 统计信息
  676. from collections import Counter
  677. platform_dist = Counter([n["platform_name"] for n in all_related_news])
  678. date_dist = Counter([n["date"] for n in all_related_news])
  679. return {
  680. "success": True,
  681. "summary": {
  682. "description": "相关新闻搜索结果",
  683. "total_found": len(all_related_news),
  684. "returned": len(results),
  685. "reference_title": reference_title,
  686. "threshold": threshold,
  687. "date_range": {
  688. "start": min(search_dates).strftime("%Y-%m-%d"),
  689. "end": max(search_dates).strftime("%Y-%m-%d")
  690. } if search_dates else None
  691. },
  692. "data": results,
  693. "statistics": {
  694. "platform_distribution": dict(platform_dist),
  695. "date_distribution": dict(date_dist)
  696. }
  697. }
  698. except MCPError as e:
  699. return {"success": False, "error": e.to_dict()}
  700. except Exception as e:
  701. return {"success": False, "error": {"code": "INTERNAL_ERROR", "message": str(e)}}
  702. def _search_rss_by_keyword(
  703. self,
  704. query: str,
  705. start_date: datetime,
  706. end_date: datetime,
  707. limit: int = 20,
  708. include_url: bool = False
  709. ) -> Dict:
  710. """
  711. 在 RSS 数据中搜索关键词
  712. Args:
  713. query: 搜索关键词
  714. start_date: 开始日期
  715. end_date: 结束日期
  716. limit: 返回条数限制
  717. include_url: 是否包含 URL
  718. Returns:
  719. RSS 搜索结果字典
  720. """
  721. all_rss_matches = []
  722. query_lower = query.lower()
  723. current_date = start_date
  724. while current_date <= end_date:
  725. try:
  726. # 读取该日期的 RSS 数据
  727. all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(
  728. date=current_date,
  729. platform_ids=None,
  730. db_type="rss"
  731. )
  732. for feed_id, items in all_titles.items():
  733. feed_name = id_to_name.get(feed_id, feed_id)
  734. for title, info in items.items():
  735. # 关键词匹配(标题或摘要)
  736. title_match = query_lower in title.lower()
  737. summary = info.get("summary", "")
  738. summary_match = query_lower in summary.lower() if summary else False
  739. if title_match or summary_match:
  740. rss_item = {
  741. "title": title,
  742. "feed_id": feed_id,
  743. "feed_name": feed_name,
  744. "date": current_date.strftime("%Y-%m-%d"),
  745. "published_at": info.get("published_at", ""),
  746. "author": info.get("author", ""),
  747. "match_in": "title" if title_match else "summary"
  748. }
  749. if include_url:
  750. rss_item["url"] = info.get("url", "")
  751. all_rss_matches.append(rss_item)
  752. except DataNotFoundError:
  753. # 该日期没有 RSS 数据,继续下一天
  754. pass
  755. except (OSError, KeyError, TypeError, ValueError):
  756. # 其他错误,跳过
  757. pass
  758. current_date += timedelta(days=1)
  759. # 按发布时间排序(最新的在前)
  760. all_rss_matches.sort(key=lambda x: x.get("published_at", ""), reverse=True)
  761. return {
  762. "items": all_rss_matches[:limit],
  763. "total": len(all_rss_matches)
  764. }