search_tools.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. """
  2. 智能新闻检索工具
  3. 提供模糊搜索、链接查询、历史相关新闻检索等高级搜索功能。
  4. """
  5. import re
  6. from collections import Counter
  7. from datetime import datetime, timedelta
  8. from difflib import SequenceMatcher
  9. from typing import Dict, List, Optional, Tuple
  10. from ..services.data_service import DataService
  11. from ..utils.validators import validate_keyword, validate_limit
  12. from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
  13. class SearchTools:
  14. """智能新闻检索工具类"""
  15. def __init__(self, project_root: str = None):
  16. """
  17. 初始化智能检索工具
  18. Args:
  19. project_root: 项目根目录
  20. """
  21. self.data_service = DataService(project_root)
  22. # 中文停用词列表
  23. self.stopwords = {
  24. '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
  25. '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有',
  26. '看', '好', '自己', '这', '那', '来', '被', '与', '为', '对', '将', '从',
  27. '以', '及', '等', '但', '或', '而', '于', '中', '由', '可', '可以', '已',
  28. '已经', '还', '更', '最', '再', '因为', '所以', '如果', '虽然', '然而'
  29. }
  30. def search_news_unified(
  31. self,
  32. query: str,
  33. search_mode: str = "keyword",
  34. date_range: Optional[Dict[str, str]] = None,
  35. platforms: Optional[List[str]] = None,
  36. limit: int = 50,
  37. sort_by: str = "relevance",
  38. threshold: float = 0.6,
  39. include_url: bool = False
  40. ) -> Dict:
  41. """
  42. 统一新闻搜索工具 - 整合多种搜索模式
  43. Args:
  44. query: 查询内容(必需)- 关键词、内容片段或实体名称
  45. search_mode: 搜索模式,可选值:
  46. - "keyword": 精确关键词匹配(默认)
  47. - "fuzzy": 模糊内容匹配(使用相似度算法)
  48. - "entity": 实体名称搜索(自动按权重排序)
  49. date_range: 日期范围,格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
  50. 不指定则默认查询今天
  51. platforms: 平台过滤列表,如 ['zhihu', 'weibo']
  52. limit: 返回条数限制,默认50
  53. sort_by: 排序方式,可选值:
  54. - "relevance": 按相关度排序(默认)
  55. - "weight": 按新闻权重排序
  56. - "date": 按日期排序
  57. threshold: 相似度阈值(仅fuzzy模式有效),0-1之间,默认0.6
  58. include_url: 是否包含URL链接,默认False(节省token)
  59. Returns:
  60. 搜索结果字典,包含匹配的新闻列表
  61. Examples:
  62. - search_news_unified(query="人工智能", search_mode="keyword")
  63. - search_news_unified(query="特斯拉降价", search_mode="fuzzy", threshold=0.4)
  64. - search_news_unified(query="马斯克", search_mode="entity", limit=20)
  65. - search_news_unified(query="iPhone 16发布", search_mode="keyword")
  66. """
  67. try:
  68. # 参数验证
  69. query = validate_keyword(query)
  70. if search_mode not in ["keyword", "fuzzy", "entity"]:
  71. raise InvalidParameterError(
  72. f"无效的搜索模式: {search_mode}",
  73. suggestion="支持的模式: keyword, fuzzy, entity"
  74. )
  75. if sort_by not in ["relevance", "weight", "date"]:
  76. raise InvalidParameterError(
  77. f"无效的排序方式: {sort_by}",
  78. suggestion="支持的排序: relevance, weight, date"
  79. )
  80. limit = validate_limit(limit, default=50)
  81. threshold = max(0.0, min(1.0, threshold))
  82. # 处理日期范围
  83. if date_range:
  84. from ..utils.validators import validate_date_range
  85. date_range_tuple = validate_date_range(date_range)
  86. start_date, end_date = date_range_tuple
  87. else:
  88. # 默认今天
  89. start_date = end_date = datetime.now()
  90. # 收集所有匹配的新闻
  91. all_matches = []
  92. current_date = start_date
  93. while current_date <= end_date:
  94. try:
  95. all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date(
  96. date=current_date,
  97. platform_ids=platforms
  98. )
  99. # 根据搜索模式执行不同的搜索逻辑
  100. if search_mode == "keyword":
  101. matches = self._search_by_keyword_mode(
  102. query, all_titles, id_to_name, current_date, include_url
  103. )
  104. elif search_mode == "fuzzy":
  105. matches = self._search_by_fuzzy_mode(
  106. query, all_titles, id_to_name, current_date, threshold, include_url
  107. )
  108. else: # entity
  109. matches = self._search_by_entity_mode(
  110. query, all_titles, id_to_name, current_date, include_url
  111. )
  112. all_matches.extend(matches)
  113. except DataNotFoundError:
  114. # 该日期没有数据,继续下一天
  115. pass
  116. current_date += timedelta(days=1)
  117. if not all_matches:
  118. time_desc = "今天" if start_date == end_date else f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
  119. return {
  120. "success": True,
  121. "results": [],
  122. "total": 0,
  123. "query": query,
  124. "search_mode": search_mode,
  125. "time_range": time_desc,
  126. "message": f"未找到匹配的新闻({time_desc})"
  127. }
  128. # 统一排序逻辑
  129. if sort_by == "relevance":
  130. all_matches.sort(key=lambda x: x.get("similarity_score", 1.0), reverse=True)
  131. elif sort_by == "weight":
  132. from .analytics import calculate_news_weight
  133. all_matches.sort(key=lambda x: calculate_news_weight(x), reverse=True)
  134. elif sort_by == "date":
  135. all_matches.sort(key=lambda x: x.get("date", ""), reverse=True)
  136. # 限制返回数量
  137. results = all_matches[:limit]
  138. # 构建时间范围描述
  139. if start_date == end_date:
  140. time_range_desc = start_date.strftime("%Y-%m-%d")
  141. else:
  142. time_range_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
  143. result = {
  144. "success": True,
  145. "summary": {
  146. "total_found": len(all_matches),
  147. "returned_count": len(results),
  148. "requested_limit": limit,
  149. "search_mode": search_mode,
  150. "query": query,
  151. "platforms": platforms or "所有平台",
  152. "time_range": time_range_desc,
  153. "sort_by": sort_by
  154. },
  155. "results": results
  156. }
  157. if search_mode == "fuzzy":
  158. result["summary"]["threshold"] = threshold
  159. if len(all_matches) < limit:
  160. result["note"] = f"模糊搜索模式下,相似度阈值 {threshold} 仅匹配到 {len(all_matches)} 条结果"
  161. return result
  162. except MCPError as e:
  163. return {
  164. "success": False,
  165. "error": e.to_dict()
  166. }
  167. except Exception as e:
  168. return {
  169. "success": False,
  170. "error": {
  171. "code": "INTERNAL_ERROR",
  172. "message": str(e)
  173. }
  174. }
  175. def _search_by_keyword_mode(
  176. self,
  177. query: str,
  178. all_titles: Dict,
  179. id_to_name: Dict,
  180. current_date: datetime,
  181. include_url: bool
  182. ) -> List[Dict]:
  183. """
  184. 关键词搜索模式(精确匹配)
  185. Args:
  186. query: 搜索关键词
  187. all_titles: 所有标题字典
  188. id_to_name: 平台ID到名称映射
  189. current_date: 当前日期
  190. Returns:
  191. 匹配的新闻列表
  192. """
  193. matches = []
  194. query_lower = query.lower()
  195. for platform_id, titles in all_titles.items():
  196. platform_name = id_to_name.get(platform_id, platform_id)
  197. for title, info in titles.items():
  198. # 精确包含判断
  199. if query_lower in title.lower():
  200. news_item = {
  201. "title": title,
  202. "platform": platform_id,
  203. "platform_name": platform_name,
  204. "date": current_date.strftime("%Y-%m-%d"),
  205. "similarity_score": 1.0, # 精确匹配,相似度为1
  206. "ranks": info.get("ranks", []),
  207. "count": len(info.get("ranks", [])),
  208. "rank": info["ranks"][0] if info["ranks"] else 999
  209. }
  210. # 条件性添加 URL 字段
  211. if include_url:
  212. news_item["url"] = info.get("url", "")
  213. news_item["mobileUrl"] = info.get("mobileUrl", "")
  214. matches.append(news_item)
  215. return matches
  216. def _search_by_fuzzy_mode(
  217. self,
  218. query: str,
  219. all_titles: Dict,
  220. id_to_name: Dict,
  221. current_date: datetime,
  222. threshold: float,
  223. include_url: bool
  224. ) -> List[Dict]:
  225. """
  226. 模糊搜索模式(使用相似度算法)
  227. Args:
  228. query: 搜索内容
  229. all_titles: 所有标题字典
  230. id_to_name: 平台ID到名称映射
  231. current_date: 当前日期
  232. threshold: 相似度阈值
  233. Returns:
  234. 匹配的新闻列表
  235. """
  236. matches = []
  237. for platform_id, titles in all_titles.items():
  238. platform_name = id_to_name.get(platform_id, platform_id)
  239. for title, info in titles.items():
  240. # 模糊匹配
  241. is_match, similarity = self._fuzzy_match(query, title, threshold)
  242. if is_match:
  243. news_item = {
  244. "title": title,
  245. "platform": platform_id,
  246. "platform_name": platform_name,
  247. "date": current_date.strftime("%Y-%m-%d"),
  248. "similarity_score": round(similarity, 4),
  249. "ranks": info.get("ranks", []),
  250. "count": len(info.get("ranks", [])),
  251. "rank": info["ranks"][0] if info["ranks"] else 999
  252. }
  253. # 条件性添加 URL 字段
  254. if include_url:
  255. news_item["url"] = info.get("url", "")
  256. news_item["mobileUrl"] = info.get("mobileUrl", "")
  257. matches.append(news_item)
  258. return matches
  259. def _search_by_entity_mode(
  260. self,
  261. query: str,
  262. all_titles: Dict,
  263. id_to_name: Dict,
  264. current_date: datetime,
  265. include_url: bool
  266. ) -> List[Dict]:
  267. """
  268. 实体搜索模式(自动按权重排序)
  269. Args:
  270. query: 实体名称
  271. all_titles: 所有标题字典
  272. id_to_name: 平台ID到名称映射
  273. current_date: 当前日期
  274. Returns:
  275. 匹配的新闻列表
  276. """
  277. matches = []
  278. for platform_id, titles in all_titles.items():
  279. platform_name = id_to_name.get(platform_id, platform_id)
  280. for title, info in titles.items():
  281. # 实体搜索:精确包含实体名称
  282. if query in title:
  283. news_item = {
  284. "title": title,
  285. "platform": platform_id,
  286. "platform_name": platform_name,
  287. "date": current_date.strftime("%Y-%m-%d"),
  288. "similarity_score": 1.0,
  289. "ranks": info.get("ranks", []),
  290. "count": len(info.get("ranks", [])),
  291. "rank": info["ranks"][0] if info["ranks"] else 999
  292. }
  293. # 条件性添加 URL 字段
  294. if include_url:
  295. news_item["url"] = info.get("url", "")
  296. news_item["mobileUrl"] = info.get("mobileUrl", "")
  297. matches.append(news_item)
  298. return matches
  299. def _calculate_similarity(self, text1: str, text2: str) -> float:
  300. """
  301. 计算两个文本的相似度
  302. Args:
  303. text1: 文本1
  304. text2: 文本2
  305. Returns:
  306. 相似度分数 (0-1之间)
  307. """
  308. # 使用 difflib.SequenceMatcher 计算序列相似度
  309. return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
  310. def _fuzzy_match(self, query: str, text: str, threshold: float = 0.3) -> Tuple[bool, float]:
  311. """
  312. 模糊匹配函数
  313. Args:
  314. query: 查询文本
  315. text: 待匹配文本
  316. threshold: 匹配阈值
  317. Returns:
  318. (是否匹配, 相似度分数)
  319. """
  320. # 直接包含判断
  321. if query.lower() in text.lower():
  322. return True, 1.0
  323. # 计算整体相似度
  324. similarity = self._calculate_similarity(query, text)
  325. if similarity >= threshold:
  326. return True, similarity
  327. # 分词后的部分匹配
  328. query_words = set(self._extract_keywords(query))
  329. text_words = set(self._extract_keywords(text))
  330. if not query_words or not text_words:
  331. return False, 0.0
  332. # 计算关键词重合度
  333. common_words = query_words & text_words
  334. keyword_overlap = len(common_words) / len(query_words)
  335. if keyword_overlap >= 0.5: # 50%的关键词重合
  336. return True, keyword_overlap
  337. return False, similarity
  338. def _extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
  339. """
  340. 从文本中提取关键词
  341. Args:
  342. text: 输入文本
  343. min_length: 最小词长
  344. Returns:
  345. 关键词列表
  346. """
  347. # 移除URL和特殊字符
  348. text = re.sub(r'http[s]?://\S+', '', text)
  349. text = re.sub(r'\[.*?\]', '', text) # 移除方括号内容
  350. # 使用正则表达式分词(中文和英文)
  351. words = re.findall(r'[\w]+', text)
  352. # 过滤停用词和短词
  353. keywords = [
  354. word for word in words
  355. if word and len(word) >= min_length and word not in self.stopwords
  356. ]
  357. return keywords
  358. def _calculate_keyword_overlap(self, keywords1: List[str], keywords2: List[str]) -> float:
  359. """
  360. 计算两个关键词列表的重合度
  361. Args:
  362. keywords1: 关键词列表1
  363. keywords2: 关键词列表2
  364. Returns:
  365. 重合度分数 (0-1之间)
  366. """
  367. if not keywords1 or not keywords2:
  368. return 0.0
  369. set1 = set(keywords1)
  370. set2 = set(keywords2)
  371. # Jaccard 相似度
  372. intersection = len(set1 & set2)
  373. union = len(set1 | set2)
  374. if union == 0:
  375. return 0.0
  376. return intersection / union
  377. def search_related_news_history(
  378. self,
  379. reference_text: str,
  380. time_range: str = "yesterday",
  381. start_date: Optional[datetime] = None,
  382. end_date: Optional[datetime] = None,
  383. threshold: float = 0.4,
  384. limit: int = 50,
  385. include_url: bool = False
  386. ) -> Dict:
  387. """
  388. 在历史数据中搜索与给定新闻相关的新闻
  389. Args:
  390. reference_text: 参考新闻标题或内容
  391. time_range: 时间范围预设值,可选:
  392. - "yesterday": 昨天
  393. - "last_week": 上周 (7天)
  394. - "last_month": 上个月 (30天)
  395. - "custom": 自定义日期范围(需要提供 start_date 和 end_date)
  396. start_date: 自定义开始日期(仅当 time_range="custom" 时有效)
  397. end_date: 自定义结束日期(仅当 time_range="custom" 时有效)
  398. threshold: 相似度阈值 (0-1之间),默认0.4
  399. limit: 返回条数限制,默认50
  400. include_url: 是否包含URL链接,默认False(节省token)
  401. Returns:
  402. 搜索结果字典,包含相关新闻列表
  403. Example:
  404. >>> tools = SearchTools()
  405. >>> result = tools.search_related_news_history(
  406. ... reference_text="人工智能技术突破",
  407. ... time_range="last_week",
  408. ... threshold=0.4,
  409. ... limit=50
  410. ... )
  411. >>> for news in result['results']:
  412. ... print(f"{news['date']}: {news['title']} (相似度: {news['similarity_score']})")
  413. """
  414. try:
  415. # 参数验证
  416. reference_text = validate_keyword(reference_text)
  417. threshold = max(0.0, min(1.0, threshold))
  418. limit = validate_limit(limit, default=50)
  419. # 确定查询日期范围
  420. today = datetime.now()
  421. if time_range == "yesterday":
  422. search_start = today - timedelta(days=1)
  423. search_end = today - timedelta(days=1)
  424. elif time_range == "last_week":
  425. search_start = today - timedelta(days=7)
  426. search_end = today - timedelta(days=1)
  427. elif time_range == "last_month":
  428. search_start = today - timedelta(days=30)
  429. search_end = today - timedelta(days=1)
  430. elif time_range == "custom":
  431. if not start_date or not end_date:
  432. raise InvalidParameterError(
  433. "自定义时间范围需要提供 start_date 和 end_date",
  434. suggestion="请提供 start_date 和 end_date 参数"
  435. )
  436. search_start = start_date
  437. search_end = end_date
  438. else:
  439. raise InvalidParameterError(
  440. f"不支持的时间范围: {time_range}",
  441. suggestion="请使用 'yesterday', 'last_week', 'last_month' 或 'custom'"
  442. )
  443. # 提取参考文本的关键词
  444. reference_keywords = self._extract_keywords(reference_text)
  445. if not reference_keywords:
  446. raise InvalidParameterError(
  447. "无法从参考文本中提取关键词",
  448. suggestion="请提供更详细的文本内容"
  449. )
  450. # 收集所有相关新闻
  451. all_related_news = []
  452. current_date = search_start
  453. while current_date <= search_end:
  454. try:
  455. # 读取该日期的数据
  456. all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(current_date)
  457. # 搜索相关新闻
  458. for platform_id, titles in all_titles.items():
  459. platform_name = id_to_name.get(platform_id, platform_id)
  460. for title, info in titles.items():
  461. # 计算标题相似度
  462. title_similarity = self._calculate_similarity(reference_text, title)
  463. # 提取标题关键词
  464. title_keywords = self._extract_keywords(title)
  465. # 计算关键词重合度
  466. keyword_overlap = self._calculate_keyword_overlap(
  467. reference_keywords,
  468. title_keywords
  469. )
  470. # 综合相似度 (70% 关键词重合 + 30% 文本相似度)
  471. combined_score = keyword_overlap * 0.7 + title_similarity * 0.3
  472. if combined_score >= threshold:
  473. news_item = {
  474. "title": title,
  475. "platform": platform_id,
  476. "platform_name": platform_name,
  477. "date": current_date.strftime("%Y-%m-%d"),
  478. "similarity_score": round(combined_score, 4),
  479. "keyword_overlap": round(keyword_overlap, 4),
  480. "text_similarity": round(title_similarity, 4),
  481. "common_keywords": list(set(reference_keywords) & set(title_keywords)),
  482. "rank": info["ranks"][0] if info["ranks"] else 0
  483. }
  484. # 条件性添加 URL 字段
  485. if include_url:
  486. news_item["url"] = info.get("url", "")
  487. news_item["mobileUrl"] = info.get("mobileUrl", "")
  488. all_related_news.append(news_item)
  489. except DataNotFoundError:
  490. # 该日期没有数据,继续下一天
  491. pass
  492. except Exception as e:
  493. # 记录错误但继续处理其他日期
  494. print(f"Warning: 处理日期 {current_date.strftime('%Y-%m-%d')} 时出错: {e}")
  495. # 移动到下一天
  496. current_date += timedelta(days=1)
  497. if not all_related_news:
  498. return {
  499. "success": True,
  500. "results": [],
  501. "total": 0,
  502. "query": reference_text,
  503. "time_range": time_range,
  504. "date_range": {
  505. "start": search_start.strftime("%Y-%m-%d"),
  506. "end": search_end.strftime("%Y-%m-%d")
  507. },
  508. "message": "未找到相关新闻"
  509. }
  510. # 按相似度排序
  511. all_related_news.sort(key=lambda x: x["similarity_score"], reverse=True)
  512. # 限制返回数量
  513. results = all_related_news[:limit]
  514. # 统计信息
  515. platform_distribution = Counter([news["platform"] for news in all_related_news])
  516. date_distribution = Counter([news["date"] for news in all_related_news])
  517. result = {
  518. "success": True,
  519. "summary": {
  520. "total_found": len(all_related_news),
  521. "returned_count": len(results),
  522. "requested_limit": limit,
  523. "threshold": threshold,
  524. "reference_text": reference_text,
  525. "reference_keywords": reference_keywords,
  526. "time_range": time_range,
  527. "date_range": {
  528. "start": search_start.strftime("%Y-%m-%d"),
  529. "end": search_end.strftime("%Y-%m-%d")
  530. }
  531. },
  532. "results": results,
  533. "statistics": {
  534. "platform_distribution": dict(platform_distribution),
  535. "date_distribution": dict(date_distribution),
  536. "avg_similarity": round(
  537. sum([news["similarity_score"] for news in all_related_news]) / len(all_related_news),
  538. 4
  539. ) if all_related_news else 0.0
  540. }
  541. }
  542. if len(all_related_news) < limit:
  543. result["note"] = f"相关性阈值 {threshold} 下仅找到 {len(all_related_news)} 条相关新闻"
  544. return result
  545. except MCPError as e:
  546. return {
  547. "success": False,
  548. "error": e.to_dict()
  549. }
  550. except Exception as e:
  551. return {
  552. "success": False,
  553. "error": {
  554. "code": "INTERNAL_ERROR",
  555. "message": str(e)
  556. }
  557. }