article_reader.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. """
  2. 文章内容读取工具
  3. 通过 Jina AI Reader API 将 URL 转换为 LLM 友好的 Markdown 格式。
  4. 支持单篇和批量读取,内置速率限制和并发控制。
  5. """
  6. import time
  7. from typing import Dict, List
  8. import requests
  9. from ..utils.errors import MCPError, InvalidParameterError
  10. # Jina Reader 配置
  11. JINA_READER_BASE = "https://r.jina.ai"
  12. DEFAULT_TIMEOUT = 30 # 秒
  13. MAX_BATCH_SIZE = 5 # 单次批量最大篇数
  14. BATCH_INTERVAL = 5.0 # 批量请求间隔(秒)
  15. class ArticleReaderTools:
  16. """文章内容读取工具类"""
  17. def __init__(self, project_root: str = None, jina_api_key: str = None):
  18. """
  19. 初始化文章读取工具
  20. Args:
  21. project_root: 项目根目录
  22. jina_api_key: Jina API Key(可选,有 Key 可提升速率限制)
  23. """
  24. self.project_root = project_root
  25. self.jina_api_key = jina_api_key
  26. self._last_request_time = 0.0
  27. def _build_headers(self) -> Dict[str, str]:
  28. """构建请求头"""
  29. headers = {
  30. "Accept": "text/markdown",
  31. "X-Return-Format": "markdown",
  32. "X-No-Cache": "true",
  33. }
  34. if self.jina_api_key:
  35. headers["Authorization"] = f"Bearer {self.jina_api_key}"
  36. return headers
  37. def _throttle(self):
  38. """速率控制:确保请求间隔 5 秒"""
  39. now = time.time()
  40. elapsed = now - self._last_request_time
  41. if elapsed < BATCH_INTERVAL:
  42. time.sleep(BATCH_INTERVAL - elapsed)
  43. self._last_request_time = time.time()
  44. def read_article(
  45. self,
  46. url: str,
  47. timeout: int = DEFAULT_TIMEOUT
  48. ) -> Dict:
  49. """
  50. 读取单篇文章内容(Markdown 格式)
  51. Args:
  52. url: 文章链接
  53. timeout: 请求超时时间(秒),默认 30
  54. Returns:
  55. 文章内容字典
  56. """
  57. try:
  58. if not url or not url.startswith(("http://", "https://")):
  59. raise InvalidParameterError(
  60. f"无效的 URL: {url}",
  61. suggestion="URL 必须以 http:// 或 https:// 开头"
  62. )
  63. self._throttle()
  64. response = requests.get(
  65. f"{JINA_READER_BASE}/{url}",
  66. headers=self._build_headers(),
  67. timeout=timeout
  68. )
  69. if response.status_code == 200:
  70. return {
  71. "success": True,
  72. "data": {
  73. "url": url,
  74. "content": response.text,
  75. "format": "markdown",
  76. "content_length": len(response.text)
  77. }
  78. }
  79. elif response.status_code == 429:
  80. return {
  81. "success": False,
  82. "error": {
  83. "code": "RATE_LIMITED",
  84. "message": "Jina Reader 速率限制,请稍后重试",
  85. "suggestion": "免费限制: 100 RPM / 2 并发,可配置 API Key 提升限额"
  86. }
  87. }
  88. else:
  89. return {
  90. "success": False,
  91. "error": {
  92. "code": "FETCH_FAILED",
  93. "message": f"HTTP {response.status_code}: {response.reason}",
  94. "url": url
  95. }
  96. }
  97. except requests.Timeout:
  98. return {
  99. "success": False,
  100. "error": {
  101. "code": "TIMEOUT",
  102. "message": f"请求超时({timeout}秒)",
  103. "url": url,
  104. "suggestion": "可尝试增加 timeout 参数"
  105. }
  106. }
  107. except MCPError as e:
  108. return {"success": False, "error": e.to_dict()}
  109. except Exception as e:
  110. return {
  111. "success": False,
  112. "error": {
  113. "code": "REQUEST_ERROR",
  114. "message": str(e),
  115. "url": url
  116. }
  117. }
  118. def read_articles_batch(
  119. self,
  120. urls: List[str],
  121. timeout: int = DEFAULT_TIMEOUT
  122. ) -> Dict:
  123. """
  124. 批量读取多篇文章内容(最多 5 篇,间隔 5 秒)
  125. Args:
  126. urls: 文章链接列表
  127. timeout: 每篇的请求超时时间(秒)
  128. Returns:
  129. 批量读取结果
  130. """
  131. try:
  132. if not urls:
  133. raise InvalidParameterError(
  134. "URL 列表不能为空",
  135. suggestion="请提供至少一个 URL"
  136. )
  137. # 限制最多 5 篇
  138. actual_urls = urls[:MAX_BATCH_SIZE]
  139. skipped = len(urls) - len(actual_urls)
  140. results = []
  141. succeeded = 0
  142. failed = 0
  143. for i, url in enumerate(actual_urls):
  144. result = self.read_article(url=url, timeout=timeout)
  145. results.append({
  146. "index": i + 1,
  147. "url": url,
  148. "success": result["success"],
  149. "data": result.get("data"),
  150. "error": result.get("error")
  151. })
  152. if result["success"]:
  153. succeeded += 1
  154. else:
  155. failed += 1
  156. return {
  157. "success": True,
  158. "summary": {
  159. "description": "批量文章读取结果",
  160. "requested": len(urls),
  161. "processed": len(actual_urls),
  162. "succeeded": succeeded,
  163. "failed": failed,
  164. "skipped": skipped,
  165. "interval_seconds": BATCH_INTERVAL,
  166. },
  167. "articles": results,
  168. "note": f"已跳过 {skipped} 篇(单次上限 {MAX_BATCH_SIZE} 篇)" if skipped > 0 else None
  169. }
  170. except MCPError as e:
  171. return {"success": False, "error": e.to_dict()}
  172. except Exception as e:
  173. return {
  174. "success": False,
  175. "error": {
  176. "code": "BATCH_ERROR",
  177. "message": str(e)
  178. }
  179. }