| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- """
- 文章内容读取工具
- 通过 Jina AI Reader API 将 URL 转换为 LLM 友好的 Markdown 格式。
- 支持单篇和批量读取,内置速率限制和并发控制。
- """
- import time
- from typing import Dict, List
- import requests
- from ..utils.errors import MCPError, InvalidParameterError
- # Jina Reader 配置
- JINA_READER_BASE = "https://r.jina.ai"
- DEFAULT_TIMEOUT = 30 # 秒
- MAX_BATCH_SIZE = 5 # 单次批量最大篇数
- BATCH_INTERVAL = 5.0 # 批量请求间隔(秒)
- class ArticleReaderTools:
- """文章内容读取工具类"""
- def __init__(self, project_root: str = None, jina_api_key: str = None):
- """
- 初始化文章读取工具
- Args:
- project_root: 项目根目录
- jina_api_key: Jina API Key(可选,有 Key 可提升速率限制)
- """
- self.project_root = project_root
- self.jina_api_key = jina_api_key
- self._last_request_time = 0.0
- def _build_headers(self) -> Dict[str, str]:
- """构建请求头"""
- headers = {
- "Accept": "text/markdown",
- "X-Return-Format": "markdown",
- "X-No-Cache": "true",
- }
- if self.jina_api_key:
- headers["Authorization"] = f"Bearer {self.jina_api_key}"
- return headers
- def _throttle(self):
- """速率控制:确保请求间隔 5 秒"""
- now = time.time()
- elapsed = now - self._last_request_time
- if elapsed < BATCH_INTERVAL:
- time.sleep(BATCH_INTERVAL - elapsed)
- self._last_request_time = time.time()
- def read_article(
- self,
- url: str,
- timeout: int = DEFAULT_TIMEOUT
- ) -> Dict:
- """
- 读取单篇文章内容(Markdown 格式)
- Args:
- url: 文章链接
- timeout: 请求超时时间(秒),默认 30
- Returns:
- 文章内容字典
- """
- try:
- if not url or not url.startswith(("http://", "https://")):
- raise InvalidParameterError(
- f"无效的 URL: {url}",
- suggestion="URL 必须以 http:// 或 https:// 开头"
- )
- self._throttle()
- response = requests.get(
- f"{JINA_READER_BASE}/{url}",
- headers=self._build_headers(),
- timeout=timeout
- )
- if response.status_code == 200:
- return {
- "success": True,
- "data": {
- "url": url,
- "content": response.text,
- "format": "markdown",
- "content_length": len(response.text)
- }
- }
- elif response.status_code == 429:
- return {
- "success": False,
- "error": {
- "code": "RATE_LIMITED",
- "message": "Jina Reader 速率限制,请稍后重试",
- "suggestion": "免费限制: 100 RPM / 2 并发,可配置 API Key 提升限额"
- }
- }
- else:
- return {
- "success": False,
- "error": {
- "code": "FETCH_FAILED",
- "message": f"HTTP {response.status_code}: {response.reason}",
- "url": url
- }
- }
- except requests.Timeout:
- return {
- "success": False,
- "error": {
- "code": "TIMEOUT",
- "message": f"请求超时({timeout}秒)",
- "url": url,
- "suggestion": "可尝试增加 timeout 参数"
- }
- }
- except MCPError as e:
- return {"success": False, "error": e.to_dict()}
- except Exception as e:
- return {
- "success": False,
- "error": {
- "code": "REQUEST_ERROR",
- "message": str(e),
- "url": url
- }
- }
- def read_articles_batch(
- self,
- urls: List[str],
- timeout: int = DEFAULT_TIMEOUT
- ) -> Dict:
- """
- 批量读取多篇文章内容(最多 5 篇,间隔 5 秒)
- Args:
- urls: 文章链接列表
- timeout: 每篇的请求超时时间(秒)
- Returns:
- 批量读取结果
- """
- try:
- if not urls:
- raise InvalidParameterError(
- "URL 列表不能为空",
- suggestion="请提供至少一个 URL"
- )
- # 限制最多 5 篇
- actual_urls = urls[:MAX_BATCH_SIZE]
- skipped = len(urls) - len(actual_urls)
- results = []
- succeeded = 0
- failed = 0
- for i, url in enumerate(actual_urls):
- result = self.read_article(url=url, timeout=timeout)
- results.append({
- "index": i + 1,
- "url": url,
- "success": result["success"],
- "data": result.get("data"),
- "error": result.get("error")
- })
- if result["success"]:
- succeeded += 1
- else:
- failed += 1
- return {
- "success": True,
- "summary": {
- "description": "批量文章读取结果",
- "requested": len(urls),
- "processed": len(actual_urls),
- "succeeded": succeeded,
- "failed": failed,
- "skipped": skipped,
- "interval_seconds": BATCH_INTERVAL,
- },
- "articles": results,
- "note": f"已跳过 {skipped} 篇(单次上限 {MAX_BATCH_SIZE} 篇)" if skipped > 0 else None
- }
- except MCPError as e:
- return {"success": False, "error": e.to_dict()}
- except Exception as e:
- return {
- "success": False,
- "error": {
- "code": "BATCH_ERROR",
- "message": str(e)
- }
- }
|