manager.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. # coding=utf-8
  2. """
  3. 存储管理器 - 统一管理存储后端
  4. 根据环境和配置自动选择合适的存储后端
  5. """
  6. import os
  7. from typing import Optional
  8. from trendradar.storage.base import StorageBackend, NewsData, RSSData
  9. # 存储管理器单例
  10. _storage_manager: Optional["StorageManager"] = None
  11. class StorageManager:
  12. """
  13. 存储管理器
  14. 功能:
  15. - 自动检测运行环境(GitHub Actions / Docker / 本地)
  16. - 根据配置选择存储后端(local / remote / auto)
  17. - 提供统一的存储接口
  18. - 支持从远程拉取数据到本地
  19. """
  20. def __init__(
  21. self,
  22. backend_type: str = "auto",
  23. data_dir: str = "output",
  24. enable_txt: bool = True,
  25. enable_html: bool = True,
  26. remote_config: Optional[dict] = None,
  27. local_retention_days: int = 0,
  28. remote_retention_days: int = 0,
  29. pull_enabled: bool = False,
  30. pull_days: int = 0,
  31. timezone: str = "Asia/Shanghai",
  32. ):
  33. """
  34. 初始化存储管理器
  35. Args:
  36. backend_type: 存储后端类型 (local / remote / auto)
  37. data_dir: 本地数据目录
  38. enable_txt: 是否启用 TXT 快照
  39. enable_html: 是否启用 HTML 报告
  40. remote_config: 远程存储配置(endpoint_url, bucket_name, access_key_id 等)
  41. local_retention_days: 本地数据保留天数(0 = 无限制)
  42. remote_retention_days: 远程数据保留天数(0 = 无限制)
  43. pull_enabled: 是否启用启动时自动拉取
  44. pull_days: 拉取最近 N 天的数据
  45. timezone: 时区配置(默认 Asia/Shanghai)
  46. """
  47. self.backend_type = backend_type
  48. self.data_dir = data_dir
  49. self.enable_txt = enable_txt
  50. self.enable_html = enable_html
  51. self.remote_config = remote_config or {}
  52. self.local_retention_days = local_retention_days
  53. self.remote_retention_days = remote_retention_days
  54. self.pull_enabled = pull_enabled
  55. self.pull_days = pull_days
  56. self.timezone = timezone
  57. self._backend: Optional[StorageBackend] = None
  58. self._remote_backend: Optional[StorageBackend] = None
  59. @staticmethod
  60. def is_github_actions() -> bool:
  61. """检测是否在 GitHub Actions 环境中运行"""
  62. return os.environ.get("GITHUB_ACTIONS") == "true"
  63. @staticmethod
  64. def is_docker() -> bool:
  65. """检测是否在 Docker 容器中运行"""
  66. # 方法1: 检查 /.dockerenv 文件
  67. if os.path.exists("/.dockerenv"):
  68. return True
  69. # 方法2: 检查 cgroup(Linux)
  70. try:
  71. with open("/proc/1/cgroup", "r") as f:
  72. return "docker" in f.read()
  73. except (FileNotFoundError, PermissionError):
  74. pass
  75. # 方法3: 检查环境变量
  76. return os.environ.get("DOCKER_CONTAINER") == "true"
  77. def _resolve_backend_type(self) -> str:
  78. """解析实际使用的后端类型"""
  79. if self.backend_type == "auto":
  80. if self.is_github_actions():
  81. # GitHub Actions 环境,检查是否配置了远程存储
  82. if self._has_remote_config():
  83. return "remote"
  84. else:
  85. print("[存储管理器] GitHub Actions 环境但未配置远程存储,使用本地存储")
  86. return "local"
  87. else:
  88. return "local"
  89. return self.backend_type
  90. def _has_remote_config(self) -> bool:
  91. """检查是否有有效的远程存储配置"""
  92. # 检查配置或环境变量
  93. bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
  94. access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
  95. secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
  96. endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
  97. # 调试日志
  98. has_config = bool(bucket_name and access_key and secret_key and endpoint)
  99. if not has_config:
  100. print(f"[存储管理器] 远程存储配置检查失败:")
  101. print(f" - bucket_name: {'已配置' if bucket_name else '未配置'}")
  102. print(f" - access_key_id: {'已配置' if access_key else '未配置'}")
  103. print(f" - secret_access_key: {'已配置' if secret_key else '未配置'}")
  104. print(f" - endpoint_url: {'已配置' if endpoint else '未配置'}")
  105. return has_config
  106. def _create_remote_backend(self) -> Optional[StorageBackend]:
  107. """创建远程存储后端"""
  108. try:
  109. from trendradar.storage.remote import RemoteStorageBackend
  110. return RemoteStorageBackend(
  111. bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
  112. access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
  113. secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
  114. endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
  115. region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
  116. enable_txt=self.enable_txt,
  117. enable_html=self.enable_html,
  118. timezone=self.timezone,
  119. )
  120. except ImportError as e:
  121. print(f"[存储管理器] 远程后端导入失败: {e}")
  122. print("[存储管理器] 请确保已安装 boto3: pip install boto3")
  123. return None
  124. except Exception as e:
  125. print(f"[存储管理器] 远程后端初始化失败: {e}")
  126. return None
  127. def get_backend(self) -> StorageBackend:
  128. """获取存储后端实例"""
  129. if self._backend is None:
  130. resolved_type = self._resolve_backend_type()
  131. if resolved_type == "remote":
  132. self._backend = self._create_remote_backend()
  133. if self._backend:
  134. print(f"[存储管理器] 使用远程存储后端")
  135. else:
  136. print("[存储管理器] 回退到本地存储")
  137. resolved_type = "local"
  138. if resolved_type == "local" or self._backend is None:
  139. from trendradar.storage.local import LocalStorageBackend
  140. self._backend = LocalStorageBackend(
  141. data_dir=self.data_dir,
  142. enable_txt=self.enable_txt,
  143. enable_html=self.enable_html,
  144. timezone=self.timezone,
  145. )
  146. print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
  147. return self._backend
  148. def pull_from_remote(self) -> int:
  149. """
  150. 从远程拉取数据到本地
  151. Returns:
  152. 成功拉取的文件数量
  153. """
  154. if not self.pull_enabled or self.pull_days <= 0:
  155. return 0
  156. if not self._has_remote_config():
  157. print("[存储管理器] 未配置远程存储,无法拉取")
  158. return 0
  159. # 创建远程后端(如果还没有)
  160. if self._remote_backend is None:
  161. self._remote_backend = self._create_remote_backend()
  162. if self._remote_backend is None:
  163. print("[存储管理器] 无法创建远程后端,拉取失败")
  164. return 0
  165. # 调用拉取方法
  166. return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
  167. def save_news_data(self, data: NewsData) -> bool:
  168. """保存新闻数据"""
  169. return self.get_backend().save_news_data(data)
  170. def save_rss_data(self, data: RSSData) -> bool:
  171. """保存 RSS 数据"""
  172. return self.get_backend().save_rss_data(data)
  173. def get_rss_data(self, date: Optional[str] = None) -> Optional[RSSData]:
  174. """获取指定日期的所有 RSS 数据(当日汇总模式)"""
  175. return self.get_backend().get_rss_data(date)
  176. def get_latest_rss_data(self, date: Optional[str] = None) -> Optional[RSSData]:
  177. """获取最新一次抓取的 RSS 数据(当前榜单模式)"""
  178. return self.get_backend().get_latest_rss_data(date)
  179. def detect_new_rss_items(self, current_data: RSSData) -> dict:
  180. """检测新增的 RSS 条目(增量模式)"""
  181. return self.get_backend().detect_new_rss_items(current_data)
  182. def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
  183. """获取当天所有数据"""
  184. return self.get_backend().get_today_all_data(date)
  185. def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
  186. """获取最新抓取数据"""
  187. return self.get_backend().get_latest_crawl_data(date)
  188. def detect_new_titles(self, current_data: NewsData) -> dict:
  189. """检测新增标题"""
  190. return self.get_backend().detect_new_titles(current_data)
  191. def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
  192. """保存 TXT 快照"""
  193. return self.get_backend().save_txt_snapshot(data)
  194. def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
  195. """保存 HTML 报告"""
  196. return self.get_backend().save_html_report(html_content, filename, is_summary)
  197. def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
  198. """检查是否是当天第一次抓取"""
  199. return self.get_backend().is_first_crawl_today(date)
  200. def cleanup(self) -> None:
  201. """清理资源"""
  202. if self._backend:
  203. self._backend.cleanup()
  204. if self._remote_backend:
  205. self._remote_backend.cleanup()
  206. def cleanup_old_data(self) -> int:
  207. """
  208. 清理过期数据
  209. Returns:
  210. 删除的日期目录数量
  211. """
  212. total_deleted = 0
  213. # 清理本地数据
  214. if self.local_retention_days > 0:
  215. total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
  216. # 清理远程数据(如果配置了)
  217. if self.remote_retention_days > 0 and self._has_remote_config():
  218. if self._remote_backend is None:
  219. self._remote_backend = self._create_remote_backend()
  220. if self._remote_backend:
  221. total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
  222. return total_deleted
  223. @property
  224. def backend_name(self) -> str:
  225. """获取当前后端名称"""
  226. return self.get_backend().backend_name
  227. @property
  228. def supports_txt(self) -> bool:
  229. """是否支持 TXT 快照"""
  230. return self.get_backend().supports_txt
  231. # === 推送记录相关方法 ===
  232. def has_pushed_today(self, date: Optional[str] = None) -> bool:
  233. """
  234. 检查指定日期是否已推送过
  235. Args:
  236. date: 日期字符串(YYYY-MM-DD),默认为今天
  237. Returns:
  238. 是否已推送
  239. """
  240. return self.get_backend().has_pushed_today(date)
  241. def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
  242. """
  243. 记录推送
  244. Args:
  245. report_type: 报告类型
  246. date: 日期字符串(YYYY-MM-DD),默认为今天
  247. Returns:
  248. 是否记录成功
  249. """
  250. return self.get_backend().record_push(report_type, date)
  251. def has_ai_analyzed_today(self, date: Optional[str] = None) -> bool:
  252. """
  253. 检查指定日期是否已进行过 AI 分析
  254. Args:
  255. date: 日期字符串(YYYY-MM-DD),默认为今天
  256. Returns:
  257. 是否已分析
  258. """
  259. return self.get_backend().has_ai_analyzed_today(date)
  260. def record_ai_analysis(self, analysis_mode: str, date: Optional[str] = None) -> bool:
  261. """
  262. 记录 AI 分析
  263. Args:
  264. analysis_mode: 分析模式(daily/current/incremental)
  265. date: 日期字符串(YYYY-MM-DD),默认为今天
  266. Returns:
  267. 是否记录成功
  268. """
  269. return self.get_backend().record_ai_analysis(analysis_mode, date)
  270. def get_storage_manager(
  271. backend_type: str = "auto",
  272. data_dir: str = "output",
  273. enable_txt: bool = True,
  274. enable_html: bool = True,
  275. remote_config: Optional[dict] = None,
  276. local_retention_days: int = 0,
  277. remote_retention_days: int = 0,
  278. pull_enabled: bool = False,
  279. pull_days: int = 0,
  280. timezone: str = "Asia/Shanghai",
  281. force_new: bool = False,
  282. ) -> StorageManager:
  283. """
  284. 获取存储管理器单例
  285. Args:
  286. backend_type: 存储后端类型
  287. data_dir: 本地数据目录
  288. enable_txt: 是否启用 TXT 快照
  289. enable_html: 是否启用 HTML 报告
  290. remote_config: 远程存储配置
  291. local_retention_days: 本地数据保留天数(0 = 无限制)
  292. remote_retention_days: 远程数据保留天数(0 = 无限制)
  293. pull_enabled: 是否启用启动时自动拉取
  294. pull_days: 拉取最近 N 天的数据
  295. timezone: 时区配置(默认 Asia/Shanghai)
  296. force_new: 是否强制创建新实例
  297. Returns:
  298. StorageManager 实例
  299. """
  300. global _storage_manager
  301. if _storage_manager is None or force_new:
  302. _storage_manager = StorageManager(
  303. backend_type=backend_type,
  304. data_dir=data_dir,
  305. enable_txt=enable_txt,
  306. enable_html=enable_html,
  307. remote_config=remote_config,
  308. local_retention_days=local_retention_days,
  309. remote_retention_days=remote_retention_days,
  310. pull_enabled=pull_enabled,
  311. pull_days=pull_days,
  312. timezone=timezone,
  313. )
  314. return _storage_manager