manage.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 新闻爬虫容器管理工具 - supercronic
  5. """
  6. import os
  7. import sys
  8. import subprocess
  9. import time
  10. import signal
  11. from pathlib import Path
  12. from datetime import datetime
  13. # Web 服务器配置
  14. WEBSERVER_PORT = int(os.environ.get("WEBSERVER_PORT", "8080"))
  15. WEBSERVER_DIR = "/app/output"
  16. WEBSERVER_PID_FILE = "/tmp/webserver.pid"
  17. WEBSERVER_MANUAL_STOP_FILE = "/tmp/webserver.manual_stop"
  18. def _env_bool(name: str, default: bool) -> bool:
  19. """读取布尔环境变量,兼容 true/1/yes/on。"""
  20. value = os.environ.get(name)
  21. if value is None:
  22. return default
  23. return value.strip().lower() in {"1", "true", "yes", "on"}
  24. WEBSERVER_AUTOFIX_LOG_HEALTHY = _env_bool("WEBSERVER_AUTOFIX_LOG_HEALTHY", False)
  25. def get_timestamp():
  26. """获取当前时间戳字符串"""
  27. return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  28. def run_command(cmd, shell=True, capture_output=True):
  29. """执行系统命令"""
  30. try:
  31. result = subprocess.run(
  32. cmd, shell=shell, capture_output=capture_output, text=True
  33. )
  34. return result.returncode == 0, result.stdout, result.stderr
  35. except Exception as e:
  36. return False, "", str(e)
  37. def manual_run():
  38. """手动执行一次爬虫"""
  39. print("🔄 手动执行爬虫...")
  40. try:
  41. result = subprocess.run(
  42. ["python", "-m", "trendradar"], cwd="/app", capture_output=False, text=True
  43. )
  44. if result.returncode == 0:
  45. print("✅ 执行完成")
  46. else:
  47. print(f"❌ 执行失败,退出码: {result.returncode}")
  48. except Exception as e:
  49. print(f"❌ 执行出错: {e}")
  50. def parse_cron_schedule(cron_expr):
  51. """解析cron表达式并返回人类可读的描述"""
  52. if not cron_expr or cron_expr == "未设置":
  53. return "未设置"
  54. try:
  55. parts = cron_expr.strip().split()
  56. if len(parts) != 5:
  57. return f"原始表达式: {cron_expr}"
  58. minute, hour, day, month, weekday = parts
  59. # 分析分钟
  60. if minute == "*":
  61. minute_desc = "每分钟"
  62. elif minute.startswith("*/"):
  63. interval = minute[2:]
  64. minute_desc = f"每{interval}分钟"
  65. elif "," in minute:
  66. minute_desc = f"在第{minute}分钟"
  67. else:
  68. minute_desc = f"在第{minute}分钟"
  69. # 分析小时
  70. if hour == "*":
  71. hour_desc = "每小时"
  72. elif hour.startswith("*/"):
  73. interval = hour[2:]
  74. hour_desc = f"每{interval}小时"
  75. elif "," in hour:
  76. hour_desc = f"在{hour}点"
  77. else:
  78. hour_desc = f"在{hour}点"
  79. # 分析日期
  80. if day == "*":
  81. day_desc = "每天"
  82. elif day.startswith("*/"):
  83. interval = day[2:]
  84. day_desc = f"每{interval}天"
  85. else:
  86. day_desc = f"每月{day}号"
  87. # 分析月份
  88. if month == "*":
  89. month_desc = "每月"
  90. else:
  91. month_desc = f"在{month}月"
  92. # 分析星期
  93. weekday_names = {
  94. "0": "周日", "1": "周一", "2": "周二", "3": "周三",
  95. "4": "周四", "5": "周五", "6": "周六", "7": "周日"
  96. }
  97. if weekday == "*":
  98. weekday_desc = ""
  99. else:
  100. weekday_desc = f"在{weekday_names.get(weekday, weekday)}"
  101. # 组合描述
  102. if minute.startswith("*/") and hour == "*" and day == "*" and month == "*" and weekday == "*":
  103. # 简单的间隔模式,如 */30 * * * *
  104. return f"每{minute[2:]}分钟执行一次"
  105. elif hour != "*" and minute != "*" and day == "*" and month == "*" and weekday == "*":
  106. # 每天特定时间,如 0 9 * * *
  107. return f"每天{hour}:{minute.zfill(2)}执行"
  108. elif weekday != "*" and day == "*":
  109. # 每周特定时间
  110. return f"{weekday_desc}{hour}:{minute.zfill(2)}执行"
  111. else:
  112. # 复杂模式,显示详细信息
  113. desc_parts = [part for part in [month_desc, day_desc, weekday_desc, hour_desc, minute_desc] if part and part != "每月" and part != "每天" and part != "每小时"]
  114. if desc_parts:
  115. return " ".join(desc_parts) + "执行"
  116. else:
  117. return f"复杂表达式: {cron_expr}"
  118. except Exception as e:
  119. return f"解析失败: {cron_expr}"
  120. def show_status():
  121. """显示容器状态"""
  122. print("📊 容器状态:")
  123. # 检查 PID 1 状态
  124. supercronic_is_pid1 = False
  125. pid1_cmdline = ""
  126. try:
  127. with open('/proc/1/cmdline', 'r') as f:
  128. pid1_cmdline = f.read().replace('\x00', ' ').strip()
  129. print(f" 🔍 PID 1 进程: {pid1_cmdline}")
  130. if "supercronic" in pid1_cmdline.lower():
  131. print(" ✅ supercronic 正确运行为 PID 1")
  132. supercronic_is_pid1 = True
  133. else:
  134. print(" ❌ PID 1 不是 supercronic")
  135. print(f" 📋 实际的 PID 1: {pid1_cmdline}")
  136. except Exception as e:
  137. print(f" ❌ 无法读取 PID 1 信息: {e}")
  138. # 检查环境变量
  139. cron_schedule = os.environ.get("CRON_SCHEDULE", "未设置")
  140. run_mode = os.environ.get("RUN_MODE", "未设置")
  141. immediate_run = os.environ.get("IMMEDIATE_RUN", "未设置")
  142. print(f" ⚙️ 运行配置:")
  143. print(f" CRON_SCHEDULE: {cron_schedule}")
  144. # 解析并显示cron表达式的含义
  145. cron_description = parse_cron_schedule(cron_schedule)
  146. print(f" ⏰ 执行频率: {cron_description}")
  147. print(f" RUN_MODE: {run_mode}")
  148. print(f" IMMEDIATE_RUN: {immediate_run}")
  149. # 检查配置文件
  150. config_files = ["/app/config/config.yaml", "/app/config/frequency_words.txt"]
  151. print(" 📁 配置文件:")
  152. for file_path in config_files:
  153. if Path(file_path).exists():
  154. print(f" ✅ {Path(file_path).name}")
  155. else:
  156. print(f" ❌ {Path(file_path).name} 缺失")
  157. # 检查关键文件
  158. key_files = [
  159. ("/usr/local/bin/supercronic-linux-amd64", "supercronic二进制文件"),
  160. ("/usr/local/bin/supercronic", "supercronic软链接"),
  161. ("/tmp/crontab", "crontab文件"),
  162. ("/entrypoint.sh", "启动脚本")
  163. ]
  164. print(" 📂 关键文件检查:")
  165. for file_path, description in key_files:
  166. if Path(file_path).exists():
  167. print(f" ✅ {description}: 存在")
  168. # 对于crontab文件,显示内容
  169. if file_path == "/tmp/crontab":
  170. try:
  171. with open(file_path, 'r') as f:
  172. crontab_content = f.read().strip()
  173. print(f" 内容: {crontab_content}")
  174. except:
  175. pass
  176. else:
  177. print(f" ❌ {description}: 不存在")
  178. # 检查容器运行时间
  179. print(" ⏱️ 容器时间信息:")
  180. try:
  181. # 检查 PID 1 的启动时间
  182. with open('/proc/1/stat', 'r') as f:
  183. stat_content = f.read().strip().split()
  184. if len(stat_content) >= 22:
  185. # starttime 是第22个字段(索引21)
  186. starttime_ticks = int(stat_content[21])
  187. # 读取系统启动时间
  188. with open('/proc/stat', 'r') as stat_f:
  189. for line in stat_f:
  190. if line.startswith('btime'):
  191. boot_time = int(line.split()[1])
  192. break
  193. else:
  194. boot_time = 0
  195. # 读取系统时钟频率
  196. clock_ticks = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
  197. if boot_time > 0:
  198. pid1_start_time = boot_time + (starttime_ticks / clock_ticks)
  199. current_time = time.time()
  200. uptime_seconds = int(current_time - pid1_start_time)
  201. uptime_minutes = uptime_seconds // 60
  202. uptime_hours = uptime_minutes // 60
  203. if uptime_hours > 0:
  204. print(f" PID 1 运行时间: {uptime_hours} 小时 {uptime_minutes % 60} 分钟")
  205. else:
  206. print(f" PID 1 运行时间: {uptime_minutes} 分钟 ({uptime_seconds} 秒)")
  207. else:
  208. print(f" PID 1 运行时间: 无法精确计算")
  209. else:
  210. print(" ❌ 无法解析 PID 1 统计信息")
  211. except Exception as e:
  212. print(f" ❌ 时间检查失败: {e}")
  213. # 状态总结和建议
  214. print(" 📊 状态总结:")
  215. if supercronic_is_pid1:
  216. print(" ✅ supercronic 正确运行为 PID 1")
  217. print(" ✅ 定时任务应该正常工作")
  218. # 显示当前的调度信息
  219. if cron_schedule != "未设置":
  220. print(f" ⏰ 当前调度: {cron_description}")
  221. # 提供一些常见的调度建议
  222. if "分钟" in cron_description and "每30分钟" not in cron_description and "每60分钟" not in cron_description:
  223. print(" 💡 频繁执行模式,适合实时监控")
  224. elif "小时" in cron_description:
  225. print(" 💡 按小时执行模式,适合定期汇总")
  226. elif "天" in cron_description:
  227. print(" 💡 每日执行模式,适合日报生成")
  228. print(" 💡 如果定时任务不执行,检查:")
  229. print(" • crontab 格式是否正确")
  230. print(" • 时区设置是否正确")
  231. print(" • 应用程序是否有错误")
  232. else:
  233. print(" ❌ supercronic 状态异常")
  234. if pid1_cmdline:
  235. print(f" 📋 当前 PID 1: {pid1_cmdline}")
  236. print(" 💡 建议操作:")
  237. print(" • 重启容器: docker restart trendradar")
  238. print(" • 检查容器日志: docker logs trendradar")
  239. # 显示日志检查建议
  240. print(" 📋 运行状态检查:")
  241. print(" • 查看完整容器日志: docker logs trendradar")
  242. print(" • 查看实时日志: docker logs -f trendradar")
  243. print(" • 手动执行测试: python manage.py run")
  244. print(" • 重启容器服务: docker restart trendradar")
  245. def show_config():
  246. """显示当前配置"""
  247. print("⚙️ 当前配置:")
  248. env_vars = [
  249. # 运行配置
  250. "CRON_SCHEDULE",
  251. "RUN_MODE",
  252. "IMMEDIATE_RUN",
  253. # 通知渠道
  254. "FEISHU_WEBHOOK_URL",
  255. "DINGTALK_WEBHOOK_URL",
  256. "WEWORK_WEBHOOK_URL",
  257. "WEWORK_MSG_TYPE",
  258. "TELEGRAM_BOT_TOKEN",
  259. "TELEGRAM_CHAT_ID",
  260. "NTFY_SERVER_URL",
  261. "NTFY_TOPIC",
  262. "NTFY_TOKEN",
  263. "BARK_URL",
  264. "SLACK_WEBHOOK_URL",
  265. # AI 分析配置
  266. "AI_ANALYSIS_ENABLED",
  267. "AI_API_KEY",
  268. "AI_PROVIDER",
  269. "AI_MODEL",
  270. "AI_BASE_URL",
  271. # 远程存储配置
  272. "S3_BUCKET_NAME",
  273. "S3_ACCESS_KEY_ID",
  274. "S3_ENDPOINT_URL",
  275. "S3_REGION",
  276. ]
  277. for var in env_vars:
  278. value = os.environ.get(var, "未设置")
  279. # 隐藏敏感信息
  280. if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY", "SECRET"]):
  281. if value and value != "未设置":
  282. masked_value = value[:10] + "***" if len(value) > 10 else "***"
  283. print(f" {var}: {masked_value}")
  284. else:
  285. print(f" {var}: {value}")
  286. else:
  287. print(f" {var}: {value}")
  288. crontab_file = "/tmp/crontab"
  289. if Path(crontab_file).exists():
  290. print(" 📅 Crontab内容:")
  291. try:
  292. with open(crontab_file, "r") as f:
  293. content = f.read().strip()
  294. print(f" {content}")
  295. except Exception as e:
  296. print(f" 读取失败: {e}")
  297. else:
  298. print(" 📅 Crontab文件不存在")
  299. def show_files():
  300. """显示输出文件"""
  301. print("📁 输出文件:")
  302. output_dir = Path("/app/output")
  303. if not output_dir.exists():
  304. print(" 📭 输出目录不存在")
  305. return
  306. # 新结构:扁平化目录
  307. # - output/news/*.db
  308. # - output/rss/*.db
  309. # - output/txt/{date}/*.txt
  310. # - output/html/{date}/*.html
  311. # 检查 news 数据库
  312. news_dir = output_dir / "news"
  313. if news_dir.exists():
  314. db_files = sorted(news_dir.glob("*.db"), key=lambda x: x.name, reverse=True)
  315. if db_files:
  316. print(f" 💾 热榜数据库 (news/): {len(db_files)} 个")
  317. for db_file in db_files[:5]:
  318. mtime = time.ctime(db_file.stat().st_mtime)
  319. size_kb = db_file.stat().st_size // 1024
  320. print(f" 📀 {db_file.name} ({size_kb}KB, {mtime.split()[3][:5]})")
  321. if len(db_files) > 5:
  322. print(f" ... 还有 {len(db_files) - 5} 个")
  323. # 检查 RSS 数据库
  324. rss_dir = output_dir / "rss"
  325. if rss_dir.exists():
  326. db_files = sorted(rss_dir.glob("*.db"), key=lambda x: x.name, reverse=True)
  327. if db_files:
  328. print(f" 📰 RSS 数据库 (rss/): {len(db_files)} 个")
  329. for db_file in db_files[:5]:
  330. mtime = time.ctime(db_file.stat().st_mtime)
  331. size_kb = db_file.stat().st_size // 1024
  332. print(f" 📀 {db_file.name} ({size_kb}KB, {mtime.split()[3][:5]})")
  333. if len(db_files) > 5:
  334. print(f" ... 还有 {len(db_files) - 5} 个")
  335. # 检查 TXT 快照目录
  336. txt_dir = output_dir / "txt"
  337. if txt_dir.exists():
  338. date_dirs = sorted([d for d in txt_dir.iterdir() if d.is_dir()], reverse=True)
  339. if date_dirs:
  340. print(f" 📄 TXT 快照 (txt/): {len(date_dirs)} 天")
  341. for date_dir in date_dirs[:3]:
  342. txt_files = list(date_dir.glob("*.txt"))
  343. if txt_files:
  344. recent = sorted(txt_files, key=lambda x: x.stat().st_mtime, reverse=True)[0]
  345. mtime = time.ctime(recent.stat().st_mtime)
  346. print(f" 📅 {date_dir.name}: {len(txt_files)} 个文件 (最新: {mtime.split()[3][:5]})")
  347. # 检查 HTML 报告目录
  348. html_dir = output_dir / "html"
  349. if html_dir.exists():
  350. date_dirs = sorted([d for d in html_dir.iterdir() if d.is_dir()], reverse=True)
  351. if date_dirs:
  352. print(f" 🌐 HTML 报告 (html/): {len(date_dirs)} 天")
  353. for date_dir in date_dirs[:3]:
  354. html_files = list(date_dir.glob("*.html"))
  355. if html_files:
  356. recent = sorted(html_files, key=lambda x: x.stat().st_mtime, reverse=True)[0]
  357. mtime = time.ctime(recent.stat().st_mtime)
  358. print(f" 📅 {date_dir.name}: {len(html_files)} 个文件 (最新: {mtime.split()[3][:5]})")
  359. def show_logs():
  360. """显示实时日志"""
  361. print("📋 实时日志 (按 Ctrl+C 退出):")
  362. print("💡 提示: 这将显示 PID 1 进程的输出")
  363. try:
  364. # 尝试多种方法查看日志
  365. log_files = [
  366. "/proc/1/fd/1", # PID 1 的标准输出
  367. "/proc/1/fd/2", # PID 1 的标准错误
  368. ]
  369. for log_file in log_files:
  370. if Path(log_file).exists():
  371. print(f"📄 尝试读取: {log_file}")
  372. subprocess.run(["tail", "-f", log_file], check=True)
  373. break
  374. else:
  375. print("📋 无法找到标准日志文件,建议使用: docker logs trendradar")
  376. except KeyboardInterrupt:
  377. print("\n👋 退出日志查看")
  378. except Exception as e:
  379. print(f"❌ 查看日志失败: {e}")
  380. print("💡 建议使用: docker logs trendradar")
  381. def restart_supercronic():
  382. """重启supercronic进程"""
  383. print("🔄 重启supercronic...")
  384. print("⚠️ 注意: supercronic 是 PID 1,无法直接重启")
  385. # 检查当前 PID 1
  386. try:
  387. with open('/proc/1/cmdline', 'r') as f:
  388. pid1_cmdline = f.read().replace('\x00', ' ').strip()
  389. print(f" 🔍 当前 PID 1: {pid1_cmdline}")
  390. if "supercronic" in pid1_cmdline.lower():
  391. print(" ✅ PID 1 是 supercronic")
  392. print(" 💡 要重启 supercronic,需要重启整个容器:")
  393. print(" docker restart trendradar")
  394. else:
  395. print(" ❌ PID 1 不是 supercronic,这是异常状态")
  396. print(" 💡 建议重启容器以修复问题:")
  397. print(" docker restart trendradar")
  398. except Exception as e:
  399. print(f" ❌ 无法检查 PID 1: {e}")
  400. print(" 💡 建议重启容器: docker restart trendradar")
  401. def _read_proc_cmdline(pid: int) -> str:
  402. """读取进程 cmdline,失败时返回空字符串。"""
  403. proc_cmdline = Path(f"/proc/{pid}/cmdline")
  404. if not proc_cmdline.exists():
  405. return ""
  406. try:
  407. with open(proc_cmdline, "rb") as f:
  408. return f.read().replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
  409. except Exception:
  410. return ""
  411. def _is_expected_webserver_process(pid: int) -> bool:
  412. """检查 pid 是否是当前端口的 http.server 进程。"""
  413. cmdline = _read_proc_cmdline(pid)
  414. if not cmdline:
  415. return False
  416. return "http.server" in cmdline and str(WEBSERVER_PORT) in cmdline
  417. def _is_manual_stop_requested() -> bool:
  418. """是否处于手动停服状态。"""
  419. return Path(WEBSERVER_MANUAL_STOP_FILE).exists()
  420. def _set_manual_stop_marker():
  421. """写入手动停服标记,防止 watchdog 自动拉起。"""
  422. try:
  423. with open(WEBSERVER_MANUAL_STOP_FILE, "w", encoding="utf-8") as f:
  424. f.write(get_timestamp())
  425. except Exception:
  426. pass
  427. def _clear_manual_stop_marker():
  428. """清理手动停服标记。"""
  429. try:
  430. if Path(WEBSERVER_MANUAL_STOP_FILE).exists():
  431. os.remove(WEBSERVER_MANUAL_STOP_FILE)
  432. except Exception:
  433. pass
  434. def _terminate_webserver_process(pid: int, require_expected: bool = True) -> bool:
  435. """尝试终止 Web 服务器进程。
  436. require_expected=True 时,仅终止确认是 http.server 的进程,避免误杀。
  437. """
  438. try:
  439. os.kill(pid, 0)
  440. except OSError:
  441. return True
  442. if require_expected and not _is_expected_webserver_process(pid):
  443. print(f" ⚠️ PID {pid} 存在但并非 Web 服务器进程,跳过终止")
  444. return False
  445. try:
  446. os.kill(pid, signal.SIGTERM)
  447. time.sleep(0.5)
  448. try:
  449. os.kill(pid, 0)
  450. os.kill(pid, signal.SIGKILL)
  451. print(f" ⚠️ 强制停止 Web 服务器 (PID: {pid})")
  452. except OSError:
  453. print(f" ✅ Web 服务器已停止 (PID: {pid})")
  454. return True
  455. except OSError:
  456. return True
  457. def _is_webserver_running(pid: int) -> bool:
  458. """检查 Web 服务器进程是否真正在运行。"""
  459. try:
  460. os.kill(pid, 0)
  461. except OSError:
  462. return False
  463. if not _is_expected_webserver_process(pid):
  464. return False
  465. try:
  466. import urllib.request
  467. req = urllib.request.Request(f"http://127.0.0.1:{WEBSERVER_PORT}/", method="HEAD")
  468. urllib.request.urlopen(req, timeout=3)
  469. return True
  470. except Exception:
  471. try:
  472. time.sleep(1)
  473. import urllib.request
  474. req = urllib.request.Request(f"http://127.0.0.1:{WEBSERVER_PORT}/", method="HEAD")
  475. urllib.request.urlopen(req, timeout=3)
  476. return True
  477. except Exception:
  478. return False
  479. def _cleanup_stale_pid():
  480. """清理失效的 PID 文件"""
  481. if not Path(WEBSERVER_PID_FILE).exists():
  482. return False
  483. try:
  484. with open(WEBSERVER_PID_FILE, 'r') as f:
  485. old_pid = int(f.read().strip())
  486. os.remove(WEBSERVER_PID_FILE)
  487. print(f" 🧹 清理失效 PID 文件 (PID: {old_pid})")
  488. return True
  489. except Exception:
  490. return False
  491. def start_webserver(force: bool = False):
  492. """启动 Web 服务器托管 output 目录"""
  493. print(f"🌐 启动 Web 服务器 (端口: {WEBSERVER_PORT})...")
  494. print(f" 🔒 安全提示:仅提供静态文件访问,限制在 {WEBSERVER_DIR} 目录")
  495. if force:
  496. _clear_manual_stop_marker()
  497. elif _is_manual_stop_requested():
  498. print(" ℹ️ 检测到手动停服标记,跳过自动启动")
  499. return
  500. # 检查是否已经运行
  501. if Path(WEBSERVER_PID_FILE).exists():
  502. try:
  503. with open(WEBSERVER_PID_FILE, 'r') as f:
  504. old_pid = int(f.read().strip())
  505. # 使用增强的进程检查
  506. if _is_webserver_running(old_pid):
  507. print(f" ⚠️ Web 服务器已在运行 (PID: {old_pid})")
  508. print(f" 💡 访问: http://localhost:{WEBSERVER_PORT}")
  509. print(" 💡 停止服务: python manage.py stop_webserver")
  510. return
  511. # 进程异常时优先尝试终止旧进程,避免端口占用导致重启失败
  512. _terminate_webserver_process(old_pid, require_expected=True)
  513. _cleanup_stale_pid()
  514. print(f" ℹ️ 检测到失效的 PID 文件,已清理")
  515. except Exception as e:
  516. print(f" ⚠️ 清理旧的 PID 文件: {e}")
  517. _cleanup_stale_pid()
  518. # 检查目录是否存在
  519. if not Path(WEBSERVER_DIR).exists():
  520. print(f" ❌ 目录不存在: {WEBSERVER_DIR}")
  521. return
  522. try:
  523. # 启动 HTTP 服务器
  524. # 使用 --bind 绑定到 0.0.0.0 使容器内部可访问
  525. # 工作目录限制在 WEBSERVER_DIR,防止访问其他目录
  526. process = subprocess.Popen(
  527. [sys.executable, '-m', 'http.server', str(WEBSERVER_PORT), '--bind', '0.0.0.0'],
  528. cwd=WEBSERVER_DIR,
  529. stdout=subprocess.DEVNULL,
  530. stderr=subprocess.DEVNULL,
  531. start_new_session=True
  532. )
  533. # 等待一下确保服务器启动
  534. time.sleep(1)
  535. # 检查进程是否还在运行
  536. if process.poll() is None:
  537. # 保存 PID
  538. with open(WEBSERVER_PID_FILE, 'w') as f:
  539. f.write(str(process.pid))
  540. _clear_manual_stop_marker()
  541. print(f" ✅ Web 服务器已启动 (PID: {process.pid})")
  542. print(f" 📁 服务目录: {WEBSERVER_DIR} (只读,仅静态文件)")
  543. print(f" 🌐 访问地址: http://localhost:{WEBSERVER_PORT}")
  544. print(f" 📄 首页: http://localhost:{WEBSERVER_PORT}/index.html")
  545. print(" 💡 停止服务: python manage.py stop_webserver")
  546. else:
  547. print(f" ❌ Web 服务器启动失败")
  548. except Exception as e:
  549. print(f" ❌ 启动失败: {e}")
  550. def stop_webserver():
  551. """停止 Web 服务器"""
  552. print("🛑 停止 Web 服务器...")
  553. _set_manual_stop_marker()
  554. if not Path(WEBSERVER_PID_FILE).exists():
  555. print(" ℹ️ Web 服务器未运行")
  556. print(" ℹ️ 已写入手动停服标记,watchdog 不会自动拉起")
  557. return
  558. try:
  559. with open(WEBSERVER_PID_FILE, 'r') as f:
  560. pid = int(f.read().strip())
  561. _terminate_webserver_process(pid, require_expected=True)
  562. if Path(WEBSERVER_PID_FILE).exists():
  563. os.remove(WEBSERVER_PID_FILE)
  564. print(" ℹ️ 已写入手动停服标记,watchdog 不会自动拉起")
  565. except Exception as e:
  566. print(f" ❌ 停止失败: {e}")
  567. # 尝试清理 PID 文件
  568. try:
  569. os.remove(WEBSERVER_PID_FILE)
  570. except:
  571. pass
  572. def webserver_status():
  573. """查看 Web 服务器状态"""
  574. print("🌐 Web 服务器状态:")
  575. if not Path(WEBSERVER_PID_FILE).exists():
  576. print(" ⭕ 未运行")
  577. if _is_manual_stop_requested():
  578. print(" ℹ️ 当前为手动停服状态,watchdog 不会自动拉起")
  579. print(f" 💡 启动服务: python manage.py start_webserver")
  580. return
  581. try:
  582. with open(WEBSERVER_PID_FILE, 'r') as f:
  583. pid = int(f.read().strip())
  584. # 使用增强的进程检查
  585. if _is_webserver_running(pid):
  586. print(f" ✅ 运行中 (PID: {pid})")
  587. print(f" 📁 服务目录: {WEBSERVER_DIR}")
  588. print(f" 🌐 访问地址: http://localhost:{WEBSERVER_PORT}")
  589. print(f" 📄 首页: http://localhost:{WEBSERVER_PORT}/index.html")
  590. print(" 💡 停止服务: python manage.py stop_webserver")
  591. else:
  592. print(f" ⭕ 未运行 (PID 文件存在但进程不可用)")
  593. _cleanup_stale_pid()
  594. print(" 💡 启动服务: python manage.py start_webserver")
  595. except Exception as e:
  596. print(f" ❌ 状态检查失败: {e}")
  597. def webserver_autofix():
  598. """Web 服务器健康检查和自动修复
  599. 供 watchdog/定时任务调用,检查服务状态并在需要时自动重启。
  600. 输出日志格式便于外部监控系统解析。
  601. """
  602. if _is_manual_stop_requested():
  603. if WEBSERVER_AUTOFIX_LOG_HEALTHY:
  604. print(f"[{get_timestamp()}] ℹ️ 手动停服状态,跳过自动修复")
  605. return
  606. if not Path(WEBSERVER_PID_FILE).exists():
  607. print(f"[{get_timestamp()}] ℹ️ Web 服务器未运行,启动中...")
  608. start_webserver(force=False)
  609. return
  610. try:
  611. with open(WEBSERVER_PID_FILE, 'r') as f:
  612. pid = int(f.read().strip())
  613. # 使用增强检查
  614. if not _is_webserver_running(pid):
  615. print(f"[{get_timestamp()}] ⚠️ Web 服务器不可用 (PID: {pid}),尝试重启...")
  616. _terminate_webserver_process(pid, require_expected=True)
  617. _cleanup_stale_pid()
  618. start_webserver(force=False)
  619. return
  620. if WEBSERVER_AUTOFIX_LOG_HEALTHY:
  621. print(f"[{get_timestamp()}] ✅ Web 服务器健康 (PID: {pid})")
  622. except Exception as e:
  623. print(f"[{get_timestamp()}] ❌ 健康检查异常: {e}")
  624. _cleanup_stale_pid()
  625. start_webserver(force=False)
  626. def show_help():
  627. """显示帮助信息"""
  628. help_text = """
  629. 🐳 TrendRadar 容器管理工具
  630. 📋 命令列表:
  631. run - 手动执行一次爬虫
  632. status - 显示容器运行状态
  633. config - 显示当前配置
  634. files - 显示输出文件
  635. logs - 实时查看日志
  636. restart - 重启说明
  637. start_webserver - 启动 Web 服务器托管 output 目录
  638. stop_webserver - 停止 Web 服务器
  639. webserver_status - 查看 Web 服务器状态
  640. help - 显示此帮助
  641. 📖 使用示例:
  642. # 在容器中执行
  643. python manage.py run
  644. python manage.py status
  645. python manage.py logs
  646. python manage.py start_webserver
  647. # 在宿主机执行
  648. docker exec -it trendradar python manage.py run
  649. docker exec -it trendradar python manage.py status
  650. docker exec -it trendradar python manage.py start_webserver
  651. docker logs trendradar
  652. 💡 常用操作指南:
  653. 1. 检查运行状态: status
  654. - 查看 supercronic 是否为 PID 1
  655. - 检查配置文件和关键文件
  656. - 查看 cron 调度设置
  657. 2. 手动执行测试: run
  658. - 立即执行一次新闻爬取
  659. - 测试程序是否正常工作
  660. 3. 查看日志: logs
  661. - 实时监控运行情况
  662. - 也可使用: docker logs trendradar
  663. 4. 重启服务: restart
  664. - 由于 supercronic 是 PID 1,需要重启整个容器
  665. - 使用: docker restart trendradar
  666. 5. Web 服务器管理:
  667. - 启动: start_webserver
  668. - 停止: stop_webserver(写入手动停服标记,watchdog 不自动拉起)
  669. - 状态: webserver_status
  670. - 访问: http://localhost:8080
  671. """
  672. print(help_text)
  673. def main():
  674. if len(sys.argv) < 2:
  675. show_help()
  676. return
  677. command = sys.argv[1]
  678. commands = {
  679. "run": manual_run,
  680. "status": show_status,
  681. "config": show_config,
  682. "files": show_files,
  683. "logs": show_logs,
  684. "restart": restart_supercronic,
  685. "start_webserver": lambda: start_webserver(force=True),
  686. "stop_webserver": stop_webserver,
  687. "webserver_status": webserver_status,
  688. "webserver_autofix": webserver_autofix,
  689. "help": show_help,
  690. }
  691. if command in commands:
  692. try:
  693. commands[command]()
  694. except KeyboardInterrupt:
  695. print("\n👋 操作已取消")
  696. except Exception as e:
  697. print(f"❌ 执行出错: {e}")
  698. else:
  699. print(f"❌ 未知命令: {command}")
  700. print("运行 'python manage.py help' 查看可用命令")
  701. if __name__ == "__main__":
  702. main()