sansan 5 месяцев назад
Родитель
Сommit
c7bacdfff7
61 измененных файлов с 12415 добавлено и 5897 удалено
  1. 0 2
      .github/ISSUE_TEMPLATE/01-bug-report.yml
  2. 0 2
      .github/ISSUE_TEMPLATE/02-feature-request.yml
  3. 0 2
      .github/ISSUE_TEMPLATE/03-config-help.yml
  4. 28 0
      .github/workflows/clean-crawler.yml
  5. 163 0
      .github/workflows/crawler.yml
  6. 388 64
      README-EN.md
  7. 83 1
      README-MCP-FAQ-EN.md
  8. 83 1
      README-MCP-FAQ.md
  9. 319 73
      README.md
  10. 49 2
      config/config.yaml
  11. 33 2
      docker/.env
  12. 1 1
      docker/Dockerfile
  13. 2 0
      docker/Dockerfile.mcp
  14. 16 2
      docker/docker-compose-build.yml
  15. 16 2
      docker/docker-compose.yml
  16. 3 3
      docker/entrypoint.sh
  17. 25 2
      docker/manage.py
  18. 0 5431
      main.py
  19. 1 1
      mcp_server/__init__.py
  20. 128 0
      mcp_server/server.py
  21. 51 32
      mcp_server/services/data_service.py
  22. 317 69
      mcp_server/services/parser_service.py
  23. 0 1
      mcp_server/tools/analytics.py
  24. 468 0
      mcp_server/tools/storage_sync.py
  25. 102 199
      mcp_server/tools/system.py
  26. 3 3
      mcp_server/utils/date_parser.py
  27. 1 1
      pyproject.toml
  28. 1 0
      requirements.txt
  29. 13 0
      trendradar/__init__.py
  30. 719 0
      trendradar/__main__.py
  31. 388 0
      trendradar/context.py
  32. 47 0
      trendradar/core/__init__.py
  33. 469 0
      trendradar/core/analyzer.py
  34. 152 0
      trendradar/core/config.py
  35. 291 0
      trendradar/core/data.py
  36. 194 0
      trendradar/core/frequency.py
  37. 332 0
      trendradar/core/loader.py
  38. 8 0
      trendradar/crawler/__init__.py
  39. 184 0
      trendradar/crawler/fetcher.py
  40. 81 0
      trendradar/notification/__init__.py
  41. 115 0
      trendradar/notification/batch.py
  42. 420 0
      trendradar/notification/dispatcher.py
  43. 80 0
      trendradar/notification/formatters.py
  44. 109 0
      trendradar/notification/push_manager.py
  45. 260 0
      trendradar/notification/renderer.py
  46. 1033 0
      trendradar/notification/senders.py
  47. 580 0
      trendradar/notification/splitter.py
  48. 40 0
      trendradar/report/__init__.py
  49. 223 0
      trendradar/report/formatter.py
  50. 235 0
      trendradar/report/generator.py
  51. 125 0
      trendradar/report/helpers.py
  52. 1050 0
      trendradar/report/html.py
  53. 44 0
      trendradar/storage/__init__.py
  54. 457 0
      trendradar/storage/base.py
  55. 869 0
      trendradar/storage/local.py
  56. 316 0
      trendradar/storage/manager.py
  57. 1071 0
      trendradar/storage/remote.py
  58. 117 0
      trendradar/storage/schema.sql
  59. 20 0
      trendradar/utils/__init__.py
  60. 91 0
      trendradar/utils/time.py
  61. 1 1
      version

+ 0 - 2
.github/ISSUE_TEMPLATE/01-bug-report.yml

@@ -4,8 +4,6 @@ name: 🐛 遇到问题了
 description: 程序运行不正常或出现错误
 description: 程序运行不正常或出现错误
 title: "[问题] "
 title: "[问题] "
 labels: ["bug"]
 labels: ["bug"]
-assignees:
-  - sansan0
 body:
 body:
   - type: markdown
   - type: markdown
     attributes:
     attributes:

+ 0 - 2
.github/ISSUE_TEMPLATE/02-feature-request.yml

@@ -4,8 +4,6 @@ name: 💡 我有个想法
 description: 建议新功能或改进现有功能
 description: 建议新功能或改进现有功能
 title: "[建议] "
 title: "[建议] "
 labels: ["enhancement"]
 labels: ["enhancement"]
-assignees:
-  - sansan0
 body:
 body:
   - type: markdown
   - type: markdown
     attributes:
     attributes:

+ 0 - 2
.github/ISSUE_TEMPLATE/03-config-help.yml

@@ -4,8 +4,6 @@ name: ⚙️ 设置遇到困难
 description: 配置相关的问题或需要帮助
 description: 配置相关的问题或需要帮助
 title: "[设置] "
 title: "[设置] "
 labels: ["配置", "帮助"]
 labels: ["配置", "帮助"]
-assignees:
-  - sansan0
 body:
 body:
   - type: markdown
   - type: markdown
     attributes:
     attributes:

+ 28 - 0
.github/workflows/clean-crawler.yml

@@ -0,0 +1,28 @@
+name: Check In
+
+# ✅ 签到续期:运行此 workflow 可重置 7 天计时,保持 "Get Hot News" 正常运行
+# ✅ Renewal: Run this workflow to reset the 7-day timer and keep "Get Hot News" active
+#
+# 📌 操作方法 / How to use:
+#   1. 点击 "Run workflow" 按钮 / Click "Run workflow" button
+#   2. 每 7 天内至少运行一次 / Run at least once every 7 days
+
+on:
+  workflow_dispatch:
+
+jobs:
+  del_runs:
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write
+      contents: read
+    steps:
+      - name: Delete all workflow runs
+        uses: Mattraks/delete-workflow-runs@v2
+        with:
+          token: ${{ github.token }}
+          repository: ${{ github.repository }}
+          retain_days: 0
+          keep_minimum_runs: 0
+          delete_workflow_by_state_pattern: "ALL"
+          delete_run_by_conclusion_pattern: "ALL"

+ 163 - 0
.github/workflows/crawler.yml

@@ -0,0 +1,163 @@
+name: Get Hot News
+
+on:
+  schedule:
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    # ⚠️ 试用版说明 / Trial Mode
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    #
+    # 🔄 运行机制 / How it works:
+    #    - 每个周期为 7 天,届时自动停止
+    #    - 运行 "Check In" 会重置周期(重新开始 7 天倒计时,而非累加)
+    #    - Each cycle is 7 days, then auto-stops
+    #    - "Check In" resets the cycle (restarts 7-day countdown, not cumulative)
+    #
+    # 💡 设计初衷 / Why this design:
+    #    如果 7 天都忘了签到,或许这些资讯对你来说并非刚需
+    #    适时的暂停,能帮你从信息流中抽离,给大脑留出喘息的空间
+    #    If you forget for 7 days, maybe you don't really need it
+    #    A timely pause helps you detach from the stream and gives your mind space
+    #
+    # 🙏 珍惜资源 / Respect shared resources:
+    #    GitHub Actions 是平台提供的公共资源,每次运行都会消耗算力
+    #    签到机制确保资源分配给真正需要的用户,感谢你的理解与配合
+    #    GitHub Actions is a shared public resource provided by the platform
+    #    Check-in ensures resources go to those who truly need it — thank you
+    #
+    # 🚀 长期使用请部署 Docker 版本 / For long-term use, deploy Docker version
+    #
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    #
+    # 📝 修改运行时间:只改第一个数字(0-59),表示每小时第几分钟运行
+    # 📝 Change time: Only modify the first number (0-59) = minute of each hour
+    #
+    # 示例 / Examples:
+    #   "15 * * * *"     → 每小时第15分钟 / minute 15 every hour
+    #   "30 0-14 * * *"  → 北京时间 8:00-22:00 每小时第30分钟 / Beijing 8am-10pm
+    #
+    - cron: "33 * * * *"
+
+  workflow_dispatch:
+
+concurrency:
+  group: crawler-${{ github.ref_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  actions: write
+
+jobs:
+  crawl:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+          clean: true
+
+      - name: Check Expiration
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          WORKFLOW_FILE="crawler.yml"
+          API_URL="repos/${{ github.repository }}/actions/workflows/$WORKFLOW_FILE/runs"
+
+          TOTAL=$(gh api "$API_URL" --jq '.total_count')
+          if [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ]; then
+            echo "No previous runs found, skipping expiration check"
+            exit 0
+          fi
+
+          LAST_PAGE=$(( (TOTAL + 99) / 100 ))
+          FIRST_RUN_DATE=$(gh api "$API_URL?per_page=100&page=$LAST_PAGE" --jq '.workflow_runs[-1].created_at')
+
+          if [ -n "$FIRST_RUN_DATE" ]; then
+            CURRENT_TIMESTAMP=$(date +%s)
+            FIRST_RUN_TIMESTAMP=$(date -d "$FIRST_RUN_DATE" +%s)
+            DIFF_SECONDS=$((CURRENT_TIMESTAMP - FIRST_RUN_TIMESTAMP))
+            LIMIT_SECONDS=604800
+
+            if [ $DIFF_SECONDS -gt $LIMIT_SECONDS ]; then
+              echo "⚠️ 试用期已结束,请运行 'Check In' 签到续期"
+              echo "⚠️ Trial expired. Run 'Check In' to renew."
+              gh workflow disable "$WORKFLOW_FILE"
+              exit 1
+            else
+              DAYS_LEFT=$(( (LIMIT_SECONDS - DIFF_SECONDS) / 86400 ))
+              echo "✅ 试用期剩余 ${DAYS_LEFT} 天,到期前请运行 'Check In' 签到续期"
+              echo "✅ Trial: ${DAYS_LEFT} days left. Run 'Check In' before expiry to renew."
+            fi
+          fi
+
+
+      # --------------------------------------------------------------------------------
+      # 🚦 TRAFFIC CONTROL / 流量控制
+      # --------------------------------------------------------------------------------
+      # EN: Generates a random delay between 1 and 300 seconds (5 minutes).
+      #     Critical for load balancing.
+      #
+      # CN: 生成 1 到 300 秒(5分钟)之间的随机延迟。
+      #     这对负载均衡至关重要。
+      - name: Random Delay (Traffic Control)
+        if: success()
+        run: |
+          echo "🎲 Traffic Control: Generating random delay..."
+          DELAY=$(( ( RANDOM % 300 )  + 1 ))
+          echo "⏸️  Sleeping for ${DELAY} seconds to spread the load..."
+          sleep ${DELAY}s
+          echo "▶️  Delay finished. Starting crawler..."
+
+      - name: Set up Python
+        if: success()
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+          cache: "pip"
+
+      - name: Install dependencies
+        if: success()
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Verify required files
+        if: success()
+        run: |
+          if [ ! -f config/config.yaml ]; then
+            echo "Error: Config missing"
+            exit 1
+          fi
+
+      - name: Run crawler
+        if: success()
+        env:
+          FEISHU_WEBHOOK_URL: ${{ secrets.FEISHU_WEBHOOK_URL }}
+          TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
+          TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
+          DINGTALK_WEBHOOK_URL: ${{ secrets.DINGTALK_WEBHOOK_URL }}
+          WEWORK_WEBHOOK_URL: ${{ secrets.WEWORK_WEBHOOK_URL }}
+          WEWORK_MSG_TYPE: ${{ secrets.WEWORK_MSG_TYPE }}
+          EMAIL_FROM: ${{ secrets.EMAIL_FROM }}
+          EMAIL_PASSWORD: ${{ secrets.EMAIL_PASSWORD }}
+          EMAIL_TO: ${{ secrets.EMAIL_TO }}
+          EMAIL_SMTP_SERVER: ${{ secrets.EMAIL_SMTP_SERVER }}
+          EMAIL_SMTP_PORT: ${{ secrets.EMAIL_SMTP_PORT }}
+          NTFY_TOPIC: ${{ secrets.NTFY_TOPIC }}
+          NTFY_SERVER_URL: ${{ secrets.NTFY_SERVER_URL }}
+          NTFY_TOKEN: ${{ secrets.NTFY_TOKEN }}
+          BARK_URL: ${{ secrets.BARK_URL }}
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+          STORAGE_BACKEND: auto
+          LOCAL_RETENTION_DAYS: ${{ secrets.LOCAL_RETENTION_DAYS }}
+          REMOTE_RETENTION_DAYS: ${{ secrets.REMOTE_RETENTION_DAYS }}
+          S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}
+          S3_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
+          S3_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
+          S3_ENDPOINT_URL: ${{ secrets.S3_ENDPOINT_URL }}
+          S3_REGION: ${{ secrets.S3_REGION }}
+          GITHUB_ACTIONS: true
+        run: python -m trendradar

+ 388 - 64
README-EN.md

@@ -1,6 +1,6 @@
 <div align="center" id="trendradar">
 <div align="center" id="trendradar">
 
 
-> **📢 Announcement:** After communicating with GitHub officials, "One-Click Fork Deployment" will be restored after compliance adjustments are completed. Please stay tuned for **v4.0.0** update
+> **📢 Announcement:** **v4.0.0** has been released! Including storage architecture refactoring, database optimization, modularization improvements, and more major updates
 
 
 <a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
 <a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
   <img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
   <img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
@@ -16,8 +16,8 @@
 [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
 [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
 [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
 [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
 [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
-[![Version](https://img.shields.io/badge/version-v3.5.0-blue.svg)](https://github.com/sansan0/TrendRadar)
-[![MCP](https://img.shields.io/badge/MCP-v1.0.3-green.svg)](https://github.com/sansan0/TrendRadar)
+[![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar)
+[![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar)
 
 
 [![WeWork](https://img.shields.io/badge/WeWork-Notification-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
 [![WeWork](https://img.shields.io/badge/WeWork-Notification-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
 [![WeChat](https://img.shields.io/badge/WeChat-Notification-00D4AA?style=flat-square)](https://weixin.qq.com/)
 [![WeChat](https://img.shields.io/badge/WeChat-Notification-00D4AA?style=flat-square)](https://weixin.qq.com/)
@@ -48,62 +48,61 @@
 <br>
 <br>
 
 
 <details>
 <details>
-<summary>🚨 <strong>【MUST READ】Important Announcement: The Correct Way to Deploy This Project</strong></summary>
+<summary>🚨 <strong>【Must Read】Important Announcement: v4.0.0 Deployment & Storage Architecture Changes</strong></summary>
 
 
 <br>
 <br>
 
 
-> **⚠️ December 2025 Urgent Notice**
->
-> Due to a surge in Fork numbers causing excessive load on GitHub servers, **GitHub Actions and GitHub Pages deployments are currently restricted**. Please read the following instructions carefully to ensure successful deployment.
+### 🛠️ Choose the Deployment Method That Fits You
+
+#### 🅰️ Option 1: Docker Deployment (Recommended 🔥)
 
 
-### 1. ✅ Only Recommended Deployment Method: Docker
+* **Features**: Most stable and simplest. Data is stored in **local SQLite**, fully under your control.
 
 
-**This is currently the most stable solution, free from GitHub restrictions.** Data is stored locally and won't be affected by GitHub policy changes.
+* **Best for**: Users with their own server, NAS, or an always-on PC.
 
 
 * 👉 [Jump to Docker Deployment Tutorial](#6-docker-deployment)
 * 👉 [Jump to Docker Deployment Tutorial](#6-docker-deployment)
 
 
 ---
 ---
 
 
-### 2. If You Were Planning to Fork This Project...
+#### 🅱️ Option 2: GitHub Actions Deployment (Restored ✅)
 
 
-To reduce pressure on GitHub servers, **please DO NOT directly click the "Fork" button!**
+* **Features**: Data is no longer committed directly to the repo. Instead, it is stored in **Remote Cloud Storage** (supports S3-compatible protocols: Cloudflare R2, Alibaba Cloud OSS, Tencent Cloud COS, etc.).
 
 
-Please use the **"Use this template"** feature instead of Fork:
+* **Requirement**: You **must** configure an S3-compatible object storage service (Cloudflare R2 recommended, it's free).
 
 
-1.  **Click** the green **[Use this template]** button in the top right corner of the original repository page.
-2.  **Select** "Create a new repository".
+> **⚠️ Note**: If you choose this option, you must complete the following two configuration steps:
 
 
-**Why do this?**
-* **❌ Fork**: Copies complete history records. Many forks running simultaneously will trigger GitHub risk control.
-* **✅ Use this template**: Creates a completely new independent repository without historical baggage, more server-friendly.
+#### 1. 🚀 Recommended Start: Use this template
 
 
----
+To keep the repository clean and avoid inheriting redundant history, I **recommend** using Template mode:
 
 
-### 3. About New Data Storage
+1.  **Click** the green **[Use this template]** button at the top right of the original repository page.
 
 
-The new version will use **Cloudflare R2** to store news data, ensuring data persistence.
+2.  **Select** "Create a new repository".
 
 
-**⚠️ Configuration Prerequisites:**
+> **💡 Why do this?**
+> * **Use this template**: Creates a brand new, clean repository with no historical baggage.
+> * **Fork**: Retains the complete commit history and relationships, consuming more GitHub resources.
 
 
-According to Cloudflare platform rules, activating R2 requires binding a payment method.
+#### 2. ☁️ About the Mandatory Remote Storage for GitHub Actions
 
 
-- **Purpose:** Identity verification only (Verify Only), no charges will be incurred.
-- **Payment:** Supports credit cards or PayPal (China region).
-- **Usage:** R2's free tier is sufficient to cover this project's daily operation, no payment required.
+If you choose **Option 2 (GitHub Actions)**, you must configure an S3-compatible object storage service.
 
 
----
+**Supported Storage Services:**
+- **Cloudflare R2** (Recommended, generous free tier)
+- Other S3-compatible services
+
+**⚠️ Configuration Prerequisites (Using Cloudflare R2 as Example):**
 
 
-### 4. 📅 Future Plans & Documentation Reading Notes
+According to Cloudflare platform rules, enabling R2 requires binding a payment method.
 
 
-> **Future Plans:**
-> - Exploring new approach: keep Actions for fetching and pushing, but no longer save data to repository, use external storage instead.
+* **Purpose**: Identity verification only (Verify Only). **No charges will be incurred**.
 
 
-**⚠️ Reading Note:**
-Given that the above plans mean **Fork deployment mode may return in a new form in the future**, and the workload to fully revise documentation is massive, we have temporarily retained the old descriptions.
+* **Payment**: Supports international credit cards or PayPal.
 
 
-**At the current stage, if "Fork" related expressions still appear in subsequent tutorials, please ignore them or understand them as "Use this template"**.
+* **Usage**: The R2 free tier (10GB storage/month) is sufficient to cover the daily operation of this project. No need to worry about costs.
 
 
-👉 **[Click here to view TrendRadar's latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**
+👉 **[Click to View Detailed Configuration Tutorial](#-quick-start)**
 
 
 </details>
 </details>
 
 
@@ -287,10 +286,32 @@ Supports **WeWork** (+ WeChat push solution), **Feishu**, **DingTalk**, **Telegr
 - ⚠️ **Paired Configuration**: Telegram and ntfy require paired parameter quantities to match (e.g., token and chat_id both have 2 values)
 - ⚠️ **Paired Configuration**: Telegram and ntfy require paired parameter quantities to match (e.g., token and chat_id both have 2 values)
 - ⚠️ **Quantity Limit**: Default maximum 3 accounts per channel, exceeded values will be truncated
 - ⚠️ **Quantity Limit**: Default maximum 3 accounts per channel, exceeded values will be truncated
 
 
-### **Multi-Platform Support**
-- **GitHub Pages**: Auto-generate beautiful web reports, PC/mobile adapted
+### **Flexible Storage Architecture (v4.0.0 Major Update)**
+
+**Multi-Backend Support**:
+- ☁️ **Remote Cloud Storage**: GitHub Actions environment default, supports S3-compatible protocols (R2/OSS/COS, etc.), data stored in cloud, keeping repository clean
+- 💾 **Local SQLite**: Traditional SQLite database, stable and efficient (Docker/local deployment)
+- 🔀 **Auto Selection**: Auto-selects appropriate backend based on runtime environment
+
+**Data Format Hierarchy**:
+
+| Format | Role | Description |
+|--------|------|-------------|
+| **SQLite** | Primary storage | Complete data with statistics information |
+| **TXT** | Human-readable backup | Optional text records for manual viewing |
+| **HTML** | Web report | Beautiful visual report (GitHub Pages) |
+
+**Data Management Features**:
+- Auto data cleanup (configurable retention period)
+- Timezone support (configurable IANA time zone)
+- Cloud/local seamless switching
+
+> 💡 For storage configuration details, see [Configuration Details - Storage Configuration](#11-storage-configuration-v400-new)
+
+### **Multi-Platform Deployment**
+- **GitHub Actions**: Cloud automated operations (7-day check-in cycle + remote cloud storage)
 - **Docker Deployment**: Supports multi-architecture containerized operation
 - **Docker Deployment**: Supports multi-architecture containerized operation
-- **Data Persistence**: HTML/TXT multi-format history saving
+- **Local Running**: Python environment direct execution
 
 
 
 
 ### **AI Smart Analysis (v3.0.0 New)**
 ### **AI Smart Analysis (v3.0.0 New)**
@@ -341,10 +362,32 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
 >**Upgrade Instructions**:
 >**Upgrade Instructions**:
 - **📌 Check Latest Updates**: **[Original Repository Changelog](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-changelog)**
 - **📌 Check Latest Updates**: **[Original Repository Changelog](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-changelog)**
 - **Tip**: Do NOT update this project via **Sync fork**. Check [Changelog] to understand specific [Upgrade Methods] and [Features]
 - **Tip**: Do NOT update this project via **Sync fork**. Check [Changelog] to understand specific [Upgrade Methods] and [Features]
-- **Minor Version Update**: Upgrading from v2.x to v2.y, replace `main.py` in your forked repo with the latest version
 - **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts
 - **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts
 
 
 
 
+### 2025/12/13 - v4.0.0
+
+**🎉 Major Update: Comprehensive Refactoring of Storage and Core Architecture**
+
+- **Multi-Storage Backend Support**: Introduced a brand new storage module supporting local SQLite and remote cloud storage (S3-compatible protocols, Cloudflare R2 recommended for free tier), adaptable to GitHub Actions, Docker, and local environments.
+- **Database Structure Optimization**: Refactored SQLite database table structures to improve data efficiency and query performance.
+- **Enhanced Features**: Implemented date format standardization, data retention policies, timezone configuration support, and optimized time display. Fixed remote storage data persistence issues to ensure accurate data merging.
+- **Cleanup and Compatibility**: Removed most legacy compatibility code and unified data storage and retrieval methods.
+
+### 2025/12/13 - mcp-v1.1.0
+
+**MCP Module Update:**
+- Adapted for v4.0.0, while maintaining compatibility with v3.x data.
+- Added storage sync tools:
+  - `sync_from_remote`: Pull data from remote storage to local
+  - `get_storage_status`: Get storage configuration and status
+  - `list_available_dates`: List available dates in local/remote storage
+
+
+<details>
+<summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
+
+
 ### 2025/12/03 - v3.5.0
 ### 2025/12/03 - v3.5.0
 
 
 **🎉 Core Feature Enhancements**
 **🎉 Core Feature Enhancements**
@@ -397,7 +440,7 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
 
 
 **🔧 Upgrade Instructions**:
 **🔧 Upgrade Instructions**:
 - **GitHub Fork Users**: Update `main.py`, `config/config.yaml` (Added multi-account push support, existing single-account configuration unaffected)
 - **GitHub Fork Users**: Update `main.py`, `config/config.yaml` (Added multi-account push support, existing single-account configuration unaffected)
-- **Docker Users**: Update `.env`, `docker compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL`
+- **Docker Users**: Update `.env`, `docker-compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL`
 - **Multi-Account Push**: New feature, disabled by default, existing single-account configuration unaffected
 - **Multi-Account Push**: New feature, disabled by default, existing single-account configuration unaffected
 
 
 
 
@@ -431,10 +474,6 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
   - Tool count increased from 13 to 14
   - Tool count increased from 13 to 14
 
 
 
 
-<details>
-<summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
-
-
 ### 2025/11/25 - v3.4.0
 ### 2025/11/25 - v3.4.0
 
 
 **🎉 Added Slack Push Support**
 **🎉 Added Slack Push Support**
@@ -819,11 +858,44 @@ frequency_words.txt file added **required word** feature, using + sign
 
 
 > **📖 Reminder**: Fork users should first **[check the latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** to ensure the configuration steps are up to date.
 > **📖 Reminder**: Fork users should first **[check the latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** to ensure the configuration steps are up to date.
 
 
+### ⚠️ GitHub Actions Usage Instructions
+
+**v4.0.0 Important Change**: Introduced "Activity Detection" mechanism—GitHub Actions now requires periodic check-in to maintain operation.
+
+#### 🔄 Check-In Renewal Mechanism
+
+- **Running Cycle**: Valid for **7 days**—service will automatically suspend when countdown ends.
+- **Renewal Method**: Manually trigger the "Check In" workflow on the Actions page to reset the 7-day validity period.
+- **Operation Path**: `Actions` → `Check In` → `Run workflow`
+- **Design Philosophy**:
+    - If you forget for 7 days, maybe you don't really need it. Letting it stop is a digital detox, freeing you from the constant impact.
+    - GitHub Actions is a valuable public computing resource. The check-in mechanism aims to prevent wasted computing cycles, ensuring resources are allocated to truly active users who need them. Thank you for your understanding and support.
+
+#### 📦 Data Storage (Required Configuration)
+
+In GitHub Actions environment, data is stored in **Remote Cloud Storage** (supports S3-compatible protocols, Cloudflare R2 recommended for free tier), keeping your repository clean (see **Required Configuration: Remote Cloud Storage** below).
+
+#### 🚀 Recommended: Docker Deployment
+
+For long-term stable operation, we recommend [Docker Deployment](#6-docker-deployment), with data stored locally and no check-in required—though it does require purchasing a cloud server.
+
+<br>
+
+> 🎉 **Now Supported: Multi-Cloud Storage Options**
+>
+> This project now supports S3-compatible protocols. You can choose:
+> - **Cloudflare R2** (Recommended, generous free tier)
+> - Other S3-compatible storage services
+>
+> Simply configure the corresponding `S3_ENDPOINT_URL`, `S3_BUCKET_NAME` and other environment variables to switch.
+
+---
+
 1. **Fork this project** to your GitHub account
 1. **Fork this project** to your GitHub account
 
 
    - Click the "Fork" button at the top right of this page
    - Click the "Fork" button at the top right of this page
 
 
-2. **Setup GitHub Secrets (Choose your needed platforms)**:
+2. **Setup GitHub Secrets (Required + Optional Platforms)**:
 
 
    In your forked repo, go to `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
    In your forked repo, go to `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
 
 
@@ -862,6 +934,35 @@ frequency_words.txt file added **required word** feature, using + sign
 
 
    <br>
    <br>
 
 
+   <details>
+   <summary>⚠️ <strong>Required Configuration: Remote Cloud Storage</strong> (Required for GitHub Actions Environment, Cloudflare R2 Recommended)</summary>
+   <br>
+
+   **GitHub Secret Configuration (⚠️ All 4 configuration items below are required):**
+
+   | Name | Secret (Value) Description |
+   |------|----------------------------|
+   | `S3_BUCKET_NAME` | Bucket name (e.g., `trendradar-data`) |
+   | `S3_ACCESS_KEY_ID` | Access key ID |
+   | `S3_SECRET_ACCESS_KEY` | Access key |
+   | `S3_ENDPOINT_URL` | S3 API endpoint (e.g., R2: `https://<account-id>.r2.cloudflarestorage.com`) |
+
+   <br>
+
+   **How to Get Credentials (Using Cloudflare R2 as Example):**
+
+   1. Visit [Cloudflare Dashboard](https://dash.cloudflare.com/) and log in
+   2. Select `R2` in left menu → Click `Create Bucket` → Enter name (e.g., `trendradar-data`)
+   3. Click `Manage R2 API Tokens` at top right → `Create API Token`
+   4. Select `Object Read & Write` permission → After creation, it will display `Access Key ID` and `Secret Access Key`
+   5. Endpoint URL can be found in bucket details page (format: `https://<account-id>.r2.cloudflarestorage.com`)
+
+   **Notes**:
+   - R2 free tier: 10GB storage + 1 million reads per month, sufficient for this project
+   - Activation requires binding a payment method (identity verification only, no charges)
+   - Data stored in cloud, keeps GitHub repository clean
+
+   </details>
 
 
    <details>
    <details>
    <summary> <strong>👉 Click to expand: WeWork Bot</strong> (Simplest and fastest configuration)</summary>
    <summary> <strong>👉 Click to expand: WeWork Bot</strong> (Simplest and fastest configuration)</summary>
@@ -2041,7 +2142,7 @@ TrendRadar provides two independent Docker images, deploy according to your need
 
 
    # Download docker compose config
    # Download docker compose config
    wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
    wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
-   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml -P docker/
+   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml -P docker/
    ```
    ```
 
 
    > 💡 **Note**: Key directory structure required for Docker deployment:
    > 💡 **Note**: Key directory structure required for Docker deployment:
@@ -2052,7 +2153,7 @@ current directory/
 │   └── frequency_words.txt
 │   └── frequency_words.txt
 └── docker/
 └── docker/
     ├── .env
     ├── .env
-    └── docker compose.yml
+    └── docker-compose.yml
 ```
 ```
 
 
 2. **Config File Description**:
 2. **Config File Description**:
@@ -2146,7 +2247,7 @@ vim config/frequency_words.txt
 
 
 # Use build version docker compose
 # Use build version docker compose
 cd docker
 cd docker
-cp docker compose-build.yml docker compose.yml
+cp docker-compose-build.yml docker-compose.yml
 ```
 ```
 
 
 **Build and Start Services**:
 **Build and Start Services**:
@@ -2232,7 +2333,7 @@ docker rm trend-radar
 
 
 > 💡 **Web Server Notes**:
 > 💡 **Web Server Notes**:
 > - After starting, access latest report at `http://localhost:8080`
 > - After starting, access latest report at `http://localhost:8080`
-> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025年xx月xx日/`)
+> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025-xx-xx/`)
 > - Port can be configured in `.env` file with `WEBSERVER_PORT` parameter
 > - Port can be configured in `.env` file with `WEBSERVER_PORT` parameter
 > - Auto-start: Set `ENABLE_WEBSERVER=true` in `.env`
 > - Auto-start: Set `ENABLE_WEBSERVER=true` in `.env`
 > - Security: Static files only, limited to output directory, localhost binding only
 > - Security: Static files only, limited to output directory, localhost binding only
@@ -2249,7 +2350,7 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously:
 |--------------|---------------|----------|
 |--------------|---------------|----------|
 | `output/index.html` | Direct host access | **Docker Deployment** (via Volume mount, visible on host) |
 | `output/index.html` | Direct host access | **Docker Deployment** (via Volume mount, visible on host) |
 | `index.html` | Root directory access | **GitHub Pages** (repository root, auto-detected by Pages) |
 | `index.html` | Root directory access | **GitHub Pages** (repository root, auto-detected by Pages) |
-| `output/YYYY年MM月DD日/html/当日汇总.html` | Historical reports | All environments (archived by date) |
+| `output/YYYY-MM-DD/html/当日汇总.html` | Historical reports | All environments (archived by date) |
 
 
 **Local Access Examples**:
 **Local Access Examples**:
 ```bash
 ```bash
@@ -2258,8 +2359,8 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously:
 docker exec -it trend-radar python manage.py start_webserver
 docker exec -it trend-radar python manage.py start_webserver
 # 2. Access in browser
 # 2. Access in browser
 http://localhost:8080                           # Access latest report (default index.html)
 http://localhost:8080                           # Access latest report (default index.html)
-http://localhost:8080/2025年xx月xx日/            # Access reports for specific date
-http://localhost:8080/2025年xx月xx日/html/       # Browse all HTML files for that date
+http://localhost:8080/2025-xx-xx/               # Access reports for specific date
+http://localhost:8080/2025-xx-xx/html/          # Browse all HTML files for that date
 
 
 # Method 2: Direct file access (local environment)
 # Method 2: Direct file access (local environment)
 open ./output/index.html             # macOS
 open ./output/index.html             # macOS
@@ -2267,7 +2368,7 @@ start ./output/index.html            # Windows
 xdg-open ./output/index.html         # Linux
 xdg-open ./output/index.html         # Linux
 
 
 # Method 3: Access historical archives
 # Method 3: Access historical archives
-open ./output/2025年xx月xx日/html/当日汇总.html
+open ./output/2025-xx-xx/html/当日汇总.html
 ```
 ```
 
 
 **Why two index.html files?**
 **Why two index.html files?**
@@ -2324,10 +2425,20 @@ flowchart TB
 Use docker compose to start both news push and MCP services:
 Use docker compose to start both news push and MCP services:
 
 
 ```bash
 ```bash
-# Download latest docker compose.yml (includes MCP service config)
-wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml
+# Method 1: Clone project (Recommended)
+git clone https://github.com/sansan0/TrendRadar.git
+cd TrendRadar/docker
+docker compose up -d
 
 
-# Start all services
+# Method 2: Download docker-compose.yml separately
+mkdir trendradar && cd trendradar
+wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml
+wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env
+mkdir -p config output
+# Download config files
+wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/config.yaml -P config/
+wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/frequency_words.txt -P config/
+# Modify volume paths in docker-compose.yml: ../config -> ./config, ../output -> ./output
 docker compose up -d
 docker compose up -d
 
 
 # Check running status
 # Check running status
@@ -2337,18 +2448,29 @@ docker ps | grep trend-radar
 **Start MCP Service Separately**:
 **Start MCP Service Separately**:
 
 
 ```bash
 ```bash
+# Linux/Mac
 docker run -d --name trend-radar-mcp \
 docker run -d --name trend-radar-mcp \
   -p 127.0.0.1:3333:3333 \
   -p 127.0.0.1:3333:3333 \
-  -v ./config:/app/config:ro \
-  -v ./output:/app/output:ro \
+  -v $(pwd)/config:/app/config:ro \
+  -v $(pwd)/output:/app/output:ro \
   -e TZ=Asia/Shanghai \
   -e TZ=Asia/Shanghai \
   wantcat/trendradar-mcp:latest
   wantcat/trendradar-mcp:latest
+
+# Windows PowerShell
+docker run -d --name trend-radar-mcp `
+  -p 127.0.0.1:3333:3333 `
+  -v ${PWD}/config:/app/config:ro `
+  -v ${PWD}/output:/app/output:ro `
+  -e TZ=Asia/Shanghai `
+  wantcat/trendradar-mcp:latest
 ```
 ```
 
 
+> ⚠️ **Note**: Ensure `config/` and `output/` folders exist in current directory with config files and news data before running.
+
 **Verify Service**:
 **Verify Service**:
 
 
 ```bash
 ```bash
-# Check if MCP service is running properly
+# Check MCP service health
 curl http://127.0.0.1:3333/mcp
 curl http://127.0.0.1:3333/mcp
 
 
 # View MCP service logs
 # View MCP service logs
@@ -2357,14 +2479,20 @@ docker logs -f trend-radar-mcp
 
 
 **Configure in AI Clients**:
 **Configure in AI Clients**:
 
 
-After MCP service starts, configure in Claude Desktop, Cherry Studio, Cursor, etc.:
+After MCP service starts, configure based on your client:
+
+**Cherry Studio** (Recommended, GUI config):
+- Settings → MCP Server → Add
+- Type: `streamableHttp`
+- URL: `http://127.0.0.1:3333/mcp`
 
 
+**Claude Desktop / Cline** (JSON config):
 ```json
 ```json
 {
 {
   "mcpServers": {
   "mcpServers": {
     "trendradar": {
     "trendradar": {
       "url": "http://127.0.0.1:3333/mcp",
       "url": "http://127.0.0.1:3333/mcp",
-      "description": "TrendRadar News Trending Analysis"
+      "type": "streamableHttp"
     }
     }
   }
   }
 }
 }
@@ -2452,7 +2580,6 @@ notification:
       start: "20:00"                  # Start time (Beijing time)
       start: "20:00"                  # Start time (Beijing time)
       end: "22:00"                    # End time (Beijing time)
       end: "22:00"                    # End time (Beijing time)
     once_per_day: true                # Push only once per day
     once_per_day: true                # Push only once per day
-    push_record_retention_days: 7     # Push record retention days
 ```
 ```
 
 
 #### Configuration Details
 #### Configuration Details
@@ -2463,7 +2590,6 @@ notification:
 | `time_range.start` | string | `"20:00"` | Push window start time (Beijing time, HH:MM format) |
 | `time_range.start` | string | `"20:00"` | Push window start time (Beijing time, HH:MM format) |
 | `time_range.end` | string | `"22:00"` | Push window end time (Beijing time, HH:MM format) |
 | `time_range.end` | string | `"22:00"` | Push window end time (Beijing time, HH:MM format) |
 | `once_per_day` | bool | `true` | `true`=push only once per day within window, `false`=push every execution within window |
 | `once_per_day` | bool | `true` | `true`=push only once per day within window, `false`=push every execution within window |
-| `push_record_retention_days` | int | `7` | Push record retention days (used to determine if already pushed) |
 
 
 #### Use Cases
 #### Use Cases
 
 
@@ -2487,7 +2613,6 @@ PUSH_WINDOW_ENABLED=true
 PUSH_WINDOW_START=09:00
 PUSH_WINDOW_START=09:00
 PUSH_WINDOW_END=18:00
 PUSH_WINDOW_END=18:00
 PUSH_WINDOW_ONCE_PER_DAY=false
 PUSH_WINDOW_ONCE_PER_DAY=false
-PUSH_WINDOW_RETENTION_DAYS=7
 ```
 ```
 
 
 #### Complete Configuration Examples
 #### Complete Configuration Examples
@@ -2502,7 +2627,6 @@ notification:
       start: "20:00"
       start: "20:00"
       end: "22:00"
       end: "22:00"
     once_per_day: true
     once_per_day: true
-    push_record_retention_days: 7
 ```
 ```
 
 
 **Scenario: Push every hour during working hours**
 **Scenario: Push every hour during working hours**
@@ -2515,7 +2639,6 @@ notification:
       start: "09:00"
       start: "09:00"
       end: "18:00"
       end: "18:00"
     once_per_day: false
     once_per_day: false
-    push_record_retention_days: 7
 ```
 ```
 
 
 </details>
 </details>
@@ -2811,6 +2934,207 @@ notification:
 
 
 <br>
 <br>
 
 
+### 11. Storage Configuration (v4.0.0 New)
+
+<details>
+<summary>👉 Click to expand: <strong>Storage Configuration Guide</strong></summary>
+<br>
+
+#### Storage Backend Selection
+
+TrendRadar v4.0.0 introduces **multi-backend storage architecture**, supporting automatic backend selection or manual specification:
+
+| Configuration Value | Description | Applicable Scenarios |
+|---------------------|-------------|---------------------|
+| `auto` (default) | Auto-select backend: GitHub Actions→R2, other environments→Local | Most users (recommended) |
+| `local` | Force use of local SQLite | Docker/local deployment |
+| `r2` | Force use of Cloudflare R2 | Cloud storage required |
+
+**Configuration Location**:
+- GitHub Actions: Set `STORAGE_BACKEND` environment variable in GitHub Secrets
+- Docker: Configure `STORAGE_BACKEND=local` in `.env` file
+- Local: Add `STORAGE_BACKEND` in environment variables or use auto mode
+
+---
+
+#### Database Structure Optimization (v4.0.0)
+
+v4.0.0 made significant optimizations to database structure, removing redundant fields and improving data normalization:
+
+##### 1. Removed Redundant Fields
+
+Removed the following redundant fields from `news` table:
+
+| Field Name | Removal Reason | Alternative |
+|------------|----------------|------------|
+| `source_name` | Duplicate with platform name | Get via `platforms` table JOIN query |
+| `crawl_date` | Duplicate with file path date | Infer from file path timestamp |
+
+**Migration Notes**: Old databases are incompatible, see [Breaking Changes](#breaking-changes-v400) section
+
+##### 2. New Platforms Table
+
+Added `platforms` table for unified management of platform information:
+
+```sql
+CREATE TABLE IF NOT EXISTS platforms (
+    id TEXT PRIMARY KEY,     -- Platform ID (immutable, e.g., 'zhihu', 'weibo')
+    name TEXT NOT NULL,      -- Platform display name (mutable, e.g., 'Zhihu', 'Weibo')
+    enabled INTEGER DEFAULT 1 -- Whether enabled (1=enabled, 0=disabled)
+);
+```
+
+**Design Advantages**:
+- `id` field is immutable, maintains data consistency
+- `name` field is mutable, supports internationalization and customization
+- Historical data remains valid when modifying platform names
+
+##### 3. Crawl Source Status Normalization
+
+Replaced original comma-separated string storage `successful_sources` field with normalized `crawl_source_status` table:
+
+```sql
+CREATE TABLE IF NOT EXISTS crawl_source_status (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    file_path TEXT NOT NULL,           -- File path (e.g., 'output/2025-12-09/news.db')
+    platform_id TEXT NOT NULL,         -- Platform ID (foreign key to platforms.id)
+    success INTEGER NOT NULL,          -- Whether crawl succeeded (1=success, 0=failed)
+    crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (platform_id) REFERENCES platforms(id)
+);
+```
+
+**Design Advantages**:
+- Supports efficient SQL queries (e.g., calculate success rate by platform)
+- Easy statistics and analysis (no string splitting required)
+- Normalized structure, avoids data redundancy
+
+##### 4. File Path Format Standardization
+
+**Old Format**: `output/2025年12月09日/news_14-30.txt`
+**New Format**: `output/2025-12-09/news.db`
+
+**Changes**:
+- Date format: Chinese format → ISO 8601 standard format
+- Filename: Multiple time-stamped TXT files → single SQLite database file
+- Extension: `.txt` → `.db`
+
+**Advantages**:
+- Cross-platform compatibility (avoids Chinese path issues)
+- Easier programmatic parsing
+- International standard, better maintainability
+
+---
+
+#### Remote Cloud Storage Configuration
+
+When using remote cloud storage (required for GitHub Actions environment), configure the following environment variables:
+
+| Environment Variable | Description | Required | Example Value |
+|----------------------|-------------|----------|--------------|
+| `S3_BUCKET_NAME` | Bucket name | ✅ Yes | `trendradar-data` |
+| `S3_ACCESS_KEY_ID` | Access key ID | ✅ Yes | `abc123...` |
+| `S3_SECRET_ACCESS_KEY` | Access key | ✅ Yes | `xyz789...` |
+| `S3_ENDPOINT_URL` | S3 API endpoint | ✅ Yes | `https://<account-id>.r2.cloudflarestorage.com` |
+| `S3_REGION` | Region (optional) | ❌ No | `auto` |
+
+**Configuration Method**:
+- GitHub Actions: Configure in GitHub Secrets (see [Quick Start - Remote Storage Configuration](#2-setup-github-secrets-required--optional-platforms))
+- Docker/Local: Configure in `.env` file (remote storage is optional)
+
+---
+
+#### Data Cleanup Strategy
+
+v4.0.0 added automatic data cleanup feature, supporting scheduled cleanup of old data:
+
+**Configuration Items**: `LOCAL_RETENTION_DAYS` and `REMOTE_RETENTION_DAYS`
+
+| Configuration Value | Description |
+|---------------------|-------------|
+| `0` (default) | Disable cleanup, keep all data |
+| Positive integer (e.g., `30`) | Only keep recent N days of data, auto-delete old data |
+
+**Configuration Method**:
+```bash
+# GitHub Actions: Configure in GitHub Secrets
+LOCAL_RETENTION_DAYS=30
+REMOTE_RETENTION_DAYS=30
+
+# Docker: Configure in .env file
+LOCAL_RETENTION_DAYS=30
+REMOTE_RETENTION_DAYS=30
+
+# Local: Add to environment variables
+export LOCAL_RETENTION_DAYS=30
+```
+
+**Cleanup Rules**:
+- Cleanup executes during each crawl task
+- Local: Deletes `output/YYYY-MM-DD/` directories older than N days
+- Remote: Deletes cloud objects older than N days (e.g., `news/2025-11-10.db`)
+
+---
+
+#### Timezone Configuration
+
+v4.0.0 added timezone configuration support, using IANA standard time zone names:
+
+**Configuration Item**: `TIMEZONE`
+
+| Configuration Value | Description | Example |
+|---------------------|-------------|---------|
+| Not set (default) | Use UTC+0 | - |
+| IANA time zone name | Specify time zone | `Asia/Shanghai`, `America/New_York`, `Europe/London` |
+
+**Configuration Method**:
+```bash
+# GitHub Actions: Configure in GitHub Secrets
+TIMEZONE=Asia/Shanghai
+
+# Docker: Configure in .env file
+TIMEZONE=Asia/Shanghai
+
+# Local: Add to environment variables
+export TIMEZONE=Asia/Shanghai
+```
+
+**Common IANA Time Zones**:
+- China: `Asia/Shanghai`
+- United States East: `America/New_York`
+- United States West: `America/Los_Angeles`
+- United Kingdom: `Europe/London`
+- Japan: `Asia/Tokyo`
+
+---
+
+#### Breaking Changes (v4.0.0)
+
+**⚠️ Important Notice**: v4.0.0 made breaking changes to database structure, **old databases are incompatible**
+
+**Impact**:
+- Cannot directly read v3.x version data
+- Need to re-crawl data to build new database
+- **No automatic migration tool provided**
+
+**Recommendations**:
+1. **Fresh Start**: Recommended to start from scratch to accumulate data
+2. **Keep Historical Data**: If need to preserve v3.x historical data, can rename old `output/` directory (e.g., `output_v3_backup/`) before running new version
+
+**Data Format Comparison**:
+
+| Item | v3.x | v4.0.0 |
+|------|------|--------|
+| File path format | `output/2025年12月09日/` | `output/2025-12-09/` |
+| Data file | Multiple `news_HH-MM.txt` files | Single `news.db` file |
+| Database fields | Contains `source_name`, `crawl_date` | Removed redundant fields |
+| Platform management | No independent table | Added `platforms` table |
+| Crawl status | Comma-separated string | Normalized `crawl_source_status` table |
+
+</details>
+
+<br>
+
 ## 🤖 AI Analysis
 ## 🤖 AI Analysis
 
 
 TrendRadar v3.0.0 added **MCP (Model Context Protocol)** based AI analysis feature, allowing natural language conversations with news data for deep analysis.
 TrendRadar v3.0.0 added **MCP (Model Context Protocol)** based AI analysis feature, allowing natural language conversations with news data for deep analysis.

+ 83 - 1
README-MCP-FAQ-EN.md

@@ -450,7 +450,89 @@ AI: (date_range={"start": "2024-12-01", "end": "2024-12-31"})
 
 
 ---
 ---
 
 
-### Q14: How to parse natural language date expressions? (Recommended to use first)
+## Storage Sync
+
+### Q14: How to sync data from remote storage to local?
+
+**You can ask like this:**
+
+- "Sync last 7 days data from remote"
+- "Pull data from remote storage to local"
+- "Sync last 30 days of news data"
+
+**Tool called:** `sync_from_remote`
+
+**Use cases:**
+
+- Crawler deployed in the cloud (e.g., GitHub Actions), data stored remotely (e.g., Cloudflare R2)
+- MCP Server deployed locally, needs to pull data from remote for analysis
+
+**Return information:**
+
+- synced_files: Number of successfully synced files
+- synced_dates: List of successfully synced dates
+- skipped_dates: Skipped dates (already exist locally)
+- failed_dates: Failed dates and error information
+
+**Prerequisites:**
+
+Need to configure remote storage in `config/config.yaml` or set environment variables:
+- `S3_ENDPOINT_URL`: Service endpoint
+- `S3_BUCKET_NAME`: Bucket name
+- `S3_ACCESS_KEY_ID`: Access key ID
+- `S3_SECRET_ACCESS_KEY`: Secret access key
+
+---
+
+### Q15: How to view storage status?
+
+**You can ask like this:**
+
+- "View current storage status"
+- "What's the storage configuration"
+- "How much data is stored locally"
+- "Is remote storage configured"
+
+**Tool called:** `get_storage_status`
+
+**Return information:**
+
+| Category | Information |
+|----------|-------------|
+| **Local Storage** | Data directory, total size, date count, date range |
+| **Remote Storage** | Whether configured, endpoint URL, bucket name, date count |
+| **Pull Config** | Whether auto-pull enabled, pull days |
+
+---
+
+### Q16: How to view available data dates?
+
+**You can ask like this:**
+
+- "What dates are available locally"
+- "What dates are in remote storage"
+- "Compare local and remote data dates"
+- "Which dates only exist remotely"
+
+**Tool called:** `list_available_dates`
+
+**Three query modes:**
+
+| Mode | Description | Example Question |
+|------|-------------|------------------|
+| **local** | View local only | "What dates are available locally" |
+| **remote** | View remote only | "What dates are in remote" |
+| **both** | Compare both (default) | "Compare local and remote data" |
+
+**Return information (both mode):**
+
+- only_local: Dates only existing locally
+- only_remote: Dates only existing remotely (useful for deciding which dates to sync)
+- both: Dates existing in both places
+
+---
+
+### Q17: How to parse natural language date expressions? (Recommended to use first)
 
 
 **You can ask like this:**
 **You can ask like this:**
 
 

+ 83 - 1
README-MCP-FAQ.md

@@ -450,7 +450,89 @@ AI:(date_range={"start": "2024-12-01", "end": "2024-12-31"})
 
 
 ---
 ---
 
 
-### Q14: 如何解析自然语言日期表达式?(推荐优先使用)
+## 存储同步
+
+### Q14: 如何从远程存储同步数据到本地?
+
+**你可以这样问:**
+
+- "从远程同步最近 7 天的数据"
+- "拉取远程存储的数据到本地"
+- "同步最近 30 天的新闻数据"
+
+**调用的工具:** `sync_from_remote`
+
+**使用场景:**
+
+- 爬虫部署在云端(如 GitHub Actions),数据存储到远程(如 Cloudflare R2)
+- MCP Server 部署在本地,需要从远程拉取数据进行分析
+
+**返回信息:**
+
+- synced_files: 成功同步的文件数量
+- synced_dates: 成功同步的日期列表
+- skipped_dates: 跳过的日期(本地已存在)
+- failed_dates: 失败的日期及错误信息
+
+**前提条件:**
+
+需要在 `config/config.yaml` 中配置远程存储或设置环境变量:
+- `S3_ENDPOINT_URL`: 服务端点
+- `S3_BUCKET_NAME`: 存储桶名称
+- `S3_ACCESS_KEY_ID`: 访问密钥 ID
+- `S3_SECRET_ACCESS_KEY`: 访问密钥
+
+---
+
+### Q15: 如何查看存储状态?
+
+**你可以这样问:**
+
+- "查看当前存储状态"
+- "存储配置是什么"
+- "本地有多少数据"
+- "远程存储配置了吗"
+
+**调用的工具:** `get_storage_status`
+
+**返回信息:**
+
+| 类别 | 信息 |
+|------|------|
+| **本地存储** | 数据目录、总大小、日期数量、日期范围 |
+| **远程存储** | 是否配置、端点地址、存储桶名称、日期数量 |
+| **拉取配置** | 是否启用自动拉取、拉取天数 |
+
+---
+
+### Q16: 如何查看可用的数据日期?
+
+**你可以这样问:**
+
+- "本地有哪些日期的数据"
+- "远程存储有哪些日期"
+- "对比本地和远程的数据日期"
+- "哪些日期只在远程有"
+
+**调用的工具:** `list_available_dates`
+
+**三种查询模式:**
+
+| 模式 | 说明 | 示例问法 |
+|------|------|---------|
+| **local** | 仅查看本地 | "本地有哪些日期" |
+| **remote** | 仅查看远程 | "远程有哪些日期" |
+| **both** | 对比两者(默认) | "对比本地和远程的数据" |
+
+**返回信息(both 模式):**
+
+- only_local: 仅本地存在的日期
+- only_remote: 仅远程存在的日期(可用于决定同步哪些日期)
+- both: 两边都存在的日期
+
+---
+
+### Q17: 如何解析自然语言日期表达式?(推荐优先使用)
 
 
 **你可以这样问:**
 **你可以这样问:**
 
 

+ 319 - 73
README.md

@@ -1,6 +1,6 @@
 <div align="center" id="trendradar">
 <div align="center" id="trendradar">
 
 
-> **📢 公告:** 经与 GitHub 官方沟通,完成合规调整后将恢复"一键 Fork 部署",请关注 **v4.0.0** 版本的更新
+> **📢 公告:** **v4.0.0** 版本已发布!包含存储架构重构、数据库优化、模块化改进等重大更新
 
 
 <a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
 <a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
   <img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
   <img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
@@ -16,8 +16,8 @@
 [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
 [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
 [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
 [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
 [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
-[![Version](https://img.shields.io/badge/version-v3.5.0-blue.svg)](https://github.com/sansan0/TrendRadar)
-[![MCP](https://img.shields.io/badge/MCP-v1.0.3-green.svg)](https://github.com/sansan0/TrendRadar)
+[![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar)
+[![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar)
 
 
 [![企业微信通知](https://img.shields.io/badge/企业微信-通知-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
 [![企业微信通知](https://img.shields.io/badge/企业微信-通知-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
 [![个人微信通知](https://img.shields.io/badge/个人微信-通知-00D4AA?style=flat-square)](https://weixin.qq.com/)
 [![个人微信通知](https://img.shields.io/badge/个人微信-通知-00D4AA?style=flat-square)](https://weixin.qq.com/)
@@ -48,62 +48,61 @@
 <br>
 <br>
 
 
 <details>
 <details>
-<summary>🚨 <strong>【必读】重要公告:本项目的正确部署姿势</strong></summary>
+<summary>🚨 <strong>【必读】重要公告:v4.0.0 部署方式与存储架构变更</strong></summary>
 
 
 <br>
 <br>
 
 
-> **⚠️ 2025年12月紧急通知**
->
-> 由于 Fork 数量激增导致 GitHub 服务器压力过大,**GitHub Actions 及 GitHub Pages 部署目前已受限**。为确保顺利部署,请务必阅读以下说明。
+### 🛠️ 请选择适合你的部署方式
+
+#### 🅰️ 方案一:Docker 部署(推荐 🔥)
 
 
-### 1. ✅ 唯一推荐部署方式:Docker
+* **特点**:最稳定、最简单,数据存储在 **本地 SQLite**,完全自主可控。
 
 
-**这是目前最稳定、不受 GitHub 限制的方案。** 数据存储在本地,不会因为 GitHub 策略调整而失效
+* **适用**:有自己的服务器、NAS 或长期运行的电脑
 
 
 * 👉 [跳转到 Docker 部署教程](#6-docker-部署)
 * 👉 [跳转到 Docker 部署教程](#6-docker-部署)
 
 
 ---
 ---
 
 
-### 2. 如果你本打算 Fork 本项目...
+#### 🅱️ 方案二:GitHub Actions 部署(已恢复 ✅)
 
 
-为了减少对 GitHub 服务器的压力,**请千万不要直接点击 "Fork" 按钮!**
+* **特点**:数据不再直接写入仓库(Git Commit),而是存储在 **远程云存储**(支持 S3 兼容协议:Cloudflare R2、阿里云 OSS、腾讯云 COS 等)。
 
 
-请务必使用 **"Use this template"** 功能来替代 Fork:
+* **门槛**:**必须**配置一个 S3 兼容的对象存储服务(推荐免费的 Cloudflare R2)。
 
 
-1.  **点击**原仓库页面右上角的绿色的 **[Use this template]** 按钮。
-2.  **选择** "Create a new repository"。
+> **⚠️ 注意**:选择此方案,请务必执行以下两步配置:
 
 
-**为什么要这样做?**
-* **❌ Fork**:复制完整历史记录,大量 Fork 同时运行会触发 GitHub 风控。
-* **✅ Use this template**:创建的是一个全新的独立仓库,没有历史包袱,对服务器更友好。
+#### 1. 🚀 推荐的开始方式:Use this template
 
 
----
+为了保持仓库整洁,避免继承冗余的历史记录,我**建议**你使用 Template 模式:
 
 
-### 3. 关于新版数据存储的说明
+1.  **点击**原仓库页面右上角的绿色 **[Use this template]** 按钮。
 
 
-新版将使用 **Cloudflare R2** 存储新闻数据,以保证持久化
+2.  **选择** "Create a new repository"
 
 
-**⚠️ 配置前置条件:**
+> **💡 为什么要这样做?**
+> * **Use this template**:创建一个全新的、干净的仓库,没有历史包袱。
+> * **Fork**:会保留完整的提交历史和关联关系,占用 GitHub 更多资源。
 
 
-根据 Cloudflare 平台规则,开通 R2 需绑定支付方式。
+#### 2. ☁️ 关于 GitHub Actions 必配的远程存储
 
 
-- **目的:** 仅作身份验证(Verify Only),不产生扣费。
-- **支付:** 支持信用卡或国区 PayPal。
-- **用量:** R2 的免费额度足以覆盖本项目日常运行,无需付费。
+如果你选择 **方案二 (GitHub Actions)**,则必须配置一个 S3 兼容的对象存储服务。
 
 
----
+**支持的存储服务:**
+- **Cloudflare R2**(推荐,免费额度充足)
+- 其他 S3 兼容服务
 
 
-### 4. 📅 后续计划与文档阅读说明
+**⚠️ 以 Cloudflare R2 为例的配置前置条件:**
 
 
-> **后续计划:**
-> - 探索新方案:保留 Actions 用于抓取和推送,但不再将数据保存到仓库,改用外部存储。
+根据 Cloudflare 平台规则,开通 R2 需绑定支付方式。
 
 
-**⚠️ 阅读注意:**
-鉴于上述计划意味着 **Fork 部署模式未来可能会以新形式回归**,且当前全面修改文档工作量巨大,我们暂时保留了旧版描述。
+* **目的**:仅作身份验证(Verify Only),**不产生扣费**。
 
 
-**在当前阶段,若后续教程中仍出现 "Fork" 相关表述,请一律忽略或将其理解为 "Use this template"**
+* **支付**:支持双币信用卡或国区 PayPal
 
 
-👉 **[点击此处查看 TrendRadar 最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**
+* **用量**:R2 的免费额度(10GB存储/月)足以覆盖本项目日常运行,无需担心付费。
+
+👉 **[点击查看详细配置教程](#-快速开始)**
 
 
 </details>
 </details>
 
 
@@ -335,10 +334,30 @@
 - ⚠️ **配对配置**:Telegram 和 ntfy 需要保证配对参数数量一致(如 token 和 chat_id 都是 2 个)
 - ⚠️ **配对配置**:Telegram 和 ntfy 需要保证配对参数数量一致(如 token 和 chat_id 都是 2 个)
 - ⚠️ **数量限制**:默认每个渠道最多 3 个账号,超出会被截断
 - ⚠️ **数量限制**:默认每个渠道最多 3 个账号,超出会被截断
 
 
-### **多端适配**
-- **GitHub Pages**:自动生成精美网页报告,PC/移动端适配
-- **Docker部署**:支持多架构容器化运行
-- **数据持久化**:HTML/TXT多格式历史记录保存
+### **灵活存储架构**(v4.0.0 重大更新)
+
+**多存储后端支持**:
+- ☁️ **远程云存储**:GitHub Actions 环境默认,支持 S3 兼容协议(R2/OSS/COS 等),数据存储在云端,不污染仓库
+- 💾 **本地 SQLite 数据库**:Docker/本地环境默认,数据完全可控
+- 🔄 **自动后端选择**:根据运行环境智能切换存储方式
+
+**数据格式**:
+| 格式 | 用途 | 说明 |
+|------|------|------|
+| **SQLite** | 主存储 | 单文件数据库,查询快速,支持 MCP AI 分析 |
+| **TXT** | 可选快照 | 可读文本格式,方便直接查看 |
+| **HTML** | 报告展示 | 精美可视化页面,PC/移动端适配 |
+
+**数据管理**:
+- ✅ 自动清理过期数据(可配置保留天数)
+- ✅ 时区配置支持(全球时区)
+
+> 💡 详细说明见 [配置详解 - 存储配置](#9-存储配置)
+
+### **多端部署**
+- **GitHub Actions**:定时自动爬取 + 远程云存储(需签到续期)
+- **Docker 部署**:支持多架构容器化运行,数据本地存储
+- **本地运行**:Windows/Mac/Linux 直接运行
 
 
 
 
 ### **AI 智能分析(v3.0.0 新增)**
 ### **AI 智能分析(v3.0.0 新增)**
@@ -389,10 +408,34 @@ GitHub 一键 Fork 即可使用,无需编程基础。
 >**升级说明**:
 >**升级说明**:
 - **📌 查看最新更新**:**[原仓库更新日志](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-更新日志)**
 - **📌 查看最新更新**:**[原仓库更新日志](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-更新日志)**
 - **提示**:不要通过 **Sync fork** 更新本项目,建议查看【历史更新】,明确具体的【升级方式】和【功能内容】
 - **提示**:不要通过 **Sync fork** 更新本项目,建议查看【历史更新】,明确具体的【升级方式】和【功能内容】
-- **小版本更新**:从 v2.x 升级到 v2.y,用本项目的 `main.py` 代码替换你 fork 仓库中的对应文件
 - **大版本升级**:从 v1.x 升级到 v2.y,建议删除现有 fork 后重新 fork,这样更省力且避免配置冲突
 - **大版本升级**:从 v1.x 升级到 v2.y,建议删除现有 fork 后重新 fork,这样更省力且避免配置冲突
 
 
 
 
+### 2025/12/13 - v4.0.0
+
+**🎉 重大更新:全面重构存储和核心架构**
+
+- **多存储后端支持**:引入全新的存储模块,支持本地 SQLite 和远程云存储(S3 兼容协议,推荐免费的 Cloudflare R2),适应 GitHub Actions、Docker 和本地环境。
+- **数据库结构优化**:重构 SQLite 数据库表结构,提升数据效率和查询能力。
+- **核心代码模块化**:将主程序逻辑拆分为 trendradar 包的多个模块,显著提升代码可维护性。
+- **增强功能**:实现日期格式标准化、数据保留策略、时区配置支持、时间显示优化,并修复远程存储数据持久化问题,确保数据合并的准确性。
+- **清理和兼容**:移除了大部分历史兼容代码,统一了数据存储和读取方式。
+
+
+### 2025/12/13 - mcp-v1.1.0
+
+  **MCP 模块更新:**
+  - 适配 v4.0.0,同时也兼容 v3.x 的数据
+  - 新增存储同步工具:
+    - `sync_from_remote`: 从远程存储拉取数据到本地
+    - `get_storage_status`: 获取存储配置和状态
+    - `list_available_dates`: 列出本地/远程可用日期范围
+
+
+<details>
+<summary>👉 点击展开:<strong>历史更新</strong></summary>
+
+
 ### 2025/12/03 - v3.5.0
 ### 2025/12/03 - v3.5.0
 
 
 **🎉 核心功能增强**
 **🎉 核心功能增强**
@@ -456,10 +499,6 @@ GitHub 一键 Fork 即可使用,无需编程基础。
   - 工具总数从 13 个增加到 14 个
   - 工具总数从 13 个增加到 14 个
 
 
 
 
-<details>
-<summary>👉 点击展开:<strong>历史更新</strong></summary>
-
-
 ### 2025/11/28 - v3.4.1
 ### 2025/11/28 - v3.4.1
 
 
 **🔧 格式优化**
 **🔧 格式优化**
@@ -857,11 +896,44 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
 
 
 > **📖 提醒**:Fork 用户建议先 **[查看最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**,确保配置步骤是最新的。
 > **📖 提醒**:Fork 用户建议先 **[查看最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**,确保配置步骤是最新的。
 
 
+### ⚠️ GitHub Actions 使用说明
+
+**v4.0.0 重要变更**:引入「活跃度检测」机制,GitHub Actions 需定期签到以维持运行。
+
+#### 🔄 签到续期机制
+
+- **运行周期**:有效期为 **7 天**,倒计时结束后服务将自动挂起。
+- **续期方式**:在 Actions 页面手动触发 "Check In" workflow,即可重置 7 天有效期。
+- **操作路径**:`Actions` → `Check In` → `Run workflow`
+- **设计理念**:
+    - 如果 7 天都忘了签到,或许这些资讯对你来说并非刚需。适时的暂停,能帮你从信息流中抽离,给大脑留出喘息的空间。
+    - GitHub Actions 是宝贵的公共计算资源。引入签到机制旨在避免算力的无效空转,确保资源能分配给真正活跃且需要的用户。感谢你的理解与支持。
+
+#### 📦 数据存储(必需配置)
+
+GitHub Actions 环境下,数据存储在 **远程云存储**(支持 S3 兼容协议,推荐免费的 Cloudflare R2),不会污染仓库(见下方 **必需配置:远程云存储**)
+
+#### 🚀 推荐:Docker 部署
+
+如需长期稳定运行,建议使用 [Docker 部署](#6-docker-部署),数据存储在本地,无需签到,不过需要额外付费购买云服务器。
+
+<br>
+
+> 🎉 **已支持:多云存储方案**
+>
+> 本项目现已支持 S3 兼容协议,你可以选择:
+> - **Cloudflare R2**(推荐,免费额度充足)
+> - 其他 S3 兼容存储服务
+>
+> 只需配置对应的 `S3_ENDPOINT_URL`、`S3_BUCKET_NAME` 等环境变量即可切换。
+
+---
+
 1. **Fork 本项目**到你的 GitHub 账户
 1. **Fork 本项目**到你的 GitHub 账户
 
 
    - 点击本页面右上角的"Fork"按钮
    - 点击本页面右上角的"Fork"按钮
 
 
-2. **设置 GitHub Secrets(选择你需要的平台)**:
+2. **设置 GitHub Secrets(必需 + 可选平台)**:
 
 
    在你 Fork 后的仓库中,进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
    在你 Fork 后的仓库中,进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
 
 
@@ -900,6 +972,53 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
 
 
    <br>
    <br>
 
 
+   <details>
+   <summary>⚠️ <strong>必需配置:远程云存储</strong>(GitHub Actions 环境必需,推荐 Cloudflare R2)</summary>
+   <br>
+
+    **GitHub Secret 配置(⚠️ 以下 4 个配置项都是必需的):**
+
+    | Name(名称) | Secret(值)说明 |
+    |-------------|-----------------|
+    | `S3_BUCKET_NAME` | 存储桶名称(如 `trendradar-data`) |
+    | `S3_ACCESS_KEY_ID` | 访问密钥 ID(Access Key ID) |
+    | `S3_SECRET_ACCESS_KEY` | 访问密钥(Secret Access Key) |
+    | `S3_ENDPOINT_URL` | S3 API 端点(如 R2:`https://<account-id>.r2.cloudflarestorage.com`) |
+
+    <br>
+
+    **如何获取凭据(以 Cloudflare R2 为例):**
+
+    1. **进入 R2 概览**:
+    - 登录 [Cloudflare Dashboard](https://dash.cloudflare.com/)。
+    - 在左侧侧边栏找到并点击 `R2对象存储`。
+
+    <br>
+
+    2. **创建存储桶**:
+    - 点击`概述`
+    - 点击右上角的 `创建存储桶` (Create bucket)。
+    - 输入名称(例如 `trendradar-data`),点击 `创建存储桶`。
+
+    <br>
+
+    3. **创建 API 令牌**:
+    - 回到 **概述**页面。
+    - 点击**右下角** `Account Details `找到并点击 `Manage` (Manage R2 API Tokens)。
+    - 同时你会看到 `S3 API`:`https://<account-id>.r2.cloudflarestorage.com`(这就是 S3_ENDPOINT_URL)
+    - 点击 `创建 Account APl 令牌` 。
+    - **⚠️ 关键设置**:
+        - **令牌名称**:随意填写(如 `github-action-write`)。
+        - **权限**:选择 `管理员读和写` 。
+        - **指定存储桶**:为了安全,建议选择 `仅适用于指定存储桶` 并选中你的桶(如 `trendradar-data`)。
+    - 点击 `创建 API 令牌`,**立即复制** 显示的 `Access Key ID` 和 `Secret Access Key`(只显示一次!)。
+
+    <br>
+
+    - **R2 免费额度**:每月 10GB 存储 + 100万次读取,对本项目来说非常充足。
+    - **支付验证**:开通 R2 即使是免费额度,Cloudflare 也要求绑定 PayPal 或信用卡进行身份验证(不会实际扣费,除非超过额度)。
+
+   </details>
 
 
    <details>
    <details>
    <summary>👉 点击展开:<strong>企业微信机器人</strong>(配置最简单最迅速)</summary>
    <summary>👉 点击展开:<strong>企业微信机器人</strong>(配置最简单最迅速)</summary>
@@ -1489,10 +1608,11 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
 
 
    **测试步骤**:
    **测试步骤**:
    1. 进入你项目的 Actions 页面
    1. 进入你项目的 Actions 页面
-   2. 找到 **"Hot News Crawler"** 点进去
+   2. 找到 **"Get Hot News"**(必须得是这个字)点进去,点击右侧的 **"Run workflow"** 按钮运行 
       - 如果看不到该字样,参照 [#109](https://github.com/sansan0/TrendRadar/issues/109) 解决
       - 如果看不到该字样,参照 [#109](https://github.com/sansan0/TrendRadar/issues/109) 解决
-   3. 点击右侧的 **"Run workflow"** 按钮运行
-   4. 等待 1 分钟左右,消息会推送到你配置的平台
+   3. 3 分钟左右,消息会推送到你配置的平台
+
+   <br>
 
 
    > ⏱️ **测试提示**:
    > ⏱️ **测试提示**:
    > - 手动测试不要太频繁,避免触发 GitHub Actions 限制
    > - 手动测试不要太频繁,避免触发 GitHub Actions 限制
@@ -2069,7 +2189,7 @@ TrendRadar 提供两个独立的 Docker 镜像,可根据需求选择部署:
 
 
    # 下载 docker compose 配置
    # 下载 docker compose 配置
    wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env  -P docker/
    wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env  -P docker/
-   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml  -P docker/
+   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml  -P docker/
    ```
    ```
 
 
    > 💡 **说明**:Docker 部署需要的关键目录结构如下:
    > 💡 **说明**:Docker 部署需要的关键目录结构如下:
@@ -2080,7 +2200,7 @@ TrendRadar 提供两个独立的 Docker 镜像,可根据需求选择部署:
 │   └── frequency_words.txt
 │   └── frequency_words.txt
 └── docker/
 └── docker/
     ├── .env
     ├── .env
-    └── docker compose.yml
+    └── docker-compose.yml
 ```
 ```
 
 
 2. **配置文件说明**:
 2. **配置文件说明**:
@@ -2174,7 +2294,7 @@ vim config/frequency_words.txt
 
 
 # 使用构建版本的 docker compose
 # 使用构建版本的 docker compose
 cd docker
 cd docker
-cp docker compose-build.yml docker compose.yml
+cp docker-compose-build.yml docker-compose.yml
 ```
 ```
 
 
 **构建并启动服务**:
 **构建并启动服务**:
@@ -2260,7 +2380,7 @@ docker rm trend-radar
 
 
 > 💡 **Web 服务器说明**:
 > 💡 **Web 服务器说明**:
 > - 启动后可通过浏览器访问 `http://localhost:8080` 查看最新报告
 > - 启动后可通过浏览器访问 `http://localhost:8080` 查看最新报告
-> - 通过目录导航访问历史报告(如:`http://localhost:8080/2025年xx月xx日/`)
+> - 通过目录导航访问历史报告(如:`http://localhost:8080/2025-xx-xx/`)
 > - 端口可在 `.env` 文件中配置 `WEBSERVER_PORT` 参数
 > - 端口可在 `.env` 文件中配置 `WEBSERVER_PORT` 参数
 > - 自动启动:在 `.env` 中设置 `ENABLE_WEBSERVER=true`
 > - 自动启动:在 `.env` 中设置 `ENABLE_WEBSERVER=true`
 > - 安全提示:仅提供静态文件访问,限制在 output 目录,只绑定本地访问
 > - 安全提示:仅提供静态文件访问,限制在 output 目录,只绑定本地访问
@@ -2277,7 +2397,7 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置:
 |---------|---------|---------|
 |---------|---------|---------|
 | `output/index.html` | 宿主机直接访问 | **Docker 部署**(通过 Volume 挂载,宿主机可见) |
 | `output/index.html` | 宿主机直接访问 | **Docker 部署**(通过 Volume 挂载,宿主机可见) |
 | `index.html` | 根目录访问 | **GitHub Pages**(仓库根目录,Pages 自动识别) |
 | `index.html` | 根目录访问 | **GitHub Pages**(仓库根目录,Pages 自动识别) |
-| `output/YYYY年MM月DD日/html/当日汇总.html` | 历史报告访问 | 所有环境(按日期归档) |
+| `output/YYYY-MM-DD/html/当日汇总.html` | 历史报告访问 | 所有环境(按日期归档) |
 
 
 **本地访问示例**:
 **本地访问示例**:
 ```bash
 ```bash
@@ -2286,8 +2406,8 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置:
 docker exec -it trend-radar python manage.py start_webserver
 docker exec -it trend-radar python manage.py start_webserver
 # 2. 在浏览器访问
 # 2. 在浏览器访问
 http://localhost:8080                           # 访问最新报告(默认 index.html)
 http://localhost:8080                           # 访问最新报告(默认 index.html)
-http://localhost:8080/2025年xx月xx日/            # 访问指定日期的报告
-http://localhost:8080/2025年xx月xx日/html/       # 浏览该日期下的所有 HTML 文件
+http://localhost:8080/2025-xx-xx/               # 访问指定日期的报告
+http://localhost:8080/2025-xx-xx/html/          # 浏览该日期下的所有 HTML 文件
 
 
 # 方式 2:直接打开文件(本地环境)
 # 方式 2:直接打开文件(本地环境)
 open ./output/index.html             # macOS
 open ./output/index.html             # macOS
@@ -2295,7 +2415,7 @@ start ./output/index.html            # Windows
 xdg-open ./output/index.html         # Linux
 xdg-open ./output/index.html         # Linux
 
 
 # 方式 3:访问历史归档
 # 方式 3:访问历史归档
-open ./output/2025年xx月xx日/html/当日汇总.html
+open ./output/2025-xx-xx/html/当日汇总.html
 ```
 ```
 
 
 **为什么有两个 index.html?**
 **为什么有两个 index.html?**
@@ -2349,34 +2469,42 @@ flowchart TB
 
 
 **快速启动**:
 **快速启动**:
 
 
-使用 docker compose 同时启动新闻推送和 MCP 服务:
+如果已按照 [方式一:使用 docker compose](#方式一使用-docker-compose推荐) 完成部署,只需启动 MCP 服务:
 
 
 ```bash
 ```bash
-# 下载最新的 docker compose.yml(已包含 MCP 服务配置)
-wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml
-
-# 启动所有服务
-docker compose up -d
+cd TrendRadar/docker
+docker compose up -d trend-radar-mcp
 
 
 # 查看运行状态
 # 查看运行状态
-docker ps | grep trend-radar
+docker ps | grep trend-radar-mcp
 ```
 ```
 
 
-**单独启动 MCP 服务**:
+**单独启动 MCP 服务**(不使用 docker compose)
 
 
 ```bash
 ```bash
+# Linux/Mac
 docker run -d --name trend-radar-mcp \
 docker run -d --name trend-radar-mcp \
   -p 127.0.0.1:3333:3333 \
   -p 127.0.0.1:3333:3333 \
-  -v ./config:/app/config:ro \
-  -v ./output:/app/output:ro \
+  -v $(pwd)/config:/app/config:ro \
+  -v $(pwd)/output:/app/output:ro \
   -e TZ=Asia/Shanghai \
   -e TZ=Asia/Shanghai \
   wantcat/trendradar-mcp:latest
   wantcat/trendradar-mcp:latest
+
+# Windows PowerShell
+docker run -d --name trend-radar-mcp `
+  -p 127.0.0.1:3333:3333 `
+  -v ${PWD}/config:/app/config:ro `
+  -v ${PWD}/output:/app/output:ro `
+  -e TZ=Asia/Shanghai `
+  wantcat/trendradar-mcp:latest
 ```
 ```
 
 
+> ⚠️ **注意**:单独运行时,确保当前目录下有 `config/` 和 `output/` 文件夹,且包含配置文件和新闻数据。
+
 **验证服务**:
 **验证服务**:
 
 
 ```bash
 ```bash
-# 检查 MCP 服务是否正常运行
+# 检查 MCP 服务健康状态
 curl http://127.0.0.1:3333/mcp
 curl http://127.0.0.1:3333/mcp
 
 
 # 查看 MCP 服务日志
 # 查看 MCP 服务日志
@@ -2385,14 +2513,20 @@ docker logs -f trend-radar-mcp
 
 
 **在 AI 客户端中配置**:
 **在 AI 客户端中配置**:
 
 
-MCP 服务启动后,在 Claude Desktop、Cherry Studio、Cursor 等客户端中配置:
+MCP 服务启动后,根据不同客户端进行配置:
 
 
+**Cherry Studio**(推荐,GUI 配置):
+- 设置 → MCP 服务器 → 添加
+- 类型:`streamableHttp`
+- URL:`http://127.0.0.1:3333/mcp`
+
+**Claude Desktop / Cline**(JSON 配置):
 ```json
 ```json
 {
 {
   "mcpServers": {
   "mcpServers": {
     "trendradar": {
     "trendradar": {
       "url": "http://127.0.0.1:3333/mcp",
       "url": "http://127.0.0.1:3333/mcp",
-      "description": "TrendRadar 新闻热点分析"
+      "type": "streamableHttp"
     }
     }
   }
   }
 }
 }
@@ -2480,7 +2614,6 @@ notification:
       start: "20:00"                  # 开始时间(北京时间)
       start: "20:00"                  # 开始时间(北京时间)
       end: "22:00"                    # 结束时间(北京时间)
       end: "22:00"                    # 结束时间(北京时间)
     once_per_day: true                # 每天只推送一次
     once_per_day: true                # 每天只推送一次
-    push_record_retention_days: 7     # 推送记录保留天数
 ```
 ```
 
 
 #### 配置项详解
 #### 配置项详解
@@ -2491,7 +2624,6 @@ notification:
 | `time_range.start` | string | `"20:00"` | 推送时间窗口开始时间(北京时间,HH:MM 格式) |
 | `time_range.start` | string | `"20:00"` | 推送时间窗口开始时间(北京时间,HH:MM 格式) |
 | `time_range.end` | string | `"22:00"` | 推送时间窗口结束时间(北京时间,HH:MM 格式) |
 | `time_range.end` | string | `"22:00"` | 推送时间窗口结束时间(北京时间,HH:MM 格式) |
 | `once_per_day` | bool | `true` | `true`=每天在窗口内只推送一次,`false`=窗口内每次执行都推送 |
 | `once_per_day` | bool | `true` | `true`=每天在窗口内只推送一次,`false`=窗口内每次执行都推送 |
-| `push_record_retention_days` | int | `7` | 推送记录保留天数(用于判断是否已推送) |
 
 
 #### 使用场景
 #### 使用场景
 
 
@@ -2515,7 +2647,6 @@ PUSH_WINDOW_ENABLED=true
 PUSH_WINDOW_START=09:00
 PUSH_WINDOW_START=09:00
 PUSH_WINDOW_END=18:00
 PUSH_WINDOW_END=18:00
 PUSH_WINDOW_ONCE_PER_DAY=false
 PUSH_WINDOW_ONCE_PER_DAY=false
-PUSH_WINDOW_RETENTION_DAYS=7
 ```
 ```
 
 
 #### 完整配置示例
 #### 完整配置示例
@@ -2530,7 +2661,6 @@ notification:
       start: "20:00"
       start: "20:00"
       end: "22:00"
       end: "22:00"
     once_per_day: true
     once_per_day: true
-    push_record_retention_days: 7
 ```
 ```
 
 
 **场景:工作时间内每小时推送**
 **场景:工作时间内每小时推送**
@@ -2543,7 +2673,6 @@ notification:
       start: "09:00"
       start: "09:00"
       end: "18:00"
       end: "18:00"
     once_per_day: false
     once_per_day: false
-    push_record_retention_days: 7
 ```
 ```
 
 
 </details>
 </details>
@@ -2829,6 +2958,123 @@ notification:
 
 
 </details>
 </details>
 
 
+### 11. 存储配置
+
+<details id="storage-config">
+<summary>👉 点击展开:<strong>存储架构配置详解</strong></summary>
+<br>
+
+#### 存储后端选择
+
+**配置位置**:`config/config.yaml` 的 `storage` 部分
+
+v4.0.0 版本重构了存储架构,支持多种存储后端:
+
+```yaml
+storage:
+  backend: auto  # 存储后端:auto(自动选择)/ local(本地SQLite)/ remote(远程云存储)
+
+  formats:
+    sqlite: true   # 是否启用SQLite存储
+    txt: true      # 是否生成TXT快照
+    html: true     # 是否生成HTML报告
+
+  local:
+    data_dir: "output"    # 本地存储目录
+    retention_days: 0     # 本地数据保留天数,0表示永久保留
+
+  remote:
+    endpoint_url: ""      # S3 API 端点
+    bucket_name: ""       # 存储桶名称
+    access_key_id: ""     # 访问密钥ID
+    secret_access_key: "" # 访问密钥
+    region: ""            # 区域(可选)
+    retention_days: 0     # 远程数据保留天数,0表示永久保留
+
+  pull:
+    enabled: false        # 是否启用启动时从远程拉取数据
+    days: 7               # 拉取最近N天的数据
+```
+
+#### 后端选择策略
+
+| backend 值 | 说明 | 适用场景 |
+|-----------|------|---------|
+| `auto` | **自动选择**(推荐) | 根据运行环境智能选择:<br>• GitHub Actions → Remote<br>• Docker/本地 → Local |
+| `local` | 本地 SQLite 数据库 | Docker 部署、本地开发 |
+| `remote` | 远程云存储(S3 兼容,如 Cloudflare R2) | GitHub Actions、多机器同步 |
+
+
+#### 远程云存储配置
+
+**环境变量**(推荐方式):
+
+```bash
+# GitHub Actions / Docker 环境变量
+STORAGE_BACKEND=remote  # 或 auto
+
+# 本地/远程数据保留天数(0 表示永久保留)
+LOCAL_RETENTION_DAYS=0
+REMOTE_RETENTION_DAYS=0
+
+# S3 兼容存储配置(以 Cloudflare R2 为例)
+S3_BUCKET_NAME=your-bucket-name
+S3_ACCESS_KEY_ID=your-access-key-id
+S3_SECRET_ACCESS_KEY=your-secret-access-key
+S3_ENDPOINT_URL=https://<account-id>.r2.cloudflarestorage.com
+S3_REGION=auto
+
+# 数据拉取配置(可选,从远程同步到本地)
+PULL_ENABLED=false
+PULL_DAYS=7
+```
+
+**获取凭据**:参见 [快速开始 - 远程存储配置](#-快速开始)
+
+#### 数据清理策略
+
+**自动清理**:每次运行结束时检查并删除超过保留天数的数据。
+
+```yaml
+storage:
+  local:
+    retention_days: 30  # 本地保留最近30天数据
+  remote:
+    retention_days: 30  # 远程保留最近30天数据
+```
+
+**清理逻辑**:
+- 本地存储:删除过期日期的文件夹(如 `output/2025-11-10/`)
+- 远程存储:批量删除过期的云端对象(如 `news/2025-11-10.db`)
+
+#### 时区配置(v4.0.0 新增)
+
+**全球时区支持**:解决非中国用户推送时间窗口问题。
+
+```yaml
+app:
+  timezone: "Asia/Shanghai"  # 默认中国时区
+  # 其他示例:
+  # timezone: "America/Los_Angeles"  # 美西时间
+  # timezone: "Europe/London"        # 英国时间
+```
+
+**支持所有 IANA 时区名称**:[时区列表](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
+
+
+#### 不兼容变更
+
+⚠️ **v4.0.0 不兼容 v3.x 数据**:
+
+1. 数据库结构完全重构,无法读取旧数据
+2. 文件路径格式变更(ISO 格式)
+
+**迁移建议**:
+- 从 v4.0.0 开始重新收集数据
+- 旧数据如需保留,请手动重命名目录格式(不推荐)
+
+</details>
+
 <br>
 <br>
 
 
 ## 🤖 AI 智能分析
 ## 🤖 AI 智能分析
@@ -2846,7 +3092,7 @@ AI 分析功能**不是**直接查询网络实时数据,而是分析你**本
 
 
 #### 使用说明:
 #### 使用说明:
 
 
-1. **项目自带测试数据**:`output` 目录默认包含 **2025年11月1日~11月15日** 的新闻数据,可用于快速体验 AI 功能
+1. **项目自带测试数据**:`output` 目录默认包含 **2025-11-01~2025-11-15** 的新闻数据,可用于快速体验 AI 功能
 
 
 2. **查询限制**:
 2. **查询限制**:
    - ✅ 只能查询已有日期范围内的数据(11月1-15日)
    - ✅ 只能查询已有日期范围内的数据(11月1-15日)

+ 49 - 2
config/config.yaml

@@ -1,12 +1,60 @@
 app:
 app:
   version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version"
   version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version"
   show_version_update: true # 控制显示版本更新提示,如果 false,则不接受新版本提示
   show_version_update: true # 控制显示版本更新提示,如果 false,则不接受新版本提示
+  # 时区配置(影响所有时间显示、推送窗口判断、数据存储)
+  # 常用时区:
+  #   - Asia/Shanghai (北京时间 UTC+8)
+  #   - America/New_York (美东时间 UTC-5/-4)
+  #   - Europe/London (伦敦时间 UTC+0/+1)
+  # 完整时区列表: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
+  timezone: "Asia/Shanghai"
+
+# 存储配置
+storage:
+  # 存储后端选择: local / remote / auto
+  # - local: 本地 SQLite + TXT/HTML 文件
+  # - remote: 远程云存储(S3 兼容协议,支持 R2/OSS/COS 等)
+  # - auto: 自动选择(GitHub Actions 环境且配置了远程存储则用 remote,否则用 local)
+  backend: "auto"
+
+  # 数据格式选项
+  formats:
+    sqlite: true       # 主存储(必须启用)
+    txt: false         # 是否生成 TXT 快照
+    html: false        # 是否生成 HTML 报告
+
+  # 本地存储配置
+  local:
+    data_dir: "output"        # 数据目录
+    retention_days: 0         # 本地数据保留天数(0 = 不清理)
+
+  # 远程存储配置(S3 兼容协议)
+  # 支持: Cloudflare R2, 阿里云 OSS, 腾讯云 COS, AWS S3, MinIO 等
+  # 建议将敏感信息配置在 GitHub Secrets 或环境变量中
+  remote:
+    # 数据保留天数(0 = 不清理远程数据)
+    retention_days: 0
+    # S3 兼容配置
+    endpoint_url: ""          # 服务端点(或环境变量 S3_ENDPOINT_URL)
+                              # Cloudflare R2: https://<account_id>.r2.cloudflarestorage.com
+                              # 阿里云 OSS: https://oss-cn-hangzhou.aliyuncs.com
+                              # 腾讯云 COS: https://cos.ap-guangzhou.myqcloud.com
+    bucket_name: ""           # 存储桶名称(或环境变量 S3_BUCKET_NAME)
+    access_key_id: ""         # 访问密钥 ID(或环境变量 S3_ACCESS_KEY_ID)
+    secret_access_key: ""     # 访问密钥(或环境变量 S3_SECRET_ACCESS_KEY)
+    region: ""                # 区域(可选,部分服务商需要,或环境变量 S3_REGION)
+
+  # 数据拉取配置(从远程同步到本地)
+  # 用于 MCP Server 等场景:爬虫存到远程,MCP 拉取到本地分析
+  pull:
+    enabled: false            # 是否启用启动时自动拉取
+    days: 7                   # 拉取最近 N 天的数据(0 = 不拉取)
 
 
 crawler:
 crawler:
   request_interval: 1000 # 请求间隔(毫秒)
   request_interval: 1000 # 请求间隔(毫秒)
   enable_crawler: true # 是否启用爬取新闻功能,如果 false,则直接停止程序
   enable_crawler: true # 是否启用爬取新闻功能,如果 false,则直接停止程序
   use_proxy: false # 是否启用代理,false 时为关闭
   use_proxy: false # 是否启用代理,false 时为关闭
-  default_proxy: "http://127.0.0.1:10086"
+  default_proxy: "http://127.0.0.1:10801"
 
 
 # 🔸 daily(当日汇总模式)
 # 🔸 daily(当日汇总模式)
 #   • 推送时机:按时推送(默认每小时推送一次)
 #   • 推送时机:按时推送(默认每小时推送一次)
@@ -55,7 +103,6 @@ notification:
       start: "20:00"  # 推送时间窗口开始(北京时间)
       start: "20:00"  # 推送时间窗口开始(北京时间)
       end: "22:00"    # 推送时间窗口结束(北京时间)
       end: "22:00"    # 推送时间窗口结束(北京时间)
     once_per_day: true  # 每天在时间窗口内只推送一次,如果 false,则窗口内每次执行都推送
     once_per_day: true  # 每天在时间窗口内只推送一次,如果 false,则窗口内每次执行都推送
-    push_record_retention_days: 7  # 推送记录保留天数
 
 
   # ⚠️⚠️⚠️ 重要安全警告 / IMPORTANT SECURITY WARNING ⚠️⚠️⚠️
   # ⚠️⚠️⚠️ 重要安全警告 / IMPORTANT SECURITY WARNING ⚠️⚠️⚠️
   #
   #

+ 33 - 2
docker/.env

@@ -40,8 +40,6 @@ PUSH_WINDOW_START=
 PUSH_WINDOW_END=
 PUSH_WINDOW_END=
 # 每天只推送一次 (true/false)
 # 每天只推送一次 (true/false)
 PUSH_WINDOW_ONCE_PER_DAY=
 PUSH_WINDOW_ONCE_PER_DAY=
-# 推送记录保留天数 (数字,如 7)
-PUSH_WINDOW_RETENTION_DAYS=
 
 
 # ============================================
 # ============================================
 # 多账号配置
 # 多账号配置
@@ -87,6 +85,39 @@ BARK_URL=
 # Slack 推送配置(多账号用 ; 分隔)
 # Slack 推送配置(多账号用 ; 分隔)
 SLACK_WEBHOOK_URL=
 SLACK_WEBHOOK_URL=
 
 
+# ============================================
+# 存储配置
+# ============================================
+
+# 存储后端选择 (local/remote/auto)
+# - local: 本地 SQLite + TXT/HTML 文件
+# - remote: 远程云存储(S3 兼容协议)
+# - auto: 自动选择(GitHub Actions 用 remote,其他用 local)
+STORAGE_BACKEND=auto
+
+# 本地数据保留天数(0 = 无限制,不清理历史数据)
+LOCAL_RETENTION_DAYS=0
+
+# 远程数据保留天数(0 = 无限制,不清理历史数据)
+REMOTE_RETENTION_DAYS=0
+
+# 是否生成 TXT 快照 (true/false)
+STORAGE_TXT_ENABLED=
+
+# 是否生成 HTML 报告 (true/false)
+STORAGE_HTML_ENABLED=
+
+# 远程存储配置(S3 兼容协议,支持 R2/OSS/COS/S3 等)
+S3_ENDPOINT_URL=
+S3_BUCKET_NAME=
+S3_ACCESS_KEY_ID=
+S3_SECRET_ACCESS_KEY=
+S3_REGION=
+
+# 数据拉取配置(从远程同步到本地)
+PULL_ENABLED=false
+PULL_DAYS=7
+
 # ============================================
 # ============================================
 # 运行配置
 # 运行配置
 # ============================================
 # ============================================

+ 1 - 1
docker/Dockerfile

@@ -53,8 +53,8 @@ RUN set -ex && \
 COPY requirements.txt .
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
 
-COPY main.py .
 COPY docker/manage.py .
 COPY docker/manage.py .
+COPY trendradar/ ./trendradar/
 
 
 # 复制 entrypoint.sh 并强制转换为 LF 格式
 # 复制 entrypoint.sh 并强制转换为 LF 格式
 COPY docker/entrypoint.sh /entrypoint.sh.tmp
 COPY docker/entrypoint.sh /entrypoint.sh.tmp

+ 2 - 0
docker/Dockerfile.mcp

@@ -8,6 +8,8 @@ RUN pip install --no-cache-dir -r requirements.txt
 
 
 # 复制 MCP 服务器代码
 # 复制 MCP 服务器代码
 COPY mcp_server/ ./mcp_server/
 COPY mcp_server/ ./mcp_server/
+# 复制 trendradar 模块(MCP 服务需要读取 SQLite 数据)
+COPY trendradar/ ./trendradar/
 
 
 # 创建必要目录
 # 创建必要目录
 RUN mkdir -p /app/config /app/output
 RUN mkdir -p /app/config /app/output

+ 16 - 2
docker/docker-compose-build.yml

@@ -32,7 +32,6 @@ services:
       - PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
       - PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
       - PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
       - PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
       - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
       - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
-      - PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-}
       # 通知渠道
       # 通知渠道
       - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
       - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
       - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
       - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
@@ -54,6 +53,21 @@ services:
       - BARK_URL=${BARK_URL:-}
       - BARK_URL=${BARK_URL:-}
       # Slack配置
       # Slack配置
       - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
       - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
+      # 存储配置
+      - STORAGE_BACKEND=${STORAGE_BACKEND:-auto}
+      - LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0}
+      - REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0}
+      - STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true}
+      - STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true}
+      # 远程存储配置(S3 兼容协议)
+      - S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
+      - S3_BUCKET_NAME=${S3_BUCKET_NAME:-}
+      - S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-}
+      - S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-}
+      - S3_REGION=${S3_REGION:-}
+      # 数据拉取配置
+      - PULL_ENABLED=${PULL_ENABLED:-false}
+      - PULL_DAYS=${PULL_DAYS:-7}
       # 运行模式
       # 运行模式
       - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
       - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
       - RUN_MODE=${RUN_MODE:-cron}
       - RUN_MODE=${RUN_MODE:-cron}
@@ -71,7 +85,7 @@ services:
 
 
     volumes:
     volumes:
       - ../config:/app/config:ro
       - ../config:/app/config:ro
-      - ../output:/app/output:ro
+      - ../output:/app/output
 
 
     environment:
     environment:
       - TZ=Asia/Shanghai
       - TZ=Asia/Shanghai

+ 16 - 2
docker/docker-compose.yml

@@ -30,7 +30,6 @@ services:
       - PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
       - PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
       - PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
       - PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
       - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
       - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
-      - PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-}
       # 通知渠道
       # 通知渠道
       - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
       - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
       - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
       - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
@@ -52,6 +51,21 @@ services:
       - BARK_URL=${BARK_URL:-}
       - BARK_URL=${BARK_URL:-}
       # Slack配置
       # Slack配置
       - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
       - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
+      # 存储配置
+      - STORAGE_BACKEND=${STORAGE_BACKEND:-auto}
+      - LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0}
+      - REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0}
+      - STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true}
+      - STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true}
+      # 远程存储配置(S3 兼容协议)
+      - S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
+      - S3_BUCKET_NAME=${S3_BUCKET_NAME:-}
+      - S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-}
+      - S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-}
+      - S3_REGION=${S3_REGION:-}
+      # 数据拉取配置
+      - PULL_ENABLED=${PULL_ENABLED:-false}
+      - PULL_DAYS=${PULL_DAYS:-7}
       # 运行模式
       # 运行模式
       - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
       - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
       - RUN_MODE=${RUN_MODE:-cron}
       - RUN_MODE=${RUN_MODE:-cron}
@@ -67,7 +81,7 @@ services:
 
 
     volumes:
     volumes:
       - ../config:/app/config:ro
       - ../config:/app/config:ro
-      - ../output:/app/output:ro
+      - ../output:/app/output
 
 
     environment:
     environment:
       - TZ=Asia/Shanghai
       - TZ=Asia/Shanghai

+ 3 - 3
docker/entrypoint.sh

@@ -13,11 +13,11 @@ env >> /etc/environment
 case "${RUN_MODE:-cron}" in
 case "${RUN_MODE:-cron}" in
 "once")
 "once")
     echo "🔄 单次执行"
     echo "🔄 单次执行"
-    exec /usr/local/bin/python main.py
+    exec /usr/local/bin/python -m trendradar
     ;;
     ;;
 "cron")
 "cron")
     # 生成 crontab
     # 生成 crontab
-    echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python main.py" > /tmp/crontab
+    echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python -m trendradar" > /tmp/crontab
     
     
     echo "📅 生成的crontab内容:"
     echo "📅 生成的crontab内容:"
     cat /tmp/crontab
     cat /tmp/crontab
@@ -30,7 +30,7 @@ case "${RUN_MODE:-cron}" in
     # 立即执行一次(如果配置了)
     # 立即执行一次(如果配置了)
     if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then
     if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then
         echo "▶️ 立即执行一次"
         echo "▶️ 立即执行一次"
-        /usr/local/bin/python main.py
+        /usr/local/bin/python -m trendradar
     fi
     fi
 
 
     # 启动 Web 服务器(如果配置了)
     # 启动 Web 服务器(如果配置了)

+ 25 - 2
docker/manage.py

@@ -33,7 +33,7 @@ def manual_run():
     print("🔄 手动执行爬虫...")
     print("🔄 手动执行爬虫...")
     try:
     try:
         result = subprocess.run(
         result = subprocess.run(
-            ["python", "main.py"], cwd="/app", capture_output=False, text=True
+            ["python", "-m", "trendradar"], cwd="/app", capture_output=False, text=True
         )
         )
         if result.returncode == 0:
         if result.returncode == 0:
             print("✅ 执行完成")
             print("✅ 执行完成")
@@ -285,12 +285,24 @@ def show_config():
         "TELEGRAM_CHAT_ID",
         "TELEGRAM_CHAT_ID",
         "CONFIG_PATH",
         "CONFIG_PATH",
         "FREQUENCY_WORDS_PATH",
         "FREQUENCY_WORDS_PATH",
+        # 存储配置
+        "STORAGE_BACKEND",
+        "LOCAL_RETENTION_DAYS",
+        "REMOTE_RETENTION_DAYS",
+        "STORAGE_TXT_ENABLED",
+        "STORAGE_HTML_ENABLED",
+        "S3_BUCKET_NAME",
+        "S3_ACCESS_KEY_ID",
+        "S3_ENDPOINT_URL",
+        "S3_REGION",
+        "PULL_ENABLED",
+        "PULL_DAYS",
     ]
     ]
 
 
     for var in env_vars:
     for var in env_vars:
         value = os.environ.get(var, "未设置")
         value = os.environ.get(var, "未设置")
         # 隐藏敏感信息
         # 隐藏敏感信息
-        if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY"]):
+        if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY", "SECRET"]):
             if value and value != "未设置":
             if value and value != "未设置":
                 masked_value = value[:10] + "***" if len(value) > 10 else "***"
                 masked_value = value[:10] + "***" if len(value) > 10 else "***"
                 print(f"  {var}: {masked_value}")
                 print(f"  {var}: {masked_value}")
@@ -331,6 +343,17 @@ def show_files():
     # 显示最近2天的文件
     # 显示最近2天的文件
     for date_dir in date_dirs[:2]:
     for date_dir in date_dirs[:2]:
         print(f"  📅 {date_dir.name}:")
         print(f"  📅 {date_dir.name}:")
+
+        # 检查 SQLite 数据库文件
+        db_files = list(date_dir.glob("*.db"))
+        if db_files:
+            print(f"    💾 SQLite: {len(db_files)} 个数据库")
+            for db_file in db_files[:3]:
+                mtime = time.ctime(db_file.stat().st_mtime)
+                size_kb = db_file.stat().st_size // 1024
+                print(f"      📀 {db_file.name} ({size_kb}KB, {mtime.split()[3][:5]})")
+
+        # 检查子目录(html, txt)
         for subdir in ["html", "txt"]:
         for subdir in ["html", "txt"]:
             sub_path = date_dir / subdir
             sub_path = date_dir / subdir
             if sub_path.exists():
             if sub_path.exists():

+ 0 - 5431
main.py

@@ -1,5431 +0,0 @@
-# coding=utf-8
-
-import json
-import os
-import random
-import re
-import time
-import webbrowser
-import smtplib
-from email.mime.text import MIMEText
-from email.mime.multipart import MIMEMultipart
-from email.header import Header
-from email.utils import formataddr, formatdate, make_msgid
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List, Tuple, Optional, Union
-
-import pytz
-import requests
-import yaml
-
-
-VERSION = "3.5.0"
-
-
-# === SMTP邮件配置 ===
-SMTP_CONFIGS = {
-    # Gmail(使用 STARTTLS)
-    "gmail.com": {"server": "smtp.gmail.com", "port": 587, "encryption": "TLS"},
-    # QQ邮箱(使用 SSL,更稳定)
-    "qq.com": {"server": "smtp.qq.com", "port": 465, "encryption": "SSL"},
-    # Outlook(使用 STARTTLS)
-    "outlook.com": {
-        "server": "smtp-mail.outlook.com",
-        "port": 587,
-        "encryption": "TLS",
-    },
-    "hotmail.com": {
-        "server": "smtp-mail.outlook.com",
-        "port": 587,
-        "encryption": "TLS",
-    },
-    "live.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"},
-    # 网易邮箱(使用 SSL,更稳定)
-    "163.com": {"server": "smtp.163.com", "port": 465, "encryption": "SSL"},
-    "126.com": {"server": "smtp.126.com", "port": 465, "encryption": "SSL"},
-    # 新浪邮箱(使用 SSL)
-    "sina.com": {"server": "smtp.sina.com", "port": 465, "encryption": "SSL"},
-    # 搜狐邮箱(使用 SSL)
-    "sohu.com": {"server": "smtp.sohu.com", "port": 465, "encryption": "SSL"},
-    # 天翼邮箱(使用 SSL)
-    "189.cn": {"server": "smtp.189.cn", "port": 465, "encryption": "SSL"},
-    # 阿里云邮箱(使用 TLS)
-    "aliyun.com": {"server": "smtp.aliyun.com", "port": 465, "encryption": "TLS"},
-}
-
-
-# === 多账号推送工具函数 ===
-def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
-    """
-    解析多账号配置,返回账号列表
-
-    Args:
-        config_value: 配置值字符串,多个账号用分隔符分隔
-        separator: 分隔符,默认为 ;
-
-    Returns:
-        账号列表,空字符串会被保留(用于占位)
-    """
-    if not config_value:
-        return []
-    # 保留空字符串用于占位(如 ";token2" 表示第一个账号无token)
-    accounts = [acc.strip() for acc in config_value.split(separator)]
-    # 过滤掉全部为空的情况
-    if all(not acc for acc in accounts):
-        return []
-    return accounts
-
-
-def validate_paired_configs(
-    configs: Dict[str, List[str]],
-    channel_name: str,
-    required_keys: Optional[List[str]] = None
-) -> Tuple[bool, int]:
-    """
-    验证配对配置的数量是否一致
-
-    Args:
-        configs: 配置字典,key 为配置名,value 为账号列表
-        channel_name: 渠道名称,用于日志输出
-        required_keys: 必须有值的配置项列表
-
-    Returns:
-        (是否验证通过, 账号数量)
-    """
-    # 过滤掉空列表
-    non_empty_configs = {k: v for k, v in configs.items() if v}
-
-    if not non_empty_configs:
-        return True, 0
-
-    # 检查必须项
-    if required_keys:
-        for key in required_keys:
-            if key not in non_empty_configs or not non_empty_configs[key]:
-                return True, 0  # 必须项为空,视为未配置
-
-    # 获取所有非空配置的长度
-    lengths = {k: len(v) for k, v in non_empty_configs.items()}
-    unique_lengths = set(lengths.values())
-
-    if len(unique_lengths) > 1:
-        print(f"❌ {channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送")
-        for key, length in lengths.items():
-            print(f"   - {key}: {length} 个")
-        return False, 0
-
-    return True, list(unique_lengths)[0] if unique_lengths else 0
-
-
-def limit_accounts(
-    accounts: List[str],
-    max_count: int,
-    channel_name: str
-) -> List[str]:
-    """
-    限制账号数量
-
-    Args:
-        accounts: 账号列表
-        max_count: 最大账号数量
-        channel_name: 渠道名称,用于日志输出
-
-    Returns:
-        限制后的账号列表
-    """
-    if len(accounts) > max_count:
-        print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count} 个")
-        print(f"   ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险")
-        return accounts[:max_count]
-    return accounts
-
-
-def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
-    """
-    安全获取指定索引的账号值
-
-    Args:
-        accounts: 账号列表
-        index: 索引
-        default: 默认值
-
-    Returns:
-        账号值或默认值
-    """
-    if index < len(accounts):
-        return accounts[index] if accounts[index] else default
-    return default
-
-
-# === 配置管理 ===
-def load_config():
-    """加载配置文件"""
-    config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
-
-    if not Path(config_path).exists():
-        raise FileNotFoundError(f"配置文件 {config_path} 不存在")
-
-    with open(config_path, "r", encoding="utf-8") as f:
-        config_data = yaml.safe_load(f)
-
-    print(f"配置文件加载成功: {config_path}")
-
-    # 构建配置
-    config = {
-        "VERSION_CHECK_URL": config_data["app"]["version_check_url"],
-        "SHOW_VERSION_UPDATE": config_data["app"]["show_version_update"],
-        "REQUEST_INTERVAL": config_data["crawler"]["request_interval"],
-        "REPORT_MODE": os.environ.get("REPORT_MODE", "").strip()
-        or config_data["report"]["mode"],
-        "RANK_THRESHOLD": config_data["report"]["rank_threshold"],
-        "SORT_BY_POSITION_FIRST": os.environ.get("SORT_BY_POSITION_FIRST", "").strip().lower()
-        in ("true", "1")
-        if os.environ.get("SORT_BY_POSITION_FIRST", "").strip()
-        else config_data["report"].get("sort_by_position_first", False),
-        "MAX_NEWS_PER_KEYWORD": int(
-            os.environ.get("MAX_NEWS_PER_KEYWORD", "").strip() or "0"
-        )
-        or config_data["report"].get("max_news_per_keyword", 0),
-        "REVERSE_CONTENT_ORDER": os.environ.get("REVERSE_CONTENT_ORDER", "").strip().lower()
-        in ("true", "1")
-        if os.environ.get("REVERSE_CONTENT_ORDER", "").strip()
-        else config_data["report"].get("reverse_content_order", False),
-        "USE_PROXY": config_data["crawler"]["use_proxy"],
-        "DEFAULT_PROXY": config_data["crawler"]["default_proxy"],
-        "ENABLE_CRAWLER": os.environ.get("ENABLE_CRAWLER", "").strip().lower()
-        in ("true", "1")
-        if os.environ.get("ENABLE_CRAWLER", "").strip()
-        else config_data["crawler"]["enable_crawler"],
-        "ENABLE_NOTIFICATION": os.environ.get("ENABLE_NOTIFICATION", "").strip().lower()
-        in ("true", "1")
-        if os.environ.get("ENABLE_NOTIFICATION", "").strip()
-        else config_data["notification"]["enable_notification"],
-        "MESSAGE_BATCH_SIZE": config_data["notification"]["message_batch_size"],
-        "DINGTALK_BATCH_SIZE": config_data["notification"].get(
-            "dingtalk_batch_size", 20000
-        ),
-        "FEISHU_BATCH_SIZE": config_data["notification"].get("feishu_batch_size", 29000),
-        "BARK_BATCH_SIZE": config_data["notification"].get("bark_batch_size", 3600),
-        "SLACK_BATCH_SIZE": config_data["notification"].get("slack_batch_size", 4000),
-        "BATCH_SEND_INTERVAL": config_data["notification"]["batch_send_interval"],
-        "FEISHU_MESSAGE_SEPARATOR": config_data["notification"][
-            "feishu_message_separator"
-        ],
-        # 多账号配置
-        "MAX_ACCOUNTS_PER_CHANNEL": int(
-            os.environ.get("MAX_ACCOUNTS_PER_CHANNEL", "").strip() or "0"
-        )
-        or config_data["notification"].get("max_accounts_per_channel", 3),
-        "PUSH_WINDOW": {
-            "ENABLED": os.environ.get("PUSH_WINDOW_ENABLED", "").strip().lower()
-            in ("true", "1")
-            if os.environ.get("PUSH_WINDOW_ENABLED", "").strip()
-            else config_data["notification"]
-            .get("push_window", {})
-            .get("enabled", False),
-            "TIME_RANGE": {
-                "START": os.environ.get("PUSH_WINDOW_START", "").strip()
-                or config_data["notification"]
-                .get("push_window", {})
-                .get("time_range", {})
-                .get("start", "08:00"),
-                "END": os.environ.get("PUSH_WINDOW_END", "").strip()
-                or config_data["notification"]
-                .get("push_window", {})
-                .get("time_range", {})
-                .get("end", "22:00"),
-            },
-            "ONCE_PER_DAY": os.environ.get("PUSH_WINDOW_ONCE_PER_DAY", "").strip().lower()
-            in ("true", "1")
-            if os.environ.get("PUSH_WINDOW_ONCE_PER_DAY", "").strip()
-            else config_data["notification"]
-            .get("push_window", {})
-            .get("once_per_day", True),
-            "RECORD_RETENTION_DAYS": int(
-                os.environ.get("PUSH_WINDOW_RETENTION_DAYS", "").strip() or "0"
-            )
-            or config_data["notification"]
-            .get("push_window", {})
-            .get("push_record_retention_days", 7),
-        },
-        "WEIGHT_CONFIG": {
-            "RANK_WEIGHT": config_data["weight"]["rank_weight"],
-            "FREQUENCY_WEIGHT": config_data["weight"]["frequency_weight"],
-            "HOTNESS_WEIGHT": config_data["weight"]["hotness_weight"],
-        },
-        "PLATFORMS": config_data["platforms"],
-    }
-
-    # 通知渠道配置(环境变量优先)
-    notification = config_data.get("notification", {})
-    webhooks = notification.get("webhooks", {})
-
-    config["FEISHU_WEBHOOK_URL"] = os.environ.get(
-        "FEISHU_WEBHOOK_URL", ""
-    ).strip() or webhooks.get("feishu_url", "")
-    config["DINGTALK_WEBHOOK_URL"] = os.environ.get(
-        "DINGTALK_WEBHOOK_URL", ""
-    ).strip() or webhooks.get("dingtalk_url", "")
-    config["WEWORK_WEBHOOK_URL"] = os.environ.get(
-        "WEWORK_WEBHOOK_URL", ""
-    ).strip() or webhooks.get("wework_url", "")
-    config["WEWORK_MSG_TYPE"] = os.environ.get(
-        "WEWORK_MSG_TYPE", ""
-    ).strip() or webhooks.get("wework_msg_type", "markdown")
-    config["TELEGRAM_BOT_TOKEN"] = os.environ.get(
-        "TELEGRAM_BOT_TOKEN", ""
-    ).strip() or webhooks.get("telegram_bot_token", "")
-    config["TELEGRAM_CHAT_ID"] = os.environ.get(
-        "TELEGRAM_CHAT_ID", ""
-    ).strip() or webhooks.get("telegram_chat_id", "")
-
-    # 邮件配置
-    config["EMAIL_FROM"] = os.environ.get("EMAIL_FROM", "").strip() or webhooks.get(
-        "email_from", ""
-    )
-    config["EMAIL_PASSWORD"] = os.environ.get(
-        "EMAIL_PASSWORD", ""
-    ).strip() or webhooks.get("email_password", "")
-    config["EMAIL_TO"] = os.environ.get("EMAIL_TO", "").strip() or webhooks.get(
-        "email_to", ""
-    )
-    config["EMAIL_SMTP_SERVER"] = os.environ.get(
-        "EMAIL_SMTP_SERVER", ""
-    ).strip() or webhooks.get("email_smtp_server", "")
-    config["EMAIL_SMTP_PORT"] = os.environ.get(
-        "EMAIL_SMTP_PORT", ""
-    ).strip() or webhooks.get("email_smtp_port", "")
-
-    # ntfy配置
-    config["NTFY_SERVER_URL"] = (
-        os.environ.get("NTFY_SERVER_URL", "").strip()
-        or webhooks.get("ntfy_server_url")
-        or "https://ntfy.sh"
-    )
-    config["NTFY_TOPIC"] = os.environ.get("NTFY_TOPIC", "").strip() or webhooks.get(
-        "ntfy_topic", ""
-    )
-    config["NTFY_TOKEN"] = os.environ.get("NTFY_TOKEN", "").strip() or webhooks.get(
-        "ntfy_token", ""
-    )
-
-    # Bark配置
-    config["BARK_URL"] = os.environ.get("BARK_URL", "").strip() or webhooks.get(
-        "bark_url", ""
-    )
-
-    # Slack配置
-    config["SLACK_WEBHOOK_URL"] = os.environ.get("SLACK_WEBHOOK_URL", "").strip() or webhooks.get(
-        "slack_webhook_url", ""
-    )
-
-    # 输出配置来源信息
-    notification_sources = []
-    max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
-
-    if config["FEISHU_WEBHOOK_URL"]:
-        accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
-        count = min(len(accounts), max_accounts)
-        source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
-        notification_sources.append(f"飞书({source}, {count}个账号)")
-    if config["DINGTALK_WEBHOOK_URL"]:
-        accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
-        count = min(len(accounts), max_accounts)
-        source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
-        notification_sources.append(f"钉钉({source}, {count}个账号)")
-    if config["WEWORK_WEBHOOK_URL"]:
-        accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
-        count = min(len(accounts), max_accounts)
-        source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
-        notification_sources.append(f"企业微信({source}, {count}个账号)")
-    if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
-        tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
-        chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
-        # 验证数量一致性
-        valid, count = validate_paired_configs(
-            {"bot_token": tokens, "chat_id": chat_ids},
-            "Telegram",
-            required_keys=["bot_token", "chat_id"]
-        )
-        if valid and count > 0:
-            count = min(count, max_accounts)
-            token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
-            notification_sources.append(f"Telegram({token_source}, {count}个账号)")
-    if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
-        from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
-        notification_sources.append(f"邮件({from_source})")
-
-    if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
-        topics = parse_multi_account_config(config["NTFY_TOPIC"])
-        tokens = parse_multi_account_config(config["NTFY_TOKEN"])
-        # ntfy 的 token 是可选的,但如果配置了,数量必须与 topic 一致
-        if tokens:
-            valid, count = validate_paired_configs(
-                {"topic": topics, "token": tokens},
-                "ntfy"
-            )
-            if valid and count > 0:
-                count = min(count, max_accounts)
-                server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
-                notification_sources.append(f"ntfy({server_source}, {count}个账号)")
-        else:
-            count = min(len(topics), max_accounts)
-            server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
-            notification_sources.append(f"ntfy({server_source}, {count}个账号)")
-
-    if config["BARK_URL"]:
-        accounts = parse_multi_account_config(config["BARK_URL"])
-        count = min(len(accounts), max_accounts)
-        bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
-        notification_sources.append(f"Bark({bark_source}, {count}个账号)")
-
-    if config["SLACK_WEBHOOK_URL"]:
-        accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
-        count = min(len(accounts), max_accounts)
-        slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
-        notification_sources.append(f"Slack({slack_source}, {count}个账号)")
-
-    if notification_sources:
-        print(f"通知渠道配置来源: {', '.join(notification_sources)}")
-        print(f"每个渠道最大账号数: {max_accounts}")
-    else:
-        print("未配置任何通知渠道")
-
-    return config
-
-
-print("正在加载配置...")
-CONFIG = load_config()
-print(f"TrendRadar v{VERSION} 配置加载完成")
-print(f"监控平台数量: {len(CONFIG['PLATFORMS'])}")
-
-
-# === 工具函数 ===
-def get_beijing_time():
-    """获取北京时间"""
-    return datetime.now(pytz.timezone("Asia/Shanghai"))
-
-
-def format_date_folder():
-    """格式化日期文件夹"""
-    return get_beijing_time().strftime("%Y年%m月%d日")
-
-
-def format_time_filename():
-    """格式化时间文件名"""
-    return get_beijing_time().strftime("%H时%M分")
-
-
-def clean_title(title: str) -> str:
-    """清理标题中的特殊字符"""
-    if not isinstance(title, str):
-        title = str(title)
-    cleaned_title = title.replace("\n", " ").replace("\r", " ")
-    cleaned_title = re.sub(r"\s+", " ", cleaned_title)
-    cleaned_title = cleaned_title.strip()
-    return cleaned_title
-
-
-def ensure_directory_exists(directory: str):
-    """确保目录存在"""
-    Path(directory).mkdir(parents=True, exist_ok=True)
-
-
-def get_output_path(subfolder: str, filename: str) -> str:
-    """获取输出路径"""
-    date_folder = format_date_folder()
-    output_dir = Path("output") / date_folder / subfolder
-    ensure_directory_exists(str(output_dir))
-    return str(output_dir / filename)
-
-
-def check_version_update(
-    current_version: str, version_url: str, proxy_url: Optional[str] = None
-) -> Tuple[bool, Optional[str]]:
-    """检查版本更新"""
-    try:
-        proxies = None
-        if proxy_url:
-            proxies = {"http": proxy_url, "https": proxy_url}
-
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-            "Accept": "text/plain, */*",
-            "Cache-Control": "no-cache",
-        }
-
-        response = requests.get(
-            version_url, proxies=proxies, headers=headers, timeout=10
-        )
-        response.raise_for_status()
-
-        remote_version = response.text.strip()
-        print(f"当前版本: {current_version}, 远程版本: {remote_version}")
-
-        # 比较版本
-        def parse_version(version_str):
-            try:
-                parts = version_str.strip().split(".")
-                if len(parts) != 3:
-                    raise ValueError("版本号格式不正确")
-                return int(parts[0]), int(parts[1]), int(parts[2])
-            except:
-                return 0, 0, 0
-
-        current_tuple = parse_version(current_version)
-        remote_tuple = parse_version(remote_version)
-
-        need_update = current_tuple < remote_tuple
-        return need_update, remote_version if need_update else None
-
-    except Exception as e:
-        print(f"版本检查失败: {e}")
-        return False, None
-
-
-def is_first_crawl_today() -> bool:
-    """检测是否是当天第一次爬取"""
-    date_folder = format_date_folder()
-    txt_dir = Path("output") / date_folder / "txt"
-
-    if not txt_dir.exists():
-        return True
-
-    files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
-    return len(files) <= 1
-
-
-def html_escape(text: str) -> str:
-    """HTML转义"""
-    if not isinstance(text, str):
-        text = str(text)
-
-    return (
-        text.replace("&", "&amp;")
-        .replace("<", "&lt;")
-        .replace(">", "&gt;")
-        .replace('"', "&quot;")
-        .replace("'", "&#x27;")
-    )
-
-
-# === 推送记录管理 ===
-class PushRecordManager:
-    """推送记录管理器"""
-
-    def __init__(self):
-        self.record_dir = Path("output") / ".push_records"
-        self.ensure_record_dir()
-        self.cleanup_old_records()
-
-    def ensure_record_dir(self):
-        """确保记录目录存在"""
-        self.record_dir.mkdir(parents=True, exist_ok=True)
-
-    def get_today_record_file(self) -> Path:
-        """获取今天的记录文件路径"""
-        today = get_beijing_time().strftime("%Y%m%d")
-        return self.record_dir / f"push_record_{today}.json"
-
-    def cleanup_old_records(self):
-        """清理过期的推送记录"""
-        retention_days = CONFIG["PUSH_WINDOW"]["RECORD_RETENTION_DAYS"]
-        current_time = get_beijing_time()
-
-        for record_file in self.record_dir.glob("push_record_*.json"):
-            try:
-                date_str = record_file.stem.replace("push_record_", "")
-                file_date = datetime.strptime(date_str, "%Y%m%d")
-                file_date = pytz.timezone("Asia/Shanghai").localize(file_date)
-
-                if (current_time - file_date).days > retention_days:
-                    record_file.unlink()
-                    print(f"清理过期推送记录: {record_file.name}")
-            except Exception as e:
-                print(f"清理记录文件失败 {record_file}: {e}")
-
-    def has_pushed_today(self) -> bool:
-        """检查今天是否已经推送过"""
-        record_file = self.get_today_record_file()
-
-        if not record_file.exists():
-            return False
-
-        try:
-            with open(record_file, "r", encoding="utf-8") as f:
-                record = json.load(f)
-            return record.get("pushed", False)
-        except Exception as e:
-            print(f"读取推送记录失败: {e}")
-            return False
-
-    def record_push(self, report_type: str):
-        """记录推送"""
-        record_file = self.get_today_record_file()
-        now = get_beijing_time()
-
-        record = {
-            "pushed": True,
-            "push_time": now.strftime("%Y-%m-%d %H:%M:%S"),
-            "report_type": report_type,
-        }
-
-        try:
-            with open(record_file, "w", encoding="utf-8") as f:
-                json.dump(record, f, ensure_ascii=False, indent=2)
-            print(f"推送记录已保存: {report_type} at {now.strftime('%H:%M:%S')}")
-        except Exception as e:
-            print(f"保存推送记录失败: {e}")
-
-    def is_in_time_range(self, start_time: str, end_time: str) -> bool:
-        """检查当前时间是否在指定时间范围内"""
-        now = get_beijing_time()
-        current_time = now.strftime("%H:%M")
-    
-        def normalize_time(time_str: str) -> str:
-            """将时间字符串标准化为 HH:MM 格式"""
-            try:
-                parts = time_str.strip().split(":")
-                if len(parts) != 2:
-                    raise ValueError(f"时间格式错误: {time_str}")
-            
-                hour = int(parts[0])
-                minute = int(parts[1])
-            
-                if not (0 <= hour <= 23 and 0 <= minute <= 59):
-                    raise ValueError(f"时间范围错误: {time_str}")
-            
-                return f"{hour:02d}:{minute:02d}"
-            except Exception as e:
-                print(f"时间格式化错误 '{time_str}': {e}")
-                return time_str
-    
-        normalized_start = normalize_time(start_time)
-        normalized_end = normalize_time(end_time)
-        normalized_current = normalize_time(current_time)
-    
-        result = normalized_start <= normalized_current <= normalized_end
-    
-        if not result:
-            print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}")
-    
-        return result
-
-
-# === 数据获取 ===
-class DataFetcher:
-    """数据获取器"""
-
-    def __init__(self, proxy_url: Optional[str] = None):
-        self.proxy_url = proxy_url
-
-    def fetch_data(
-        self,
-        id_info: Union[str, Tuple[str, str]],
-        max_retries: int = 2,
-        min_retry_wait: int = 3,
-        max_retry_wait: int = 5,
-    ) -> Tuple[Optional[str], str, str]:
-        """获取指定ID数据,支持重试"""
-        if isinstance(id_info, tuple):
-            id_value, alias = id_info
-        else:
-            id_value = id_info
-            alias = id_value
-
-        url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
-
-        proxies = None
-        if self.proxy_url:
-            proxies = {"http": self.proxy_url, "https": self.proxy_url}
-
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-            "Connection": "keep-alive",
-            "Cache-Control": "no-cache",
-        }
-
-        retries = 0
-        while retries <= max_retries:
-            try:
-                response = requests.get(
-                    url, proxies=proxies, headers=headers, timeout=10
-                )
-                response.raise_for_status()
-
-                data_text = response.text
-                data_json = json.loads(data_text)
-
-                status = data_json.get("status", "未知")
-                if status not in ["success", "cache"]:
-                    raise ValueError(f"响应状态异常: {status}")
-
-                status_info = "最新数据" if status == "success" else "缓存数据"
-                print(f"获取 {id_value} 成功({status_info})")
-                return data_text, id_value, alias
-
-            except Exception as e:
-                retries += 1
-                if retries <= max_retries:
-                    base_wait = random.uniform(min_retry_wait, max_retry_wait)
-                    additional_wait = (retries - 1) * random.uniform(1, 2)
-                    wait_time = base_wait + additional_wait
-                    print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
-                    time.sleep(wait_time)
-                else:
-                    print(f"请求 {id_value} 失败: {e}")
-                    return None, id_value, alias
-        return None, id_value, alias
-
-    def crawl_websites(
-        self,
-        ids_list: List[Union[str, Tuple[str, str]]],
-        request_interval: int = CONFIG["REQUEST_INTERVAL"],
-    ) -> Tuple[Dict, Dict, List]:
-        """爬取多个网站数据"""
-        results = {}
-        id_to_name = {}
-        failed_ids = []
-
-        for i, id_info in enumerate(ids_list):
-            if isinstance(id_info, tuple):
-                id_value, name = id_info
-            else:
-                id_value = id_info
-                name = id_value
-
-            id_to_name[id_value] = name
-            response, _, _ = self.fetch_data(id_info)
-
-            if response:
-                try:
-                    data = json.loads(response)
-                    results[id_value] = {}
-                    for index, item in enumerate(data.get("items", []), 1):
-                        title = item.get("title")
-                        # 跳过无效标题(None、float、空字符串)
-                        if title is None or isinstance(title, float) or not str(title).strip():
-                            continue
-                        title = str(title).strip()
-                        url = item.get("url", "")
-                        mobile_url = item.get("mobileUrl", "")
-
-                        if title in results[id_value]:
-                            results[id_value][title]["ranks"].append(index)
-                        else:
-                            results[id_value][title] = {
-                                "ranks": [index],
-                                "url": url,
-                                "mobileUrl": mobile_url,
-                            }
-                except json.JSONDecodeError:
-                    print(f"解析 {id_value} 响应失败")
-                    failed_ids.append(id_value)
-                except Exception as e:
-                    print(f"处理 {id_value} 数据出错: {e}")
-                    failed_ids.append(id_value)
-            else:
-                failed_ids.append(id_value)
-
-            if i < len(ids_list) - 1:
-                actual_interval = request_interval + random.randint(-10, 20)
-                actual_interval = max(50, actual_interval)
-                time.sleep(actual_interval / 1000)
-
-        print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
-        return results, id_to_name, failed_ids
-
-
-# === 数据处理 ===
-def save_titles_to_file(results: Dict, id_to_name: Dict, failed_ids: List) -> str:
-    """保存标题到文件"""
-    file_path = get_output_path("txt", f"{format_time_filename()}.txt")
-
-    with open(file_path, "w", encoding="utf-8") as f:
-        for id_value, title_data in results.items():
-            # id | name 或 id
-            name = id_to_name.get(id_value)
-            if name and name != id_value:
-                f.write(f"{id_value} | {name}\n")
-            else:
-                f.write(f"{id_value}\n")
-
-            # 按排名排序标题
-            sorted_titles = []
-            for title, info in title_data.items():
-                cleaned_title = clean_title(title)
-                if isinstance(info, dict):
-                    ranks = info.get("ranks", [])
-                    url = info.get("url", "")
-                    mobile_url = info.get("mobileUrl", "")
-                else:
-                    ranks = info if isinstance(info, list) else []
-                    url = ""
-                    mobile_url = ""
-
-                rank = ranks[0] if ranks else 1
-                sorted_titles.append((rank, cleaned_title, url, mobile_url))
-
-            sorted_titles.sort(key=lambda x: x[0])
-
-            for rank, cleaned_title, url, mobile_url in sorted_titles:
-                line = f"{rank}. {cleaned_title}"
-
-                if url:
-                    line += f" [URL:{url}]"
-                if mobile_url:
-                    line += f" [MOBILE:{mobile_url}]"
-                f.write(line + "\n")
-
-            f.write("\n")
-
-        if failed_ids:
-            f.write("==== 以下ID请求失败 ====\n")
-            for id_value in failed_ids:
-                f.write(f"{id_value}\n")
-
-    return file_path
-
-
-def load_frequency_words(
-    frequency_file: Optional[str] = None,
-) -> Tuple[List[Dict], List[str], List[str]]:
-    """
-    加载频率词配置
-
-    Returns:
-        (词组列表, 词组内过滤词, 全局过滤词)
-    """
-    if frequency_file is None:
-        frequency_file = os.environ.get(
-            "FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
-        )
-
-    frequency_path = Path(frequency_file)
-    if not frequency_path.exists():
-        raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
-
-    with open(frequency_path, "r", encoding="utf-8") as f:
-        content = f.read()
-
-    word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
-
-    processed_groups = []
-    filter_words = []
-    global_filters = []  # 新增:全局过滤词列表
-
-    # 默认区域(向后兼容)
-    current_section = "WORD_GROUPS"
-
-    for group in word_groups:
-        lines = [line.strip() for line in group.split("\n") if line.strip()]
-
-        if not lines:
-            continue
-
-        # 检查是否为区域标记
-        if lines[0].startswith("[") and lines[0].endswith("]"):
-            section_name = lines[0][1:-1].upper()
-            if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
-                current_section = section_name
-                lines = lines[1:]  # 移除标记行
-
-        # 处理全局过滤区域
-        if current_section == "GLOBAL_FILTER":
-            # 直接添加所有非空行到全局过滤列表
-            for line in lines:
-                # 忽略特殊语法前缀,只提取纯文本
-                if line.startswith(("!", "+", "@")):
-                    continue  # 全局过滤区不支持特殊语法
-                if line:
-                    global_filters.append(line)
-            continue
-
-        # 处理词组区域(保持现有逻辑)
-        words = lines
-
-        group_required_words = []
-        group_normal_words = []
-        group_filter_words = []
-        group_max_count = 0  # 默认不限制
-
-        for word in words:
-            if word.startswith("@"):
-                # 解析最大显示数量(只接受正整数)
-                try:
-                    count = int(word[1:])
-                    if count > 0:
-                        group_max_count = count
-                except (ValueError, IndexError):
-                    pass  # 忽略无效的@数字格式
-            elif word.startswith("!"):
-                filter_words.append(word[1:])
-                group_filter_words.append(word[1:])
-            elif word.startswith("+"):
-                group_required_words.append(word[1:])
-            else:
-                group_normal_words.append(word)
-
-        if group_required_words or group_normal_words:
-            if group_normal_words:
-                group_key = " ".join(group_normal_words)
-            else:
-                group_key = " ".join(group_required_words)
-
-            processed_groups.append(
-                {
-                    "required": group_required_words,
-                    "normal": group_normal_words,
-                    "group_key": group_key,
-                    "max_count": group_max_count,  # 新增字段
-                }
-            )
-
-    return processed_groups, filter_words, global_filters
-
-
-def parse_file_titles(file_path: Path) -> Tuple[Dict, Dict]:
-    """解析单个txt文件的标题数据,返回(titles_by_id, id_to_name)"""
-    titles_by_id = {}
-    id_to_name = {}
-
-    with open(file_path, "r", encoding="utf-8") as f:
-        content = f.read()
-        sections = content.split("\n\n")
-
-        for section in sections:
-            if not section.strip() or "==== 以下ID请求失败 ====" in section:
-                continue
-
-            lines = section.strip().split("\n")
-            if len(lines) < 2:
-                continue
-
-            # id | name 或 id
-            header_line = lines[0].strip()
-            if " | " in header_line:
-                parts = header_line.split(" | ", 1)
-                source_id = parts[0].strip()
-                name = parts[1].strip()
-                id_to_name[source_id] = name
-            else:
-                source_id = header_line
-                id_to_name[source_id] = source_id
-
-            titles_by_id[source_id] = {}
-
-            for line in lines[1:]:
-                if line.strip():
-                    try:
-                        title_part = line.strip()
-                        rank = None
-
-                        # 提取排名
-                        if ". " in title_part and title_part.split(". ")[0].isdigit():
-                            rank_str, title_part = title_part.split(". ", 1)
-                            rank = int(rank_str)
-
-                        # 提取 MOBILE URL
-                        mobile_url = ""
-                        if " [MOBILE:" in title_part:
-                            title_part, mobile_part = title_part.rsplit(" [MOBILE:", 1)
-                            if mobile_part.endswith("]"):
-                                mobile_url = mobile_part[:-1]
-
-                        # 提取 URL
-                        url = ""
-                        if " [URL:" in title_part:
-                            title_part, url_part = title_part.rsplit(" [URL:", 1)
-                            if url_part.endswith("]"):
-                                url = url_part[:-1]
-
-                        title = clean_title(title_part.strip())
-                        ranks = [rank] if rank is not None else [1]
-
-                        titles_by_id[source_id][title] = {
-                            "ranks": ranks,
-                            "url": url,
-                            "mobileUrl": mobile_url,
-                        }
-
-                    except Exception as e:
-                        print(f"解析标题行出错: {line}, 错误: {e}")
-
-    return titles_by_id, id_to_name
-
-
-def read_all_today_titles(
-    current_platform_ids: Optional[List[str]] = None,
-) -> Tuple[Dict, Dict, Dict]:
-    """读取当天所有标题文件,支持按当前监控平台过滤"""
-    date_folder = format_date_folder()
-    txt_dir = Path("output") / date_folder / "txt"
-
-    if not txt_dir.exists():
-        return {}, {}, {}
-
-    all_results = {}
-    final_id_to_name = {}
-    title_info = {}
-
-    files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
-
-    for file_path in files:
-        time_info = file_path.stem
-
-        titles_by_id, file_id_to_name = parse_file_titles(file_path)
-
-        if current_platform_ids is not None:
-            filtered_titles_by_id = {}
-            filtered_id_to_name = {}
-
-            for source_id, title_data in titles_by_id.items():
-                if source_id in current_platform_ids:
-                    filtered_titles_by_id[source_id] = title_data
-                    if source_id in file_id_to_name:
-                        filtered_id_to_name[source_id] = file_id_to_name[source_id]
-
-            titles_by_id = filtered_titles_by_id
-            file_id_to_name = filtered_id_to_name
-
-        final_id_to_name.update(file_id_to_name)
-
-        for source_id, title_data in titles_by_id.items():
-            process_source_data(
-                source_id, title_data, time_info, all_results, title_info
-            )
-
-    return all_results, final_id_to_name, title_info
-
-
-def process_source_data(
-    source_id: str,
-    title_data: Dict,
-    time_info: str,
-    all_results: Dict,
-    title_info: Dict,
-) -> None:
-    """处理来源数据,合并重复标题"""
-    if source_id not in all_results:
-        all_results[source_id] = title_data
-
-        if source_id not in title_info:
-            title_info[source_id] = {}
-
-        for title, data in title_data.items():
-            ranks = data.get("ranks", [])
-            url = data.get("url", "")
-            mobile_url = data.get("mobileUrl", "")
-
-            title_info[source_id][title] = {
-                "first_time": time_info,
-                "last_time": time_info,
-                "count": 1,
-                "ranks": ranks,
-                "url": url,
-                "mobileUrl": mobile_url,
-            }
-    else:
-        for title, data in title_data.items():
-            ranks = data.get("ranks", [])
-            url = data.get("url", "")
-            mobile_url = data.get("mobileUrl", "")
-
-            if title not in all_results[source_id]:
-                all_results[source_id][title] = {
-                    "ranks": ranks,
-                    "url": url,
-                    "mobileUrl": mobile_url,
-                }
-                title_info[source_id][title] = {
-                    "first_time": time_info,
-                    "last_time": time_info,
-                    "count": 1,
-                    "ranks": ranks,
-                    "url": url,
-                    "mobileUrl": mobile_url,
-                }
-            else:
-                existing_data = all_results[source_id][title]
-                existing_ranks = existing_data.get("ranks", [])
-                existing_url = existing_data.get("url", "")
-                existing_mobile_url = existing_data.get("mobileUrl", "")
-
-                merged_ranks = existing_ranks.copy()
-                for rank in ranks:
-                    if rank not in merged_ranks:
-                        merged_ranks.append(rank)
-
-                all_results[source_id][title] = {
-                    "ranks": merged_ranks,
-                    "url": existing_url or url,
-                    "mobileUrl": existing_mobile_url or mobile_url,
-                }
-
-                title_info[source_id][title]["last_time"] = time_info
-                title_info[source_id][title]["ranks"] = merged_ranks
-                title_info[source_id][title]["count"] += 1
-                if not title_info[source_id][title].get("url"):
-                    title_info[source_id][title]["url"] = url
-                if not title_info[source_id][title].get("mobileUrl"):
-                    title_info[source_id][title]["mobileUrl"] = mobile_url
-
-
-def detect_latest_new_titles(current_platform_ids: Optional[List[str]] = None) -> Dict:
-    """检测当日最新批次的新增标题,支持按当前监控平台过滤"""
-    date_folder = format_date_folder()
-    txt_dir = Path("output") / date_folder / "txt"
-
-    if not txt_dir.exists():
-        return {}
-
-    files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
-    if len(files) < 2:
-        return {}
-
-    # 解析最新文件
-    latest_file = files[-1]
-    latest_titles, _ = parse_file_titles(latest_file)
-
-    # 如果指定了当前平台列表,过滤最新文件数据
-    if current_platform_ids is not None:
-        filtered_latest_titles = {}
-        for source_id, title_data in latest_titles.items():
-            if source_id in current_platform_ids:
-                filtered_latest_titles[source_id] = title_data
-        latest_titles = filtered_latest_titles
-
-    # 汇总历史标题(按平台过滤)
-    historical_titles = {}
-    for file_path in files[:-1]:
-        historical_data, _ = parse_file_titles(file_path)
-
-        # 过滤历史数据
-        if current_platform_ids is not None:
-            filtered_historical_data = {}
-            for source_id, title_data in historical_data.items():
-                if source_id in current_platform_ids:
-                    filtered_historical_data[source_id] = title_data
-            historical_data = filtered_historical_data
-
-        for source_id, titles_data in historical_data.items():
-            if source_id not in historical_titles:
-                historical_titles[source_id] = set()
-            for title in titles_data.keys():
-                historical_titles[source_id].add(title)
-
-    # 找出新增标题
-    new_titles = {}
-    for source_id, latest_source_titles in latest_titles.items():
-        historical_set = historical_titles.get(source_id, set())
-        source_new_titles = {}
-
-        for title, title_data in latest_source_titles.items():
-            if title not in historical_set:
-                source_new_titles[title] = title_data
-
-        if source_new_titles:
-            new_titles[source_id] = source_new_titles
-
-    return new_titles
-
-
-# === 统计和分析 ===
-def calculate_news_weight(
-    title_data: Dict, rank_threshold: int = CONFIG["RANK_THRESHOLD"]
-) -> float:
-    """计算新闻权重,用于排序"""
-    ranks = title_data.get("ranks", [])
-    if not ranks:
-        return 0.0
-
-    count = title_data.get("count", len(ranks))
-    weight_config = CONFIG["WEIGHT_CONFIG"]
-
-    # 排名权重:Σ(11 - min(rank, 10)) / 出现次数
-    rank_scores = []
-    for rank in ranks:
-        score = 11 - min(rank, 10)
-        rank_scores.append(score)
-
-    rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
-
-    # 频次权重:min(出现次数, 10) × 10
-    frequency_weight = min(count, 10) * 10
-
-    # 热度加成:高排名次数 / 总出现次数 × 100
-    high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
-    hotness_ratio = high_rank_count / len(ranks) if ranks else 0
-    hotness_weight = hotness_ratio * 100
-
-    total_weight = (
-        rank_weight * weight_config["RANK_WEIGHT"]
-        + frequency_weight * weight_config["FREQUENCY_WEIGHT"]
-        + hotness_weight * weight_config["HOTNESS_WEIGHT"]
-    )
-
-    return total_weight
-
-
-def matches_word_groups(
-    title: str, word_groups: List[Dict], filter_words: List[str], global_filters: Optional[List[str]] = None
-) -> bool:
-    """检查标题是否匹配词组规则"""
-    # 防御性类型检查:确保 title 是有效字符串
-    if not isinstance(title, str):
-        title = str(title) if title is not None else ""
-    if not title.strip():
-        return False
-
-    title_lower = title.lower()
-
-    # 全局过滤检查(优先级最高)
-    if global_filters:
-        if any(global_word.lower() in title_lower for global_word in global_filters):
-            return False
-
-    # 如果没有配置词组,则匹配所有标题(支持显示全部新闻)
-    if not word_groups:
-        return True
-
-    # 过滤词检查
-    if any(filter_word.lower() in title_lower for filter_word in filter_words):
-        return False
-
-    # 词组匹配检查
-    for group in word_groups:
-        required_words = group["required"]
-        normal_words = group["normal"]
-
-        # 必须词检查
-        if required_words:
-            all_required_present = all(
-                req_word.lower() in title_lower for req_word in required_words
-            )
-            if not all_required_present:
-                continue
-
-        # 普通词检查
-        if normal_words:
-            any_normal_present = any(
-                normal_word.lower() in title_lower for normal_word in normal_words
-            )
-            if not any_normal_present:
-                continue
-
-        return True
-
-    return False
-
-
-def format_time_display(first_time: str, last_time: str) -> str:
-    """格式化时间显示"""
-    if not first_time:
-        return ""
-    if first_time == last_time or not last_time:
-        return first_time
-    else:
-        return f"[{first_time} ~ {last_time}]"
-
-
-def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str:
-    """统一的排名格式化方法"""
-    if not ranks:
-        return ""
-
-    unique_ranks = sorted(set(ranks))
-    min_rank = unique_ranks[0]
-    max_rank = unique_ranks[-1]
-
-    if format_type == "html":
-        highlight_start = "<font color='red'><strong>"
-        highlight_end = "</strong></font>"
-    elif format_type == "feishu":
-        highlight_start = "<font color='red'>**"
-        highlight_end = "**</font>"
-    elif format_type == "dingtalk":
-        highlight_start = "**"
-        highlight_end = "**"
-    elif format_type == "wework":
-        highlight_start = "**"
-        highlight_end = "**"
-    elif format_type == "telegram":
-        highlight_start = "<b>"
-        highlight_end = "</b>"
-    elif format_type == "slack":
-        highlight_start = "*"
-        highlight_end = "*"
-    else:
-        highlight_start = "**"
-        highlight_end = "**"
-
-    if min_rank <= rank_threshold:
-        if min_rank == max_rank:
-            return f"{highlight_start}[{min_rank}]{highlight_end}"
-        else:
-            return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}"
-    else:
-        if min_rank == max_rank:
-            return f"[{min_rank}]"
-        else:
-            return f"[{min_rank} - {max_rank}]"
-
-
-def count_word_frequency(
-    results: Dict,
-    word_groups: List[Dict],
-    filter_words: List[str],
-    id_to_name: Dict,
-    title_info: Optional[Dict] = None,
-    rank_threshold: int = CONFIG["RANK_THRESHOLD"],
-    new_titles: Optional[Dict] = None,
-    mode: str = "daily",
-    global_filters: Optional[List[str]] = None,
-) -> Tuple[List[Dict], int]:
-    """统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题"""
-
-    # 如果没有配置词组,创建一个包含所有新闻的虚拟词组
-    if not word_groups:
-        print("频率词配置为空,将显示所有新闻")
-        word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
-        filter_words = []  # 清空过滤词,显示所有新闻
-
-    is_first_today = is_first_crawl_today()
-
-    # 确定处理的数据源和新增标记逻辑
-    if mode == "incremental":
-        if is_first_today:
-            # 增量模式 + 当天第一次:处理所有新闻,都标记为新增
-            results_to_process = results
-            all_news_are_new = True
-        else:
-            # 增量模式 + 当天非第一次:只处理新增的新闻
-            results_to_process = new_titles if new_titles else {}
-            all_news_are_new = True
-    elif mode == "current":
-        # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
-        if title_info:
-            latest_time = None
-            for source_titles in title_info.values():
-                for title_data in source_titles.values():
-                    last_time = title_data.get("last_time", "")
-                    if last_time:
-                        if latest_time is None or last_time > latest_time:
-                            latest_time = last_time
-
-            # 只处理 last_time 等于最新时间的新闻
-            if latest_time:
-                results_to_process = {}
-                for source_id, source_titles in results.items():
-                    if source_id in title_info:
-                        filtered_titles = {}
-                        for title, title_data in source_titles.items():
-                            if title in title_info[source_id]:
-                                info = title_info[source_id][title]
-                                if info.get("last_time") == latest_time:
-                                    filtered_titles[title] = title_data
-                        if filtered_titles:
-                            results_to_process[source_id] = filtered_titles
-
-                print(
-                    f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
-                )
-            else:
-                results_to_process = results
-        else:
-            results_to_process = results
-        all_news_are_new = False
-    else:
-        # 当日汇总模式:处理所有新闻
-        results_to_process = results
-        all_news_are_new = False
-        total_input_news = sum(len(titles) for titles in results.values())
-        filter_status = (
-            "全部显示"
-            if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
-            else "频率词过滤"
-        )
-        print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
-
-    word_stats = {}
-    total_titles = 0
-    processed_titles = {}
-    matched_new_count = 0
-
-    if title_info is None:
-        title_info = {}
-    if new_titles is None:
-        new_titles = {}
-
-    for group in word_groups:
-        group_key = group["group_key"]
-        word_stats[group_key] = {"count": 0, "titles": {}}
-
-    for source_id, titles_data in results_to_process.items():
-        total_titles += len(titles_data)
-
-        if source_id not in processed_titles:
-            processed_titles[source_id] = {}
-
-        for title, title_data in titles_data.items():
-            if title in processed_titles.get(source_id, {}):
-                continue
-
-            # 使用统一的匹配逻辑
-            matches_frequency_words = matches_word_groups(
-                title, word_groups, filter_words, global_filters
-            )
-
-            if not matches_frequency_words:
-                continue
-
-            # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
-            if (mode == "incremental" and all_news_are_new) or (
-                mode == "current" and is_first_today
-            ):
-                matched_new_count += 1
-
-            source_ranks = title_data.get("ranks", [])
-            source_url = title_data.get("url", "")
-            source_mobile_url = title_data.get("mobileUrl", "")
-
-            # 找到匹配的词组(防御性转换确保类型安全)
-            title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
-            for group in word_groups:
-                required_words = group["required"]
-                normal_words = group["normal"]
-
-                # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
-                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
-                    group_key = group["group_key"]
-                    word_stats[group_key]["count"] += 1
-                    if source_id not in word_stats[group_key]["titles"]:
-                        word_stats[group_key]["titles"][source_id] = []
-                else:
-                    # 原有的匹配逻辑
-                    if required_words:
-                        all_required_present = all(
-                            req_word.lower() in title_lower
-                            for req_word in required_words
-                        )
-                        if not all_required_present:
-                            continue
-
-                    if normal_words:
-                        any_normal_present = any(
-                            normal_word.lower() in title_lower
-                            for normal_word in normal_words
-                        )
-                        if not any_normal_present:
-                            continue
-
-                    group_key = group["group_key"]
-                    word_stats[group_key]["count"] += 1
-                    if source_id not in word_stats[group_key]["titles"]:
-                        word_stats[group_key]["titles"][source_id] = []
-
-                first_time = ""
-                last_time = ""
-                count_info = 1
-                ranks = source_ranks if source_ranks else []
-                url = source_url
-                mobile_url = source_mobile_url
-
-                # 对于 current 模式,从历史统计信息中获取完整数据
-                if (
-                    mode == "current"
-                    and title_info
-                    and source_id in title_info
-                    and title in title_info[source_id]
-                ):
-                    info = title_info[source_id][title]
-                    first_time = info.get("first_time", "")
-                    last_time = info.get("last_time", "")
-                    count_info = info.get("count", 1)
-                    if "ranks" in info and info["ranks"]:
-                        ranks = info["ranks"]
-                    url = info.get("url", source_url)
-                    mobile_url = info.get("mobileUrl", source_mobile_url)
-                elif (
-                    title_info
-                    and source_id in title_info
-                    and title in title_info[source_id]
-                ):
-                    info = title_info[source_id][title]
-                    first_time = info.get("first_time", "")
-                    last_time = info.get("last_time", "")
-                    count_info = info.get("count", 1)
-                    if "ranks" in info and info["ranks"]:
-                        ranks = info["ranks"]
-                    url = info.get("url", source_url)
-                    mobile_url = info.get("mobileUrl", source_mobile_url)
-
-                if not ranks:
-                    ranks = [99]
-
-                time_display = format_time_display(first_time, last_time)
-
-                source_name = id_to_name.get(source_id, source_id)
-
-                # 判断是否为新增
-                is_new = False
-                if all_news_are_new:
-                    # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
-                    is_new = True
-                elif new_titles and source_id in new_titles:
-                    # 检查是否在新增列表中
-                    new_titles_for_source = new_titles[source_id]
-                    is_new = title in new_titles_for_source
-
-                word_stats[group_key]["titles"][source_id].append(
-                    {
-                        "title": title,
-                        "source_name": source_name,
-                        "first_time": first_time,
-                        "last_time": last_time,
-                        "time_display": time_display,
-                        "count": count_info,
-                        "ranks": ranks,
-                        "rank_threshold": rank_threshold,
-                        "url": url,
-                        "mobileUrl": mobile_url,
-                        "is_new": is_new,
-                    }
-                )
-
-                if source_id not in processed_titles:
-                    processed_titles[source_id] = {}
-                processed_titles[source_id][title] = True
-
-                break
-
-    # 最后统一打印汇总信息
-    if mode == "incremental":
-        if is_first_today:
-            total_input_news = sum(len(titles) for titles in results.values())
-            filter_status = (
-                "全部显示"
-                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
-                else "频率词匹配"
-            )
-            print(
-                f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
-            )
-        else:
-            if new_titles:
-                total_new_count = sum(len(titles) for titles in new_titles.values())
-                filter_status = (
-                    "全部显示"
-                    if len(word_groups) == 1
-                    and word_groups[0]["group_key"] == "全部新闻"
-                    else "匹配频率词"
-                )
-                print(
-                    f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}"
-                )
-                if matched_new_count == 0 and len(word_groups) > 1:
-                    print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
-            else:
-                print("增量模式:未检测到新增新闻")
-    elif mode == "current":
-        total_input_news = sum(len(titles) for titles in results_to_process.values())
-        if is_first_today:
-            filter_status = (
-                "全部显示"
-                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
-                else "频率词匹配"
-            )
-            print(
-                f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
-            )
-        else:
-            matched_count = sum(stat["count"] for stat in word_stats.values())
-            filter_status = (
-                "全部显示"
-                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
-                else "频率词匹配"
-            )
-            print(
-                f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
-            )
-
-    stats = []
-    # 创建 group_key 到位置和最大数量的映射
-    group_key_to_position = {
-        group["group_key"]: idx for idx, group in enumerate(word_groups)
-    }
-    group_key_to_max_count = {
-        group["group_key"]: group.get("max_count", 0) for group in word_groups
-    }
-
-    for group_key, data in word_stats.items():
-        all_titles = []
-        for source_id, title_list in data["titles"].items():
-            all_titles.extend(title_list)
-
-        # 按权重排序
-        sorted_titles = sorted(
-            all_titles,
-            key=lambda x: (
-                -calculate_news_weight(x, rank_threshold),
-                min(x["ranks"]) if x["ranks"] else 999,
-                -x["count"],
-            ),
-        )
-
-        # 应用最大显示数量限制(优先级:单独配置 > 全局配置)
-        group_max_count = group_key_to_max_count.get(group_key, 0)
-        if group_max_count == 0:
-            # 使用全局配置
-            group_max_count = CONFIG.get("MAX_NEWS_PER_KEYWORD", 0)
-
-        if group_max_count > 0:
-            sorted_titles = sorted_titles[:group_max_count]
-
-        stats.append(
-            {
-                "word": group_key,
-                "count": data["count"],
-                "position": group_key_to_position.get(group_key, 999),
-                "titles": sorted_titles,
-                "percentage": (
-                    round(data["count"] / total_titles * 100, 2)
-                    if total_titles > 0
-                    else 0
-                ),
-            }
-        )
-
-    # 根据配置选择排序优先级
-    if CONFIG.get("SORT_BY_POSITION_FIRST", False):
-        # 先按配置位置,再按热点条数
-        stats.sort(key=lambda x: (x["position"], -x["count"]))
-    else:
-        # 先按热点条数,再按配置位置(原逻辑)
-        stats.sort(key=lambda x: (-x["count"], x["position"]))
-
-    return stats, total_titles
-
-
-# === 报告生成 ===
-def prepare_report_data(
-    stats: List[Dict],
-    failed_ids: Optional[List] = None,
-    new_titles: Optional[Dict] = None,
-    id_to_name: Optional[Dict] = None,
-    mode: str = "daily",
-) -> Dict:
-    """准备报告数据"""
-    processed_new_titles = []
-
-    # 在增量模式下隐藏新增新闻区域
-    hide_new_section = mode == "incremental"
-
-    # 只有在非隐藏模式下才处理新增新闻部分
-    if not hide_new_section:
-        filtered_new_titles = {}
-        if new_titles and id_to_name:
-            word_groups, filter_words, global_filters = load_frequency_words()
-            for source_id, titles_data in new_titles.items():
-                filtered_titles = {}
-                for title, title_data in titles_data.items():
-                    if matches_word_groups(title, word_groups, filter_words, global_filters):
-                        filtered_titles[title] = title_data
-                if filtered_titles:
-                    filtered_new_titles[source_id] = filtered_titles
-
-        if filtered_new_titles and id_to_name:
-            for source_id, titles_data in filtered_new_titles.items():
-                source_name = id_to_name.get(source_id, source_id)
-                source_titles = []
-
-                for title, title_data in titles_data.items():
-                    url = title_data.get("url", "")
-                    mobile_url = title_data.get("mobileUrl", "")
-                    ranks = title_data.get("ranks", [])
-
-                    processed_title = {
-                        "title": title,
-                        "source_name": source_name,
-                        "time_display": "",
-                        "count": 1,
-                        "ranks": ranks,
-                        "rank_threshold": CONFIG["RANK_THRESHOLD"],
-                        "url": url,
-                        "mobile_url": mobile_url,
-                        "is_new": True,
-                    }
-                    source_titles.append(processed_title)
-
-                if source_titles:
-                    processed_new_titles.append(
-                        {
-                            "source_id": source_id,
-                            "source_name": source_name,
-                            "titles": source_titles,
-                        }
-                    )
-
-    processed_stats = []
-    for stat in stats:
-        if stat["count"] <= 0:
-            continue
-
-        processed_titles = []
-        for title_data in stat["titles"]:
-            processed_title = {
-                "title": title_data["title"],
-                "source_name": title_data["source_name"],
-                "time_display": title_data["time_display"],
-                "count": title_data["count"],
-                "ranks": title_data["ranks"],
-                "rank_threshold": title_data["rank_threshold"],
-                "url": title_data.get("url", ""),
-                "mobile_url": title_data.get("mobileUrl", ""),
-                "is_new": title_data.get("is_new", False),
-            }
-            processed_titles.append(processed_title)
-
-        processed_stats.append(
-            {
-                "word": stat["word"],
-                "count": stat["count"],
-                "percentage": stat.get("percentage", 0),
-                "titles": processed_titles,
-            }
-        )
-
-    return {
-        "stats": processed_stats,
-        "new_titles": processed_new_titles,
-        "failed_ids": failed_ids or [],
-        "total_new_count": sum(
-            len(source["titles"]) for source in processed_new_titles
-        ),
-    }
-
-
-def format_title_for_platform(
-    platform: str, title_data: Dict, show_source: bool = True
-) -> str:
-    """统一的标题格式化方法"""
-    rank_display = format_rank_display(
-        title_data["ranks"], title_data["rank_threshold"], platform
-    )
-
-    link_url = title_data["mobile_url"] or title_data["url"]
-
-    cleaned_title = clean_title(title_data["title"])
-
-    if platform == "feishu":
-        if link_url:
-            formatted_title = f"[{cleaned_title}]({link_url})"
-        else:
-            formatted_title = cleaned_title
-
-        title_prefix = "🆕 " if title_data.get("is_new") else ""
-
-        if show_source:
-            result = f"<font color='grey'>[{title_data['source_name']}]</font> {title_prefix}{formatted_title}"
-        else:
-            result = f"{title_prefix}{formatted_title}"
-
-        if rank_display:
-            result += f" {rank_display}"
-        if title_data["time_display"]:
-            result += f" <font color='grey'>- {title_data['time_display']}</font>"
-        if title_data["count"] > 1:
-            result += f" <font color='green'>({title_data['count']}次)</font>"
-
-        return result
-
-    elif platform == "dingtalk":
-        if link_url:
-            formatted_title = f"[{cleaned_title}]({link_url})"
-        else:
-            formatted_title = cleaned_title
-
-        title_prefix = "🆕 " if title_data.get("is_new") else ""
-
-        if show_source:
-            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
-        else:
-            result = f"{title_prefix}{formatted_title}"
-
-        if rank_display:
-            result += f" {rank_display}"
-        if title_data["time_display"]:
-            result += f" - {title_data['time_display']}"
-        if title_data["count"] > 1:
-            result += f" ({title_data['count']}次)"
-
-        return result
-
-    elif platform in ("wework", "bark"):
-        # WeWork 和 Bark 使用 markdown 格式
-        if link_url:
-            formatted_title = f"[{cleaned_title}]({link_url})"
-        else:
-            formatted_title = cleaned_title
-
-        title_prefix = "🆕 " if title_data.get("is_new") else ""
-
-        if show_source:
-            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
-        else:
-            result = f"{title_prefix}{formatted_title}"
-
-        if rank_display:
-            result += f" {rank_display}"
-        if title_data["time_display"]:
-            result += f" - {title_data['time_display']}"
-        if title_data["count"] > 1:
-            result += f" ({title_data['count']}次)"
-
-        return result
-
-    elif platform == "telegram":
-        if link_url:
-            formatted_title = f'<a href="{link_url}">{html_escape(cleaned_title)}</a>'
-        else:
-            formatted_title = cleaned_title
-
-        title_prefix = "🆕 " if title_data.get("is_new") else ""
-
-        if show_source:
-            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
-        else:
-            result = f"{title_prefix}{formatted_title}"
-
-        if rank_display:
-            result += f" {rank_display}"
-        if title_data["time_display"]:
-            result += f" <code>- {title_data['time_display']}</code>"
-        if title_data["count"] > 1:
-            result += f" <code>({title_data['count']}次)</code>"
-
-        return result
-
-    elif platform == "ntfy":
-        if link_url:
-            formatted_title = f"[{cleaned_title}]({link_url})"
-        else:
-            formatted_title = cleaned_title
-
-        title_prefix = "🆕 " if title_data.get("is_new") else ""
-
-        if show_source:
-            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
-        else:
-            result = f"{title_prefix}{formatted_title}"
-
-        if rank_display:
-            result += f" {rank_display}"
-        if title_data["time_display"]:
-            result += f" `- {title_data['time_display']}`"
-        if title_data["count"] > 1:
-            result += f" `({title_data['count']}次)`"
-
-        return result
-
-    elif platform == "slack":
-        # Slack 使用 mrkdwn 格式
-        if link_url:
-            # Slack 链接格式: <url|text>
-            formatted_title = f"<{link_url}|{cleaned_title}>"
-        else:
-            formatted_title = cleaned_title
-
-        title_prefix = "🆕 " if title_data.get("is_new") else ""
-
-        if show_source:
-            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
-        else:
-            result = f"{title_prefix}{formatted_title}"
-
-        # 排名(使用 * 加粗)
-        rank_display = format_rank_display(
-            title_data["ranks"], title_data["rank_threshold"], "slack"
-        )
-        if rank_display:
-            result += f" {rank_display}"
-        if title_data["time_display"]:
-            result += f" `- {title_data['time_display']}`"
-        if title_data["count"] > 1:
-            result += f" `({title_data['count']}次)`"
-
-        return result
-
-    elif platform == "html":
-        rank_display = format_rank_display(
-            title_data["ranks"], title_data["rank_threshold"], "html"
-        )
-
-        link_url = title_data["mobile_url"] or title_data["url"]
-
-        escaped_title = html_escape(cleaned_title)
-        escaped_source_name = html_escape(title_data["source_name"])
-
-        if link_url:
-            escaped_url = html_escape(link_url)
-            formatted_title = f'[{escaped_source_name}] <a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
-        else:
-            formatted_title = (
-                f'[{escaped_source_name}] <span class="no-link">{escaped_title}</span>'
-            )
-
-        if rank_display:
-            formatted_title += f" {rank_display}"
-        if title_data["time_display"]:
-            escaped_time = html_escape(title_data["time_display"])
-            formatted_title += f" <font color='grey'>- {escaped_time}</font>"
-        if title_data["count"] > 1:
-            formatted_title += f" <font color='green'>({title_data['count']}次)</font>"
-
-        if title_data.get("is_new"):
-            formatted_title = f"<div class='new-title'>🆕 {formatted_title}</div>"
-
-        return formatted_title
-
-    else:
-        return cleaned_title
-
-
-def generate_html_report(
-    stats: List[Dict],
-    total_titles: int,
-    failed_ids: Optional[List] = None,
-    new_titles: Optional[Dict] = None,
-    id_to_name: Optional[Dict] = None,
-    mode: str = "daily",
-    is_daily_summary: bool = False,
-    update_info: Optional[Dict] = None,
-) -> str:
-    """生成HTML报告"""
-    if is_daily_summary:
-        if mode == "current":
-            filename = "当前榜单汇总.html"
-        elif mode == "incremental":
-            filename = "当日增量.html"
-        else:
-            filename = "当日汇总.html"
-    else:
-        filename = f"{format_time_filename()}.html"
-
-    file_path = get_output_path("html", filename)
-
-    report_data = prepare_report_data(stats, failed_ids, new_titles, id_to_name, mode)
-
-    html_content = render_html_content(
-        report_data, total_titles, is_daily_summary, mode, update_info
-    )
-
-    with open(file_path, "w", encoding="utf-8") as f:
-        f.write(html_content)
-
-    if is_daily_summary:
-        # 生成到根目录(供 GitHub Pages 访问)
-        root_index_path = Path("index.html")
-        with open(root_index_path, "w", encoding="utf-8") as f:
-            f.write(html_content)
-
-        # 同时生成到 output 目录(供 Docker Volume 挂载访问)
-        output_index_path = Path("output") / "index.html"
-        ensure_directory_exists("output")
-        with open(output_index_path, "w", encoding="utf-8") as f:
-            f.write(html_content)
-
-    return file_path
-
-
-def render_html_content(
-    report_data: Dict,
-    total_titles: int,
-    is_daily_summary: bool = False,
-    mode: str = "daily",
-    update_info: Optional[Dict] = None,
-) -> str:
-    """渲染HTML内容"""
-    html = """
-    <!DOCTYPE html>
-    <html>
-    <head>
-        <meta charset="UTF-8">
-        <meta name="viewport" content="width=device-width, initial-scale=1.0">
-        <title>热点新闻分析</title>
-        <script src="https://cdnjs.cloudflare.com/ajax/libs/html2canvas/1.4.1/html2canvas.min.js" integrity="sha512-BNaRQnYJYiPSqHHDb58B0yaPfCu+Wgds8Gp/gU33kqBtgNS4tSPHuGibyoeqMV/TJlSKda6FXzoEyYGjTe+vXA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
-        <style>
-            * { box-sizing: border-box; }
-            body { 
-                font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif;
-                margin: 0; 
-                padding: 16px; 
-                background: #fafafa;
-                color: #333;
-                line-height: 1.5;
-            }
-            
-            .container {
-                max-width: 600px;
-                margin: 0 auto;
-                background: white;
-                border-radius: 12px;
-                overflow: hidden;
-                box-shadow: 0 2px 16px rgba(0,0,0,0.06);
-            }
-            
-            .header {
-                background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
-                color: white;
-                padding: 32px 24px;
-                text-align: center;
-                position: relative;
-            }
-            
-            .save-buttons {
-                position: absolute;
-                top: 16px;
-                right: 16px;
-                display: flex;
-                gap: 8px;
-            }
-            
-            .save-btn {
-                background: rgba(255, 255, 255, 0.2);
-                border: 1px solid rgba(255, 255, 255, 0.3);
-                color: white;
-                padding: 8px 16px;
-                border-radius: 6px;
-                cursor: pointer;
-                font-size: 13px;
-                font-weight: 500;
-                transition: all 0.2s ease;
-                backdrop-filter: blur(10px);
-                white-space: nowrap;
-            }
-            
-            .save-btn:hover {
-                background: rgba(255, 255, 255, 0.3);
-                border-color: rgba(255, 255, 255, 0.5);
-                transform: translateY(-1px);
-            }
-            
-            .save-btn:active {
-                transform: translateY(0);
-            }
-            
-            .save-btn:disabled {
-                opacity: 0.6;
-                cursor: not-allowed;
-            }
-            
-            .header-title {
-                font-size: 22px;
-                font-weight: 700;
-                margin: 0 0 20px 0;
-            }
-            
-            .header-info {
-                display: grid;
-                grid-template-columns: 1fr 1fr;
-                gap: 16px;
-                font-size: 14px;
-                opacity: 0.95;
-            }
-            
-            .info-item {
-                text-align: center;
-            }
-            
-            .info-label {
-                display: block;
-                font-size: 12px;
-                opacity: 0.8;
-                margin-bottom: 4px;
-            }
-            
-            .info-value {
-                font-weight: 600;
-                font-size: 16px;
-            }
-            
-            .content {
-                padding: 24px;
-            }
-            
-            .word-group {
-                margin-bottom: 40px;
-            }
-            
-            .word-group:first-child {
-                margin-top: 0;
-            }
-            
-            .word-header {
-                display: flex;
-                align-items: center;
-                justify-content: space-between;
-                margin-bottom: 20px;
-                padding-bottom: 8px;
-                border-bottom: 1px solid #f0f0f0;
-            }
-            
-            .word-info {
-                display: flex;
-                align-items: center;
-                gap: 12px;
-            }
-            
-            .word-name {
-                font-size: 17px;
-                font-weight: 600;
-                color: #1a1a1a;
-            }
-            
-            .word-count {
-                color: #666;
-                font-size: 13px;
-                font-weight: 500;
-            }
-            
-            .word-count.hot { color: #dc2626; font-weight: 600; }
-            .word-count.warm { color: #ea580c; font-weight: 600; }
-            
-            .word-index {
-                color: #999;
-                font-size: 12px;
-            }
-            
-            .news-item {
-                margin-bottom: 20px;
-                padding: 16px 0;
-                border-bottom: 1px solid #f5f5f5;
-                position: relative;
-                display: flex;
-                gap: 12px;
-                align-items: center;
-            }
-            
-            .news-item:last-child {
-                border-bottom: none;
-            }
-            
-            .news-item.new::after {
-                content: "NEW";
-                position: absolute;
-                top: 12px;
-                right: 0;
-                background: #fbbf24;
-                color: #92400e;
-                font-size: 9px;
-                font-weight: 700;
-                padding: 3px 6px;
-                border-radius: 4px;
-                letter-spacing: 0.5px;
-            }
-            
-            .news-number {
-                color: #999;
-                font-size: 13px;
-                font-weight: 600;
-                min-width: 20px;
-                text-align: center;
-                flex-shrink: 0;
-                background: #f8f9fa;
-                border-radius: 50%;
-                width: 24px;
-                height: 24px;
-                display: flex;
-                align-items: center;
-                justify-content: center;
-                align-self: flex-start;
-                margin-top: 8px;
-            }
-            
-            .news-content {
-                flex: 1;
-                min-width: 0;
-                padding-right: 40px;
-            }
-            
-            .news-item.new .news-content {
-                padding-right: 50px;
-            }
-            
-            .news-header {
-                display: flex;
-                align-items: center;
-                gap: 8px;
-                margin-bottom: 8px;
-                flex-wrap: wrap;
-            }
-            
-            .source-name {
-                color: #666;
-                font-size: 12px;
-                font-weight: 500;
-            }
-            
-            .rank-num {
-                color: #fff;
-                background: #6b7280;
-                font-size: 10px;
-                font-weight: 700;
-                padding: 2px 6px;
-                border-radius: 10px;
-                min-width: 18px;
-                text-align: center;
-            }
-            
-            .rank-num.top { background: #dc2626; }
-            .rank-num.high { background: #ea580c; }
-            
-            .time-info {
-                color: #999;
-                font-size: 11px;
-            }
-            
-            .count-info {
-                color: #059669;
-                font-size: 11px;
-                font-weight: 500;
-            }
-            
-            .news-title {
-                font-size: 15px;
-                line-height: 1.4;
-                color: #1a1a1a;
-                margin: 0;
-            }
-            
-            .news-link {
-                color: #2563eb;
-                text-decoration: none;
-            }
-            
-            .news-link:hover {
-                text-decoration: underline;
-            }
-            
-            .news-link:visited {
-                color: #7c3aed;
-            }
-            
-            .new-section {
-                margin-top: 40px;
-                padding-top: 24px;
-                border-top: 2px solid #f0f0f0;
-            }
-            
-            .new-section-title {
-                color: #1a1a1a;
-                font-size: 16px;
-                font-weight: 600;
-                margin: 0 0 20px 0;
-            }
-            
-            .new-source-group {
-                margin-bottom: 24px;
-            }
-            
-            .new-source-title {
-                color: #666;
-                font-size: 13px;
-                font-weight: 500;
-                margin: 0 0 12px 0;
-                padding-bottom: 6px;
-                border-bottom: 1px solid #f5f5f5;
-            }
-            
-            .new-item {
-                display: flex;
-                align-items: center;
-                gap: 12px;
-                padding: 8px 0;
-                border-bottom: 1px solid #f9f9f9;
-            }
-            
-            .new-item:last-child {
-                border-bottom: none;
-            }
-            
-            .new-item-number {
-                color: #999;
-                font-size: 12px;
-                font-weight: 600;
-                min-width: 18px;
-                text-align: center;
-                flex-shrink: 0;
-                background: #f8f9fa;
-                border-radius: 50%;
-                width: 20px;
-                height: 20px;
-                display: flex;
-                align-items: center;
-                justify-content: center;
-            }
-            
-            .new-item-rank {
-                color: #fff;
-                background: #6b7280;
-                font-size: 10px;
-                font-weight: 700;
-                padding: 3px 6px;
-                border-radius: 8px;
-                min-width: 20px;
-                text-align: center;
-                flex-shrink: 0;
-            }
-            
-            .new-item-rank.top { background: #dc2626; }
-            .new-item-rank.high { background: #ea580c; }
-            
-            .new-item-content {
-                flex: 1;
-                min-width: 0;
-            }
-            
-            .new-item-title {
-                font-size: 14px;
-                line-height: 1.4;
-                color: #1a1a1a;
-                margin: 0;
-            }
-            
-            .error-section {
-                background: #fef2f2;
-                border: 1px solid #fecaca;
-                border-radius: 8px;
-                padding: 16px;
-                margin-bottom: 24px;
-            }
-            
-            .error-title {
-                color: #dc2626;
-                font-size: 14px;
-                font-weight: 600;
-                margin: 0 0 8px 0;
-            }
-            
-            .error-list {
-                list-style: none;
-                padding: 0;
-                margin: 0;
-            }
-            
-            .error-item {
-                color: #991b1b;
-                font-size: 13px;
-                padding: 2px 0;
-                font-family: 'SF Mono', Consolas, monospace;
-            }
-            
-            .footer {
-                margin-top: 32px;
-                padding: 20px 24px;
-                background: #f8f9fa;
-                border-top: 1px solid #e5e7eb;
-                text-align: center;
-            }
-            
-            .footer-content {
-                font-size: 13px;
-                color: #6b7280;
-                line-height: 1.6;
-            }
-            
-            .footer-link {
-                color: #4f46e5;
-                text-decoration: none;
-                font-weight: 500;
-                transition: color 0.2s ease;
-            }
-            
-            .footer-link:hover {
-                color: #7c3aed;
-                text-decoration: underline;
-            }
-            
-            .project-name {
-                font-weight: 600;
-                color: #374151;
-            }
-            
-            @media (max-width: 480px) {
-                body { padding: 12px; }
-                .header { padding: 24px 20px; }
-                .content { padding: 20px; }
-                .footer { padding: 16px 20px; }
-                .header-info { grid-template-columns: 1fr; gap: 12px; }
-                .news-header { gap: 6px; }
-                .news-content { padding-right: 45px; }
-                .news-item { gap: 8px; }
-                .new-item { gap: 8px; }
-                .news-number { width: 20px; height: 20px; font-size: 12px; }
-                .save-buttons {
-                    position: static;
-                    margin-bottom: 16px;
-                    display: flex;
-                    gap: 8px;
-                    justify-content: center;
-                    flex-direction: column;
-                    width: 100%;
-                }
-                .save-btn {
-                    width: 100%;
-                }
-            }
-        </style>
-    </head>
-    <body>
-        <div class="container">
-            <div class="header">
-                <div class="save-buttons">
-                    <button class="save-btn" onclick="saveAsImage()">保存为图片</button>
-                    <button class="save-btn" onclick="saveAsMultipleImages()">分段保存</button>
-                </div>
-                <div class="header-title">热点新闻分析</div>
-                <div class="header-info">
-                    <div class="info-item">
-                        <span class="info-label">报告类型</span>
-                        <span class="info-value">"""
-
-    # 处理报告类型显示
-    if is_daily_summary:
-        if mode == "current":
-            html += "当前榜单"
-        elif mode == "incremental":
-            html += "增量模式"
-        else:
-            html += "当日汇总"
-    else:
-        html += "实时分析"
-
-    html += """</span>
-                    </div>
-                    <div class="info-item">
-                        <span class="info-label">新闻总数</span>
-                        <span class="info-value">"""
-
-    html += f"{total_titles} 条"
-
-    # 计算筛选后的热点新闻数量
-    hot_news_count = sum(len(stat["titles"]) for stat in report_data["stats"])
-
-    html += """</span>
-                    </div>
-                    <div class="info-item">
-                        <span class="info-label">热点新闻</span>
-                        <span class="info-value">"""
-
-    html += f"{hot_news_count} 条"
-
-    html += """</span>
-                    </div>
-                    <div class="info-item">
-                        <span class="info-label">生成时间</span>
-                        <span class="info-value">"""
-
-    now = get_beijing_time()
-    html += now.strftime("%m-%d %H:%M")
-
-    html += """</span>
-                    </div>
-                </div>
-            </div>
-            
-            <div class="content">"""
-
-    # 处理失败ID错误信息
-    if report_data["failed_ids"]:
-        html += """
-                <div class="error-section">
-                    <div class="error-title">⚠️ 请求失败的平台</div>
-                    <ul class="error-list">"""
-        for id_value in report_data["failed_ids"]:
-            html += f'<li class="error-item">{html_escape(id_value)}</li>'
-        html += """
-                    </ul>
-                </div>"""
-
-    # 生成热点词汇统计部分的HTML
-    stats_html = ""
-    if report_data["stats"]:
-        total_count = len(report_data["stats"])
-
-        for i, stat in enumerate(report_data["stats"], 1):
-            count = stat["count"]
-
-            # 确定热度等级
-            if count >= 10:
-                count_class = "hot"
-            elif count >= 5:
-                count_class = "warm"
-            else:
-                count_class = ""
-
-            escaped_word = html_escape(stat["word"])
-
-            stats_html += f"""
-                <div class="word-group">
-                    <div class="word-header">
-                        <div class="word-info">
-                            <div class="word-name">{escaped_word}</div>
-                            <div class="word-count {count_class}">{count} 条</div>
-                        </div>
-                        <div class="word-index">{i}/{total_count}</div>
-                    </div>"""
-
-            # 处理每个词组下的新闻标题,给每条新闻标上序号
-            for j, title_data in enumerate(stat["titles"], 1):
-                is_new = title_data.get("is_new", False)
-                new_class = "new" if is_new else ""
-
-                stats_html += f"""
-                    <div class="news-item {new_class}">
-                        <div class="news-number">{j}</div>
-                        <div class="news-content">
-                            <div class="news-header">
-                                <span class="source-name">{html_escape(title_data["source_name"])}</span>"""
-
-                # 处理排名显示
-                ranks = title_data.get("ranks", [])
-                if ranks:
-                    min_rank = min(ranks)
-                    max_rank = max(ranks)
-                    rank_threshold = title_data.get("rank_threshold", 10)
-
-                    # 确定排名等级
-                    if min_rank <= 3:
-                        rank_class = "top"
-                    elif min_rank <= rank_threshold:
-                        rank_class = "high"
-                    else:
-                        rank_class = ""
-
-                    if min_rank == max_rank:
-                        rank_text = str(min_rank)
-                    else:
-                        rank_text = f"{min_rank}-{max_rank}"
-
-                    stats_html += f'<span class="rank-num {rank_class}">{rank_text}</span>'
-
-                # 处理时间显示
-                time_display = title_data.get("time_display", "")
-                if time_display:
-                    # 简化时间显示格式,将波浪线替换为~
-                    simplified_time = (
-                        time_display.replace(" ~ ", "~")
-                        .replace("[", "")
-                        .replace("]", "")
-                    )
-                    stats_html += (
-                        f'<span class="time-info">{html_escape(simplified_time)}</span>'
-                    )
-
-                # 处理出现次数
-                count_info = title_data.get("count", 1)
-                if count_info > 1:
-                    stats_html += f'<span class="count-info">{count_info}次</span>'
-
-                stats_html += """
-                            </div>
-                            <div class="news-title">"""
-
-                # 处理标题和链接
-                escaped_title = html_escape(title_data["title"])
-                link_url = title_data.get("mobile_url") or title_data.get("url", "")
-
-                if link_url:
-                    escaped_url = html_escape(link_url)
-                    stats_html += f'<a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
-                else:
-                    stats_html += escaped_title
-
-                stats_html += """
-                            </div>
-                        </div>
-                    </div>"""
-
-            stats_html += """
-                </div>"""
-
-    # 生成新增新闻区域的HTML
-    new_titles_html = ""
-    if report_data["new_titles"]:
-        new_titles_html += f"""
-                <div class="new-section">
-                    <div class="new-section-title">本次新增热点 (共 {report_data['total_new_count']} 条)</div>"""
-
-        for source_data in report_data["new_titles"]:
-            escaped_source = html_escape(source_data["source_name"])
-            titles_count = len(source_data["titles"])
-
-            new_titles_html += f"""
-                    <div class="new-source-group">
-                        <div class="new-source-title">{escaped_source} · {titles_count}条</div>"""
-
-            # 为新增新闻也添加序号
-            for idx, title_data in enumerate(source_data["titles"], 1):
-                ranks = title_data.get("ranks", [])
-
-                # 处理新增新闻的排名显示
-                rank_class = ""
-                if ranks:
-                    min_rank = min(ranks)
-                    if min_rank <= 3:
-                        rank_class = "top"
-                    elif min_rank <= title_data.get("rank_threshold", 10):
-                        rank_class = "high"
-
-                    if len(ranks) == 1:
-                        rank_text = str(ranks[0])
-                    else:
-                        rank_text = f"{min(ranks)}-{max(ranks)}"
-                else:
-                    rank_text = "?"
-
-                new_titles_html += f"""
-                        <div class="new-item">
-                            <div class="new-item-number">{idx}</div>
-                            <div class="new-item-rank {rank_class}">{rank_text}</div>
-                            <div class="new-item-content">
-                                <div class="new-item-title">"""
-
-                # 处理新增新闻的链接
-                escaped_title = html_escape(title_data["title"])
-                link_url = title_data.get("mobile_url") or title_data.get("url", "")
-
-                if link_url:
-                    escaped_url = html_escape(link_url)
-                    new_titles_html += f'<a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
-                else:
-                    new_titles_html += escaped_title
-
-                new_titles_html += """
-                                </div>
-                            </div>
-                        </div>"""
-
-            new_titles_html += """
-                    </div>"""
-
-        new_titles_html += """
-                </div>"""
-
-    # 根据配置决定内容顺序
-    if CONFIG.get("REVERSE_CONTENT_ORDER", False):
-        # 新增热点在前,热点词汇统计在后
-        html += new_titles_html + stats_html
-    else:
-        # 默认:热点词汇统计在前,新增热点在后
-        html += stats_html + new_titles_html
-
-    html += """
-            </div>
-            
-            <div class="footer">
-                <div class="footer-content">
-                    由 <span class="project-name">TrendRadar</span> 生成 · 
-                    <a href="https://github.com/sansan0/TrendRadar" target="_blank" class="footer-link">
-                        GitHub 开源项目
-                    </a>"""
-
-    if update_info:
-        html += f"""
-                    <br>
-                    <span style="color: #ea580c; font-weight: 500;">
-                        发现新版本 {update_info['remote_version']},当前版本 {update_info['current_version']}
-                    </span>"""
-
-    html += """
-                </div>
-            </div>
-        </div>
-        
-        <script>
-            async function saveAsImage() {
-                const button = event.target;
-                const originalText = button.textContent;
-                
-                try {
-                    button.textContent = '生成中...';
-                    button.disabled = true;
-                    window.scrollTo(0, 0);
-                    
-                    // 等待页面稳定
-                    await new Promise(resolve => setTimeout(resolve, 200));
-                    
-                    // 截图前隐藏按钮
-                    const buttons = document.querySelector('.save-buttons');
-                    buttons.style.visibility = 'hidden';
-                    
-                    // 再次等待确保按钮完全隐藏
-                    await new Promise(resolve => setTimeout(resolve, 100));
-                    
-                    const container = document.querySelector('.container');
-                    
-                    const canvas = await html2canvas(container, {
-                        backgroundColor: '#ffffff',
-                        scale: 1.5,
-                        useCORS: true,
-                        allowTaint: false,
-                        imageTimeout: 10000,
-                        removeContainer: false,
-                        foreignObjectRendering: false,
-                        logging: false,
-                        width: container.offsetWidth,
-                        height: container.offsetHeight,
-                        x: 0,
-                        y: 0,
-                        scrollX: 0,
-                        scrollY: 0,
-                        windowWidth: window.innerWidth,
-                        windowHeight: window.innerHeight
-                    });
-                    
-                    buttons.style.visibility = 'visible';
-                    
-                    const link = document.createElement('a');
-                    const now = new Date();
-                    const filename = `TrendRadar_热点新闻分析_${now.getFullYear()}${String(now.getMonth() + 1).padStart(2, '0')}${String(now.getDate()).padStart(2, '0')}_${String(now.getHours()).padStart(2, '0')}${String(now.getMinutes()).padStart(2, '0')}.png`;
-                    
-                    link.download = filename;
-                    link.href = canvas.toDataURL('image/png', 1.0);
-                    
-                    // 触发下载
-                    document.body.appendChild(link);
-                    link.click();
-                    document.body.removeChild(link);
-                    
-                    button.textContent = '保存成功!';
-                    setTimeout(() => {
-                        button.textContent = originalText;
-                        button.disabled = false;
-                    }, 2000);
-                    
-                } catch (error) {
-                    const buttons = document.querySelector('.save-buttons');
-                    buttons.style.visibility = 'visible';
-                    button.textContent = '保存失败';
-                    setTimeout(() => {
-                        button.textContent = originalText;
-                        button.disabled = false;
-                    }, 2000);
-                }
-            }
-            
-            async function saveAsMultipleImages() {
-                const button = event.target;
-                const originalText = button.textContent;
-                const container = document.querySelector('.container');
-                const scale = 1.5; 
-                const maxHeight = 5000 / scale;
-                
-                try {
-                    button.textContent = '分析中...';
-                    button.disabled = true;
-                    
-                    // 获取所有可能的分割元素
-                    const newsItems = Array.from(container.querySelectorAll('.news-item'));
-                    const wordGroups = Array.from(container.querySelectorAll('.word-group'));
-                    const newSection = container.querySelector('.new-section');
-                    const errorSection = container.querySelector('.error-section');
-                    const header = container.querySelector('.header');
-                    const footer = container.querySelector('.footer');
-                    
-                    // 计算元素位置和高度
-                    const containerRect = container.getBoundingClientRect();
-                    const elements = [];
-                    
-                    // 添加header作为必须包含的元素
-                    elements.push({
-                        type: 'header',
-                        element: header,
-                        top: 0,
-                        bottom: header.offsetHeight,
-                        height: header.offsetHeight
-                    });
-                    
-                    // 添加错误信息(如果存在)
-                    if (errorSection) {
-                        const rect = errorSection.getBoundingClientRect();
-                        elements.push({
-                            type: 'error',
-                            element: errorSection,
-                            top: rect.top - containerRect.top,
-                            bottom: rect.bottom - containerRect.top,
-                            height: rect.height
-                        });
-                    }
-                    
-                    // 按word-group分组处理news-item
-                    wordGroups.forEach(group => {
-                        const groupRect = group.getBoundingClientRect();
-                        const groupNewsItems = group.querySelectorAll('.news-item');
-                        
-                        // 添加word-group的header部分
-                        const wordHeader = group.querySelector('.word-header');
-                        if (wordHeader) {
-                            const headerRect = wordHeader.getBoundingClientRect();
-                            elements.push({
-                                type: 'word-header',
-                                element: wordHeader,
-                                parent: group,
-                                top: groupRect.top - containerRect.top,
-                                bottom: headerRect.bottom - containerRect.top,
-                                height: headerRect.height
-                            });
-                        }
-                        
-                        // 添加每个news-item
-                        groupNewsItems.forEach(item => {
-                            const rect = item.getBoundingClientRect();
-                            elements.push({
-                                type: 'news-item',
-                                element: item,
-                                parent: group,
-                                top: rect.top - containerRect.top,
-                                bottom: rect.bottom - containerRect.top,
-                                height: rect.height
-                            });
-                        });
-                    });
-                    
-                    // 添加新增新闻部分
-                    if (newSection) {
-                        const rect = newSection.getBoundingClientRect();
-                        elements.push({
-                            type: 'new-section',
-                            element: newSection,
-                            top: rect.top - containerRect.top,
-                            bottom: rect.bottom - containerRect.top,
-                            height: rect.height
-                        });
-                    }
-                    
-                    // 添加footer
-                    const footerRect = footer.getBoundingClientRect();
-                    elements.push({
-                        type: 'footer',
-                        element: footer,
-                        top: footerRect.top - containerRect.top,
-                        bottom: footerRect.bottom - containerRect.top,
-                        height: footer.offsetHeight
-                    });
-                    
-                    // 计算分割点
-                    const segments = [];
-                    let currentSegment = { start: 0, end: 0, height: 0, includeHeader: true };
-                    let headerHeight = header.offsetHeight;
-                    currentSegment.height = headerHeight;
-                    
-                    for (let i = 1; i < elements.length; i++) {
-                        const element = elements[i];
-                        const potentialHeight = element.bottom - currentSegment.start;
-                        
-                        // 检查是否需要创建新分段
-                        if (potentialHeight > maxHeight && currentSegment.height > headerHeight) {
-                            // 在前一个元素结束处分割
-                            currentSegment.end = elements[i - 1].bottom;
-                            segments.push(currentSegment);
-                            
-                            // 开始新分段
-                            currentSegment = {
-                                start: currentSegment.end,
-                                end: 0,
-                                height: element.bottom - currentSegment.end,
-                                includeHeader: false
-                            };
-                        } else {
-                            currentSegment.height = potentialHeight;
-                            currentSegment.end = element.bottom;
-                        }
-                    }
-                    
-                    // 添加最后一个分段
-                    if (currentSegment.height > 0) {
-                        currentSegment.end = container.offsetHeight;
-                        segments.push(currentSegment);
-                    }
-                    
-                    button.textContent = `生成中 (0/${segments.length})...`;
-                    
-                    // 隐藏保存按钮
-                    const buttons = document.querySelector('.save-buttons');
-                    buttons.style.visibility = 'hidden';
-                    
-                    // 为每个分段生成图片
-                    const images = [];
-                    for (let i = 0; i < segments.length; i++) {
-                        const segment = segments[i];
-                        button.textContent = `生成中 (${i + 1}/${segments.length})...`;
-                        
-                        // 创建临时容器用于截图
-                        const tempContainer = document.createElement('div');
-                        tempContainer.style.cssText = `
-                            position: absolute;
-                            left: -9999px;
-                            top: 0;
-                            width: ${container.offsetWidth}px;
-                            background: white;
-                        `;
-                        tempContainer.className = 'container';
-                        
-                        // 克隆容器内容
-                        const clonedContainer = container.cloneNode(true);
-                        
-                        // 移除克隆内容中的保存按钮
-                        const clonedButtons = clonedContainer.querySelector('.save-buttons');
-                        if (clonedButtons) {
-                            clonedButtons.style.display = 'none';
-                        }
-                        
-                        tempContainer.appendChild(clonedContainer);
-                        document.body.appendChild(tempContainer);
-                        
-                        // 等待DOM更新
-                        await new Promise(resolve => setTimeout(resolve, 100));
-                        
-                        // 使用html2canvas截取特定区域
-                        const canvas = await html2canvas(clonedContainer, {
-                            backgroundColor: '#ffffff',
-                            scale: scale,
-                            useCORS: true,
-                            allowTaint: false,
-                            imageTimeout: 10000,
-                            logging: false,
-                            width: container.offsetWidth,
-                            height: segment.end - segment.start,
-                            x: 0,
-                            y: segment.start,
-                            windowWidth: window.innerWidth,
-                            windowHeight: window.innerHeight
-                        });
-                        
-                        images.push(canvas.toDataURL('image/png', 1.0));
-                        
-                        // 清理临时容器
-                        document.body.removeChild(tempContainer);
-                    }
-                    
-                    // 恢复按钮显示
-                    buttons.style.visibility = 'visible';
-                    
-                    // 下载所有图片
-                    const now = new Date();
-                    const baseFilename = `TrendRadar_热点新闻分析_${now.getFullYear()}${String(now.getMonth() + 1).padStart(2, '0')}${String(now.getDate()).padStart(2, '0')}_${String(now.getHours()).padStart(2, '0')}${String(now.getMinutes()).padStart(2, '0')}`;
-                    
-                    for (let i = 0; i < images.length; i++) {
-                        const link = document.createElement('a');
-                        link.download = `${baseFilename}_part${i + 1}.png`;
-                        link.href = images[i];
-                        document.body.appendChild(link);
-                        link.click();
-                        document.body.removeChild(link);
-                        
-                        // 延迟一下避免浏览器阻止多个下载
-                        await new Promise(resolve => setTimeout(resolve, 100));
-                    }
-                    
-                    button.textContent = `已保存 ${segments.length} 张图片!`;
-                    setTimeout(() => {
-                        button.textContent = originalText;
-                        button.disabled = false;
-                    }, 2000);
-                    
-                } catch (error) {
-                    console.error('分段保存失败:', error);
-                    const buttons = document.querySelector('.save-buttons');
-                    buttons.style.visibility = 'visible';
-                    button.textContent = '保存失败';
-                    setTimeout(() => {
-                        button.textContent = originalText;
-                        button.disabled = false;
-                    }, 2000);
-                }
-            }
-            
-            document.addEventListener('DOMContentLoaded', function() {
-                window.scrollTo(0, 0);
-            });
-        </script>
-    </body>
-    </html>
-    """
-
-    return html
-
-
-def render_feishu_content(
-    report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily"
-) -> str:
-    """渲染飞书内容"""
-    # 生成热点词汇统计部分
-    stats_content = ""
-    if report_data["stats"]:
-        stats_content += f"📊 **热点词汇统计**\n\n"
-
-        total_count = len(report_data["stats"])
-
-        for i, stat in enumerate(report_data["stats"]):
-            word = stat["word"]
-            count = stat["count"]
-
-            sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
-
-            if count >= 10:
-                stats_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
-            elif count >= 5:
-                stats_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
-            else:
-                stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
-
-            for j, title_data in enumerate(stat["titles"], 1):
-                formatted_title = format_title_for_platform(
-                    "feishu", title_data, show_source=True
-                )
-                stats_content += f"  {j}. {formatted_title}\n"
-
-                if j < len(stat["titles"]):
-                    stats_content += "\n"
-
-            if i < len(report_data["stats"]) - 1:
-                stats_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
-
-    # 生成新增新闻部分
-    new_titles_content = ""
-    if report_data["new_titles"]:
-        new_titles_content += (
-            f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
-        )
-
-        for source_data in report_data["new_titles"]:
-            new_titles_content += (
-                f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n"
-            )
-
-            for j, title_data in enumerate(source_data["titles"], 1):
-                title_data_copy = title_data.copy()
-                title_data_copy["is_new"] = False
-                formatted_title = format_title_for_platform(
-                    "feishu", title_data_copy, show_source=False
-                )
-                new_titles_content += f"  {j}. {formatted_title}\n"
-
-            new_titles_content += "\n"
-
-    # 根据配置决定内容顺序
-    text_content = ""
-    if CONFIG.get("REVERSE_CONTENT_ORDER", False):
-        # 新增热点在前,热点词汇统计在后
-        if new_titles_content:
-            text_content += new_titles_content
-            if stats_content:
-                text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
-        if stats_content:
-            text_content += stats_content
-    else:
-        # 默认:热点词汇统计在前,新增热点在后
-        if stats_content:
-            text_content += stats_content
-            if new_titles_content:
-                text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
-        if new_titles_content:
-            text_content += new_titles_content
-
-    if not text_content:
-        if mode == "incremental":
-            mode_text = "增量模式下暂无新增匹配的热点词汇"
-        elif mode == "current":
-            mode_text = "当前榜单模式下暂无匹配的热点词汇"
-        else:
-            mode_text = "暂无匹配的热点词汇"
-        text_content = f"📭 {mode_text}\n\n"
-
-    if report_data["failed_ids"]:
-        if text_content and "暂无匹配" not in text_content:
-            text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
-
-        text_content += "⚠️ **数据获取失败的平台:**\n\n"
-        for i, id_value in enumerate(report_data["failed_ids"], 1):
-            text_content += f"  • <font color='red'>{id_value}</font>\n"
-
-    now = get_beijing_time()
-    text_content += (
-        f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
-    )
-
-    if update_info:
-        text_content += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
-
-    return text_content
-
-
-def render_dingtalk_content(
-    report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily"
-) -> str:
-    """渲染钉钉内容"""
-    total_titles = sum(
-        len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
-    )
-    now = get_beijing_time()
-
-    # 头部信息
-    header_content = f"**总新闻数:** {total_titles}\n\n"
-    header_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
-    header_content += f"**类型:** 热点分析报告\n\n"
-    header_content += "---\n\n"
-
-    # 生成热点词汇统计部分
-    stats_content = ""
-    if report_data["stats"]:
-        stats_content += f"📊 **热点词汇统计**\n\n"
-
-        total_count = len(report_data["stats"])
-
-        for i, stat in enumerate(report_data["stats"]):
-            word = stat["word"]
-            count = stat["count"]
-
-            sequence_display = f"[{i + 1}/{total_count}]"
-
-            if count >= 10:
-                stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
-            elif count >= 5:
-                stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
-            else:
-                stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
-
-            for j, title_data in enumerate(stat["titles"], 1):
-                formatted_title = format_title_for_platform(
-                    "dingtalk", title_data, show_source=True
-                )
-                stats_content += f"  {j}. {formatted_title}\n"
-
-                if j < len(stat["titles"]):
-                    stats_content += "\n"
-
-            if i < len(report_data["stats"]) - 1:
-                stats_content += f"\n---\n\n"
-
-    # 生成新增新闻部分
-    new_titles_content = ""
-    if report_data["new_titles"]:
-        new_titles_content += (
-            f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
-        )
-
-        for source_data in report_data["new_titles"]:
-            new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
-
-            for j, title_data in enumerate(source_data["titles"], 1):
-                title_data_copy = title_data.copy()
-                title_data_copy["is_new"] = False
-                formatted_title = format_title_for_platform(
-                    "dingtalk", title_data_copy, show_source=False
-                )
-                new_titles_content += f"  {j}. {formatted_title}\n"
-
-            new_titles_content += "\n"
-
-    # 根据配置决定内容顺序
-    text_content = header_content
-    if CONFIG.get("REVERSE_CONTENT_ORDER", False):
-        # 新增热点在前,热点词汇统计在后
-        if new_titles_content:
-            text_content += new_titles_content
-            if stats_content:
-                text_content += f"\n---\n\n"
-        if stats_content:
-            text_content += stats_content
-    else:
-        # 默认:热点词汇统计在前,新增热点在后
-        if stats_content:
-            text_content += stats_content
-            if new_titles_content:
-                text_content += f"\n---\n\n"
-        if new_titles_content:
-            text_content += new_titles_content
-
-    if not stats_content and not new_titles_content:
-        if mode == "incremental":
-            mode_text = "增量模式下暂无新增匹配的热点词汇"
-        elif mode == "current":
-            mode_text = "当前榜单模式下暂无匹配的热点词汇"
-        else:
-            mode_text = "暂无匹配的热点词汇"
-        text_content += f"📭 {mode_text}\n\n"
-
-    if report_data["failed_ids"]:
-        if "暂无匹配" not in text_content:
-            text_content += f"\n---\n\n"
-
-        text_content += "⚠️ **数据获取失败的平台:**\n\n"
-        for i, id_value in enumerate(report_data["failed_ids"], 1):
-            text_content += f"  • **{id_value}**\n"
-
-    text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
-
-    if update_info:
-        text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
-
-    return text_content
-
-
-def _get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str:
-    """根据 format_type 生成对应格式的批次头部"""
-    if format_type == "telegram":
-        return f"<b>[第 {batch_num}/{total_batches} 批次]</b>\n\n"
-    elif format_type == "slack":
-        return f"*[第 {batch_num}/{total_batches} 批次]*\n\n"
-    elif format_type in ("wework_text", "bark"):
-        # 企业微信文本模式和 Bark 使用纯文本格式
-        return f"[第 {batch_num}/{total_batches} 批次]\n\n"
-    else:
-        # 飞书、钉钉、ntfy、企业微信 markdown 模式
-        return f"**[第 {batch_num}/{total_batches} 批次]**\n\n"
-
-
-def _get_max_batch_header_size(format_type: str) -> int:
-    """估算批次头部的最大字节数(假设最多 99 批次)
-
-    用于在分批时预留空间,避免事后截断破坏内容完整性。
-    """
-    # 生成最坏情况的头部(99/99 批次)
-    max_header = _get_batch_header(format_type, 99, 99)
-    return len(max_header.encode("utf-8"))
-
-
-def _truncate_to_bytes(text: str, max_bytes: int) -> str:
-    """安全截断字符串到指定字节数,避免截断多字节字符"""
-    text_bytes = text.encode("utf-8")
-    if len(text_bytes) <= max_bytes:
-        return text
-
-    # 截断到指定字节数
-    truncated = text_bytes[:max_bytes]
-
-    # 处理可能的不完整 UTF-8 字符
-    for i in range(min(4, len(truncated))):
-        try:
-            return truncated[: len(truncated) - i].decode("utf-8")
-        except UnicodeDecodeError:
-            continue
-
-    # 极端情况:返回空字符串
-    return ""
-
-
-def add_batch_headers(
-    batches: List[str], format_type: str, max_bytes: int
-) -> List[str]:
-    """为批次添加头部,动态计算确保总大小不超过限制
-
-    Args:
-        batches: 原始批次列表
-        format_type: 推送类型(bark, telegram, feishu 等)
-        max_bytes: 该推送类型的最大字节限制
-
-    Returns:
-        添加头部后的批次列表
-    """
-    if len(batches) <= 1:
-        return batches
-
-    total = len(batches)
-    result = []
-
-    for i, content in enumerate(batches, 1):
-        # 生成批次头部
-        header = _get_batch_header(format_type, i, total)
-        header_size = len(header.encode("utf-8"))
-
-        # 动态计算允许的最大内容大小
-        max_content_size = max_bytes - header_size
-        content_size = len(content.encode("utf-8"))
-
-        # 如果超出,截断到安全大小
-        if content_size > max_content_size:
-            print(
-                f"警告:{format_type} 第 {i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节),截断到 {max_content_size} 字节"
-            )
-            content = _truncate_to_bytes(content, max_content_size)
-
-        result.append(header + content)
-
-    return result
-
-
-def split_content_into_batches(
-    report_data: Dict,
-    format_type: str,
-    update_info: Optional[Dict] = None,
-    max_bytes: int = None,
-    mode: str = "daily",
-) -> List[str]:
-    """分批处理消息内容,确保词组标题+至少第一条新闻的完整性"""
-    if max_bytes is None:
-        if format_type == "dingtalk":
-            max_bytes = CONFIG.get("DINGTALK_BATCH_SIZE", 20000)
-        elif format_type == "feishu":
-            max_bytes = CONFIG.get("FEISHU_BATCH_SIZE", 29000)
-        elif format_type == "ntfy":
-            max_bytes = 3800
-        else:
-            max_bytes = CONFIG.get("MESSAGE_BATCH_SIZE", 4000)
-
-    batches = []
-
-    total_titles = sum(
-        len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
-    )
-    now = get_beijing_time()
-
-    base_header = ""
-    if format_type in ("wework", "bark"):
-        base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
-    elif format_type == "telegram":
-        base_header = f"总新闻数: {total_titles}\n\n"
-    elif format_type == "ntfy":
-        base_header = f"**总新闻数:** {total_titles}\n\n"
-    elif format_type == "feishu":
-        base_header = ""
-    elif format_type == "dingtalk":
-        base_header = f"**总新闻数:** {total_titles}\n\n"
-        base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
-        base_header += f"**类型:** 热点分析报告\n\n"
-        base_header += "---\n\n"
-    elif format_type == "slack":
-        base_header = f"*总新闻数:* {total_titles}\n\n"
-
-    base_footer = ""
-    if format_type in ("wework", "bark"):
-        base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
-        if update_info:
-            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
-    elif format_type == "telegram":
-        base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
-        if update_info:
-            base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
-    elif format_type == "ntfy":
-        base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
-        if update_info:
-            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
-    elif format_type == "feishu":
-        base_footer = f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
-        if update_info:
-            base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
-    elif format_type == "dingtalk":
-        base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
-        if update_info:
-            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
-    elif format_type == "slack":
-        base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_"
-        if update_info:
-            base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_"
-
-    stats_header = ""
-    if report_data["stats"]:
-        if format_type in ("wework", "bark"):
-            stats_header = f"📊 **热点词汇统计**\n\n"
-        elif format_type == "telegram":
-            stats_header = f"📊 热点词汇统计\n\n"
-        elif format_type == "ntfy":
-            stats_header = f"📊 **热点词汇统计**\n\n"
-        elif format_type == "feishu":
-            stats_header = f"📊 **热点词汇统计**\n\n"
-        elif format_type == "dingtalk":
-            stats_header = f"📊 **热点词汇统计**\n\n"
-        elif format_type == "slack":
-            stats_header = f"📊 *热点词汇统计*\n\n"
-
-    current_batch = base_header
-    current_batch_has_content = False
-
-    if (
-        not report_data["stats"]
-        and not report_data["new_titles"]
-        and not report_data["failed_ids"]
-    ):
-        if mode == "incremental":
-            mode_text = "增量模式下暂无新增匹配的热点词汇"
-        elif mode == "current":
-            mode_text = "当前榜单模式下暂无匹配的热点词汇"
-        else:
-            mode_text = "暂无匹配的热点词汇"
-        simple_content = f"📭 {mode_text}\n\n"
-        final_content = base_header + simple_content + base_footer
-        batches.append(final_content)
-        return batches
-
-    # 定义处理热点词汇统计的函数
-    def process_stats_section(current_batch, current_batch_has_content, batches):
-        """处理热点词汇统计"""
-        if not report_data["stats"]:
-            return current_batch, current_batch_has_content, batches
-
-        total_count = len(report_data["stats"])
-
-        # 添加统计标题
-        test_content = current_batch + stats_header
-        if (
-            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-            < max_bytes
-        ):
-            current_batch = test_content
-            current_batch_has_content = True
-        else:
-            if current_batch_has_content:
-                batches.append(current_batch + base_footer)
-            current_batch = base_header + stats_header
-            current_batch_has_content = True
-
-        # 逐个处理词组(确保词组标题+第一条新闻的原子性)
-        for i, stat in enumerate(report_data["stats"]):
-            word = stat["word"]
-            count = stat["count"]
-            sequence_display = f"[{i + 1}/{total_count}]"
-
-            # 构建词组标题
-            word_header = ""
-            if format_type in ("wework", "bark"):
-                if count >= 10:
-                    word_header = (
-                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
-                    )
-                elif count >= 5:
-                    word_header = (
-                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
-                    )
-                else:
-                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
-            elif format_type == "telegram":
-                if count >= 10:
-                    word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
-                elif count >= 5:
-                    word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
-                else:
-                    word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
-            elif format_type == "ntfy":
-                if count >= 10:
-                    word_header = (
-                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
-                    )
-                elif count >= 5:
-                    word_header = (
-                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
-                    )
-                else:
-                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
-            elif format_type == "feishu":
-                if count >= 10:
-                    word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
-                elif count >= 5:
-                    word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
-                else:
-                    word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
-            elif format_type == "dingtalk":
-                if count >= 10:
-                    word_header = (
-                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
-                    )
-                elif count >= 5:
-                    word_header = (
-                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
-                    )
-                else:
-                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
-            elif format_type == "slack":
-                if count >= 10:
-                    word_header = (
-                        f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
-                    )
-                elif count >= 5:
-                    word_header = (
-                        f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
-                    )
-                else:
-                    word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
-
-            # 构建第一条新闻
-            first_news_line = ""
-            if stat["titles"]:
-                first_title_data = stat["titles"][0]
-                if format_type in ("wework", "bark"):
-                    formatted_title = format_title_for_platform(
-                        "wework", first_title_data, show_source=True
-                    )
-                elif format_type == "telegram":
-                    formatted_title = format_title_for_platform(
-                        "telegram", first_title_data, show_source=True
-                    )
-                elif format_type == "ntfy":
-                    formatted_title = format_title_for_platform(
-                        "ntfy", first_title_data, show_source=True
-                    )
-                elif format_type == "feishu":
-                    formatted_title = format_title_for_platform(
-                        "feishu", first_title_data, show_source=True
-                    )
-                elif format_type == "dingtalk":
-                    formatted_title = format_title_for_platform(
-                        "dingtalk", first_title_data, show_source=True
-                    )
-                elif format_type == "slack":
-                    formatted_title = format_title_for_platform(
-                        "slack", first_title_data, show_source=True
-                    )
-                else:
-                    formatted_title = f"{first_title_data['title']}"
-
-                first_news_line = f"  1. {formatted_title}\n"
-                if len(stat["titles"]) > 1:
-                    first_news_line += "\n"
-
-            # 原子性检查:词组标题+第一条新闻必须一起处理
-            word_with_first_news = word_header + first_news_line
-            test_content = current_batch + word_with_first_news
-
-            if (
-                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-                >= max_bytes
-            ):
-                # 当前批次容纳不下,开启新批次
-                if current_batch_has_content:
-                    batches.append(current_batch + base_footer)
-                current_batch = base_header + stats_header + word_with_first_news
-                current_batch_has_content = True
-                start_index = 1
-            else:
-                current_batch = test_content
-                current_batch_has_content = True
-                start_index = 1
-
-            # 处理剩余新闻条目
-            for j in range(start_index, len(stat["titles"])):
-                title_data = stat["titles"][j]
-                if format_type in ("wework", "bark"):
-                    formatted_title = format_title_for_platform(
-                        "wework", title_data, show_source=True
-                    )
-                elif format_type == "telegram":
-                    formatted_title = format_title_for_platform(
-                        "telegram", title_data, show_source=True
-                    )
-                elif format_type == "ntfy":
-                    formatted_title = format_title_for_platform(
-                        "ntfy", title_data, show_source=True
-                    )
-                elif format_type == "feishu":
-                    formatted_title = format_title_for_platform(
-                        "feishu", title_data, show_source=True
-                    )
-                elif format_type == "dingtalk":
-                    formatted_title = format_title_for_platform(
-                        "dingtalk", title_data, show_source=True
-                    )
-                elif format_type == "slack":
-                    formatted_title = format_title_for_platform(
-                        "slack", title_data, show_source=True
-                    )
-                else:
-                    formatted_title = f"{title_data['title']}"
-
-                news_line = f"  {j + 1}. {formatted_title}\n"
-                if j < len(stat["titles"]) - 1:
-                    news_line += "\n"
-
-                test_content = current_batch + news_line
-                if (
-                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-                    >= max_bytes
-                ):
-                    if current_batch_has_content:
-                        batches.append(current_batch + base_footer)
-                    current_batch = base_header + stats_header + word_header + news_line
-                    current_batch_has_content = True
-                else:
-                    current_batch = test_content
-                    current_batch_has_content = True
-
-            # 词组间分隔符
-            if i < len(report_data["stats"]) - 1:
-                separator = ""
-                if format_type in ("wework", "bark"):
-                    separator = f"\n\n\n\n"
-                elif format_type == "telegram":
-                    separator = f"\n\n"
-                elif format_type == "ntfy":
-                    separator = f"\n\n"
-                elif format_type == "feishu":
-                    separator = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
-                elif format_type == "dingtalk":
-                    separator = f"\n---\n\n"
-                elif format_type == "slack":
-                    separator = f"\n\n"
-
-                test_content = current_batch + separator
-                if (
-                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-                    < max_bytes
-                ):
-                    current_batch = test_content
-
-        return current_batch, current_batch_has_content, batches
-
-    # 定义处理新增新闻的函数
-    def process_new_titles_section(current_batch, current_batch_has_content, batches):
-        """处理新增新闻"""
-        if not report_data["new_titles"]:
-            return current_batch, current_batch_has_content, batches
-
-        new_header = ""
-        if format_type in ("wework", "bark"):
-            new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
-        elif format_type == "telegram":
-            new_header = (
-                f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
-            )
-        elif format_type == "ntfy":
-            new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
-        elif format_type == "feishu":
-            new_header = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
-        elif format_type == "dingtalk":
-            new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
-        elif format_type == "slack":
-            new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
-
-        test_content = current_batch + new_header
-        if (
-            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-            >= max_bytes
-        ):
-            if current_batch_has_content:
-                batches.append(current_batch + base_footer)
-            current_batch = base_header + new_header
-            current_batch_has_content = True
-        else:
-            current_batch = test_content
-            current_batch_has_content = True
-
-        # 逐个处理新增新闻来源
-        for source_data in report_data["new_titles"]:
-            source_header = ""
-            if format_type in ("wework", "bark"):
-                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
-            elif format_type == "telegram":
-                source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
-            elif format_type == "ntfy":
-                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
-            elif format_type == "feishu":
-                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
-            elif format_type == "dingtalk":
-                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
-            elif format_type == "slack":
-                source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
-
-            # 构建第一条新增新闻
-            first_news_line = ""
-            if source_data["titles"]:
-                first_title_data = source_data["titles"][0]
-                title_data_copy = first_title_data.copy()
-                title_data_copy["is_new"] = False
-
-                if format_type in ("wework", "bark"):
-                    formatted_title = format_title_for_platform(
-                        "wework", title_data_copy, show_source=False
-                    )
-                elif format_type == "telegram":
-                    formatted_title = format_title_for_platform(
-                        "telegram", title_data_copy, show_source=False
-                    )
-                elif format_type == "feishu":
-                    formatted_title = format_title_for_platform(
-                        "feishu", title_data_copy, show_source=False
-                    )
-                elif format_type == "dingtalk":
-                    formatted_title = format_title_for_platform(
-                        "dingtalk", title_data_copy, show_source=False
-                    )
-                elif format_type == "slack":
-                    formatted_title = format_title_for_platform(
-                        "slack", title_data_copy, show_source=False
-                    )
-                else:
-                    formatted_title = f"{title_data_copy['title']}"
-
-                first_news_line = f"  1. {formatted_title}\n"
-
-            # 原子性检查:来源标题+第一条新闻
-            source_with_first_news = source_header + first_news_line
-            test_content = current_batch + source_with_first_news
-
-            if (
-                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-                >= max_bytes
-            ):
-                if current_batch_has_content:
-                    batches.append(current_batch + base_footer)
-                current_batch = base_header + new_header + source_with_first_news
-                current_batch_has_content = True
-                start_index = 1
-            else:
-                current_batch = test_content
-                current_batch_has_content = True
-                start_index = 1
-
-            # 处理剩余新增新闻
-            for j in range(start_index, len(source_data["titles"])):
-                title_data = source_data["titles"][j]
-                title_data_copy = title_data.copy()
-                title_data_copy["is_new"] = False
-
-                if format_type == "wework":
-                    formatted_title = format_title_for_platform(
-                        "wework", title_data_copy, show_source=False
-                    )
-                elif format_type == "telegram":
-                    formatted_title = format_title_for_platform(
-                        "telegram", title_data_copy, show_source=False
-                    )
-                elif format_type == "feishu":
-                    formatted_title = format_title_for_platform(
-                        "feishu", title_data_copy, show_source=False
-                    )
-                elif format_type == "dingtalk":
-                    formatted_title = format_title_for_platform(
-                        "dingtalk", title_data_copy, show_source=False
-                    )
-                elif format_type == "slack":
-                    formatted_title = format_title_for_platform(
-                        "slack", title_data_copy, show_source=False
-                    )
-                else:
-                    formatted_title = f"{title_data_copy['title']}"
-
-                news_line = f"  {j + 1}. {formatted_title}\n"
-
-                test_content = current_batch + news_line
-                if (
-                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-                    >= max_bytes
-                ):
-                    if current_batch_has_content:
-                        batches.append(current_batch + base_footer)
-                    current_batch = base_header + new_header + source_header + news_line
-                    current_batch_has_content = True
-                else:
-                    current_batch = test_content
-                    current_batch_has_content = True
-
-            current_batch += "\n"
-
-        return current_batch, current_batch_has_content, batches
-
-    # 根据配置决定处理顺序
-    if CONFIG.get("REVERSE_CONTENT_ORDER", False):
-        # 新增热点在前,热点词汇统计在后
-        current_batch, current_batch_has_content, batches = process_new_titles_section(
-            current_batch, current_batch_has_content, batches
-        )
-        current_batch, current_batch_has_content, batches = process_stats_section(
-            current_batch, current_batch_has_content, batches
-        )
-    else:
-        # 默认:热点词汇统计在前,新增热点在后
-        current_batch, current_batch_has_content, batches = process_stats_section(
-            current_batch, current_batch_has_content, batches
-        )
-        current_batch, current_batch_has_content, batches = process_new_titles_section(
-            current_batch, current_batch_has_content, batches
-        )
-
-    if report_data["failed_ids"]:
-        failed_header = ""
-        if format_type == "wework":
-            failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
-        elif format_type == "telegram":
-            failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
-        elif format_type == "ntfy":
-            failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n"
-        elif format_type == "feishu":
-            failed_header = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n⚠️ **数据获取失败的平台:**\n\n"
-        elif format_type == "dingtalk":
-            failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n"
-
-        test_content = current_batch + failed_header
-        if (
-            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-            >= max_bytes
-        ):
-            if current_batch_has_content:
-                batches.append(current_batch + base_footer)
-            current_batch = base_header + failed_header
-            current_batch_has_content = True
-        else:
-            current_batch = test_content
-            current_batch_has_content = True
-
-        for i, id_value in enumerate(report_data["failed_ids"], 1):
-            if format_type == "feishu":
-                failed_line = f"  • <font color='red'>{id_value}</font>\n"
-            elif format_type == "dingtalk":
-                failed_line = f"  • **{id_value}**\n"
-            else:
-                failed_line = f"  • {id_value}\n"
-
-            test_content = current_batch + failed_line
-            if (
-                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
-                >= max_bytes
-            ):
-                if current_batch_has_content:
-                    batches.append(current_batch + base_footer)
-                current_batch = base_header + failed_header + failed_line
-                current_batch_has_content = True
-            else:
-                current_batch = test_content
-                current_batch_has_content = True
-
-    # 完成最后批次
-    if current_batch_has_content:
-        batches.append(current_batch + base_footer)
-
-    return batches
-
-
-def send_to_notifications(
-    stats: List[Dict],
-    failed_ids: Optional[List] = None,
-    report_type: str = "当日汇总",
-    new_titles: Optional[Dict] = None,
-    id_to_name: Optional[Dict] = None,
-    update_info: Optional[Dict] = None,
-    proxy_url: Optional[str] = None,
-    mode: str = "daily",
-    html_file_path: Optional[str] = None,
-) -> Dict[str, bool]:
-    """发送数据到多个通知平台(支持多账号)"""
-    results = {}
-    max_accounts = CONFIG["MAX_ACCOUNTS_PER_CHANNEL"]
-
-    if CONFIG["PUSH_WINDOW"]["ENABLED"]:
-        push_manager = PushRecordManager()
-        time_range_start = CONFIG["PUSH_WINDOW"]["TIME_RANGE"]["START"]
-        time_range_end = CONFIG["PUSH_WINDOW"]["TIME_RANGE"]["END"]
-
-        if not push_manager.is_in_time_range(time_range_start, time_range_end):
-            now = get_beijing_time()
-            print(
-                f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送"
-            )
-            return results
-
-        if CONFIG["PUSH_WINDOW"]["ONCE_PER_DAY"]:
-            if push_manager.has_pushed_today():
-                print(f"推送窗口控制:今天已推送过,跳过本次推送")
-                return results
-            else:
-                print(f"推送窗口控制:今天首次推送")
-
-    report_data = prepare_report_data(stats, failed_ids, new_titles, id_to_name, mode)
-
-    update_info_to_send = update_info if CONFIG["SHOW_VERSION_UPDATE"] else None
-
-    # 发送到飞书(多账号)
-    feishu_urls = parse_multi_account_config(CONFIG["FEISHU_WEBHOOK_URL"])
-    if feishu_urls:
-        feishu_urls = limit_accounts(feishu_urls, max_accounts, "飞书")
-        feishu_results = []
-        for i, url in enumerate(feishu_urls):
-            if url:  # 跳过空值
-                account_label = f"账号{i+1}" if len(feishu_urls) > 1 else ""
-                result = send_to_feishu(
-                    url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
-                )
-                feishu_results.append(result)
-        results["feishu"] = any(feishu_results) if feishu_results else False
-
-    # 发送到钉钉(多账号)
-    dingtalk_urls = parse_multi_account_config(CONFIG["DINGTALK_WEBHOOK_URL"])
-    if dingtalk_urls:
-        dingtalk_urls = limit_accounts(dingtalk_urls, max_accounts, "钉钉")
-        dingtalk_results = []
-        for i, url in enumerate(dingtalk_urls):
-            if url:
-                account_label = f"账号{i+1}" if len(dingtalk_urls) > 1 else ""
-                result = send_to_dingtalk(
-                    url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
-                )
-                dingtalk_results.append(result)
-        results["dingtalk"] = any(dingtalk_results) if dingtalk_results else False
-
-    # 发送到企业微信(多账号)
-    wework_urls = parse_multi_account_config(CONFIG["WEWORK_WEBHOOK_URL"])
-    if wework_urls:
-        wework_urls = limit_accounts(wework_urls, max_accounts, "企业微信")
-        wework_results = []
-        for i, url in enumerate(wework_urls):
-            if url:
-                account_label = f"账号{i+1}" if len(wework_urls) > 1 else ""
-                result = send_to_wework(
-                    url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
-                )
-                wework_results.append(result)
-        results["wework"] = any(wework_results) if wework_results else False
-
-    # 发送到 Telegram(多账号,需验证配对)
-    telegram_tokens = parse_multi_account_config(CONFIG["TELEGRAM_BOT_TOKEN"])
-    telegram_chat_ids = parse_multi_account_config(CONFIG["TELEGRAM_CHAT_ID"])
-    if telegram_tokens and telegram_chat_ids:
-        valid, count = validate_paired_configs(
-            {"bot_token": telegram_tokens, "chat_id": telegram_chat_ids},
-            "Telegram",
-            required_keys=["bot_token", "chat_id"]
-        )
-        if valid and count > 0:
-            telegram_tokens = limit_accounts(telegram_tokens, max_accounts, "Telegram")
-            telegram_chat_ids = telegram_chat_ids[:len(telegram_tokens)]  # 保持数量一致
-            telegram_results = []
-            for i in range(len(telegram_tokens)):
-                token = telegram_tokens[i]
-                chat_id = telegram_chat_ids[i]
-                if token and chat_id:
-                    account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else ""
-                    result = send_to_telegram(
-                        token, chat_id, report_data, report_type,
-                        update_info_to_send, proxy_url, mode, account_label
-                    )
-                    telegram_results.append(result)
-            results["telegram"] = any(telegram_results) if telegram_results else False
-
-    # 发送到 ntfy(多账号,需验证配对)
-    ntfy_server_url = CONFIG["NTFY_SERVER_URL"]
-    ntfy_topics = parse_multi_account_config(CONFIG["NTFY_TOPIC"])
-    ntfy_tokens = parse_multi_account_config(CONFIG["NTFY_TOKEN"])
-    if ntfy_server_url and ntfy_topics:
-        # 验证 token 和 topic 数量一致(如果配置了 token)
-        if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics):
-            print(f"❌ ntfy 配置错误:topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致,跳过 ntfy 推送")
-        else:
-            ntfy_topics = limit_accounts(ntfy_topics, max_accounts, "ntfy")
-            if ntfy_tokens:
-                ntfy_tokens = ntfy_tokens[:len(ntfy_topics)]
-            ntfy_results = []
-            for i, topic in enumerate(ntfy_topics):
-                if topic:
-                    token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else ""
-                    account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else ""
-                    result = send_to_ntfy(
-                        ntfy_server_url, topic, token, report_data, report_type,
-                        update_info_to_send, proxy_url, mode, account_label
-                    )
-                    ntfy_results.append(result)
-            results["ntfy"] = any(ntfy_results) if ntfy_results else False
-
-    # 发送到 Bark(多账号)
-    bark_urls = parse_multi_account_config(CONFIG["BARK_URL"])
-    if bark_urls:
-        bark_urls = limit_accounts(bark_urls, max_accounts, "Bark")
-        bark_results = []
-        for i, url in enumerate(bark_urls):
-            if url:
-                account_label = f"账号{i+1}" if len(bark_urls) > 1 else ""
-                result = send_to_bark(
-                    url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
-                )
-                bark_results.append(result)
-        results["bark"] = any(bark_results) if bark_results else False
-
-    # 发送到 Slack(多账号)
-    slack_urls = parse_multi_account_config(CONFIG["SLACK_WEBHOOK_URL"])
-    if slack_urls:
-        slack_urls = limit_accounts(slack_urls, max_accounts, "Slack")
-        slack_results = []
-        for i, url in enumerate(slack_urls):
-            if url:
-                account_label = f"账号{i+1}" if len(slack_urls) > 1 else ""
-                result = send_to_slack(
-                    url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
-                )
-                slack_results.append(result)
-        results["slack"] = any(slack_results) if slack_results else False
-
-    # 发送邮件(保持原有逻辑,已支持多收件人)
-    email_from = CONFIG["EMAIL_FROM"]
-    email_password = CONFIG["EMAIL_PASSWORD"]
-    email_to = CONFIG["EMAIL_TO"]
-    email_smtp_server = CONFIG.get("EMAIL_SMTP_SERVER", "")
-    email_smtp_port = CONFIG.get("EMAIL_SMTP_PORT", "")
-    if email_from and email_password and email_to:
-        results["email"] = send_to_email(
-            email_from,
-            email_password,
-            email_to,
-            report_type,
-            html_file_path,
-            email_smtp_server,
-            email_smtp_port,
-        )
-
-    if not results:
-        print("未配置任何通知渠道,跳过通知发送")
-
-    # 如果成功发送了任何通知,且启用了每天只推一次,则记录推送
-    if (
-        CONFIG["PUSH_WINDOW"]["ENABLED"]
-        and CONFIG["PUSH_WINDOW"]["ONCE_PER_DAY"]
-        and any(results.values())
-    ):
-        push_manager = PushRecordManager()
-        push_manager.record_push(report_type)
-
-    return results
-
-
-def send_to_feishu(
-    webhook_url: str,
-    report_data: Dict,
-    report_type: str,
-    update_info: Optional[Dict] = None,
-    proxy_url: Optional[str] = None,
-    mode: str = "daily",
-    account_label: str = "",
-) -> bool:
-    """发送到飞书(支持分批发送)"""
-    headers = {"Content-Type": "application/json"}
-    proxies = None
-    if proxy_url:
-        proxies = {"http": proxy_url, "https": proxy_url}
-
-    # 日志前缀
-    log_prefix = f"飞书{account_label}" if account_label else "飞书"
-
-    # 获取分批内容,使用飞书专用的批次大小
-    feishu_batch_size = CONFIG.get("FEISHU_BATCH_SIZE", 29000)
-    # 预留批次头部空间,避免添加头部后超限
-    header_reserve = _get_max_batch_header_size("feishu")
-    batches = split_content_into_batches(
-        report_data,
-        "feishu",
-        update_info,
-        max_bytes=feishu_batch_size - header_reserve,
-        mode=mode,
-    )
-
-    # 统一添加批次头部(已预留空间,不会超限)
-    batches = add_batch_headers(batches, "feishu", feishu_batch_size)
-
-    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
-
-    # 逐批发送
-    for i, batch_content in enumerate(batches, 1):
-        batch_size = len(batch_content.encode("utf-8"))
-        print(
-            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
-        )
-
-        total_titles = sum(
-            len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
-        )
-        now = get_beijing_time()
-
-        payload = {
-            "msg_type": "text",
-            "content": {
-                "total_titles": total_titles,
-                "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
-                "report_type": report_type,
-                "text": batch_content,
-            },
-        }
-
-        try:
-            response = requests.post(
-                webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
-            )
-            if response.status_code == 200:
-                result = response.json()
-                # 检查飞书的响应状态
-                if result.get("StatusCode") == 0 or result.get("code") == 0:
-                    print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
-                    # 批次间间隔
-                    if i < len(batches):
-                        time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
-                else:
-                    error_msg = result.get("msg") or result.get("StatusMessage", "未知错误")
-                    print(
-                        f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}"
-                    )
-                    return False
-            else:
-                print(
-                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
-                )
-                return False
-        except Exception as e:
-            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
-            return False
-
-    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
-    return True
-
-
-def send_to_dingtalk(
-    webhook_url: str,
-    report_data: Dict,
-    report_type: str,
-    update_info: Optional[Dict] = None,
-    proxy_url: Optional[str] = None,
-    mode: str = "daily",
-    account_label: str = "",
-) -> bool:
-    """发送到钉钉(支持分批发送)"""
-    headers = {"Content-Type": "application/json"}
-    proxies = None
-    if proxy_url:
-        proxies = {"http": proxy_url, "https": proxy_url}
-
-    # 日志前缀
-    log_prefix = f"钉钉{account_label}" if account_label else "钉钉"
-
-    # 获取分批内容,使用钉钉专用的批次大小
-    dingtalk_batch_size = CONFIG.get("DINGTALK_BATCH_SIZE", 20000)
-    # 预留批次头部空间,避免添加头部后超限
-    header_reserve = _get_max_batch_header_size("dingtalk")
-    batches = split_content_into_batches(
-        report_data,
-        "dingtalk",
-        update_info,
-        max_bytes=dingtalk_batch_size - header_reserve,
-        mode=mode,
-    )
-
-    # 统一添加批次头部(已预留空间,不会超限)
-    batches = add_batch_headers(batches, "dingtalk", dingtalk_batch_size)
-
-    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
-
-    # 逐批发送
-    for i, batch_content in enumerate(batches, 1):
-        batch_size = len(batch_content.encode("utf-8"))
-        print(
-            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
-        )
-
-        payload = {
-            "msgtype": "markdown",
-            "markdown": {
-                "title": f"TrendRadar 热点分析报告 - {report_type}",
-                "text": batch_content,
-            },
-        }
-
-        try:
-            response = requests.post(
-                webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
-            )
-            if response.status_code == 200:
-                result = response.json()
-                if result.get("errcode") == 0:
-                    print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
-                    # 批次间间隔
-                    if i < len(batches):
-                        time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
-                else:
-                    print(
-                        f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}"
-                    )
-                    return False
-            else:
-                print(
-                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
-                )
-                return False
-        except Exception as e:
-            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
-            return False
-
-    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
-    return True
-
-
-def strip_markdown(text: str) -> str:
-    """去除文本中的 markdown 语法格式,用于个人微信推送"""
-
-    # 去除粗体 **text** 或 __text__
-    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
-    text = re.sub(r'__(.+?)__', r'\1', text)
-
-    # 去除斜体 *text* 或 _text_
-    text = re.sub(r'\*(.+?)\*', r'\1', text)
-    text = re.sub(r'_(.+?)_', r'\1', text)
-
-    # 去除删除线 ~~text~~
-    text = re.sub(r'~~(.+?)~~', r'\1', text)
-
-    # 转换链接 [text](url) -> text url(保留 URL)
-    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text)
-    # 如果不需要保留 URL,可以使用下面这行(只保留标题文本):
-    # text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
-
-    # 去除图片 ![alt](url) -> alt
-    text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text)
-
-    # 去除行内代码 `code`
-    text = re.sub(r'`(.+?)`', r'\1', text)
-
-    # 去除引用符号 >
-    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
-
-    # 去除标题符号 # ## ### 等
-    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
-
-    # 去除水平分割线 --- 或 ***
-    text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE)
-
-    # 去除 HTML 标签 <font color='xxx'>text</font> -> text
-    text = re.sub(r'<font[^>]*>(.+?)</font>', r'\1', text)
-    text = re.sub(r'<[^>]+>', '', text)
-
-    # 清理多余的空行(保留最多两个连续空行)
-    text = re.sub(r'\n{3,}', '\n\n', text)
-
-    return text.strip()
-
-
-def send_to_wework(
-    webhook_url: str,
-    report_data: Dict,
-    report_type: str,
-    update_info: Optional[Dict] = None,
-    proxy_url: Optional[str] = None,
-    mode: str = "daily",
-    account_label: str = "",
-) -> bool:
-    """发送到企业微信(支持分批发送,支持 markdown 和 text 两种格式)"""
-    headers = {"Content-Type": "application/json"}
-    proxies = None
-    if proxy_url:
-        proxies = {"http": proxy_url, "https": proxy_url}
-
-    # 日志前缀
-    log_prefix = f"企业微信{account_label}" if account_label else "企业微信"
-
-    # 获取消息类型配置(markdown 或 text)
-    msg_type = CONFIG.get("WEWORK_MSG_TYPE", "markdown").lower()
-    is_text_mode = msg_type == "text"
-
-    if is_text_mode:
-        print(f"{log_prefix}使用 text 格式(个人微信模式)[{report_type}]")
-    else:
-        print(f"{log_prefix}使用 markdown 格式(群机器人模式)[{report_type}]")
-
-    # text 模式使用 wework_text,markdown 模式使用 wework
-    header_format_type = "wework_text" if is_text_mode else "wework"
-
-    # 获取分批内容,预留批次头部空间
-    wework_batch_size = CONFIG.get("MESSAGE_BATCH_SIZE", 4000)
-    header_reserve = _get_max_batch_header_size(header_format_type)
-    batches = split_content_into_batches(
-        report_data, "wework", update_info, max_bytes=wework_batch_size - header_reserve, mode=mode
-    )
-
-    # 统一添加批次头部(已预留空间,不会超限)
-    batches = add_batch_headers(batches, header_format_type, wework_batch_size)
-
-    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
-
-    # 逐批发送
-    for i, batch_content in enumerate(batches, 1):
-        # 根据消息类型构建 payload
-        if is_text_mode:
-            # text 格式:去除 markdown 语法
-            plain_content = strip_markdown(batch_content)
-            payload = {"msgtype": "text", "text": {"content": plain_content}}
-            batch_size = len(plain_content.encode("utf-8"))
-        else:
-            # markdown 格式:保持原样
-            payload = {"msgtype": "markdown", "markdown": {"content": batch_content}}
-            batch_size = len(batch_content.encode("utf-8"))
-
-        print(
-            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
-        )
-
-        try:
-            response = requests.post(
-                webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
-            )
-            if response.status_code == 200:
-                result = response.json()
-                if result.get("errcode") == 0:
-                    print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
-                    # 批次间间隔
-                    if i < len(batches):
-                        time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
-                else:
-                    print(
-                        f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}"
-                    )
-                    return False
-            else:
-                print(
-                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
-                )
-                return False
-        except Exception as e:
-            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
-            return False
-
-    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
-    return True
-
-
-def send_to_telegram(
-    bot_token: str,
-    chat_id: str,
-    report_data: Dict,
-    report_type: str,
-    update_info: Optional[Dict] = None,
-    proxy_url: Optional[str] = None,
-    mode: str = "daily",
-    account_label: str = "",
-) -> bool:
-    """发送到Telegram(支持分批发送)"""
-    headers = {"Content-Type": "application/json"}
-    url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
-
-    proxies = None
-    if proxy_url:
-        proxies = {"http": proxy_url, "https": proxy_url}
-
-    # 日志前缀
-    log_prefix = f"Telegram{account_label}" if account_label else "Telegram"
-
-    # 获取分批内容,预留批次头部空间
-    telegram_batch_size = CONFIG.get("MESSAGE_BATCH_SIZE", 4000)
-    header_reserve = _get_max_batch_header_size("telegram")
-    batches = split_content_into_batches(
-        report_data, "telegram", update_info, max_bytes=telegram_batch_size - header_reserve, mode=mode
-    )
-
-    # 统一添加批次头部(已预留空间,不会超限)
-    batches = add_batch_headers(batches, "telegram", telegram_batch_size)
-
-    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
-
-    # 逐批发送
-    for i, batch_content in enumerate(batches, 1):
-        batch_size = len(batch_content.encode("utf-8"))
-        print(
-            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
-        )
-
-        payload = {
-            "chat_id": chat_id,
-            "text": batch_content,
-            "parse_mode": "HTML",
-            "disable_web_page_preview": True,
-        }
-
-        try:
-            response = requests.post(
-                url, headers=headers, json=payload, proxies=proxies, timeout=30
-            )
-            if response.status_code == 200:
-                result = response.json()
-                if result.get("ok"):
-                    print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
-                    # 批次间间隔
-                    if i < len(batches):
-                        time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
-                else:
-                    print(
-                        f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('description')}"
-                    )
-                    return False
-            else:
-                print(
-                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
-                )
-                return False
-        except Exception as e:
-            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
-            return False
-
-    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
-    return True
-
-
-def send_to_email(
-    from_email: str,
-    password: str,
-    to_email: str,
-    report_type: str,
-    html_file_path: str,
-    custom_smtp_server: Optional[str] = None,
-    custom_smtp_port: Optional[int] = None,
-) -> bool:
-    """发送邮件通知"""
-    try:
-        if not html_file_path or not Path(html_file_path).exists():
-            print(f"错误:HTML文件不存在或未提供: {html_file_path}")
-            return False
-
-        print(f"使用HTML文件: {html_file_path}")
-        with open(html_file_path, "r", encoding="utf-8") as f:
-            html_content = f.read()
-
-        domain = from_email.split("@")[-1].lower()
-
-        if custom_smtp_server and custom_smtp_port:
-            # 使用自定义 SMTP 配置
-            smtp_server = custom_smtp_server
-            smtp_port = int(custom_smtp_port)
-            # 根据端口判断加密方式:465=SSL, 587=TLS
-            if smtp_port == 465:
-                use_tls = False  # SSL 模式(SMTP_SSL)
-            elif smtp_port == 587:
-                use_tls = True   # TLS 模式(STARTTLS)
-            else:
-                # 其他端口优先尝试 TLS(更安全,更广泛支持)
-                use_tls = True
-        elif domain in SMTP_CONFIGS:
-            # 使用预设配置
-            config = SMTP_CONFIGS[domain]
-            smtp_server = config["server"]
-            smtp_port = config["port"]
-            use_tls = config["encryption"] == "TLS"
-        else:
-            print(f"未识别的邮箱服务商: {domain},使用通用 SMTP 配置")
-            smtp_server = f"smtp.{domain}"
-            smtp_port = 587
-            use_tls = True
-
-        msg = MIMEMultipart("alternative")
-
-        # 严格按照 RFC 标准设置 From header
-        sender_name = "TrendRadar"
-        msg["From"] = formataddr((sender_name, from_email))
-
-        # 设置收件人
-        recipients = [addr.strip() for addr in to_email.split(",")]
-        if len(recipients) == 1:
-            msg["To"] = recipients[0]
-        else:
-            msg["To"] = ", ".join(recipients)
-
-        # 设置邮件主题
-        now = get_beijing_time()
-        subject = f"TrendRadar 热点分析报告 - {report_type} - {now.strftime('%m月%d日 %H:%M')}"
-        msg["Subject"] = Header(subject, "utf-8")
-
-        # 设置其他标准 header
-        msg["MIME-Version"] = "1.0"
-        msg["Date"] = formatdate(localtime=True)
-        msg["Message-ID"] = make_msgid()
-
-        # 添加纯文本部分(作为备选)
-        text_content = f"""
-TrendRadar 热点分析报告
-========================
-报告类型:{report_type}
-生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')}
-
-请使用支持HTML的邮件客户端查看完整报告内容。
-        """
-        text_part = MIMEText(text_content, "plain", "utf-8")
-        msg.attach(text_part)
-
-        html_part = MIMEText(html_content, "html", "utf-8")
-        msg.attach(html_part)
-
-        print(f"正在发送邮件到 {to_email}...")
-        print(f"SMTP 服务器: {smtp_server}:{smtp_port}")
-        print(f"发件人: {from_email}")
-
-        try:
-            if use_tls:
-                # TLS 模式
-                server = smtplib.SMTP(smtp_server, smtp_port, timeout=30)
-                server.set_debuglevel(0)  # 设为1可以查看详细调试信息
-                server.ehlo()
-                server.starttls()
-                server.ehlo()
-            else:
-                # SSL 模式
-                server = smtplib.SMTP_SSL(smtp_server, smtp_port, timeout=30)
-                server.set_debuglevel(0)
-                server.ehlo()
-
-            # 登录
-            server.login(from_email, password)
-
-            # 发送邮件
-            server.send_message(msg)
-            server.quit()
-
-            print(f"邮件发送成功 [{report_type}] -> {to_email}")
-            return True
-
-        except smtplib.SMTPServerDisconnected:
-            print(f"邮件发送失败:服务器意外断开连接,请检查网络或稍后重试")
-            return False
-
-    except smtplib.SMTPAuthenticationError as e:
-        print(f"邮件发送失败:认证错误,请检查邮箱和密码/授权码")
-        print(f"详细错误: {str(e)}")
-        return False
-    except smtplib.SMTPRecipientsRefused as e:
-        print(f"邮件发送失败:收件人地址被拒绝 {e}")
-        return False
-    except smtplib.SMTPSenderRefused as e:
-        print(f"邮件发送失败:发件人地址被拒绝 {e}")
-        return False
-    except smtplib.SMTPDataError as e:
-        print(f"邮件发送失败:邮件数据错误 {e}")
-        return False
-    except smtplib.SMTPConnectError as e:
-        print(f"邮件发送失败:无法连接到 SMTP 服务器 {smtp_server}:{smtp_port}")
-        print(f"详细错误: {str(e)}")
-        return False
-    except Exception as e:
-        print(f"邮件发送失败 [{report_type}]:{e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-
-def send_to_ntfy(
-    server_url: str,
-    topic: str,
-    token: Optional[str],
-    report_data: Dict,
-    report_type: str,
-    update_info: Optional[Dict] = None,
-    proxy_url: Optional[str] = None,
-    mode: str = "daily",
-    account_label: str = "",
-) -> bool:
-    """发送到ntfy(支持分批发送,严格遵守4KB限制)"""
-    # 日志前缀
-    log_prefix = f"ntfy{account_label}" if account_label else "ntfy"
-
-    # 避免 HTTP header 编码问题
-    report_type_en_map = {
-        "当日汇总": "Daily Summary",
-        "当前榜单汇总": "Current Ranking",
-        "增量更新": "Incremental Update",
-        "实时增量": "Realtime Incremental", 
-        "实时当前榜单": "Realtime Current Ranking",  
-    }
-    report_type_en = report_type_en_map.get(report_type, "News Report") 
-
-    headers = {
-        "Content-Type": "text/plain; charset=utf-8",
-        "Markdown": "yes",
-        "Title": report_type_en,
-        "Priority": "default",
-        "Tags": "news",
-    }
-
-    if token:
-        headers["Authorization"] = f"Bearer {token}"
-
-    # 构建完整URL,确保格式正确
-    base_url = server_url.rstrip("/")
-    if not base_url.startswith(("http://", "https://")):
-        base_url = f"https://{base_url}"
-    url = f"{base_url}/{topic}"
-
-    proxies = None
-    if proxy_url:
-        proxies = {"http": proxy_url, "https": proxy_url}
-
-    # 获取分批内容,使用ntfy专用的4KB限制,预留批次头部空间
-    ntfy_batch_size = 3800
-    header_reserve = _get_max_batch_header_size("ntfy")
-    batches = split_content_into_batches(
-        report_data, "ntfy", update_info, max_bytes=ntfy_batch_size - header_reserve, mode=mode
-    )
-
-    # 统一添加批次头部(已预留空间,不会超限)
-    batches = add_batch_headers(batches, "ntfy", ntfy_batch_size)
-
-    total_batches = len(batches)
-    print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]")
-
-    # 反转批次顺序,使得在ntfy客户端显示时顺序正确
-    # ntfy显示最新消息在上面,所以我们从最后一批开始推送
-    reversed_batches = list(reversed(batches))
-
-    print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确")
-
-    # 逐批发送(反向顺序)
-    success_count = 0
-    for idx, batch_content in enumerate(reversed_batches, 1):
-        # 计算正确的批次编号(用户视角的编号)
-        actual_batch_num = total_batches - idx + 1
-
-        batch_size = len(batch_content.encode("utf-8"))
-        print(
-            f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{batch_size} 字节 [{report_type}]"
-        )
-
-        # 检查消息大小,确保不超过4KB
-        if batch_size > 4096:
-            print(f"警告:{log_prefix}第 {actual_batch_num} 批次消息过大({batch_size} 字节),可能被拒绝")
-
-        # 更新 headers 的批次标识
-        current_headers = headers.copy()
-        if total_batches > 1:
-            current_headers["Title"] = (
-                f"{report_type_en} ({actual_batch_num}/{total_batches})"
-            )
-
-        try:
-            response = requests.post(
-                url,
-                headers=current_headers,
-                data=batch_content.encode("utf-8"),
-                proxies=proxies,
-                timeout=30,
-            )
-
-            if response.status_code == 200:
-                print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]")
-                success_count += 1
-                if idx < total_batches:
-                    # 公共服务器建议 2-3 秒,自托管可以更短
-                    interval = 2 if "ntfy.sh" in server_url else 1
-                    time.sleep(interval)
-            elif response.status_code == 429:
-                print(
-                    f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次速率限制 [{report_type}],等待后重试"
-                )
-                time.sleep(10)  # 等待10秒后重试
-                # 重试一次
-                retry_response = requests.post(
-                    url,
-                    headers=current_headers,
-                    data=batch_content.encode("utf-8"),
-                    proxies=proxies,
-                    timeout=30,
-                )
-                if retry_response.status_code == 200:
-                    print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试成功 [{report_type}]")
-                    success_count += 1
-                else:
-                    print(
-                        f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试失败,状态码:{retry_response.status_code}"
-                    )
-            elif response.status_code == 413:
-                print(
-                    f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大被拒绝 [{report_type}],消息大小:{batch_size} 字节"
-                )
-            else:
-                print(
-                    f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}"
-                )
-                try:
-                    print(f"错误详情:{response.text}")
-                except:
-                    pass
-
-        except requests.exceptions.ConnectTimeout:
-            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]")
-        except requests.exceptions.ReadTimeout:
-            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]")
-        except requests.exceptions.ConnectionError as e:
-            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}")
-        except Exception as e:
-            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}")
-
-    # 判断整体发送是否成功
-    if success_count == total_batches:
-        print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]")
-        return True
-    elif success_count > 0:
-        print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]")
-        return True  # 部分成功也视为成功
-    else:
-        print(f"{log_prefix}发送完全失败 [{report_type}]")
-        return False
-
-
-def send_to_bark(
-    bark_url: str,
-    report_data: Dict,
-    report_type: str,
-    update_info: Optional[Dict] = None,
-    proxy_url: Optional[str] = None,
-    mode: str = "daily",
-    account_label: str = "",
-) -> bool:
-    """发送到Bark(支持分批发送,使用 markdown 格式)"""
-    # 日志前缀
-    log_prefix = f"Bark{account_label}" if account_label else "Bark"
-
-    proxies = None
-    if proxy_url:
-        proxies = {"http": proxy_url, "https": proxy_url}
-
-    # 解析 Bark URL,提取 device_key 和 API 端点
-    # Bark URL 格式: https://api.day.app/device_key 或 https://bark.day.app/device_key
-    from urllib.parse import urlparse
-
-    parsed_url = urlparse(bark_url)
-    device_key = parsed_url.path.strip('/').split('/')[0] if parsed_url.path else None
-
-    if not device_key:
-        print(f"{log_prefix} URL 格式错误,无法提取 device_key: {bark_url}")
-        return False
-
-    # 构建正确的 API 端点
-    api_endpoint = f"{parsed_url.scheme}://{parsed_url.netloc}/push"
-
-    # 获取分批内容(Bark 限制为 3600 字节以避免 413 错误),预留批次头部空间
-    bark_batch_size = CONFIG["BARK_BATCH_SIZE"]
-    header_reserve = _get_max_batch_header_size("bark")
-    batches = split_content_into_batches(
-        report_data, "bark", update_info, max_bytes=bark_batch_size - header_reserve, mode=mode
-    )
-
-    # 统一添加批次头部(已预留空间,不会超限)
-    batches = add_batch_headers(batches, "bark", bark_batch_size)
-
-    total_batches = len(batches)
-    print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]")
-
-    # 反转批次顺序,使得在Bark客户端显示时顺序正确
-    # Bark显示最新消息在上面,所以我们从最后一批开始推送
-    reversed_batches = list(reversed(batches))
-
-    print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确")
-
-    # 逐批发送(反向顺序)
-    success_count = 0
-    for idx, batch_content in enumerate(reversed_batches, 1):
-        # 计算正确的批次编号(用户视角的编号)
-        actual_batch_num = total_batches - idx + 1
-
-        batch_size = len(batch_content.encode("utf-8"))
-        print(
-            f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{batch_size} 字节 [{report_type}]"
-        )
-
-        # 检查消息大小(Bark使用APNs,限制4KB)
-        if batch_size > 4096:
-            print(
-                f"警告:{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大({batch_size} 字节),可能被拒绝"
-            )
-
-        # 构建JSON payload
-        payload = {
-            "title": report_type,
-            "markdown": batch_content,
-            "device_key": device_key,
-            "sound": "default",
-            "group": "TrendRadar",
-            "action": "none",  # 点击推送跳到 APP 不弹出弹框,方便阅读
-        }
-
-        try:
-            response = requests.post(
-                api_endpoint,
-                json=payload,
-                proxies=proxies,
-                timeout=30,
-            )
-
-            if response.status_code == 200:
-                result = response.json()
-                if result.get("code") == 200:
-                    print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]")
-                    success_count += 1
-                    # 批次间间隔
-                    if idx < total_batches:
-                        time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
-                else:
-                    print(
-                        f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],错误:{result.get('message', '未知错误')}"
-                    )
-            else:
-                print(
-                    f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}"
-                )
-                try:
-                    print(f"错误详情:{response.text}")
-                except:
-                    pass
-
-        except requests.exceptions.ConnectTimeout:
-            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]")
-        except requests.exceptions.ReadTimeout:
-            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]")
-        except requests.exceptions.ConnectionError as e:
-            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}")
-        except Exception as e:
-            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}")
-
-    # 判断整体发送是否成功
-    if success_count == total_batches:
-        print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]")
-        return True
-    elif success_count > 0:
-        print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]")
-        return True  # 部分成功也视为成功
-    else:
-        print(f"{log_prefix}发送完全失败 [{report_type}]")
-        return False
-
-
-def convert_markdown_to_mrkdwn(content: str) -> str:
-    """
-    将标准 Markdown 转换为 Slack 的 mrkdwn 格式
-
-    转换规则:
-    - **粗体** → *粗体*
-    - [文本](url) → <url|文本>
-    - 保留其他格式(代码块、列表等)
-    """
-    # 1. 转换链接格式: [文本](url) → <url|文本>
-    content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content)
-
-    # 2. 转换粗体: **文本** → *文本*
-    content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content)
-
-    return content
-
-
-def send_to_slack(
-    webhook_url: str,
-    report_data: Dict,
-    report_type: str,
-    update_info: Optional[Dict] = None,
-    proxy_url: Optional[str] = None,
-    mode: str = "daily",
-    account_label: str = "",
-) -> bool:
-    """发送到Slack(支持分批发送,使用 mrkdwn 格式)"""
-    headers = {"Content-Type": "application/json"}
-    proxies = None
-    if proxy_url:
-        proxies = {"http": proxy_url, "https": proxy_url}
-
-    # 日志前缀
-    log_prefix = f"Slack{account_label}" if account_label else "Slack"
-
-    # 获取分批内容(使用 Slack 批次大小),预留批次头部空间
-    slack_batch_size = CONFIG["SLACK_BATCH_SIZE"]
-    header_reserve = _get_max_batch_header_size("slack")
-    batches = split_content_into_batches(
-        report_data, "slack", update_info, max_bytes=slack_batch_size - header_reserve, mode=mode
-    )
-
-    # 统一添加批次头部(已预留空间,不会超限)
-    batches = add_batch_headers(batches, "slack", slack_batch_size)
-
-    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
-
-    # 逐批发送
-    for i, batch_content in enumerate(batches, 1):
-        # 转换 Markdown 到 mrkdwn 格式
-        mrkdwn_content = convert_markdown_to_mrkdwn(batch_content)
-
-        batch_size = len(mrkdwn_content.encode("utf-8"))
-        print(
-            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
-        )
-
-        # 构建 Slack payload(使用简单的 text 字段,支持 mrkdwn)
-        payload = {
-            "text": mrkdwn_content
-        }
-
-        try:
-            response = requests.post(
-                webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
-            )
-
-            # Slack Incoming Webhooks 成功时返回 "ok" 文本
-            if response.status_code == 200 and response.text == "ok":
-                print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
-                # 批次间间隔
-                if i < len(batches):
-                    time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
-            else:
-                error_msg = response.text if response.text else f"状态码:{response.status_code}"
-                print(
-                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}"
-                )
-                return False
-        except Exception as e:
-            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
-            return False
-
-    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
-    return True
-
-
-# === 主分析器 ===
-class NewsAnalyzer:
-    """新闻分析器"""
-
-    # 模式策略定义
-    MODE_STRATEGIES = {
-        "incremental": {
-            "mode_name": "增量模式",
-            "description": "增量模式(只关注新增新闻,无新增时不推送)",
-            "realtime_report_type": "实时增量",
-            "summary_report_type": "当日汇总",
-            "should_send_realtime": True,
-            "should_generate_summary": True,
-            "summary_mode": "daily",
-        },
-        "current": {
-            "mode_name": "当前榜单模式",
-            "description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)",
-            "realtime_report_type": "实时当前榜单",
-            "summary_report_type": "当前榜单汇总",
-            "should_send_realtime": True,
-            "should_generate_summary": True,
-            "summary_mode": "current",
-        },
-        "daily": {
-            "mode_name": "当日汇总模式",
-            "description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)",
-            "realtime_report_type": "",
-            "summary_report_type": "当日汇总",
-            "should_send_realtime": False,
-            "should_generate_summary": True,
-            "summary_mode": "daily",
-        },
-    }
-
-    def __init__(self):
-        self.request_interval = CONFIG["REQUEST_INTERVAL"]
-        self.report_mode = CONFIG["REPORT_MODE"]
-        self.rank_threshold = CONFIG["RANK_THRESHOLD"]
-        self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
-        self.is_docker_container = self._detect_docker_environment()
-        self.update_info = None
-        self.proxy_url = None
-        self._setup_proxy()
-        self.data_fetcher = DataFetcher(self.proxy_url)
-
-        if self.is_github_actions:
-            self._check_version_update()
-
-    def _detect_docker_environment(self) -> bool:
-        """检测是否运行在 Docker 容器中"""
-        try:
-            if os.environ.get("DOCKER_CONTAINER") == "true":
-                return True
-
-            if os.path.exists("/.dockerenv"):
-                return True
-
-            return False
-        except Exception:
-            return False
-
-    def _should_open_browser(self) -> bool:
-        """判断是否应该打开浏览器"""
-        return not self.is_github_actions and not self.is_docker_container
-
-    def _setup_proxy(self) -> None:
-        """设置代理配置"""
-        if not self.is_github_actions and CONFIG["USE_PROXY"]:
-            self.proxy_url = CONFIG["DEFAULT_PROXY"]
-            print("本地环境,使用代理")
-        elif not self.is_github_actions and not CONFIG["USE_PROXY"]:
-            print("本地环境,未启用代理")
-        else:
-            print("GitHub Actions环境,不使用代理")
-
-    def _check_version_update(self) -> None:
-        """检查版本更新"""
-        try:
-            need_update, remote_version = check_version_update(
-                VERSION, CONFIG["VERSION_CHECK_URL"], self.proxy_url
-            )
-
-            if need_update and remote_version:
-                self.update_info = {
-                    "current_version": VERSION,
-                    "remote_version": remote_version,
-                }
-                print(f"发现新版本: {remote_version} (当前: {VERSION})")
-            else:
-                print("版本检查完成,当前为最新版本")
-        except Exception as e:
-            print(f"版本检查出错: {e}")
-
-    def _get_mode_strategy(self) -> Dict:
-        """获取当前模式的策略配置"""
-        return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
-
-    def _has_notification_configured(self) -> bool:
-        """检查是否配置了任何通知渠道"""
-        return any(
-            [
-                CONFIG["FEISHU_WEBHOOK_URL"],
-                CONFIG["DINGTALK_WEBHOOK_URL"],
-                CONFIG["WEWORK_WEBHOOK_URL"],
-                (CONFIG["TELEGRAM_BOT_TOKEN"] and CONFIG["TELEGRAM_CHAT_ID"]),
-                (
-                    CONFIG["EMAIL_FROM"]
-                    and CONFIG["EMAIL_PASSWORD"]
-                    and CONFIG["EMAIL_TO"]
-                ),
-                (CONFIG["NTFY_SERVER_URL"] and CONFIG["NTFY_TOPIC"]),
-                CONFIG["BARK_URL"],
-                CONFIG["SLACK_WEBHOOK_URL"],
-            ]
-        )
-
-    def _has_valid_content(
-        self, stats: List[Dict], new_titles: Optional[Dict] = None
-    ) -> bool:
-        """检查是否有有效的新闻内容"""
-        if self.report_mode in ["incremental", "current"]:
-            # 增量模式和current模式下,只要stats有内容就说明有匹配的新闻
-            return any(stat["count"] > 0 for stat in stats)
-        else:
-            # 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻
-            has_matched_news = any(stat["count"] > 0 for stat in stats)
-            has_new_news = bool(
-                new_titles and any(len(titles) > 0 for titles in new_titles.values())
-            )
-            return has_matched_news or has_new_news
-
-    def _load_analysis_data(
-        self,
-    ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
-        """统一的数据加载和预处理,使用当前监控平台列表过滤历史数据"""
-        try:
-            # 获取当前配置的监控平台ID列表
-            current_platform_ids = []
-            for platform in CONFIG["PLATFORMS"]:
-                current_platform_ids.append(platform["id"])
-
-            print(f"当前监控平台: {current_platform_ids}")
-
-            all_results, id_to_name, title_info = read_all_today_titles(
-                current_platform_ids
-            )
-
-            if not all_results:
-                print("没有找到当天的数据")
-                return None
-
-            total_titles = sum(len(titles) for titles in all_results.values())
-            print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)")
-
-            new_titles = detect_latest_new_titles(current_platform_ids)
-            word_groups, filter_words, global_filters = load_frequency_words()
-
-            return (
-                all_results,
-                id_to_name,
-                title_info,
-                new_titles,
-                word_groups,
-                filter_words,
-                global_filters,
-            )
-        except Exception as e:
-            print(f"数据加载失败: {e}")
-            return None
-
-    def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
-        """从当前抓取结果构建标题信息"""
-        title_info = {}
-        for source_id, titles_data in results.items():
-            title_info[source_id] = {}
-            for title, title_data in titles_data.items():
-                ranks = title_data.get("ranks", [])
-                url = title_data.get("url", "")
-                mobile_url = title_data.get("mobileUrl", "")
-
-                title_info[source_id][title] = {
-                    "first_time": time_info,
-                    "last_time": time_info,
-                    "count": 1,
-                    "ranks": ranks,
-                    "url": url,
-                    "mobileUrl": mobile_url,
-                }
-        return title_info
-
-    def _run_analysis_pipeline(
-        self,
-        data_source: Dict,
-        mode: str,
-        title_info: Dict,
-        new_titles: Dict,
-        word_groups: List[Dict],
-        filter_words: List[str],
-        id_to_name: Dict,
-        failed_ids: Optional[List] = None,
-        is_daily_summary: bool = False,
-        global_filters: Optional[List[str]] = None,
-    ) -> Tuple[List[Dict], str]:
-        """统一的分析流水线:数据处理 → 统计计算 → HTML生成"""
-
-        # 统计计算
-        stats, total_titles = count_word_frequency(
-            data_source,
-            word_groups,
-            filter_words,
-            id_to_name,
-            title_info,
-            self.rank_threshold,
-            new_titles,
-            mode=mode,
-            global_filters=global_filters,
-        )
-
-        # HTML生成
-        html_file = generate_html_report(
-            stats,
-            total_titles,
-            failed_ids=failed_ids,
-            new_titles=new_titles,
-            id_to_name=id_to_name,
-            mode=mode,
-            is_daily_summary=is_daily_summary,
-            update_info=self.update_info if CONFIG["SHOW_VERSION_UPDATE"] else None,
-        )
-
-        return stats, html_file
-
-    def _send_notification_if_needed(
-        self,
-        stats: List[Dict],
-        report_type: str,
-        mode: str,
-        failed_ids: Optional[List] = None,
-        new_titles: Optional[Dict] = None,
-        id_to_name: Optional[Dict] = None,
-        html_file_path: Optional[str] = None,
-    ) -> bool:
-        """统一的通知发送逻辑,包含所有判断条件"""
-        has_notification = self._has_notification_configured()
-
-        if (
-            CONFIG["ENABLE_NOTIFICATION"]
-            and has_notification
-            and self._has_valid_content(stats, new_titles)
-        ):
-            send_to_notifications(
-                stats,
-                failed_ids or [],
-                report_type,
-                new_titles,
-                id_to_name,
-                self.update_info,
-                self.proxy_url,
-                mode=mode,
-                html_file_path=html_file_path,
-            )
-            return True
-        elif CONFIG["ENABLE_NOTIFICATION"] and not has_notification:
-            print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送")
-        elif not CONFIG["ENABLE_NOTIFICATION"]:
-            print(f"跳过{report_type}通知:通知功能已禁用")
-        elif (
-            CONFIG["ENABLE_NOTIFICATION"]
-            and has_notification
-            and not self._has_valid_content(stats, new_titles)
-        ):
-            mode_strategy = self._get_mode_strategy()
-            if "实时" in report_type:
-                print(
-                    f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻"
-                )
-            else:
-                print(
-                    f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容"
-                )
-
-        return False
-
-    def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]:
-        """生成汇总报告(带通知)"""
-        summary_type = (
-            "当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总"
-        )
-        print(f"生成{summary_type}报告...")
-
-        # 加载分析数据
-        analysis_data = self._load_analysis_data()
-        if not analysis_data:
-            return None
-
-        all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
-            analysis_data
-        )
-
-        # 运行分析流水线
-        stats, html_file = self._run_analysis_pipeline(
-            all_results,
-            mode_strategy["summary_mode"],
-            title_info,
-            new_titles,
-            word_groups,
-            filter_words,
-            id_to_name,
-            is_daily_summary=True,
-            global_filters=global_filters,
-        )
-
-        print(f"{summary_type}报告已生成: {html_file}")
-
-        # 发送通知
-        self._send_notification_if_needed(
-            stats,
-            mode_strategy["summary_report_type"],
-            mode_strategy["summary_mode"],
-            failed_ids=[],
-            new_titles=new_titles,
-            id_to_name=id_to_name,
-            html_file_path=html_file,
-        )
-
-        return html_file
-
-    def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
-        """生成汇总HTML"""
-        summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
-        print(f"生成{summary_type}HTML...")
-
-        # 加载分析数据
-        analysis_data = self._load_analysis_data()
-        if not analysis_data:
-            return None
-
-        all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
-            analysis_data
-        )
-
-        # 运行分析流水线
-        _, html_file = self._run_analysis_pipeline(
-            all_results,
-            mode,
-            title_info,
-            new_titles,
-            word_groups,
-            filter_words,
-            id_to_name,
-            is_daily_summary=True,
-            global_filters=global_filters,
-        )
-
-        print(f"{summary_type}HTML已生成: {html_file}")
-        return html_file
-
-    def _initialize_and_check_config(self) -> None:
-        """通用初始化和配置检查"""
-        now = get_beijing_time()
-        print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
-
-        if not CONFIG["ENABLE_CRAWLER"]:
-            print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出")
-            return
-
-        has_notification = self._has_notification_configured()
-        if not CONFIG["ENABLE_NOTIFICATION"]:
-            print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取")
-        elif not has_notification:
-            print("未配置任何通知渠道,将只进行数据抓取,不发送通知")
-        else:
-            print("通知功能已启用,将发送通知")
-
-        mode_strategy = self._get_mode_strategy()
-        print(f"报告模式: {self.report_mode}")
-        print(f"运行模式: {mode_strategy['description']}")
-
-    def _crawl_data(self) -> Tuple[Dict, Dict, List]:
-        """执行数据爬取"""
-        ids = []
-        for platform in CONFIG["PLATFORMS"]:
-            if "name" in platform:
-                ids.append((platform["id"], platform["name"]))
-            else:
-                ids.append(platform["id"])
-
-        print(
-            f"配置的监控平台: {[p.get('name', p['id']) for p in CONFIG['PLATFORMS']]}"
-        )
-        print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
-        ensure_directory_exists("output")
-
-        results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
-            ids, self.request_interval
-        )
-
-        title_file = save_titles_to_file(results, id_to_name, failed_ids)
-        print(f"标题已保存到: {title_file}")
-
-        return results, id_to_name, failed_ids
-
-    def _execute_mode_strategy(
-        self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List
-    ) -> Optional[str]:
-        """执行模式特定逻辑"""
-        # 获取当前监控平台ID列表
-        current_platform_ids = [platform["id"] for platform in CONFIG["PLATFORMS"]]
-
-        new_titles = detect_latest_new_titles(current_platform_ids)
-        time_info = Path(save_titles_to_file(results, id_to_name, failed_ids)).stem
-        word_groups, filter_words, global_filters = load_frequency_words()
-
-        # current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性
-        if self.report_mode == "current":
-            # 加载完整的历史数据(已按当前平台过滤)
-            analysis_data = self._load_analysis_data()
-            if analysis_data:
-                (
-                    all_results,
-                    historical_id_to_name,
-                    historical_title_info,
-                    historical_new_titles,
-                    _,
-                    _,
-                    _,
-                ) = analysis_data
-
-                print(
-                    f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}"
-                )
-
-                stats, html_file = self._run_analysis_pipeline(
-                    all_results,
-                    self.report_mode,
-                    historical_title_info,
-                    historical_new_titles,
-                    word_groups,
-                    filter_words,
-                    historical_id_to_name,
-                    failed_ids=failed_ids,
-                    global_filters=global_filters,
-                )
-
-                combined_id_to_name = {**historical_id_to_name, **id_to_name}
-
-                print(f"HTML报告已生成: {html_file}")
-
-                # 发送实时通知(使用完整历史数据的统计结果)
-                summary_html = None
-                if mode_strategy["should_send_realtime"]:
-                    self._send_notification_if_needed(
-                        stats,
-                        mode_strategy["realtime_report_type"],
-                        self.report_mode,
-                        failed_ids=failed_ids,
-                        new_titles=historical_new_titles,
-                        id_to_name=combined_id_to_name,
-                        html_file_path=html_file,
-                    )
-            else:
-                print("❌ 严重错误:无法读取刚保存的数据文件")
-                raise RuntimeError("数据一致性检查失败:保存后立即读取失败")
-        else:
-            title_info = self._prepare_current_title_info(results, time_info)
-            stats, html_file = self._run_analysis_pipeline(
-                results,
-                self.report_mode,
-                title_info,
-                new_titles,
-                word_groups,
-                filter_words,
-                id_to_name,
-                failed_ids=failed_ids,
-                global_filters=global_filters,
-            )
-            print(f"HTML报告已生成: {html_file}")
-
-            # 发送实时通知(如果需要)
-            summary_html = None
-            if mode_strategy["should_send_realtime"]:
-                self._send_notification_if_needed(
-                    stats,
-                    mode_strategy["realtime_report_type"],
-                    self.report_mode,
-                    failed_ids=failed_ids,
-                    new_titles=new_titles,
-                    id_to_name=id_to_name,
-                    html_file_path=html_file,
-                )
-
-        # 生成汇总报告(如果需要)
-        summary_html = None
-        if mode_strategy["should_generate_summary"]:
-            if mode_strategy["should_send_realtime"]:
-                # 如果已经发送了实时通知,汇总只生成HTML不发送通知
-                summary_html = self._generate_summary_html(
-                    mode_strategy["summary_mode"]
-                )
-            else:
-                # daily模式:直接生成汇总报告并发送通知
-                summary_html = self._generate_summary_report(mode_strategy)
-
-        # 打开浏览器(仅在非容器环境)
-        if self._should_open_browser() and html_file:
-            if summary_html:
-                summary_url = "file://" + str(Path(summary_html).resolve())
-                print(f"正在打开汇总报告: {summary_url}")
-                webbrowser.open(summary_url)
-            else:
-                file_url = "file://" + str(Path(html_file).resolve())
-                print(f"正在打开HTML报告: {file_url}")
-                webbrowser.open(file_url)
-        elif self.is_docker_container and html_file:
-            if summary_html:
-                print(f"汇总报告已生成(Docker环境): {summary_html}")
-            else:
-                print(f"HTML报告已生成(Docker环境): {html_file}")
-
-        return summary_html
-
-    def run(self) -> None:
-        """执行分析流程"""
-        try:
-            self._initialize_and_check_config()
-
-            mode_strategy = self._get_mode_strategy()
-
-            results, id_to_name, failed_ids = self._crawl_data()
-
-            self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
-
-        except Exception as e:
-            print(f"分析流程执行出错: {e}")
-            raise
-
-
-def main():
-    try:
-        analyzer = NewsAnalyzer()
-        analyzer.run()
-    except FileNotFoundError as e:
-        print(f"❌ 配置文件错误: {e}")
-        print("\n请确保以下文件存在:")
-        print("  • config/config.yaml")
-        print("  • config/frequency_words.txt")
-        print("\n参考项目文档进行正确配置")
-    except Exception as e:
-        print(f"❌ 程序运行错误: {e}")
-        raise
-
-
-if __name__ == "__main__":
-    main()

+ 1 - 1
mcp_server/__init__.py

@@ -4,4 +4,4 @@ TrendRadar MCP Server
 提供基于MCP协议的新闻聚合数据查询和系统管理接口。
 提供基于MCP协议的新闻聚合数据查询和系统管理接口。
 """
 """
 
 
-__version__ = "1.0.0"
+__version__ = "1.1.0"

+ 128 - 0
mcp_server/server.py

@@ -15,6 +15,7 @@ from .tools.analytics import AnalyticsTools
 from .tools.search_tools import SearchTools
 from .tools.search_tools import SearchTools
 from .tools.config_mgmt import ConfigManagementTools
 from .tools.config_mgmt import ConfigManagementTools
 from .tools.system import SystemManagementTools
 from .tools.system import SystemManagementTools
+from .tools.storage_sync import StorageSyncTools
 from .utils.date_parser import DateParser
 from .utils.date_parser import DateParser
 from .utils.errors import MCPError
 from .utils.errors import MCPError
 
 
@@ -34,6 +35,7 @@ def _get_tools(project_root: Optional[str] = None):
         _tools_instances['search'] = SearchTools(project_root)
         _tools_instances['search'] = SearchTools(project_root)
         _tools_instances['config'] = ConfigManagementTools(project_root)
         _tools_instances['config'] = ConfigManagementTools(project_root)
         _tools_instances['system'] = SystemManagementTools(project_root)
         _tools_instances['system'] = SystemManagementTools(project_root)
+        _tools_instances['storage'] = StorageSyncTools(project_root)
     return _tools_instances
     return _tools_instances
 
 
 
 
@@ -657,6 +659,127 @@ async def trigger_crawl(
     return json.dumps(result, ensure_ascii=False, indent=2)
     return json.dumps(result, ensure_ascii=False, indent=2)
 
 
 
 
+# ==================== 存储同步工具 ====================
+
+@mcp.tool
+async def sync_from_remote(
+    days: int = 7
+) -> str:
+    """
+    从远程存储拉取数据到本地
+
+    用于 MCP Server 等场景:爬虫存到远程云存储(如 Cloudflare R2),
+    MCP Server 拉取到本地进行分析查询。
+
+    Args:
+        days: 拉取最近 N 天的数据,默认 7 天
+              - 0: 不拉取
+              - 7: 拉取最近一周的数据
+              - 30: 拉取最近一个月的数据
+
+    Returns:
+        JSON格式的同步结果,包含:
+        - success: 是否成功
+        - synced_files: 成功同步的文件数量
+        - synced_dates: 成功同步的日期列表
+        - skipped_dates: 跳过的日期(本地已存在)
+        - failed_dates: 失败的日期及错误信息
+        - message: 操作结果描述
+
+    Examples:
+        - sync_from_remote()  # 拉取最近7天
+        - sync_from_remote(days=30)  # 拉取最近30天
+
+    Note:
+        需要在 config/config.yaml 中配置远程存储(storage.remote)或设置环境变量:
+        - S3_ENDPOINT_URL: 服务端点
+        - S3_BUCKET_NAME: 存储桶名称
+        - S3_ACCESS_KEY_ID: 访问密钥 ID
+        - S3_SECRET_ACCESS_KEY: 访问密钥
+    """
+    tools = _get_tools()
+    result = tools['storage'].sync_from_remote(days=days)
+    return json.dumps(result, ensure_ascii=False, indent=2)
+
+
+@mcp.tool
+async def get_storage_status() -> str:
+    """
+    获取存储配置和状态
+
+    查看当前存储后端配置、本地和远程存储的状态信息。
+
+    Returns:
+        JSON格式的存储状态信息,包含:
+        - backend: 当前使用的后端类型(local/remote/auto)
+        - local: 本地存储状态
+            - data_dir: 数据目录
+            - retention_days: 保留天数
+            - total_size: 总大小
+            - date_count: 日期数量
+            - earliest_date: 最早日期
+            - latest_date: 最新日期
+        - remote: 远程存储状态
+            - configured: 是否已配置
+            - endpoint_url: 服务端点
+            - bucket_name: 存储桶名称
+            - date_count: 远程日期数量
+        - pull: 拉取配置
+            - enabled: 是否启用自动拉取
+            - days: 自动拉取天数
+
+    Examples:
+        - get_storage_status()  # 查看所有存储状态
+    """
+    tools = _get_tools()
+    result = tools['storage'].get_storage_status()
+    return json.dumps(result, ensure_ascii=False, indent=2)
+
+
+@mcp.tool
+async def list_available_dates(
+    source: str = "both"
+) -> str:
+    """
+    列出本地/远程可用的日期范围
+
+    查看本地和远程存储中有哪些日期的数据可用,
+    帮助了解数据覆盖范围和同步状态。
+
+    Args:
+        source: 数据来源,可选值:
+            - "local": 仅列出本地可用日期
+            - "remote": 仅列出远程可用日期
+            - "both": 同时列出两者并进行对比(默认)
+
+    Returns:
+        JSON格式的日期列表,包含:
+        - local: 本地日期信息(如果 source 包含 local)
+            - dates: 日期列表(按时间倒序)
+            - count: 日期数量
+            - earliest: 最早日期
+            - latest: 最新日期
+        - remote: 远程日期信息(如果 source 包含 remote)
+            - configured: 是否已配置远程存储
+            - dates: 日期列表
+            - count: 日期数量
+            - earliest: 最早日期
+            - latest: 最新日期
+        - comparison: 对比结果(仅当 source="both" 时)
+            - only_local: 仅本地存在的日期
+            - only_remote: 仅远程存在的日期
+            - both: 两边都存在的日期
+
+    Examples:
+        - list_available_dates()  # 查看本地和远程的对比
+        - list_available_dates(source="local")  # 仅查看本地
+        - list_available_dates(source="remote")  # 仅查看远程
+    """
+    tools = _get_tools()
+    result = tools['storage'].list_available_dates(source=source)
+    return json.dumps(result, ensure_ascii=False, indent=2)
+
+
 # ==================== 启动入口 ====================
 # ==================== 启动入口 ====================
 
 
 def run_server(
 def run_server(
@@ -721,6 +844,11 @@ def run_server(
     print("    11. get_current_config      - 获取当前系统配置")
     print("    11. get_current_config      - 获取当前系统配置")
     print("    12. get_system_status       - 获取系统运行状态")
     print("    12. get_system_status       - 获取系统运行状态")
     print("    13. trigger_crawl           - 手动触发爬取任务")
     print("    13. trigger_crawl           - 手动触发爬取任务")
+    print()
+    print("    === 存储同步工具 ===")
+    print("    14. sync_from_remote        - 从远程存储拉取数据到本地")
+    print("    15. get_storage_status      - 获取存储配置和状态")
+    print("    16. list_available_dates    - 列出本地/远程可用日期")
     print("=" * 60)
     print("=" * 60)
     print()
     print()
 
 

+ 51 - 32
mcp_server/services/data_service.py

@@ -517,24 +517,55 @@ class DataService:
         # 遍历日期文件夹
         # 遍历日期文件夹
         for date_folder in output_dir.iterdir():
         for date_folder in output_dir.iterdir():
             if date_folder.is_dir() and not date_folder.name.startswith('.'):
             if date_folder.is_dir() and not date_folder.name.startswith('.'):
-                # 解析日期(格式: YYYY年MM月DD日)
-                try:
-                    date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
-                    if date_match:
-                        folder_date = datetime(
-                            int(date_match.group(1)),
-                            int(date_match.group(2)),
-                            int(date_match.group(3))
-                        )
-                        available_dates.append(folder_date)
-                except Exception:
-                    pass
+                folder_date = self._parse_date_folder_name(date_folder.name)
+                if folder_date:
+                    available_dates.append(folder_date)
 
 
         if not available_dates:
         if not available_dates:
             return (None, None)
             return (None, None)
 
 
         return (min(available_dates), max(available_dates))
         return (min(available_dates), max(available_dates))
 
 
+    def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
+        """
+        解析日期文件夹名称(兼容中文和ISO格式)
+
+        支持两种格式:
+        - 中文格式:YYYY年MM月DD日
+        - ISO格式:YYYY-MM-DD
+
+        Args:
+            folder_name: 文件夹名称
+
+        Returns:
+            datetime 对象,解析失败返回 None
+        """
+        # 尝试中文格式:YYYY年MM月DD日
+        chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
+        if chinese_match:
+            try:
+                return datetime(
+                    int(chinese_match.group(1)),
+                    int(chinese_match.group(2)),
+                    int(chinese_match.group(3))
+                )
+            except ValueError:
+                pass
+
+        # 尝试 ISO 格式:YYYY-MM-DD
+        iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
+        if iso_match:
+            try:
+                return datetime(
+                    int(iso_match.group(1)),
+                    int(iso_match.group(2)),
+                    int(iso_match.group(3))
+                )
+            except ValueError:
+                pass
+
+        return None
+
     def get_system_status(self) -> Dict:
     def get_system_status(self) -> Dict:
         """
         """
         获取系统运行状态
         获取系统运行状态
@@ -553,26 +584,14 @@ class DataService:
         if output_dir.exists():
         if output_dir.exists():
             # 遍历日期文件夹
             # 遍历日期文件夹
             for date_folder in output_dir.iterdir():
             for date_folder in output_dir.iterdir():
-                if date_folder.is_dir():
-                    # 解析日期
-                    try:
-                        date_str = date_folder.name
-                        # 格式: YYYY年MM月DD日
-                        date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
-                        if date_match:
-                            folder_date = datetime(
-                                int(date_match.group(1)),
-                                int(date_match.group(2)),
-                                int(date_match.group(3))
-                            )
-
-                            if oldest_record is None or folder_date < oldest_record:
-                                oldest_record = folder_date
-                            if latest_record is None or folder_date > latest_record:
-                                latest_record = folder_date
-
-                    except:
-                        pass
+                if date_folder.is_dir() and not date_folder.name.startswith('.'):
+                    # 解析日期(兼容中文和ISO格式)
+                    folder_date = self._parse_date_folder_name(date_folder.name)
+                    if folder_date:
+                        if oldest_record is None or folder_date < oldest_record:
+                            oldest_record = folder_date
+                        if latest_record is None or folder_date > latest_record:
+                            latest_record = folder_date
 
 
                     # 计算存储大小
                     # 计算存储大小
                     for item in date_folder.rglob("*"):
                     for item in date_folder.rglob("*"):

+ 317 - 69
mcp_server/services/parser_service.py

@@ -2,9 +2,12 @@
 文件解析服务
 文件解析服务
 
 
 提供txt格式新闻数据和YAML配置文件的解析功能。
 提供txt格式新闻数据和YAML配置文件的解析功能。
+支持从 SQLite 数据库和 TXT 文件两种数据源读取。
 """
 """
 
 
+import json
 import re
 import re
+import sqlite3
 from pathlib import Path
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
 from typing import Dict, List, Tuple, Optional
 from datetime import datetime
 from datetime import datetime
@@ -145,17 +148,310 @@ class ParserService:
 
 
     def get_date_folder_name(self, date: datetime = None) -> str:
     def get_date_folder_name(self, date: datetime = None) -> str:
         """
         """
-        获取日期文件夹名称
+        获取日期文件夹名称(兼容中文和ISO格式)
 
 
         Args:
         Args:
             date: 日期对象,默认为今天
             date: 日期对象,默认为今天
 
 
         Returns:
         Returns:
-            文件夹名称,格式: YYYY年MM月DD日
+            实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
+            若不存在则返回 ISO 格式(YYYY-MM-DD)
         """
         """
         if date is None:
         if date is None:
             date = datetime.now()
             date = datetime.now()
-        return date.strftime("%Y年%m月%d日")
+        return self._find_date_folder(date)
+
+    def _get_date_folder_name(self, date: datetime = None) -> str:
+        """
+        获取日期文件夹名称(兼容中文和ISO格式)
+
+        Args:
+            date: 日期对象,默认为今天
+
+        Returns:
+            实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
+            若不存在则返回 ISO 格式(YYYY-MM-DD)
+        """
+        if date is None:
+            date = datetime.now()
+        return self._find_date_folder(date)
+
+    def _find_date_folder(self, date: datetime) -> str:
+        """
+        查找实际存在的日期文件夹
+
+        支持两种格式:
+        - 中文格式:YYYY年MM月DD日(优先)
+        - ISO格式:YYYY-MM-DD
+
+        Args:
+            date: 日期对象
+
+        Returns:
+            实际存在的文件夹名称,若都不存在则返回中文格式
+        """
+        output_dir = self.project_root / "output"
+
+        # 中文格式:YYYY年MM月DD日
+        chinese_format = date.strftime("%Y年%m月%d日")
+        # ISO格式:YYYY-MM-DD
+        iso_format = date.strftime("%Y-%m-%d")
+
+        # 优先检查中文格式
+        if (output_dir / chinese_format).exists():
+            return chinese_format
+        # 其次检查 ISO 格式
+        if (output_dir / iso_format).exists():
+            return iso_format
+
+        # 都不存在,返回中文格式(与项目现有风格一致)
+        return chinese_format
+
+    def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]:
+        """
+        获取 SQLite 数据库文件路径
+
+        Args:
+            date: 日期对象,默认为今天
+
+        Returns:
+            数据库文件路径,如果不存在则返回 None
+        """
+        date_folder = self._get_date_folder_name(date)
+        db_path = self.project_root / "output" / date_folder / "news.db"
+        if db_path.exists():
+            return db_path
+        return None
+
+    def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]:
+        """
+        获取 TXT 文件夹路径
+
+        Args:
+            date: 日期对象,默认为今天
+
+        Returns:
+            TXT 文件夹路径,如果不存在则返回 None
+        """
+        date_folder = self._get_date_folder_name(date)
+        txt_path = self.project_root / "output" / date_folder / "txt"
+        if txt_path.exists() and txt_path.is_dir():
+            return txt_path
+        return None
+
+    def _read_from_txt(
+        self,
+        date: datetime = None,
+        platform_ids: Optional[List[str]] = None
+    ) -> Optional[Tuple[Dict, Dict, Dict]]:
+        """
+        从 TXT 文件夹读取新闻数据
+
+        Args:
+            date: 日期对象,默认为今天
+            platform_ids: 平台ID列表,None表示所有平台
+
+        Returns:
+            (all_titles, id_to_name, all_timestamps) 元组,如果不存在返回 None
+        """
+        txt_folder = self._get_txt_folder_path(date)
+        if txt_folder is None:
+            return None
+
+        # 获取所有 TXT 文件并按时间排序
+        txt_files = sorted(txt_folder.glob("*.txt"))
+        if not txt_files:
+            return None
+
+        all_titles = {}
+        id_to_name = {}
+        all_timestamps = {}
+
+        for txt_file in txt_files:
+            try:
+                titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
+
+                # 记录时间戳
+                all_timestamps[txt_file.name] = txt_file.stat().st_mtime
+
+                # 合并 id_to_name
+                id_to_name.update(file_id_to_name)
+
+                # 合并标题数据
+                for source_id, titles in titles_by_id.items():
+                    # 如果指定了 platform_ids,过滤
+                    if platform_ids and source_id not in platform_ids:
+                        continue
+
+                    if source_id not in all_titles:
+                        all_titles[source_id] = {}
+
+                    for title, data in titles.items():
+                        if title not in all_titles[source_id]:
+                            # 新标题
+                            all_titles[source_id][title] = {
+                                "ranks": data.get("ranks", []),
+                                "url": data.get("url", ""),
+                                "mobileUrl": data.get("mobileUrl", ""),
+                                "first_time": txt_file.stem,  # 使用文件名作为时间
+                                "last_time": txt_file.stem,
+                                "count": 1,
+                            }
+                        else:
+                            # 合并已存在的标题
+                            existing = all_titles[source_id][title]
+                            # 合并排名
+                            for rank in data.get("ranks", []):
+                                if rank not in existing["ranks"]:
+                                    existing["ranks"].append(rank)
+                            # 更新 last_time
+                            existing["last_time"] = txt_file.stem
+                            existing["count"] += 1
+                            # 保留 URL
+                            if not existing["url"] and data.get("url"):
+                                existing["url"] = data["url"]
+                            if not existing["mobileUrl"] and data.get("mobileUrl"):
+                                existing["mobileUrl"] = data["mobileUrl"]
+
+            except Exception as e:
+                print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}")
+                continue
+
+        if not all_titles:
+            return None
+
+        return (all_titles, id_to_name, all_timestamps)
+
+    def _read_from_sqlite(
+        self,
+        date: datetime = None,
+        platform_ids: Optional[List[str]] = None
+    ) -> Optional[Tuple[Dict, Dict, Dict]]:
+        """
+        从 SQLite 数据库读取新闻数据
+
+        新表结构数据已按 URL 去重,包含:
+        - first_crawl_time: 首次抓取时间
+        - last_crawl_time: 最后抓取时间
+        - crawl_count: 抓取次数
+
+        Args:
+            date: 日期对象,默认为今天
+            platform_ids: 平台ID列表,None表示所有平台
+
+        Returns:
+            (all_titles, id_to_name, all_timestamps) 元组,如果数据库不存在返回 None
+        """
+        db_path = self._get_sqlite_db_path(date)
+        if db_path is None:
+            return None
+
+        all_titles = {}
+        id_to_name = {}
+        all_timestamps = {}
+
+        try:
+            conn = sqlite3.connect(str(db_path))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+
+            # 检查表是否存在
+            cursor.execute("""
+                SELECT name FROM sqlite_master
+                WHERE type='table' AND name='news_items'
+            """)
+            if not cursor.fetchone():
+                conn.close()
+                return None
+
+            # 构建查询
+            if platform_ids:
+                placeholders = ','.join(['?' for _ in platform_ids])
+                query = f"""
+                    SELECT n.id, n.platform_id, p.name as platform_name, n.title,
+                           n.rank, n.url, n.mobile_url,
+                           n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                    FROM news_items n
+                    LEFT JOIN platforms p ON n.platform_id = p.id
+                    WHERE n.platform_id IN ({placeholders})
+                """
+                cursor.execute(query, platform_ids)
+            else:
+                cursor.execute("""
+                    SELECT n.id, n.platform_id, p.name as platform_name, n.title,
+                           n.rank, n.url, n.mobile_url,
+                           n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                    FROM news_items n
+                    LEFT JOIN platforms p ON n.platform_id = p.id
+                """)
+
+            rows = cursor.fetchall()
+
+            # 收集所有 news_item_id 用于查询历史排名
+            news_ids = [row['id'] for row in rows]
+            rank_history_map = {}
+
+            if news_ids:
+                placeholders = ",".join("?" * len(news_ids))
+                cursor.execute(f"""
+                    SELECT news_item_id, rank FROM rank_history
+                    WHERE news_item_id IN ({placeholders})
+                    ORDER BY news_item_id, crawl_time
+                """, news_ids)
+                
+                for rh_row in cursor.fetchall():
+                    news_id = rh_row['news_item_id']
+                    rank = rh_row['rank']
+                    if news_id not in rank_history_map:
+                        rank_history_map[news_id] = []
+                    rank_history_map[news_id].append(rank)
+
+            for row in rows:
+                news_id = row['id']
+                platform_id = row['platform_id']
+                platform_name = row['platform_name'] or platform_id
+                title = row['title']
+
+                # 更新 id_to_name
+                if platform_id not in id_to_name:
+                    id_to_name[platform_id] = platform_name
+
+                # 初始化平台字典
+                if platform_id not in all_titles:
+                    all_titles[platform_id] = {}
+
+                # 获取排名历史,如果为空则使用当前排名
+                ranks = rank_history_map.get(news_id, [row['rank']])
+
+                # 直接使用数据(已去重)
+                all_titles[platform_id][title] = {
+                    "ranks": ranks,
+                    "url": row['url'] or "",
+                    "mobileUrl": row['mobile_url'] or "",
+                    "first_time": row['first_crawl_time'] or "",
+                    "last_time": row['last_crawl_time'] or "",
+                    "count": row['crawl_count'] or 1,
+                }
+
+            # 获取抓取时间作为 timestamps
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time
+            """)
+            for row in cursor.fetchall():
+                crawl_time = row['crawl_time']
+                all_timestamps[f"{crawl_time}.db"] = 0  # 用虚拟时间戳
+
+            conn.close()
+
+            if not all_titles:
+                return None
+
+            return (all_titles, id_to_name, all_timestamps)
+
+        except Exception as e:
+            print(f"Warning: 从 SQLite 读取数据失败: {e}")
+            return None
 
 
     def read_all_titles_for_date(
     def read_all_titles_for_date(
         self,
         self,
@@ -163,7 +459,7 @@ class ParserService:
         platform_ids: Optional[List[str]] = None
         platform_ids: Optional[List[str]] = None
     ) -> Tuple[Dict, Dict, Dict]:
     ) -> Tuple[Dict, Dict, Dict]:
         """
         """
-        读取指定日期的所有标题文件(带缓存)
+        读取指定日期的所有标题(带缓存)
 
 
         Args:
         Args:
             date: 日期对象,默认为今天
             date: 日期对象,默认为今天
@@ -193,71 +489,23 @@ class ParserService:
         if cached:
         if cached:
             return cached
             return cached
 
 
-        # 缓存未命中,读取文件
-        date_folder = self.get_date_folder_name(date)
-        txt_dir = self.project_root / "output" / date_folder / "txt"
-
-        if not txt_dir.exists():
-            raise DataNotFoundError(
-                f"未找到 {date_folder} 的数据目录",
-                suggestion="请先运行爬虫或检查日期是否正确"
-            )
-
-        all_titles = {}
-        id_to_name = {}
-        all_timestamps = {}
-
-        # 读取所有txt文件
-        txt_files = sorted(txt_dir.glob("*.txt"))
-
-        if not txt_files:
-            raise DataNotFoundError(
-                f"{date_folder} 没有数据文件",
-                suggestion="请等待爬虫任务完成"
-            )
-
-        for txt_file in txt_files:
-            try:
-                titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
-
-                # 更新id_to_name
-                id_to_name.update(file_id_to_name)
-
-                # 合并标题数据
-                for platform_id, titles in titles_by_id.items():
-                    # 如果指定了平台过滤
-                    if platform_ids and platform_id not in platform_ids:
-                        continue
-
-                    if platform_id not in all_titles:
-                        all_titles[platform_id] = {}
-
-                    for title, info in titles.items():
-                        if title in all_titles[platform_id]:
-                            # 合并排名
-                            all_titles[platform_id][title]["ranks"].extend(info["ranks"])
-                        else:
-                            all_titles[platform_id][title] = info.copy()
-
-                # 记录文件时间戳
-                all_timestamps[txt_file.name] = txt_file.stat().st_mtime
-
-            except Exception as e:
-                # 忽略单个文件的解析错误,继续处理其他文件
-                print(f"Warning: 解析文件 {txt_file} 失败: {e}")
-                continue
-
-        if not all_titles:
-            raise DataNotFoundError(
-                f"{date_folder} 没有有效的数据",
-                suggestion="请检查数据文件格式或重新运行爬虫"
-            )
-
-        # 缓存结果
-        result = (all_titles, id_to_name, all_timestamps)
-        self.cache.set(cache_key, result)
-
-        return result
+        # 优先从 SQLite 读取
+        sqlite_result = self._read_from_sqlite(date, platform_ids)
+        if sqlite_result:
+            self.cache.set(cache_key, sqlite_result)
+            return sqlite_result
+
+        # SQLite 不存在,尝试从 TXT 读取
+        txt_result = self._read_from_txt(date, platform_ids)
+        if txt_result:
+            self.cache.set(cache_key, txt_result)
+            return txt_result
+
+        # 两种数据源都不存在
+        raise DataNotFoundError(
+            f"未找到 {date_str} 的数据",
+            suggestion="请先运行爬虫或检查日期是否正确"
+        )
 
 
     def parse_yaml_config(self, config_path: str = None) -> dict:
     def parse_yaml_config(self, config_path: str = None) -> dict:
         """
         """

+ 0 - 1
mcp_server/tools/analytics.py

@@ -25,7 +25,6 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
     """
     """
     计算新闻权重(用于排序)
     计算新闻权重(用于排序)
 
 
-    基于 main.py 的权重算法实现,综合考虑:
     - 排名权重 (60%):新闻在榜单中的排名
     - 排名权重 (60%):新闻在榜单中的排名
     - 频次权重 (30%):新闻出现的次数
     - 频次权重 (30%):新闻出现的次数
     - 热度权重 (10%):高排名出现的比例
     - 热度权重 (10%):高排名出现的比例

+ 468 - 0
mcp_server/tools/storage_sync.py

@@ -0,0 +1,468 @@
+# coding=utf-8
+"""
+存储同步工具
+
+实现从远程存储拉取数据到本地、获取存储状态、列出可用日期等功能。
+"""
+
+import os
+import re
+from pathlib import Path
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
+
+import yaml
+
+from ..utils.errors import MCPError
+
+
+class StorageSyncTools:
+    """存储同步工具类"""
+
+    def __init__(self, project_root: str = None):
+        """
+        初始化存储同步工具
+
+        Args:
+            project_root: 项目根目录
+        """
+        if project_root:
+            self.project_root = Path(project_root)
+        else:
+            current_file = Path(__file__)
+            self.project_root = current_file.parent.parent.parent
+
+        self._config = None
+        self._remote_backend = None
+
+    def _load_config(self) -> dict:
+        """加载配置文件"""
+        if self._config is None:
+            config_path = self.project_root / "config" / "config.yaml"
+            if config_path.exists():
+                with open(config_path, "r", encoding="utf-8") as f:
+                    self._config = yaml.safe_load(f)
+            else:
+                self._config = {}
+        return self._config
+
+    def _get_storage_config(self) -> dict:
+        """获取存储配置"""
+        config = self._load_config()
+        return config.get("storage", {})
+
+    def _get_remote_config(self) -> dict:
+        """
+        获取远程存储配置(合并配置文件和环境变量)
+        """
+        storage_config = self._get_storage_config()
+        remote_config = storage_config.get("remote", {})
+
+        return {
+            "endpoint_url": remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
+            "bucket_name": remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
+            "access_key_id": remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
+            "secret_access_key": remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
+            "region": remote_config.get("region") or os.environ.get("S3_REGION", ""),
+        }
+
+    def _has_remote_config(self) -> bool:
+        """检查是否有有效的远程存储配置"""
+        config = self._get_remote_config()
+        return bool(
+            config.get("bucket_name") and
+            config.get("access_key_id") and
+            config.get("secret_access_key") and
+            config.get("endpoint_url")
+        )
+
+    def _get_remote_backend(self):
+        """获取远程存储后端实例"""
+        if self._remote_backend is not None:
+            return self._remote_backend
+
+        if not self._has_remote_config():
+            return None
+
+        try:
+            from trendradar.storage.remote import RemoteStorageBackend
+
+            remote_config = self._get_remote_config()
+            config = self._load_config()
+            timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
+
+            self._remote_backend = RemoteStorageBackend(
+                bucket_name=remote_config["bucket_name"],
+                access_key_id=remote_config["access_key_id"],
+                secret_access_key=remote_config["secret_access_key"],
+                endpoint_url=remote_config["endpoint_url"],
+                region=remote_config.get("region", ""),
+                timezone=timezone,
+            )
+            return self._remote_backend
+        except ImportError:
+            print("[存储同步] 远程存储后端需要安装 boto3: pip install boto3")
+            return None
+        except Exception as e:
+            print(f"[存储同步] 创建远程后端失败: {e}")
+            return None
+
+    def _get_local_data_dir(self) -> Path:
+        """获取本地数据目录"""
+        storage_config = self._get_storage_config()
+        local_config = storage_config.get("local", {})
+        data_dir = local_config.get("data_dir", "output")
+        return self.project_root / data_dir
+
+    def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
+        """
+        解析日期文件夹名称(兼容中文和 ISO 格式)
+
+        支持两种格式:
+        - 中文格式:YYYY年MM月DD日
+        - ISO 格式:YYYY-MM-DD
+        """
+        # 尝试 ISO 格式
+        iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
+        if iso_match:
+            try:
+                return datetime(
+                    int(iso_match.group(1)),
+                    int(iso_match.group(2)),
+                    int(iso_match.group(3))
+                )
+            except ValueError:
+                pass
+
+        # 尝试中文格式
+        chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
+        if chinese_match:
+            try:
+                return datetime(
+                    int(chinese_match.group(1)),
+                    int(chinese_match.group(2)),
+                    int(chinese_match.group(3))
+                )
+            except ValueError:
+                pass
+
+        return None
+
+    def _get_local_dates(self) -> List[str]:
+        """获取本地可用的日期列表"""
+        local_dir = self._get_local_data_dir()
+        dates = []
+
+        if not local_dir.exists():
+            return dates
+
+        for item in local_dir.iterdir():
+            if item.is_dir() and not item.name.startswith('.'):
+                folder_date = self._parse_date_folder_name(item.name)
+                if folder_date:
+                    dates.append(folder_date.strftime("%Y-%m-%d"))
+
+        return sorted(dates, reverse=True)
+
+    def _calculate_dir_size(self, path: Path) -> int:
+        """计算目录大小(字节)"""
+        total_size = 0
+        if path.exists():
+            for item in path.rglob("*"):
+                if item.is_file():
+                    total_size += item.stat().st_size
+        return total_size
+
+    def sync_from_remote(self, days: int = 7) -> Dict:
+        """
+        从远程存储拉取数据到本地
+
+        Args:
+            days: 拉取最近 N 天的数据,默认 7 天
+
+        Returns:
+            同步结果字典
+        """
+        try:
+            # 检查远程配置
+            if not self._has_remote_config():
+                return {
+                    "success": False,
+                    "error": {
+                        "code": "REMOTE_NOT_CONFIGURED",
+                        "message": "未配置远程存储",
+                        "suggestion": "请在 config/config.yaml 中配置 storage.remote 或设置环境变量"
+                    }
+                }
+
+            # 获取远程后端
+            remote_backend = self._get_remote_backend()
+            if remote_backend is None:
+                return {
+                    "success": False,
+                    "error": {
+                        "code": "REMOTE_BACKEND_FAILED",
+                        "message": "无法创建远程存储后端",
+                        "suggestion": "请检查远程存储配置和 boto3 是否已安装"
+                    }
+                }
+
+            # 获取本地数据目录
+            local_dir = self._get_local_data_dir()
+            local_dir.mkdir(parents=True, exist_ok=True)
+
+            # 获取远程可用日期
+            remote_dates = remote_backend.list_remote_dates()
+
+            # 获取本地已有日期
+            local_dates = set(self._get_local_dates())
+
+            # 计算需要拉取的日期(最近 N 天)
+            from trendradar.utils.time import get_configured_time
+            config = self._load_config()
+            timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
+            now = get_configured_time(timezone)
+
+            target_dates = []
+            for i in range(days):
+                date = now - timedelta(days=i)
+                date_str = date.strftime("%Y-%m-%d")
+                if date_str in remote_dates:
+                    target_dates.append(date_str)
+
+            # 执行拉取
+            synced_dates = []
+            skipped_dates = []
+            failed_dates = []
+
+            for date_str in target_dates:
+                # 检查本地是否已存在
+                if date_str in local_dates:
+                    skipped_dates.append(date_str)
+                    continue
+
+                # 拉取单个日期
+                try:
+                    local_date_dir = local_dir / date_str
+                    local_db_path = local_date_dir / "news.db"
+                    remote_key = f"news/{date_str}.db"
+
+                    local_date_dir.mkdir(parents=True, exist_ok=True)
+                    remote_backend.s3_client.download_file(
+                        remote_backend.bucket_name,
+                        remote_key,
+                        str(local_db_path)
+                    )
+                    synced_dates.append(date_str)
+                    print(f"[存储同步] 已拉取: {date_str}")
+                except Exception as e:
+                    failed_dates.append({"date": date_str, "error": str(e)})
+                    print(f"[存储同步] 拉取失败 ({date_str}): {e}")
+
+            return {
+                "success": True,
+                "synced_files": len(synced_dates),
+                "synced_dates": synced_dates,
+                "skipped_dates": skipped_dates,
+                "failed_dates": failed_dates,
+                "message": f"成功同步 {len(synced_dates)} 天数据" + (
+                    f",跳过 {len(skipped_dates)} 天(本地已存在)" if skipped_dates else ""
+                ) + (
+                    f",失败 {len(failed_dates)} 天" if failed_dates else ""
+                )
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def get_storage_status(self) -> Dict:
+        """
+        获取存储配置和状态
+
+        Returns:
+            存储状态字典
+        """
+        try:
+            storage_config = self._get_storage_config()
+            config = self._load_config()
+
+            # 本地存储状态
+            local_config = storage_config.get("local", {})
+            local_dir = self._get_local_data_dir()
+            local_size = self._calculate_dir_size(local_dir)
+            local_dates = self._get_local_dates()
+
+            local_status = {
+                "data_dir": local_config.get("data_dir", "output"),
+                "retention_days": local_config.get("retention_days", 0),
+                "total_size": f"{local_size / 1024 / 1024:.2f} MB",
+                "total_size_bytes": local_size,
+                "date_count": len(local_dates),
+                "earliest_date": local_dates[-1] if local_dates else None,
+                "latest_date": local_dates[0] if local_dates else None,
+            }
+
+            # 远程存储状态
+            remote_config = storage_config.get("remote", {})
+            has_remote = self._has_remote_config()
+
+            remote_status = {
+                "configured": has_remote,
+                "retention_days": remote_config.get("retention_days", 0),
+            }
+
+            if has_remote:
+                merged_config = self._get_remote_config()
+                # 脱敏显示
+                endpoint = merged_config.get("endpoint_url", "")
+                bucket = merged_config.get("bucket_name", "")
+                remote_status["endpoint_url"] = endpoint
+                remote_status["bucket_name"] = bucket
+
+                # 尝试获取远程日期列表
+                remote_backend = self._get_remote_backend()
+                if remote_backend:
+                    try:
+                        remote_dates = remote_backend.list_remote_dates()
+                        remote_status["date_count"] = len(remote_dates)
+                        remote_status["earliest_date"] = remote_dates[-1] if remote_dates else None
+                        remote_status["latest_date"] = remote_dates[0] if remote_dates else None
+                    except Exception as e:
+                        remote_status["error"] = str(e)
+
+            # 拉取配置状态
+            pull_config = storage_config.get("pull", {})
+            pull_status = {
+                "enabled": pull_config.get("enabled", False),
+                "days": pull_config.get("days", 7),
+            }
+
+            return {
+                "success": True,
+                "backend": storage_config.get("backend", "auto"),
+                "local": local_status,
+                "remote": remote_status,
+                "pull": pull_status,
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def list_available_dates(self, source: str = "both") -> Dict:
+        """
+        列出可用的日期范围
+
+        Args:
+            source: 数据来源
+                - "local": 仅本地
+                - "remote": 仅远程
+                - "both": 两者都列出(默认)
+
+        Returns:
+            日期列表字典
+        """
+        try:
+            result = {
+                "success": True,
+            }
+
+            # 本地日期
+            if source in ("local", "both"):
+                local_dates = self._get_local_dates()
+                result["local"] = {
+                    "dates": local_dates,
+                    "count": len(local_dates),
+                    "earliest": local_dates[-1] if local_dates else None,
+                    "latest": local_dates[0] if local_dates else None,
+                }
+
+            # 远程日期
+            if source in ("remote", "both"):
+                if not self._has_remote_config():
+                    result["remote"] = {
+                        "configured": False,
+                        "dates": [],
+                        "count": 0,
+                        "earliest": None,
+                        "latest": None,
+                        "error": "未配置远程存储"
+                    }
+                else:
+                    remote_backend = self._get_remote_backend()
+                    if remote_backend:
+                        try:
+                            remote_dates = remote_backend.list_remote_dates()
+                            result["remote"] = {
+                                "configured": True,
+                                "dates": remote_dates,
+                                "count": len(remote_dates),
+                                "earliest": remote_dates[-1] if remote_dates else None,
+                                "latest": remote_dates[0] if remote_dates else None,
+                            }
+                        except Exception as e:
+                            result["remote"] = {
+                                "configured": True,
+                                "dates": [],
+                                "count": 0,
+                                "earliest": None,
+                                "latest": None,
+                                "error": str(e)
+                            }
+                    else:
+                        result["remote"] = {
+                            "configured": True,
+                            "dates": [],
+                            "count": 0,
+                            "earliest": None,
+                            "latest": None,
+                            "error": "无法创建远程存储后端"
+                        }
+
+            # 如果同时查询两者,计算差异
+            if source == "both" and "local" in result and "remote" in result:
+                local_set = set(result["local"]["dates"])
+                remote_set = set(result["remote"].get("dates", []))
+
+                result["comparison"] = {
+                    "only_local": sorted(list(local_set - remote_set), reverse=True),
+                    "only_remote": sorted(list(remote_set - local_set), reverse=True),
+                    "both": sorted(list(local_set & remote_set), reverse=True),
+                }
+
+            return result
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }

+ 102 - 199
mcp_server/tools/system.py

@@ -87,13 +87,13 @@ class SystemManagementTools:
             >>> print(result['saved_files'])
             >>> print(result['saved_files'])
         """
         """
         try:
         try:
-            import json
             import time
             import time
-            import random
-            import requests
-            from datetime import datetime
-            import pytz
             import yaml
             import yaml
+            from trendradar.crawler.fetcher import DataFetcher
+            from trendradar.storage.local import LocalStorageBackend
+            from trendradar.storage.base import convert_crawl_results_to_news_data
+            from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename
+            from ..services.cache_service import get_cache
 
 
             # 参数验证
             # 参数验证
             platforms = validate_platforms(platforms)
             platforms = validate_platforms(platforms)
@@ -129,9 +129,6 @@ class SystemManagementTools:
             else:
             else:
                 target_platforms = all_platforms
                 target_platforms = all_platforms
 
 
-            # 获取请求间隔
-            request_interval = config_data.get("crawler", {}).get("request_interval", 100)
-
             # 构建平台ID列表
             # 构建平台ID列表
             ids = []
             ids = []
             for platform in target_platforms:
             for platform in target_platforms:
@@ -142,87 +139,82 @@ class SystemManagementTools:
 
 
             print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
             print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
 
 
-            # 爬取数据
-            results = {}
-            id_to_name = {}
-            failed_ids = []
-
-            for i, id_info in enumerate(ids):
-                if isinstance(id_info, tuple):
-                    id_value, name = id_info
-                else:
-                    id_value = id_info
-                    name = id_value
-
-                id_to_name[id_value] = name
-
-                # 构建请求URL
-                url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
+            # 初始化数据获取器
+            crawler_config = config_data.get("crawler", {})
+            proxy_url = None
+            if crawler_config.get("use_proxy"):
+                proxy_url = crawler_config.get("proxy_url")
+            
+            fetcher = DataFetcher(proxy_url=proxy_url)
+            request_interval = crawler_config.get("request_interval", 100)
+
+            # 执行爬取
+            results, id_to_name, failed_ids = fetcher.crawl_websites(
+                ids_list=ids,
+                request_interval=request_interval
+            )
+
+            # 获取当前时间(统一使用 trendradar 的时间工具)
+            # 从配置中读取时区,默认为 Asia/Shanghai
+            timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai")
+            current_time = get_configured_time(timezone)
+            crawl_date = format_date_folder(None, timezone)
+            crawl_time_str = format_time_filename(timezone)
+
+            # 转换为标准数据模型
+            news_data = convert_crawl_results_to_news_data(
+                results=results,
+                id_to_name=id_to_name,
+                failed_ids=failed_ids,
+                crawl_time=crawl_time_str,
+                crawl_date=crawl_date
+            )
+
+            # 初始化存储后端
+            storage = LocalStorageBackend(
+                data_dir=str(self.project_root / "output"),
+                enable_txt=True,
+                enable_html=True,
+                timezone=timezone
+            )
+
+            # 尝试持久化数据
+            save_success = False
+            save_error_msg = ""
+            saved_files = {}
+
+            try:
+                # 1. 保存到 SQLite (核心持久化)
+                if storage.save_news_data(news_data):
+                    save_success = True
+                
+                # 2. 如果请求保存到本地,生成 TXT/HTML 快照
+                if save_to_local:
+                    # 保存 TXT
+                    txt_path = storage.save_txt_snapshot(news_data)
+                    if txt_path:
+                        saved_files["txt"] = txt_path
+
+                    # 保存 HTML (使用简化版生成器)
+                    html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time)
+                    html_filename = f"{crawl_time_str}.html"
+                    html_path = storage.save_html_report(html_content, html_filename)
+                    if html_path:
+                        saved_files["html"] = html_path
+
+            except Exception as e:
+                # 捕获所有保存错误(特别是 Docker 只读卷导致的 PermissionError)
+                print(f"[System] 数据保存失败: {e}")
+                save_success = False
+                save_error_msg = str(e)
+
+            # 3. 清除缓存,确保下次查询获取最新数据
+            # 即使保存失败,内存中的数据可能已经通过其他方式更新,或者是临时的
+            get_cache().clear()
+            print("[System] 缓存已清除")
 
 
-                headers = {
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-                    "Accept": "application/json, text/plain, */*",
-                    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-                    "Connection": "keep-alive",
-                    "Cache-Control": "no-cache",
-                }
-
-                # 重试机制
-                max_retries = 2
-                retries = 0
-                success = False
-
-                while retries <= max_retries and not success:
-                    try:
-                        response = requests.get(url, headers=headers, timeout=10)
-                        response.raise_for_status()
-
-                        data_text = response.text
-                        data_json = json.loads(data_text)
-
-                        status = data_json.get("status", "未知")
-                        if status not in ["success", "cache"]:
-                            raise ValueError(f"响应状态异常: {status}")
-
-                        status_info = "最新数据" if status == "success" else "缓存数据"
-                        print(f"获取 {id_value} 成功({status_info})")
-
-                        # 解析数据
-                        results[id_value] = {}
-                        for index, item in enumerate(data_json.get("items", []), 1):
-                            title = item["title"]
-                            url_link = item.get("url", "")
-                            mobile_url = item.get("mobileUrl", "")
-
-                            if title in results[id_value]:
-                                results[id_value][title]["ranks"].append(index)
-                            else:
-                                results[id_value][title] = {
-                                    "ranks": [index],
-                                    "url": url_link,
-                                    "mobileUrl": mobile_url,
-                                }
-
-                        success = True
-
-                    except Exception as e:
-                        retries += 1
-                        if retries <= max_retries:
-                            wait_time = random.uniform(3, 5)
-                            print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
-                            time.sleep(wait_time)
-                        else:
-                            print(f"请求 {id_value} 失败: {e}")
-                            failed_ids.append(id_value)
-
-                # 请求间隔
-                if i < len(ids) - 1:
-                    actual_interval = request_interval + random.randint(-10, 20)
-                    actual_interval = max(50, actual_interval)
-                    time.sleep(actual_interval / 1000)
-
-            # 格式化返回数据
-            news_data = []
+            # 构建返回结果
+            news_response_data = []
             for platform_id, titles_data in results.items():
             for platform_id, titles_data in results.items():
                 platform_name = id_to_name.get(platform_id, platform_id)
                 platform_name = id_to_name.get(platform_id, platform_id)
                 for title, info in titles_data.items():
                 for title, info in titles_data.items():
@@ -230,131 +222,42 @@ class SystemManagementTools:
                         "platform_id": platform_id,
                         "platform_id": platform_id,
                         "platform_name": platform_name,
                         "platform_name": platform_name,
                         "title": title,
                         "title": title,
-                        "ranks": info["ranks"]
+                        "ranks": info.get("ranks", [])
                     }
                     }
-
-                    # 条件性添加 URL 字段
                     if include_url:
                     if include_url:
                         news_item["url"] = info.get("url", "")
                         news_item["url"] = info.get("url", "")
                         news_item["mobile_url"] = info.get("mobileUrl", "")
                         news_item["mobile_url"] = info.get("mobileUrl", "")
+                    news_response_data.append(news_item)
 
 
-                    news_data.append(news_item)
-
-            # 获取北京时间
-            beijing_tz = pytz.timezone("Asia/Shanghai")
-            now = datetime.now(beijing_tz)
-
-            # 构建返回结果
             result = {
             result = {
                 "success": True,
                 "success": True,
                 "task_id": f"crawl_{int(time.time())}",
                 "task_id": f"crawl_{int(time.time())}",
                 "status": "completed",
                 "status": "completed",
-                "crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"),
+                "crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"),
                 "platforms": list(results.keys()),
                 "platforms": list(results.keys()),
-                "total_news": len(news_data),
+                "total_news": len(news_response_data),
                 "failed_platforms": failed_ids,
                 "failed_platforms": failed_ids,
-                "data": news_data,
-                "saved_to_local": save_to_local
+                "data": news_response_data,
+                "saved_to_local": save_success and save_to_local
             }
             }
 
 
-            # 如果需要持久化,调用保存逻辑
-            if save_to_local:
-                try:
-                    import re
-
-                    # 辅助函数:清理标题
-                    def clean_title(title: str) -> str:
-                        """清理标题中的特殊字符"""
-                        if not isinstance(title, str):
-                            title = str(title)
-                        cleaned_title = title.replace("\n", " ").replace("\r", " ")
-                        cleaned_title = re.sub(r"\s+", " ", cleaned_title)
-                        cleaned_title = cleaned_title.strip()
-                        return cleaned_title
-
-                    # 辅助函数:创建目录
-                    def ensure_directory_exists(directory: str):
-                        """确保目录存在"""
-                        Path(directory).mkdir(parents=True, exist_ok=True)
-
-                    # 格式化日期和时间
-                    date_folder = now.strftime("%Y年%m月%d日")
-                    time_filename = now.strftime("%H时%M分")
-
-                    # 创建 txt 文件路径
-                    txt_dir = self.project_root / "output" / date_folder / "txt"
-                    ensure_directory_exists(str(txt_dir))
-                    txt_file_path = txt_dir / f"{time_filename}.txt"
-
-                    # 创建 html 文件路径
-                    html_dir = self.project_root / "output" / date_folder / "html"
-                    ensure_directory_exists(str(html_dir))
-                    html_file_path = html_dir / f"{time_filename}.html"
-
-                    # 保存 txt 文件(按照 main.py 的格式)
-                    with open(txt_file_path, "w", encoding="utf-8") as f:
-                        for id_value, title_data in results.items():
-                            # id | name 或 id
-                            name = id_to_name.get(id_value)
-                            if name and name != id_value:
-                                f.write(f"{id_value} | {name}\n")
-                            else:
-                                f.write(f"{id_value}\n")
-
-                            # 按排名排序标题
-                            sorted_titles = []
-                            for title, info in title_data.items():
-                                cleaned = clean_title(title)
-                                if isinstance(info, dict):
-                                    ranks = info.get("ranks", [])
-                                    url = info.get("url", "")
-                                    mobile_url = info.get("mobileUrl", "")
-                                else:
-                                    ranks = info if isinstance(info, list) else []
-                                    url = ""
-                                    mobile_url = ""
-
-                                rank = ranks[0] if ranks else 1
-                                sorted_titles.append((rank, cleaned, url, mobile_url))
-
-                            sorted_titles.sort(key=lambda x: x[0])
-
-                            for rank, cleaned, url, mobile_url in sorted_titles:
-                                line = f"{rank}. {cleaned}"
-                                if url:
-                                    line += f" [URL:{url}]"
-                                if mobile_url:
-                                    line += f" [MOBILE:{mobile_url}]"
-                                f.write(line + "\n")
-
-                            f.write("\n")
-
-                        if failed_ids:
-                            f.write("==== 以下ID请求失败 ====\n")
-                            for id_value in failed_ids:
-                                f.write(f"{id_value}\n")
-
-                    # 保存 html 文件(简化版)
-                    html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
-                    with open(html_file_path, "w", encoding="utf-8") as f:
-                        f.write(html_content)
-
-                    print(f"数据已保存到:")
-                    print(f"  TXT: {txt_file_path}")
-                    print(f"  HTML: {html_file_path}")
-
-                    result["saved_files"] = {
-                        "txt": str(txt_file_path),
-                        "html": str(html_file_path)
-                    }
-                    result["note"] = "数据已持久化到 output 文件夹"
-
-                except Exception as e:
-                    print(f"保存文件失败: {e}")
-                    result["save_error"] = str(e)
-                    result["note"] = "爬取成功但保存失败,数据仅在内存中"
+            if save_success:
+                if save_to_local:
+                    result["saved_files"] = saved_files
+                    result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹"
+                else:
+                    result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果,未生成TXT快照)"
             else:
             else:
-                result["note"] = "临时爬取结果,未持久化到output文件夹"
+                # 明确告知用户保存失败
+                result["saved_to_local"] = False
+                result["save_error"] = save_error_msg
+                if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg:
+                    result["note"] = "爬取成功,但无法写入数据库(Docker只读模式)。数据仅在本次返回中有效。"
+                else:
+                    result["note"] = f"爬取成功但保存失败: {save_error_msg}"
+
+            # 清理资源
+            storage.cleanup()
 
 
             return result
             return result
 
 

+ 3 - 3
mcp_server/utils/date_parser.py

@@ -283,13 +283,13 @@ class DateParser:
             date: datetime对象
             date: datetime对象
 
 
         Returns:
         Returns:
-            文件夹名称,格式: YYYY年MM月DD日
+            文件夹名称,格式: YYYY-MM-DD
 
 
         Examples:
         Examples:
             >>> DateParser.format_date_folder(datetime(2025, 10, 11))
             >>> DateParser.format_date_folder(datetime(2025, 10, 11))
-            '2025年10月11日'
+            '2025-10-11'
         """
         """
-        return date.strftime("%Y年%m月%d日")
+        return date.strftime("%Y-%m-%d")
 
 
     @staticmethod
     @staticmethod
     def validate_date_not_future(date: datetime) -> None:
     def validate_date_not_future(date: datetime) -> None:

+ 1 - 1
pyproject.toml

@@ -1,6 +1,6 @@
 [project]
 [project]
 name = "trendradar-mcp"
 name = "trendradar-mcp"
-version = "1.0.3"
+version = "1.1.0"
 description = "TrendRadar MCP Server - 新闻热点聚合工具"
 description = "TrendRadar MCP Server - 新闻热点聚合工具"
 requires-python = ">=3.10"
 requires-python = ">=3.10"
 dependencies = [
 dependencies = [

+ 1 - 0
requirements.txt

@@ -3,3 +3,4 @@ pytz>=2025.2,<2026.0
 PyYAML>=6.0.3,<7.0.0
 PyYAML>=6.0.3,<7.0.0
 fastmcp>=2.12.0,<2.14.0
 fastmcp>=2.12.0,<2.14.0
 websockets>=13.0,<14.0
 websockets>=13.0,<14.0
+boto3>=1.35.0,<2.0.0

+ 13 - 0
trendradar/__init__.py

@@ -0,0 +1,13 @@
+# coding=utf-8
+"""
+TrendRadar - 热点新闻聚合与分析工具
+
+使用方式:
+  python -m trendradar        # 模块执行
+  trendradar                  # 安装后执行
+"""
+
+from trendradar.context import AppContext
+
+__version__ = "4.0.0"
+__all__ = ["AppContext", "__version__"]

+ 719 - 0
trendradar/__main__.py

@@ -0,0 +1,719 @@
+# coding=utf-8
+"""
+TrendRadar 主程序
+
+热点新闻聚合与分析工具
+支持: python -m trendradar
+"""
+
+import os
+import webbrowser
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+
+import requests
+
+from trendradar.context import AppContext
+
+# 版本号直接定义,避免循环导入
+VERSION = "4.0.0"
+from trendradar.core import load_config
+from trendradar.crawler import DataFetcher
+from trendradar.storage import convert_crawl_results_to_news_data
+
+
+def check_version_update(
+    current_version: str, version_url: str, proxy_url: Optional[str] = None
+) -> Tuple[bool, Optional[str]]:
+    """检查版本更新"""
+    try:
+        proxies = None
+        if proxy_url:
+            proxies = {"http": proxy_url, "https": proxy_url}
+
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            "Accept": "text/plain, */*",
+            "Cache-Control": "no-cache",
+        }
+
+        response = requests.get(
+            version_url, proxies=proxies, headers=headers, timeout=10
+        )
+        response.raise_for_status()
+
+        remote_version = response.text.strip()
+        print(f"当前版本: {current_version}, 远程版本: {remote_version}")
+
+        # 比较版本
+        def parse_version(version_str):
+            try:
+                parts = version_str.strip().split(".")
+                if len(parts) != 3:
+                    raise ValueError("版本号格式不正确")
+                return int(parts[0]), int(parts[1]), int(parts[2])
+            except:
+                return 0, 0, 0
+
+        current_tuple = parse_version(current_version)
+        remote_tuple = parse_version(remote_version)
+
+        need_update = current_tuple < remote_tuple
+        return need_update, remote_version if need_update else None
+
+    except Exception as e:
+        print(f"版本检查失败: {e}")
+        return False, None
+
+
+# === 主分析器 ===
+class NewsAnalyzer:
+    """新闻分析器"""
+
+    # 模式策略定义
+    MODE_STRATEGIES = {
+        "incremental": {
+            "mode_name": "增量模式",
+            "description": "增量模式(只关注新增新闻,无新增时不推送)",
+            "realtime_report_type": "实时增量",
+            "summary_report_type": "当日汇总",
+            "should_send_realtime": True,
+            "should_generate_summary": True,
+            "summary_mode": "daily",
+        },
+        "current": {
+            "mode_name": "当前榜单模式",
+            "description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)",
+            "realtime_report_type": "实时当前榜单",
+            "summary_report_type": "当前榜单汇总",
+            "should_send_realtime": True,
+            "should_generate_summary": True,
+            "summary_mode": "current",
+        },
+        "daily": {
+            "mode_name": "当日汇总模式",
+            "description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)",
+            "realtime_report_type": "",
+            "summary_report_type": "当日汇总",
+            "should_send_realtime": False,
+            "should_generate_summary": True,
+            "summary_mode": "daily",
+        },
+    }
+
+    def __init__(self):
+        # 加载配置
+        print("正在加载配置...")
+        config = load_config()
+        print(f"TrendRadar v{VERSION} 配置加载完成")
+        print(f"监控平台数量: {len(config['PLATFORMS'])}")
+        print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}")
+
+        # 创建应用上下文
+        self.ctx = AppContext(config)
+
+        self.request_interval = self.ctx.config["REQUEST_INTERVAL"]
+        self.report_mode = self.ctx.config["REPORT_MODE"]
+        self.rank_threshold = self.ctx.rank_threshold
+        self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
+        self.is_docker_container = self._detect_docker_environment()
+        self.update_info = None
+        self.proxy_url = None
+        self._setup_proxy()
+        self.data_fetcher = DataFetcher(self.proxy_url)
+
+        # 初始化存储管理器(使用 AppContext)
+        self._init_storage_manager()
+
+        if self.is_github_actions:
+            self._check_version_update()
+
+    def _init_storage_manager(self) -> None:
+        """初始化存储管理器(使用 AppContext)"""
+        # 获取数据保留天数(支持环境变量覆盖)
+        env_retention = os.environ.get("STORAGE_RETENTION_DAYS", "").strip()
+        if env_retention:
+            # 环境变量覆盖配置
+            self.ctx.config["STORAGE"]["RETENTION_DAYS"] = int(env_retention)
+
+        self.storage_manager = self.ctx.get_storage_manager()
+        print(f"存储后端: {self.storage_manager.backend_name}")
+
+        retention_days = self.ctx.config.get("STORAGE", {}).get("RETENTION_DAYS", 0)
+        if retention_days > 0:
+            print(f"数据保留天数: {retention_days} 天")
+
+    def _detect_docker_environment(self) -> bool:
+        """检测是否运行在 Docker 容器中"""
+        try:
+            if os.environ.get("DOCKER_CONTAINER") == "true":
+                return True
+
+            if os.path.exists("/.dockerenv"):
+                return True
+
+            return False
+        except Exception:
+            return False
+
+    def _should_open_browser(self) -> bool:
+        """判断是否应该打开浏览器"""
+        return not self.is_github_actions and not self.is_docker_container
+
+    def _setup_proxy(self) -> None:
+        """设置代理配置"""
+        if not self.is_github_actions and self.ctx.config["USE_PROXY"]:
+            self.proxy_url = self.ctx.config["DEFAULT_PROXY"]
+            print("本地环境,使用代理")
+        elif not self.is_github_actions and not self.ctx.config["USE_PROXY"]:
+            print("本地环境,未启用代理")
+        else:
+            print("GitHub Actions环境,不使用代理")
+
+    def _check_version_update(self) -> None:
+        """检查版本更新"""
+        try:
+            need_update, remote_version = check_version_update(
+                VERSION, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url
+            )
+
+            if need_update and remote_version:
+                self.update_info = {
+                    "current_version": VERSION,
+                    "remote_version": remote_version,
+                }
+                print(f"发现新版本: {remote_version} (当前: {VERSION})")
+            else:
+                print("版本检查完成,当前为最新版本")
+        except Exception as e:
+            print(f"版本检查出错: {e}")
+
+    def _get_mode_strategy(self) -> Dict:
+        """获取当前模式的策略配置"""
+        return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
+
+    def _has_notification_configured(self) -> bool:
+        """检查是否配置了任何通知渠道"""
+        cfg = self.ctx.config
+        return any(
+            [
+                cfg["FEISHU_WEBHOOK_URL"],
+                cfg["DINGTALK_WEBHOOK_URL"],
+                cfg["WEWORK_WEBHOOK_URL"],
+                (cfg["TELEGRAM_BOT_TOKEN"] and cfg["TELEGRAM_CHAT_ID"]),
+                (
+                    cfg["EMAIL_FROM"]
+                    and cfg["EMAIL_PASSWORD"]
+                    and cfg["EMAIL_TO"]
+                ),
+                (cfg["NTFY_SERVER_URL"] and cfg["NTFY_TOPIC"]),
+                cfg["BARK_URL"],
+                cfg["SLACK_WEBHOOK_URL"],
+            ]
+        )
+
+    def _has_valid_content(
+        self, stats: List[Dict], new_titles: Optional[Dict] = None
+    ) -> bool:
+        """检查是否有有效的新闻内容"""
+        if self.report_mode in ["incremental", "current"]:
+            # 增量模式和current模式下,只要stats有内容就说明有匹配的新闻
+            return any(stat["count"] > 0 for stat in stats)
+        else:
+            # 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻
+            has_matched_news = any(stat["count"] > 0 for stat in stats)
+            has_new_news = bool(
+                new_titles and any(len(titles) > 0 for titles in new_titles.values())
+            )
+            return has_matched_news or has_new_news
+
+    def _load_analysis_data(
+        self,
+    ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
+        """统一的数据加载和预处理,使用当前监控平台列表过滤历史数据"""
+        try:
+            # 获取当前配置的监控平台ID列表
+            current_platform_ids = self.ctx.platform_ids
+            print(f"当前监控平台: {current_platform_ids}")
+
+            all_results, id_to_name, title_info = self.ctx.read_today_titles(
+                current_platform_ids
+            )
+
+            if not all_results:
+                print("没有找到当天的数据")
+                return None
+
+            total_titles = sum(len(titles) for titles in all_results.values())
+            print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)")
+
+            new_titles = self.ctx.detect_new_titles(current_platform_ids)
+            word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
+
+            return (
+                all_results,
+                id_to_name,
+                title_info,
+                new_titles,
+                word_groups,
+                filter_words,
+                global_filters,
+            )
+        except Exception as e:
+            print(f"数据加载失败: {e}")
+            return None
+
+    def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
+        """从当前抓取结果构建标题信息"""
+        title_info = {}
+        for source_id, titles_data in results.items():
+            title_info[source_id] = {}
+            for title, title_data in titles_data.items():
+                ranks = title_data.get("ranks", [])
+                url = title_data.get("url", "")
+                mobile_url = title_data.get("mobileUrl", "")
+
+                title_info[source_id][title] = {
+                    "first_time": time_info,
+                    "last_time": time_info,
+                    "count": 1,
+                    "ranks": ranks,
+                    "url": url,
+                    "mobileUrl": mobile_url,
+                }
+        return title_info
+
+    def _run_analysis_pipeline(
+        self,
+        data_source: Dict,
+        mode: str,
+        title_info: Dict,
+        new_titles: Dict,
+        word_groups: List[Dict],
+        filter_words: List[str],
+        id_to_name: Dict,
+        failed_ids: Optional[List] = None,
+        is_daily_summary: bool = False,
+        global_filters: Optional[List[str]] = None,
+    ) -> Tuple[List[Dict], Optional[str]]:
+        """统一的分析流水线:数据处理 → 统计计算 → HTML生成"""
+
+        # 统计计算(使用 AppContext)
+        stats, total_titles = self.ctx.count_frequency(
+            data_source,
+            word_groups,
+            filter_words,
+            id_to_name,
+            title_info,
+            new_titles,
+            mode=mode,
+            global_filters=global_filters,
+        )
+
+        # HTML生成(如果启用)
+        html_file = None
+        if self.ctx.config["STORAGE"]["FORMATS"]["HTML"]:
+            html_file = self.ctx.generate_html(
+                stats,
+                total_titles,
+                failed_ids=failed_ids,
+                new_titles=new_titles,
+                id_to_name=id_to_name,
+                mode=mode,
+                is_daily_summary=is_daily_summary,
+                update_info=self.update_info if self.ctx.config["SHOW_VERSION_UPDATE"] else None,
+            )
+
+        return stats, html_file
+
+    def _send_notification_if_needed(
+        self,
+        stats: List[Dict],
+        report_type: str,
+        mode: str,
+        failed_ids: Optional[List] = None,
+        new_titles: Optional[Dict] = None,
+        id_to_name: Optional[Dict] = None,
+        html_file_path: Optional[str] = None,
+    ) -> bool:
+        """统一的通知发送逻辑,包含所有判断条件"""
+        has_notification = self._has_notification_configured()
+        cfg = self.ctx.config
+
+        if (
+            cfg["ENABLE_NOTIFICATION"]
+            and has_notification
+            and self._has_valid_content(stats, new_titles)
+        ):
+            # 推送窗口控制
+            if cfg["PUSH_WINDOW"]["ENABLED"]:
+                push_manager = self.ctx.create_push_manager()
+                time_range_start = cfg["PUSH_WINDOW"]["TIME_RANGE"]["START"]
+                time_range_end = cfg["PUSH_WINDOW"]["TIME_RANGE"]["END"]
+
+                if not push_manager.is_in_time_range(time_range_start, time_range_end):
+                    now = self.ctx.get_time()
+                    print(
+                        f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送"
+                    )
+                    return False
+
+                if cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]:
+                    if push_manager.has_pushed_today():
+                        print(f"推送窗口控制:今天已推送过,跳过本次推送")
+                        return False
+                    else:
+                        print(f"推送窗口控制:今天首次推送")
+
+            # 准备报告数据
+            report_data = self.ctx.prepare_report(stats, failed_ids, new_titles, id_to_name, mode)
+
+            # 是否发送版本更新信息
+            update_info_to_send = self.update_info if cfg["SHOW_VERSION_UPDATE"] else None
+
+            # 使用 NotificationDispatcher 发送到所有渠道
+            dispatcher = self.ctx.create_notification_dispatcher()
+            results = dispatcher.dispatch_all(
+                report_data=report_data,
+                report_type=report_type,
+                update_info=update_info_to_send,
+                proxy_url=self.proxy_url,
+                mode=mode,
+                html_file_path=html_file_path,
+            )
+
+            if not results:
+                print("未配置任何通知渠道,跳过通知发送")
+                return False
+
+            # 如果成功发送了任何通知,且启用了每天只推一次,则记录推送
+            if (
+                cfg["PUSH_WINDOW"]["ENABLED"]
+                and cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]
+                and any(results.values())
+            ):
+                push_manager = self.ctx.create_push_manager()
+                push_manager.record_push(report_type)
+
+            return True
+
+        elif cfg["ENABLE_NOTIFICATION"] and not has_notification:
+            print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送")
+        elif not cfg["ENABLE_NOTIFICATION"]:
+            print(f"跳过{report_type}通知:通知功能已禁用")
+        elif (
+            cfg["ENABLE_NOTIFICATION"]
+            and has_notification
+            and not self._has_valid_content(stats, new_titles)
+        ):
+            mode_strategy = self._get_mode_strategy()
+            if "实时" in report_type:
+                print(
+                    f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻"
+                )
+            else:
+                print(
+                    f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容"
+                )
+
+        return False
+
+    def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]:
+        """生成汇总报告(带通知)"""
+        summary_type = (
+            "当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总"
+        )
+        print(f"生成{summary_type}报告...")
+
+        # 加载分析数据
+        analysis_data = self._load_analysis_data()
+        if not analysis_data:
+            return None
+
+        all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
+            analysis_data
+        )
+
+        # 运行分析流水线
+        stats, html_file = self._run_analysis_pipeline(
+            all_results,
+            mode_strategy["summary_mode"],
+            title_info,
+            new_titles,
+            word_groups,
+            filter_words,
+            id_to_name,
+            is_daily_summary=True,
+            global_filters=global_filters,
+        )
+
+        if html_file:
+            print(f"{summary_type}报告已生成: {html_file}")
+
+        # 发送通知
+        self._send_notification_if_needed(
+            stats,
+            mode_strategy["summary_report_type"],
+            mode_strategy["summary_mode"],
+            failed_ids=[],
+            new_titles=new_titles,
+            id_to_name=id_to_name,
+            html_file_path=html_file,
+        )
+
+        return html_file
+
+    def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
+        """生成汇总HTML"""
+        summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
+        print(f"生成{summary_type}HTML...")
+
+        # 加载分析数据
+        analysis_data = self._load_analysis_data()
+        if not analysis_data:
+            return None
+
+        all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
+            analysis_data
+        )
+
+        # 运行分析流水线
+        _, html_file = self._run_analysis_pipeline(
+            all_results,
+            mode,
+            title_info,
+            new_titles,
+            word_groups,
+            filter_words,
+            id_to_name,
+            is_daily_summary=True,
+            global_filters=global_filters,
+        )
+
+        if html_file:
+            print(f"{summary_type}HTML已生成: {html_file}")
+        return html_file
+
+    def _initialize_and_check_config(self) -> None:
+        """通用初始化和配置检查"""
+        now = self.ctx.get_time()
+        print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
+
+        if not self.ctx.config["ENABLE_CRAWLER"]:
+            print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出")
+            return
+
+        has_notification = self._has_notification_configured()
+        if not self.ctx.config["ENABLE_NOTIFICATION"]:
+            print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取")
+        elif not has_notification:
+            print("未配置任何通知渠道,将只进行数据抓取,不发送通知")
+        else:
+            print("通知功能已启用,将发送通知")
+
+        mode_strategy = self._get_mode_strategy()
+        print(f"报告模式: {self.report_mode}")
+        print(f"运行模式: {mode_strategy['description']}")
+
+    def _crawl_data(self) -> Tuple[Dict, Dict, List]:
+        """执行数据爬取"""
+        ids = []
+        for platform in self.ctx.platforms:
+            if "name" in platform:
+                ids.append((platform["id"], platform["name"]))
+            else:
+                ids.append(platform["id"])
+
+        print(
+            f"配置的监控平台: {[p.get('name', p['id']) for p in self.ctx.platforms]}"
+        )
+        print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
+        Path("output").mkdir(parents=True, exist_ok=True)
+
+        results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
+            ids, self.request_interval
+        )
+
+        # 转换为 NewsData 格式并保存到存储后端
+        crawl_time = self.ctx.format_time()
+        crawl_date = self.ctx.format_date()
+        news_data = convert_crawl_results_to_news_data(
+            results, id_to_name, failed_ids, crawl_time, crawl_date
+        )
+
+        # 保存到存储后端(SQLite)
+        if self.storage_manager.save_news_data(news_data):
+            print(f"数据已保存到存储后端: {self.storage_manager.backend_name}")
+
+        # 保存 TXT 快照(如果启用)
+        txt_file = self.storage_manager.save_txt_snapshot(news_data)
+        if txt_file:
+            print(f"TXT 快照已保存: {txt_file}")
+
+        # 兼容:同时保存到原有 TXT 格式(确保向后兼容)
+        if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
+            title_file = self.ctx.save_titles(results, id_to_name, failed_ids)
+            print(f"标题已保存到: {title_file}")
+
+        return results, id_to_name, failed_ids
+
+    def _execute_mode_strategy(
+        self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List
+    ) -> Optional[str]:
+        """执行模式特定逻辑"""
+        # 获取当前监控平台ID列表
+        current_platform_ids = self.ctx.platform_ids
+
+        new_titles = self.ctx.detect_new_titles(current_platform_ids)
+        time_info = self.ctx.format_time()
+        if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
+            self.ctx.save_titles(results, id_to_name, failed_ids)
+        word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
+
+        # current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性
+        if self.report_mode == "current":
+            # 加载完整的历史数据(已按当前平台过滤)
+            analysis_data = self._load_analysis_data()
+            if analysis_data:
+                (
+                    all_results,
+                    historical_id_to_name,
+                    historical_title_info,
+                    historical_new_titles,
+                    _,
+                    _,
+                    _,
+                ) = analysis_data
+
+                print(
+                    f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}"
+                )
+
+                stats, html_file = self._run_analysis_pipeline(
+                    all_results,
+                    self.report_mode,
+                    historical_title_info,
+                    historical_new_titles,
+                    word_groups,
+                    filter_words,
+                    historical_id_to_name,
+                    failed_ids=failed_ids,
+                    global_filters=global_filters,
+                )
+
+                combined_id_to_name = {**historical_id_to_name, **id_to_name}
+
+                if html_file:
+                    print(f"HTML报告已生成: {html_file}")
+
+                # 发送实时通知(使用完整历史数据的统计结果)
+                summary_html = None
+                if mode_strategy["should_send_realtime"]:
+                    self._send_notification_if_needed(
+                        stats,
+                        mode_strategy["realtime_report_type"],
+                        self.report_mode,
+                        failed_ids=failed_ids,
+                        new_titles=historical_new_titles,
+                        id_to_name=combined_id_to_name,
+                        html_file_path=html_file,
+                    )
+            else:
+                print("❌ 严重错误:无法读取刚保存的数据文件")
+                raise RuntimeError("数据一致性检查失败:保存后立即读取失败")
+        else:
+            title_info = self._prepare_current_title_info(results, time_info)
+            stats, html_file = self._run_analysis_pipeline(
+                results,
+                self.report_mode,
+                title_info,
+                new_titles,
+                word_groups,
+                filter_words,
+                id_to_name,
+                failed_ids=failed_ids,
+                global_filters=global_filters,
+            )
+            if html_file:
+                print(f"HTML报告已生成: {html_file}")
+
+            # 发送实时通知(如果需要)
+            summary_html = None
+            if mode_strategy["should_send_realtime"]:
+                self._send_notification_if_needed(
+                    stats,
+                    mode_strategy["realtime_report_type"],
+                    self.report_mode,
+                    failed_ids=failed_ids,
+                    new_titles=new_titles,
+                    id_to_name=id_to_name,
+                    html_file_path=html_file,
+                )
+
+        # 生成汇总报告(如果需要)
+        summary_html = None
+        if mode_strategy["should_generate_summary"]:
+            if mode_strategy["should_send_realtime"]:
+                # 如果已经发送了实时通知,汇总只生成HTML不发送通知
+                summary_html = self._generate_summary_html(
+                    mode_strategy["summary_mode"]
+                )
+            else:
+                # daily模式:直接生成汇总报告并发送通知
+                summary_html = self._generate_summary_report(mode_strategy)
+
+        # 打开浏览器(仅在非容器环境)
+        if self._should_open_browser() and html_file:
+            if summary_html:
+                summary_url = "file://" + str(Path(summary_html).resolve())
+                print(f"正在打开汇总报告: {summary_url}")
+                webbrowser.open(summary_url)
+            else:
+                file_url = "file://" + str(Path(html_file).resolve())
+                print(f"正在打开HTML报告: {file_url}")
+                webbrowser.open(file_url)
+        elif self.is_docker_container and html_file:
+            if summary_html:
+                print(f"汇总报告已生成(Docker环境): {summary_html}")
+            else:
+                print(f"HTML报告已生成(Docker环境): {html_file}")
+
+        return summary_html
+
+    def run(self) -> None:
+        """执行分析流程"""
+        try:
+            self._initialize_and_check_config()
+
+            mode_strategy = self._get_mode_strategy()
+
+            results, id_to_name, failed_ids = self._crawl_data()
+
+            self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
+
+        except Exception as e:
+            print(f"分析流程执行出错: {e}")
+            raise
+        finally:
+            # 清理资源(包括过期数据清理和数据库连接关闭)
+            self.ctx.cleanup()
+
+
+def main():
+    """主程序入口"""
+    try:
+        analyzer = NewsAnalyzer()
+        analyzer.run()
+    except FileNotFoundError as e:
+        print(f"❌ 配置文件错误: {e}")
+        print("\n请确保以下文件存在:")
+        print("  • config/config.yaml")
+        print("  • config/frequency_words.txt")
+        print("\n参考项目文档进行正确配置")
+    except Exception as e:
+        print(f"❌ 程序运行错误: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()

+ 388 - 0
trendradar/context.py

@@ -0,0 +1,388 @@
+# coding=utf-8
+"""
+应用上下文模块
+
+提供配置上下文类,封装所有依赖配置的操作,消除全局状态和包装函数。
+"""
+
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from trendradar.utils.time import (
+    get_configured_time,
+    format_date_folder,
+    format_time_filename,
+    get_current_time_display,
+    convert_time_for_display,
+)
+from trendradar.core import (
+    load_frequency_words,
+    matches_word_groups,
+    save_titles_to_file,
+    read_all_today_titles,
+    detect_latest_new_titles,
+    is_first_crawl_today,
+    count_word_frequency,
+)
+from trendradar.report import (
+    clean_title,
+    prepare_report_data,
+    generate_html_report,
+    render_html_content,
+)
+from trendradar.notification import (
+    render_feishu_content,
+    render_dingtalk_content,
+    split_content_into_batches,
+    NotificationDispatcher,
+    PushRecordManager,
+)
+from trendradar.storage import get_storage_manager
+
+
+class AppContext:
+    """
+    应用上下文类
+
+    封装所有依赖配置的操作,提供统一的接口。
+    消除对全局 CONFIG 的依赖,提高可测试性。
+
+    使用示例:
+        config = load_config()
+        ctx = AppContext(config)
+
+        # 时间操作
+        now = ctx.get_time()
+        date_folder = ctx.format_date()
+
+        # 存储操作
+        storage = ctx.get_storage_manager()
+
+        # 报告生成
+        html = ctx.generate_html_report(stats, total_titles, ...)
+    """
+
+    def __init__(self, config: Dict[str, Any]):
+        """
+        初始化应用上下文
+
+        Args:
+            config: 完整的配置字典
+        """
+        self.config = config
+        self._storage_manager = None
+
+    # === 配置访问 ===
+
+    @property
+    def timezone(self) -> str:
+        """获取配置的时区"""
+        return self.config.get("TIMEZONE", "Asia/Shanghai")
+
+    @property
+    def rank_threshold(self) -> int:
+        """获取排名阈值"""
+        return self.config.get("RANK_THRESHOLD", 50)
+
+    @property
+    def weight_config(self) -> Dict:
+        """获取权重配置"""
+        return self.config.get("WEIGHT_CONFIG", {})
+
+    @property
+    def platforms(self) -> List[Dict]:
+        """获取平台配置列表"""
+        return self.config.get("PLATFORMS", [])
+
+    @property
+    def platform_ids(self) -> List[str]:
+        """获取平台ID列表"""
+        return [p["id"] for p in self.platforms]
+
+    # === 时间操作 ===
+
+    def get_time(self) -> datetime:
+        """获取当前配置时区的时间"""
+        return get_configured_time(self.timezone)
+
+    def format_date(self) -> str:
+        """格式化日期文件夹 (YYYY-MM-DD)"""
+        return format_date_folder(timezone=self.timezone)
+
+    def format_time(self) -> str:
+        """格式化时间文件名 (HH-MM)"""
+        return format_time_filename(self.timezone)
+
+    def get_time_display(self) -> str:
+        """获取时间显示 (HH:MM)"""
+        return get_current_time_display(self.timezone)
+
+    @staticmethod
+    def convert_time_display(time_str: str) -> str:
+        """将 HH-MM 转换为 HH:MM"""
+        return convert_time_for_display(time_str)
+
+    # === 存储操作 ===
+
+    def get_storage_manager(self):
+        """获取存储管理器(延迟初始化,单例)"""
+        if self._storage_manager is None:
+            storage_config = self.config.get("STORAGE", {})
+            remote_config = storage_config.get("REMOTE", {})
+            local_config = storage_config.get("LOCAL", {})
+            pull_config = storage_config.get("PULL", {})
+
+            self._storage_manager = get_storage_manager(
+                backend_type=storage_config.get("BACKEND", "auto"),
+                data_dir=local_config.get("DATA_DIR", "output"),
+                enable_txt=storage_config.get("FORMATS", {}).get("TXT", True),
+                enable_html=storage_config.get("FORMATS", {}).get("HTML", True),
+                remote_config={
+                    "bucket_name": remote_config.get("BUCKET_NAME", ""),
+                    "access_key_id": remote_config.get("ACCESS_KEY_ID", ""),
+                    "secret_access_key": remote_config.get("SECRET_ACCESS_KEY", ""),
+                    "endpoint_url": remote_config.get("ENDPOINT_URL", ""),
+                    "region": remote_config.get("REGION", ""),
+                },
+                local_retention_days=local_config.get("RETENTION_DAYS", 0),
+                remote_retention_days=remote_config.get("RETENTION_DAYS", 0),
+                pull_enabled=pull_config.get("ENABLED", False),
+                pull_days=pull_config.get("DAYS", 7),
+                timezone=self.timezone,
+            )
+        return self._storage_manager
+
+    def get_output_path(self, subfolder: str, filename: str) -> str:
+        """获取输出路径"""
+        output_dir = Path("output") / self.format_date() / subfolder
+        output_dir.mkdir(parents=True, exist_ok=True)
+        return str(output_dir / filename)
+
+    # === 数据处理 ===
+
+    def save_titles(self, results: Dict, id_to_name: Dict, failed_ids: List) -> str:
+        """保存标题到文件"""
+        output_path = self.get_output_path("txt", f"{self.format_time()}.txt")
+        return save_titles_to_file(results, id_to_name, failed_ids, output_path, clean_title)
+
+    def read_today_titles(
+        self, platform_ids: Optional[List[str]] = None
+    ) -> Tuple[Dict, Dict, Dict]:
+        """读取当天所有标题"""
+        return read_all_today_titles(self.get_storage_manager(), platform_ids)
+
+    def detect_new_titles(
+        self, platform_ids: Optional[List[str]] = None
+    ) -> Dict:
+        """检测最新批次的新增标题"""
+        return detect_latest_new_titles(self.get_storage_manager(), platform_ids)
+
+    def is_first_crawl(self) -> bool:
+        """检测是否是当天第一次爬取"""
+        return is_first_crawl_today("output", self.format_date())
+
+    # === 频率词处理 ===
+
+    def load_frequency_words(
+        self, frequency_file: Optional[str] = None
+    ) -> Tuple[List[Dict], List[str], List[str]]:
+        """加载频率词配置"""
+        return load_frequency_words(frequency_file)
+
+    def matches_word_groups(
+        self,
+        title: str,
+        word_groups: List[Dict],
+        filter_words: List[str],
+        global_filters: Optional[List[str]] = None,
+    ) -> bool:
+        """检查标题是否匹配词组规则"""
+        return matches_word_groups(title, word_groups, filter_words, global_filters)
+
+    # === 统计分析 ===
+
+    def count_frequency(
+        self,
+        results: Dict,
+        word_groups: List[Dict],
+        filter_words: List[str],
+        id_to_name: Dict,
+        title_info: Optional[Dict] = None,
+        new_titles: Optional[Dict] = None,
+        mode: str = "daily",
+        global_filters: Optional[List[str]] = None,
+    ) -> Tuple[List[Dict], int]:
+        """统计词频"""
+        return count_word_frequency(
+            results=results,
+            word_groups=word_groups,
+            filter_words=filter_words,
+            id_to_name=id_to_name,
+            title_info=title_info,
+            rank_threshold=self.rank_threshold,
+            new_titles=new_titles,
+            mode=mode,
+            global_filters=global_filters,
+            weight_config=self.weight_config,
+            max_news_per_keyword=self.config.get("MAX_NEWS_PER_KEYWORD", 0),
+            sort_by_position_first=self.config.get("SORT_BY_POSITION_FIRST", False),
+            is_first_crawl_func=self.is_first_crawl,
+            convert_time_func=self.convert_time_display,
+        )
+
+    # === 报告生成 ===
+
+    def prepare_report(
+        self,
+        stats: List[Dict],
+        failed_ids: Optional[List] = None,
+        new_titles: Optional[Dict] = None,
+        id_to_name: Optional[Dict] = None,
+        mode: str = "daily",
+    ) -> Dict:
+        """准备报告数据"""
+        return prepare_report_data(
+            stats=stats,
+            failed_ids=failed_ids,
+            new_titles=new_titles,
+            id_to_name=id_to_name,
+            mode=mode,
+            rank_threshold=self.rank_threshold,
+            matches_word_groups_func=self.matches_word_groups,
+            load_frequency_words_func=self.load_frequency_words,
+        )
+
+    def generate_html(
+        self,
+        stats: List[Dict],
+        total_titles: int,
+        failed_ids: Optional[List] = None,
+        new_titles: Optional[Dict] = None,
+        id_to_name: Optional[Dict] = None,
+        mode: str = "daily",
+        is_daily_summary: bool = False,
+        update_info: Optional[Dict] = None,
+    ) -> str:
+        """生成HTML报告"""
+        return generate_html_report(
+            stats=stats,
+            total_titles=total_titles,
+            failed_ids=failed_ids,
+            new_titles=new_titles,
+            id_to_name=id_to_name,
+            mode=mode,
+            is_daily_summary=is_daily_summary,
+            update_info=update_info,
+            rank_threshold=self.rank_threshold,
+            output_dir="output",
+            date_folder=self.format_date(),
+            time_filename=self.format_time(),
+            render_html_func=lambda *args, **kwargs: self.render_html(*args, **kwargs),
+            matches_word_groups_func=self.matches_word_groups,
+            load_frequency_words_func=self.load_frequency_words,
+            enable_index_copy=True,
+        )
+
+    def render_html(
+        self,
+        report_data: Dict,
+        total_titles: int,
+        is_daily_summary: bool = False,
+        mode: str = "daily",
+        update_info: Optional[Dict] = None,
+    ) -> str:
+        """渲染HTML内容"""
+        return render_html_content(
+            report_data=report_data,
+            total_titles=total_titles,
+            is_daily_summary=is_daily_summary,
+            mode=mode,
+            update_info=update_info,
+            reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
+            get_time_func=self.get_time,
+        )
+
+    # === 通知内容渲染 ===
+
+    def render_feishu(
+        self,
+        report_data: Dict,
+        update_info: Optional[Dict] = None,
+        mode: str = "daily",
+    ) -> str:
+        """渲染飞书内容"""
+        return render_feishu_content(
+            report_data=report_data,
+            update_info=update_info,
+            mode=mode,
+            separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
+            reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
+            get_time_func=self.get_time,
+        )
+
+    def render_dingtalk(
+        self,
+        report_data: Dict,
+        update_info: Optional[Dict] = None,
+        mode: str = "daily",
+    ) -> str:
+        """渲染钉钉内容"""
+        return render_dingtalk_content(
+            report_data=report_data,
+            update_info=update_info,
+            mode=mode,
+            reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
+            get_time_func=self.get_time,
+        )
+
+    def split_content(
+        self,
+        report_data: Dict,
+        format_type: str,
+        update_info: Optional[Dict] = None,
+        max_bytes: Optional[int] = None,
+        mode: str = "daily",
+    ) -> List[str]:
+        """分批处理消息内容"""
+        return split_content_into_batches(
+            report_data=report_data,
+            format_type=format_type,
+            update_info=update_info,
+            max_bytes=max_bytes,
+            mode=mode,
+            batch_sizes={
+                "dingtalk": self.config.get("DINGTALK_BATCH_SIZE", 20000),
+                "feishu": self.config.get("FEISHU_BATCH_SIZE", 29000),
+                "default": self.config.get("MESSAGE_BATCH_SIZE", 4000),
+            },
+            feishu_separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
+            reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
+            get_time_func=self.get_time,
+        )
+
+    # === 通知发送 ===
+
+    def create_notification_dispatcher(self) -> NotificationDispatcher:
+        """创建通知调度器"""
+        return NotificationDispatcher(
+            config=self.config,
+            get_time_func=self.get_time,
+            split_content_func=self.split_content,
+        )
+
+    def create_push_manager(self) -> PushRecordManager:
+        """创建推送记录管理器"""
+        return PushRecordManager(
+            storage_backend=self.get_storage_manager(),
+            get_time_func=self.get_time,
+        )
+
+    # === 资源清理 ===
+
+    def cleanup(self):
+        """清理资源"""
+        if self._storage_manager:
+            self._storage_manager.cleanup_old_data()
+            self._storage_manager.cleanup()
+            self._storage_manager = None

+ 47 - 0
trendradar/core/__init__.py

@@ -0,0 +1,47 @@
+# coding=utf-8
+"""
+核心模块 - 配置管理和核心工具
+"""
+
+from trendradar.core.config import (
+    parse_multi_account_config,
+    validate_paired_configs,
+    limit_accounts,
+    get_account_at_index,
+)
+from trendradar.core.loader import load_config
+from trendradar.core.frequency import load_frequency_words, matches_word_groups
+from trendradar.core.data import (
+    save_titles_to_file,
+    read_all_today_titles_from_storage,
+    read_all_today_titles,
+    detect_latest_new_titles_from_storage,
+    detect_latest_new_titles,
+    is_first_crawl_today,
+)
+from trendradar.core.analyzer import (
+    calculate_news_weight,
+    format_time_display,
+    count_word_frequency,
+)
+
+__all__ = [
+    "parse_multi_account_config",
+    "validate_paired_configs",
+    "limit_accounts",
+    "get_account_at_index",
+    "load_config",
+    "load_frequency_words",
+    "matches_word_groups",
+    # 数据处理
+    "save_titles_to_file",
+    "read_all_today_titles_from_storage",
+    "read_all_today_titles",
+    "detect_latest_new_titles_from_storage",
+    "detect_latest_new_titles",
+    "is_first_crawl_today",
+    # 统计分析
+    "calculate_news_weight",
+    "format_time_display",
+    "count_word_frequency",
+]

+ 469 - 0
trendradar/core/analyzer.py

@@ -0,0 +1,469 @@
+# coding=utf-8
+"""
+统计分析模块
+
+提供新闻统计和分析功能:
+- calculate_news_weight: 计算新闻权重
+- format_time_display: 格式化时间显示
+- count_word_frequency: 统计词频
+"""
+
+from typing import Dict, List, Tuple, Optional, Callable
+
+from trendradar.core.frequency import matches_word_groups
+
+
+def calculate_news_weight(
+    title_data: Dict,
+    rank_threshold: int,
+    weight_config: Dict,
+) -> float:
+    """
+    计算新闻权重,用于排序
+
+    Args:
+        title_data: 标题数据,包含 ranks 和 count
+        rank_threshold: 排名阈值
+        weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
+
+    Returns:
+        float: 计算出的权重值
+    """
+    ranks = title_data.get("ranks", [])
+    if not ranks:
+        return 0.0
+
+    count = title_data.get("count", len(ranks))
+
+    # 排名权重:Σ(11 - min(rank, 10)) / 出现次数
+    rank_scores = []
+    for rank in ranks:
+        score = 11 - min(rank, 10)
+        rank_scores.append(score)
+
+    rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
+
+    # 频次权重:min(出现次数, 10) × 10
+    frequency_weight = min(count, 10) * 10
+
+    # 热度加成:高排名次数 / 总出现次数 × 100
+    high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
+    hotness_ratio = high_rank_count / len(ranks) if ranks else 0
+    hotness_weight = hotness_ratio * 100
+
+    total_weight = (
+        rank_weight * weight_config["RANK_WEIGHT"]
+        + frequency_weight * weight_config["FREQUENCY_WEIGHT"]
+        + hotness_weight * weight_config["HOTNESS_WEIGHT"]
+    )
+
+    return total_weight
+
+
+def format_time_display(
+    first_time: str,
+    last_time: str,
+    convert_time_func: Callable[[str], str],
+) -> str:
+    """
+    格式化时间显示(将 HH-MM 转换为 HH:MM)
+
+    Args:
+        first_time: 首次出现时间
+        last_time: 最后出现时间
+        convert_time_func: 时间格式转换函数
+
+    Returns:
+        str: 格式化后的时间显示字符串
+    """
+    if not first_time:
+        return ""
+    # 转换为显示格式
+    first_display = convert_time_func(first_time)
+    last_display = convert_time_func(last_time)
+    if first_display == last_display or not last_display:
+        return first_display
+    else:
+        return f"[{first_display} ~ {last_display}]"
+
+
+def count_word_frequency(
+    results: Dict,
+    word_groups: List[Dict],
+    filter_words: List[str],
+    id_to_name: Dict,
+    title_info: Optional[Dict] = None,
+    rank_threshold: int = 3,
+    new_titles: Optional[Dict] = None,
+    mode: str = "daily",
+    global_filters: Optional[List[str]] = None,
+    weight_config: Optional[Dict] = None,
+    max_news_per_keyword: int = 0,
+    sort_by_position_first: bool = False,
+    is_first_crawl_func: Optional[Callable[[], bool]] = None,
+    convert_time_func: Optional[Callable[[str], str]] = None,
+) -> Tuple[List[Dict], int]:
+    """
+    统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题
+
+    Args:
+        results: 抓取结果 {source_id: {title: title_data}}
+        word_groups: 词组配置列表
+        filter_words: 过滤词列表
+        id_to_name: ID 到名称的映射
+        title_info: 标题统计信息(可选)
+        rank_threshold: 排名阈值
+        new_titles: 新增标题(可选)
+        mode: 报告模式 (daily/incremental/current)
+        global_filters: 全局过滤词(可选)
+        weight_config: 权重配置
+        max_news_per_keyword: 每个关键词最大显示数量
+        sort_by_position_first: 是否优先按配置位置排序
+        is_first_crawl_func: 检测是否是当天第一次爬取的函数
+        convert_time_func: 时间格式转换函数
+
+    Returns:
+        Tuple[List[Dict], int]: (统计结果列表, 总标题数)
+    """
+    # 默认权重配置
+    if weight_config is None:
+        weight_config = {
+            "RANK_WEIGHT": 0.4,
+            "FREQUENCY_WEIGHT": 0.3,
+            "HOTNESS_WEIGHT": 0.3,
+        }
+
+    # 默认时间转换函数
+    if convert_time_func is None:
+        convert_time_func = lambda x: x
+
+    # 默认首次爬取检测函数
+    if is_first_crawl_func is None:
+        is_first_crawl_func = lambda: True
+
+    # 如果没有配置词组,创建一个包含所有新闻的虚拟词组
+    if not word_groups:
+        print("频率词配置为空,将显示所有新闻")
+        word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
+        filter_words = []  # 清空过滤词,显示所有新闻
+
+    is_first_today = is_first_crawl_func()
+
+    # 确定处理的数据源和新增标记逻辑
+    if mode == "incremental":
+        if is_first_today:
+            # 增量模式 + 当天第一次:处理所有新闻,都标记为新增
+            results_to_process = results
+            all_news_are_new = True
+        else:
+            # 增量模式 + 当天非第一次:只处理新增的新闻
+            results_to_process = new_titles if new_titles else {}
+            all_news_are_new = True
+    elif mode == "current":
+        # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
+        if title_info:
+            latest_time = None
+            for source_titles in title_info.values():
+                for title_data in source_titles.values():
+                    last_time = title_data.get("last_time", "")
+                    if last_time:
+                        if latest_time is None or last_time > latest_time:
+                            latest_time = last_time
+
+            # 只处理 last_time 等于最新时间的新闻
+            if latest_time:
+                results_to_process = {}
+                for source_id, source_titles in results.items():
+                    if source_id in title_info:
+                        filtered_titles = {}
+                        for title, title_data in source_titles.items():
+                            if title in title_info[source_id]:
+                                info = title_info[source_id][title]
+                                if info.get("last_time") == latest_time:
+                                    filtered_titles[title] = title_data
+                        if filtered_titles:
+                            results_to_process[source_id] = filtered_titles
+
+                print(
+                    f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
+                )
+            else:
+                results_to_process = results
+        else:
+            results_to_process = results
+        all_news_are_new = False
+    else:
+        # 当日汇总模式:处理所有新闻
+        results_to_process = results
+        all_news_are_new = False
+        total_input_news = sum(len(titles) for titles in results.values())
+        filter_status = (
+            "全部显示"
+            if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
+            else "频率词过滤"
+        )
+        print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
+
+    word_stats = {}
+    total_titles = 0
+    processed_titles = {}
+    matched_new_count = 0
+
+    if title_info is None:
+        title_info = {}
+    if new_titles is None:
+        new_titles = {}
+
+    for group in word_groups:
+        group_key = group["group_key"]
+        word_stats[group_key] = {"count": 0, "titles": {}}
+
+    for source_id, titles_data in results_to_process.items():
+        total_titles += len(titles_data)
+
+        if source_id not in processed_titles:
+            processed_titles[source_id] = {}
+
+        for title, title_data in titles_data.items():
+            if title in processed_titles.get(source_id, {}):
+                continue
+
+            # 使用统一的匹配逻辑
+            matches_frequency_words = matches_word_groups(
+                title, word_groups, filter_words, global_filters
+            )
+
+            if not matches_frequency_words:
+                continue
+
+            # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
+            if (mode == "incremental" and all_news_are_new) or (
+                mode == "current" and is_first_today
+            ):
+                matched_new_count += 1
+
+            source_ranks = title_data.get("ranks", [])
+            source_url = title_data.get("url", "")
+            source_mobile_url = title_data.get("mobileUrl", "")
+
+            # 找到匹配的词组(防御性转换确保类型安全)
+            title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
+            for group in word_groups:
+                required_words = group["required"]
+                normal_words = group["normal"]
+
+                # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
+                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
+                    group_key = group["group_key"]
+                    word_stats[group_key]["count"] += 1
+                    if source_id not in word_stats[group_key]["titles"]:
+                        word_stats[group_key]["titles"][source_id] = []
+                else:
+                    # 原有的匹配逻辑
+                    if required_words:
+                        all_required_present = all(
+                            req_word.lower() in title_lower
+                            for req_word in required_words
+                        )
+                        if not all_required_present:
+                            continue
+
+                    if normal_words:
+                        any_normal_present = any(
+                            normal_word.lower() in title_lower
+                            for normal_word in normal_words
+                        )
+                        if not any_normal_present:
+                            continue
+
+                    group_key = group["group_key"]
+                    word_stats[group_key]["count"] += 1
+                    if source_id not in word_stats[group_key]["titles"]:
+                        word_stats[group_key]["titles"][source_id] = []
+
+                first_time = ""
+                last_time = ""
+                count_info = 1
+                ranks = source_ranks if source_ranks else []
+                url = source_url
+                mobile_url = source_mobile_url
+
+                # 对于 current 模式,从历史统计信息中获取完整数据
+                if (
+                    mode == "current"
+                    and title_info
+                    and source_id in title_info
+                    and title in title_info[source_id]
+                ):
+                    info = title_info[source_id][title]
+                    first_time = info.get("first_time", "")
+                    last_time = info.get("last_time", "")
+                    count_info = info.get("count", 1)
+                    if "ranks" in info and info["ranks"]:
+                        ranks = info["ranks"]
+                    url = info.get("url", source_url)
+                    mobile_url = info.get("mobileUrl", source_mobile_url)
+                elif (
+                    title_info
+                    and source_id in title_info
+                    and title in title_info[source_id]
+                ):
+                    info = title_info[source_id][title]
+                    first_time = info.get("first_time", "")
+                    last_time = info.get("last_time", "")
+                    count_info = info.get("count", 1)
+                    if "ranks" in info and info["ranks"]:
+                        ranks = info["ranks"]
+                    url = info.get("url", source_url)
+                    mobile_url = info.get("mobileUrl", source_mobile_url)
+
+                if not ranks:
+                    ranks = [99]
+
+                time_display = format_time_display(first_time, last_time, convert_time_func)
+
+                source_name = id_to_name.get(source_id, source_id)
+
+                # 判断是否为新增
+                is_new = False
+                if all_news_are_new:
+                    # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
+                    is_new = True
+                elif new_titles and source_id in new_titles:
+                    # 检查是否在新增列表中
+                    new_titles_for_source = new_titles[source_id]
+                    is_new = title in new_titles_for_source
+
+                word_stats[group_key]["titles"][source_id].append(
+                    {
+                        "title": title,
+                        "source_name": source_name,
+                        "first_time": first_time,
+                        "last_time": last_time,
+                        "time_display": time_display,
+                        "count": count_info,
+                        "ranks": ranks,
+                        "rank_threshold": rank_threshold,
+                        "url": url,
+                        "mobileUrl": mobile_url,
+                        "is_new": is_new,
+                    }
+                )
+
+                if source_id not in processed_titles:
+                    processed_titles[source_id] = {}
+                processed_titles[source_id][title] = True
+
+                break
+
+    # 最后统一打印汇总信息
+    if mode == "incremental":
+        if is_first_today:
+            total_input_news = sum(len(titles) for titles in results.values())
+            filter_status = (
+                "全部显示"
+                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
+                else "频率词匹配"
+            )
+            print(
+                f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
+            )
+        else:
+            if new_titles:
+                total_new_count = sum(len(titles) for titles in new_titles.values())
+                filter_status = (
+                    "全部显示"
+                    if len(word_groups) == 1
+                    and word_groups[0]["group_key"] == "全部新闻"
+                    else "匹配频率词"
+                )
+                print(
+                    f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}"
+                )
+                if matched_new_count == 0 and len(word_groups) > 1:
+                    print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
+            else:
+                print("增量模式:未检测到新增新闻")
+    elif mode == "current":
+        total_input_news = sum(len(titles) for titles in results_to_process.values())
+        if is_first_today:
+            filter_status = (
+                "全部显示"
+                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
+                else "频率词匹配"
+            )
+            print(
+                f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
+            )
+        else:
+            matched_count = sum(stat["count"] for stat in word_stats.values())
+            filter_status = (
+                "全部显示"
+                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
+                else "频率词匹配"
+            )
+            print(
+                f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
+            )
+
+    stats = []
+    # 创建 group_key 到位置和最大数量的映射
+    group_key_to_position = {
+        group["group_key"]: idx for idx, group in enumerate(word_groups)
+    }
+    group_key_to_max_count = {
+        group["group_key"]: group.get("max_count", 0) for group in word_groups
+    }
+
+    for group_key, data in word_stats.items():
+        all_titles = []
+        for source_id, title_list in data["titles"].items():
+            all_titles.extend(title_list)
+
+        # 按权重排序
+        sorted_titles = sorted(
+            all_titles,
+            key=lambda x: (
+                -calculate_news_weight(x, rank_threshold, weight_config),
+                min(x["ranks"]) if x["ranks"] else 999,
+                -x["count"],
+            ),
+        )
+
+        # 应用最大显示数量限制(优先级:单独配置 > 全局配置)
+        group_max_count = group_key_to_max_count.get(group_key, 0)
+        if group_max_count == 0:
+            # 使用全局配置
+            group_max_count = max_news_per_keyword
+
+        if group_max_count > 0:
+            sorted_titles = sorted_titles[:group_max_count]
+
+        stats.append(
+            {
+                "word": group_key,
+                "count": data["count"],
+                "position": group_key_to_position.get(group_key, 999),
+                "titles": sorted_titles,
+                "percentage": (
+                    round(data["count"] / total_titles * 100, 2)
+                    if total_titles > 0
+                    else 0
+                ),
+            }
+        )
+
+    # 根据配置选择排序优先级
+    if sort_by_position_first:
+        # 先按配置位置,再按热点条数
+        stats.sort(key=lambda x: (x["position"], -x["count"]))
+    else:
+        # 先按热点条数,再按配置位置(原逻辑)
+        stats.sort(key=lambda x: (-x["count"], x["position"]))
+
+    # 打印过滤后的匹配新闻数(与推送显示一致)
+    matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
+    if mode == "daily":
+        print(f"频率词过滤后:{matched_news_count} 条新闻匹配(将显示在推送中)")
+
+    return stats, total_titles

+ 152 - 0
trendradar/core/config.py

@@ -0,0 +1,152 @@
+# coding=utf-8
+"""
+配置工具模块 - 多账号配置解析和验证
+
+提供多账号推送配置的解析、验证和限制功能
+"""
+
+from typing import Dict, List, Optional, Tuple
+
+
+def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
+    """
+    解析多账号配置,返回账号列表
+
+    Args:
+        config_value: 配置值字符串,多个账号用分隔符分隔
+        separator: 分隔符,默认为 ;
+
+    Returns:
+        账号列表,空字符串会被保留(用于占位)
+
+    Examples:
+        >>> parse_multi_account_config("url1;url2;url3")
+        ['url1', 'url2', 'url3']
+        >>> parse_multi_account_config(";token2")  # 第一个账号无token
+        ['', 'token2']
+        >>> parse_multi_account_config("")
+        []
+    """
+    if not config_value:
+        return []
+    # 保留空字符串用于占位(如 ";token2" 表示第一个账号无token)
+    accounts = [acc.strip() for acc in config_value.split(separator)]
+    # 过滤掉全部为空的情况
+    if all(not acc for acc in accounts):
+        return []
+    return accounts
+
+
+def validate_paired_configs(
+    configs: Dict[str, List[str]],
+    channel_name: str,
+    required_keys: Optional[List[str]] = None
+) -> Tuple[bool, int]:
+    """
+    验证配对配置的数量是否一致
+
+    对于需要多个配置项配对的渠道(如 Telegram 的 token 和 chat_id),
+    验证所有配置项的账号数量是否一致。
+
+    Args:
+        configs: 配置字典,key 为配置名,value 为账号列表
+        channel_name: 渠道名称,用于日志输出
+        required_keys: 必须有值的配置项列表
+
+    Returns:
+        (是否验证通过, 账号数量)
+
+    Examples:
+        >>> validate_paired_configs({
+        ...     "token": ["t1", "t2"],
+        ...     "chat_id": ["c1", "c2"]
+        ... }, "Telegram", ["token", "chat_id"])
+        (True, 2)
+
+        >>> validate_paired_configs({
+        ...     "token": ["t1", "t2"],
+        ...     "chat_id": ["c1"]  # 数量不匹配
+        ... }, "Telegram", ["token", "chat_id"])
+        (False, 0)
+    """
+    # 过滤掉空列表
+    non_empty_configs = {k: v for k, v in configs.items() if v}
+
+    if not non_empty_configs:
+        return True, 0
+
+    # 检查必须项
+    if required_keys:
+        for key in required_keys:
+            if key not in non_empty_configs or not non_empty_configs[key]:
+                return True, 0  # 必须项为空,视为未配置
+
+    # 获取所有非空配置的长度
+    lengths = {k: len(v) for k, v in non_empty_configs.items()}
+    unique_lengths = set(lengths.values())
+
+    if len(unique_lengths) > 1:
+        print(f"❌ {channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送")
+        for key, length in lengths.items():
+            print(f"   - {key}: {length} 个")
+        return False, 0
+
+    return True, list(unique_lengths)[0] if unique_lengths else 0
+
+
+def limit_accounts(
+    accounts: List[str],
+    max_count: int,
+    channel_name: str
+) -> List[str]:
+    """
+    限制账号数量
+
+    当配置的账号数量超过最大限制时,只使用前 N 个账号,
+    并输出警告信息。
+
+    Args:
+        accounts: 账号列表
+        max_count: 最大账号数量
+        channel_name: 渠道名称,用于日志输出
+
+    Returns:
+        限制后的账号列表
+
+    Examples:
+        >>> limit_accounts(["a1", "a2", "a3"], 2, "飞书")
+        ⚠️ 飞书 配置了 3 个账号,超过最大限制 2,只使用前 2 个
+        ['a1', 'a2']
+    """
+    if len(accounts) > max_count:
+        print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count} 个")
+        print(f"   ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险")
+        return accounts[:max_count]
+    return accounts
+
+
+def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
+    """
+    安全获取指定索引的账号值
+
+    当索引超出范围或账号值为空时,返回默认值。
+
+    Args:
+        accounts: 账号列表
+        index: 索引
+        default: 默认值
+
+    Returns:
+        账号值或默认值
+
+    Examples:
+        >>> get_account_at_index(["a", "b", "c"], 1)
+        'b'
+        >>> get_account_at_index(["a", "", "c"], 1, "default")
+        'default'
+        >>> get_account_at_index(["a"], 5, "default")
+        'default'
+    """
+    if index < len(accounts):
+        return accounts[index] if accounts[index] else default
+    return default

+ 291 - 0
trendradar/core/data.py

@@ -0,0 +1,291 @@
+# coding=utf-8
+"""
+数据处理模块
+
+提供数据读取、保存和检测功能:
+- save_titles_to_file: 保存标题到 TXT 文件
+- read_all_today_titles: 从存储后端读取当天所有标题
+- detect_latest_new_titles: 检测最新批次的新增标题
+
+Author: TrendRadar Team
+"""
+
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Callable
+
+
+def save_titles_to_file(
+    results: Dict,
+    id_to_name: Dict,
+    failed_ids: List,
+    output_path: str,
+    clean_title_func: Callable[[str], str],
+) -> str:
+    """
+    保存标题到 TXT 文件
+
+    Args:
+        results: 抓取结果 {source_id: {title: title_data}}
+        id_to_name: ID 到名称的映射
+        failed_ids: 失败的 ID 列表
+        output_path: 输出文件路径
+        clean_title_func: 标题清理函数
+
+    Returns:
+        str: 保存的文件路径
+    """
+    # 确保目录存在
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        for id_value, title_data in results.items():
+            # id | name 或 id
+            name = id_to_name.get(id_value)
+            if name and name != id_value:
+                f.write(f"{id_value} | {name}\n")
+            else:
+                f.write(f"{id_value}\n")
+
+            # 按排名排序标题
+            sorted_titles = []
+            for title, info in title_data.items():
+                cleaned_title = clean_title_func(title)
+                if isinstance(info, dict):
+                    ranks = info.get("ranks", [])
+                    url = info.get("url", "")
+                    mobile_url = info.get("mobileUrl", "")
+                else:
+                    ranks = info if isinstance(info, list) else []
+                    url = ""
+                    mobile_url = ""
+
+                rank = ranks[0] if ranks else 1
+                sorted_titles.append((rank, cleaned_title, url, mobile_url))
+
+            sorted_titles.sort(key=lambda x: x[0])
+
+            for rank, cleaned_title, url, mobile_url in sorted_titles:
+                line = f"{rank}. {cleaned_title}"
+
+                if url:
+                    line += f" [URL:{url}]"
+                if mobile_url:
+                    line += f" [MOBILE:{mobile_url}]"
+                f.write(line + "\n")
+
+            f.write("\n")
+
+        if failed_ids:
+            f.write("==== 以下ID请求失败 ====\n")
+            for id_value in failed_ids:
+                f.write(f"{id_value}\n")
+
+    return output_path
+
+
+def read_all_today_titles_from_storage(
+    storage_manager,
+    current_platform_ids: Optional[List[str]] = None,
+) -> Tuple[Dict, Dict, Dict]:
+    """
+    从存储后端读取当天所有标题(SQLite 数据)
+
+    Args:
+        storage_manager: 存储管理器实例
+        current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
+
+    Returns:
+        Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
+    """
+    try:
+        news_data = storage_manager.get_today_all_data()
+
+        if not news_data or not news_data.items:
+            return {}, {}, {}
+
+        all_results = {}
+        final_id_to_name = {}
+        title_info = {}
+
+        for source_id, news_list in news_data.items.items():
+            # 按平台过滤
+            if current_platform_ids is not None and source_id not in current_platform_ids:
+                continue
+
+            # 获取来源名称
+            source_name = news_data.id_to_name.get(source_id, source_id)
+            final_id_to_name[source_id] = source_name
+
+            if source_id not in all_results:
+                all_results[source_id] = {}
+                title_info[source_id] = {}
+
+            for item in news_list:
+                title = item.title
+                ranks = getattr(item, 'ranks', [item.rank])
+                first_time = getattr(item, 'first_time', item.crawl_time)
+                last_time = getattr(item, 'last_time', item.crawl_time)
+                count = getattr(item, 'count', 1)
+
+                all_results[source_id][title] = {
+                    "ranks": ranks,
+                    "url": item.url or "",
+                    "mobileUrl": item.mobile_url or "",
+                }
+
+                title_info[source_id][title] = {
+                    "first_time": first_time,
+                    "last_time": last_time,
+                    "count": count,
+                    "ranks": ranks,
+                    "url": item.url or "",
+                    "mobileUrl": item.mobile_url or "",
+                }
+
+        return all_results, final_id_to_name, title_info
+
+    except Exception as e:
+        print(f"[存储] 从存储后端读取数据失败: {e}")
+        return {}, {}, {}
+
+
+def read_all_today_titles(
+    storage_manager,
+    current_platform_ids: Optional[List[str]] = None,
+) -> Tuple[Dict, Dict, Dict]:
+    """
+    读取当天所有标题(从存储后端)
+
+    Args:
+        storage_manager: 存储管理器实例
+        current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
+
+    Returns:
+        Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
+    """
+    all_results, final_id_to_name, title_info = read_all_today_titles_from_storage(
+        storage_manager, current_platform_ids
+    )
+
+    if all_results:
+        total_count = sum(len(titles) for titles in all_results.values())
+        print(f"[存储] 已从存储后端读取 {total_count} 条标题")
+    else:
+        print("[存储] 当天暂无数据")
+
+    return all_results, final_id_to_name, title_info
+
+
+def detect_latest_new_titles_from_storage(
+    storage_manager,
+    current_platform_ids: Optional[List[str]] = None,
+) -> Dict:
+    """
+    从存储后端检测最新批次的新增标题
+
+    Args:
+        storage_manager: 存储管理器实例
+        current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
+
+    Returns:
+        Dict: 新增标题 {source_id: {title: title_data}}
+    """
+    try:
+        # 获取最新抓取数据
+        latest_data = storage_manager.get_latest_crawl_data()
+        if not latest_data or not latest_data.items:
+            return {}
+
+        # 获取所有历史数据
+        all_data = storage_manager.get_today_all_data()
+        if not all_data or not all_data.items:
+            # 没有历史数据(第一次抓取),不应该有"新增"标题
+            return {}
+
+        # 收集历史标题(不包括最新批次的时间)
+        latest_time = latest_data.crawl_time
+        historical_titles = {}
+
+        for source_id, news_list in all_data.items.items():
+            if current_platform_ids is not None and source_id not in current_platform_ids:
+                continue
+
+            historical_titles[source_id] = set()
+            for item in news_list:
+                # 只统计非最新批次的标题
+                first_time = getattr(item, 'first_time', item.crawl_time)
+                if first_time != latest_time:
+                    historical_titles[source_id].add(item.title)
+
+        # 检查是否是当天第一次抓取(没有任何历史标题)
+        # 如果所有平台的历史标题集合都为空,说明只有一个抓取批次,不应该有"新增"标题
+        has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
+        if not has_historical_data:
+            return {}
+
+        # 找出新增标题
+        new_titles = {}
+        for source_id, news_list in latest_data.items.items():
+            if current_platform_ids is not None and source_id not in current_platform_ids:
+                continue
+
+            historical_set = historical_titles.get(source_id, set())
+            source_new_titles = {}
+
+            for item in news_list:
+                if item.title not in historical_set:
+                    source_new_titles[item.title] = {
+                        "ranks": [item.rank],
+                        "url": item.url or "",
+                        "mobileUrl": item.mobile_url or "",
+                    }
+
+            if source_new_titles:
+                new_titles[source_id] = source_new_titles
+
+        return new_titles
+
+    except Exception as e:
+        print(f"[存储] 从存储后端检测新标题失败: {e}")
+        return {}
+
+
+def detect_latest_new_titles(
+    storage_manager,
+    current_platform_ids: Optional[List[str]] = None,
+) -> Dict:
+    """
+    检测当日最新批次的新增标题(从存储后端)
+
+    Args:
+        storage_manager: 存储管理器实例
+        current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
+
+    Returns:
+        Dict: 新增标题 {source_id: {title: title_data}}
+    """
+    new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
+    if new_titles:
+        total_new = sum(len(titles) for titles in new_titles.values())
+        print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
+    return new_titles
+
+
+def is_first_crawl_today(output_dir: str, date_folder: str) -> bool:
+    """
+    检测是否是当天第一次爬取
+
+    Args:
+        output_dir: 输出目录
+        date_folder: 日期文件夹名称
+
+    Returns:
+        bool: 是否是当天第一次爬取
+    """
+    txt_dir = Path(output_dir) / date_folder / "txt"
+
+    if not txt_dir.exists():
+        return True
+
+    files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
+    return len(files) <= 1

+ 194 - 0
trendradar/core/frequency.py

@@ -0,0 +1,194 @@
+# coding=utf-8
+"""
+频率词配置加载模块
+
+负责从配置文件加载频率词规则,支持:
+- 普通词组
+- 必须词(+前缀)
+- 过滤词(!前缀)
+- 全局过滤词([GLOBAL_FILTER] 区域)
+- 最大显示数量(@前缀)
+"""
+
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+
+
+def load_frequency_words(
+    frequency_file: Optional[str] = None,
+) -> Tuple[List[Dict], List[str], List[str]]:
+    """
+    加载频率词配置
+
+    配置文件格式说明:
+    - 每个词组由空行分隔
+    - [GLOBAL_FILTER] 区域定义全局过滤词
+    - [WORD_GROUPS] 区域定义词组(默认)
+
+    词组语法:
+    - 普通词:直接写入,任意匹配即可
+    - +词:必须词,所有必须词都要匹配
+    - !词:过滤词,匹配则排除
+    - @数字:该词组最多显示的条数
+
+    Args:
+        frequency_file: 频率词配置文件路径,默认从环境变量 FREQUENCY_WORDS_PATH 获取或使用 config/frequency_words.txt
+
+    Returns:
+        (词组列表, 词组内过滤词, 全局过滤词)
+
+    Raises:
+        FileNotFoundError: 频率词文件不存在
+    """
+    if frequency_file is None:
+        frequency_file = os.environ.get(
+            "FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
+        )
+
+    frequency_path = Path(frequency_file)
+    if not frequency_path.exists():
+        raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
+
+    with open(frequency_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
+
+    processed_groups = []
+    filter_words = []
+    global_filters = []
+
+    # 默认区域(向后兼容)
+    current_section = "WORD_GROUPS"
+
+    for group in word_groups:
+        lines = [line.strip() for line in group.split("\n") if line.strip()]
+
+        if not lines:
+            continue
+
+        # 检查是否为区域标记
+        if lines[0].startswith("[") and lines[0].endswith("]"):
+            section_name = lines[0][1:-1].upper()
+            if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
+                current_section = section_name
+                lines = lines[1:]  # 移除标记行
+
+        # 处理全局过滤区域
+        if current_section == "GLOBAL_FILTER":
+            # 直接添加所有非空行到全局过滤列表
+            for line in lines:
+                # 忽略特殊语法前缀,只提取纯文本
+                if line.startswith(("!", "+", "@")):
+                    continue  # 全局过滤区不支持特殊语法
+                if line:
+                    global_filters.append(line)
+            continue
+
+        # 处理词组区域
+        words = lines
+
+        group_required_words = []
+        group_normal_words = []
+        group_filter_words = []
+        group_max_count = 0  # 默认不限制
+
+        for word in words:
+            if word.startswith("@"):
+                # 解析最大显示数量(只接受正整数)
+                try:
+                    count = int(word[1:])
+                    if count > 0:
+                        group_max_count = count
+                except (ValueError, IndexError):
+                    pass  # 忽略无效的@数字格式
+            elif word.startswith("!"):
+                filter_words.append(word[1:])
+                group_filter_words.append(word[1:])
+            elif word.startswith("+"):
+                group_required_words.append(word[1:])
+            else:
+                group_normal_words.append(word)
+
+        if group_required_words or group_normal_words:
+            if group_normal_words:
+                group_key = " ".join(group_normal_words)
+            else:
+                group_key = " ".join(group_required_words)
+
+            processed_groups.append(
+                {
+                    "required": group_required_words,
+                    "normal": group_normal_words,
+                    "group_key": group_key,
+                    "max_count": group_max_count,
+                }
+            )
+
+    return processed_groups, filter_words, global_filters
+
+
+def matches_word_groups(
+    title: str,
+    word_groups: List[Dict],
+    filter_words: List[str],
+    global_filters: Optional[List[str]] = None
+) -> bool:
+    """
+    检查标题是否匹配词组规则
+
+    Args:
+        title: 标题文本
+        word_groups: 词组列表
+        filter_words: 过滤词列表
+        global_filters: 全局过滤词列表
+
+    Returns:
+        是否匹配
+    """
+    # 防御性类型检查:确保 title 是有效字符串
+    if not isinstance(title, str):
+        title = str(title) if title is not None else ""
+    if not title.strip():
+        return False
+
+    title_lower = title.lower()
+
+    # 全局过滤检查(优先级最高)
+    if global_filters:
+        if any(global_word.lower() in title_lower for global_word in global_filters):
+            return False
+
+    # 如果没有配置词组,则匹配所有标题(支持显示全部新闻)
+    if not word_groups:
+        return True
+
+    # 过滤词检查
+    if any(filter_word.lower() in title_lower for filter_word in filter_words):
+        return False
+
+    # 词组匹配检查
+    for group in word_groups:
+        required_words = group["required"]
+        normal_words = group["normal"]
+
+        # 必须词检查
+        if required_words:
+            all_required_present = all(
+                req_word.lower() in title_lower for req_word in required_words
+            )
+            if not all_required_present:
+                continue
+
+        # 普通词检查
+        if normal_words:
+            any_normal_present = any(
+                normal_word.lower() in title_lower for normal_word in normal_words
+            )
+            if not any_normal_present:
+                continue
+
+        return True
+
+    return False

+ 332 - 0
trendradar/core/loader.py

@@ -0,0 +1,332 @@
+# coding=utf-8
+"""
+配置加载模块
+
+负责从 YAML 配置文件和环境变量加载配置。
+"""
+
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+
+import yaml
+
+from .config import parse_multi_account_config, validate_paired_configs
+
+
+def _get_env_bool(key: str, default: bool = False) -> Optional[bool]:
+    """从环境变量获取布尔值,如果未设置返回 None"""
+    value = os.environ.get(key, "").strip().lower()
+    if not value:
+        return None
+    return value in ("true", "1")
+
+
+def _get_env_int(key: str, default: int = 0) -> int:
+    """从环境变量获取整数值"""
+    value = os.environ.get(key, "").strip()
+    if not value:
+        return default
+    try:
+        return int(value)
+    except ValueError:
+        return default
+
+
+def _get_env_str(key: str, default: str = "") -> str:
+    """从环境变量获取字符串值"""
+    return os.environ.get(key, "").strip() or default
+
+
+def _load_app_config(config_data: Dict) -> Dict:
+    """加载应用配置"""
+    app_config = config_data.get("app", {})
+    return {
+        "VERSION_CHECK_URL": app_config.get("version_check_url", ""),
+        "SHOW_VERSION_UPDATE": app_config.get("show_version_update", True),
+        "TIMEZONE": _get_env_str("TIMEZONE") or app_config.get("timezone", "Asia/Shanghai"),
+    }
+
+
+def _load_crawler_config(config_data: Dict) -> Dict:
+    """加载爬虫配置"""
+    crawler_config = config_data.get("crawler", {})
+    enable_crawler_env = _get_env_bool("ENABLE_CRAWLER")
+    return {
+        "REQUEST_INTERVAL": crawler_config.get("request_interval", 100),
+        "USE_PROXY": crawler_config.get("use_proxy", False),
+        "DEFAULT_PROXY": crawler_config.get("default_proxy", ""),
+        "ENABLE_CRAWLER": enable_crawler_env if enable_crawler_env is not None else crawler_config.get("enable_crawler", True),
+    }
+
+
+def _load_report_config(config_data: Dict) -> Dict:
+    """加载报告配置"""
+    report_config = config_data.get("report", {})
+
+    # 环境变量覆盖
+    sort_by_position_env = _get_env_bool("SORT_BY_POSITION_FIRST")
+    reverse_content_env = _get_env_bool("REVERSE_CONTENT_ORDER")
+    max_news_env = _get_env_int("MAX_NEWS_PER_KEYWORD")
+
+    return {
+        "REPORT_MODE": _get_env_str("REPORT_MODE") or report_config.get("mode", "daily"),
+        "RANK_THRESHOLD": report_config.get("rank_threshold", 10),
+        "SORT_BY_POSITION_FIRST": sort_by_position_env if sort_by_position_env is not None else report_config.get("sort_by_position_first", False),
+        "MAX_NEWS_PER_KEYWORD": max_news_env or report_config.get("max_news_per_keyword", 0),
+        "REVERSE_CONTENT_ORDER": reverse_content_env if reverse_content_env is not None else report_config.get("reverse_content_order", False),
+    }
+
+
+def _load_notification_config(config_data: Dict) -> Dict:
+    """加载通知配置"""
+    notification = config_data.get("notification", {})
+    enable_notification_env = _get_env_bool("ENABLE_NOTIFICATION")
+
+    return {
+        "ENABLE_NOTIFICATION": enable_notification_env if enable_notification_env is not None else notification.get("enable_notification", True),
+        "MESSAGE_BATCH_SIZE": notification.get("message_batch_size", 4000),
+        "DINGTALK_BATCH_SIZE": notification.get("dingtalk_batch_size", 20000),
+        "FEISHU_BATCH_SIZE": notification.get("feishu_batch_size", 29000),
+        "BARK_BATCH_SIZE": notification.get("bark_batch_size", 3600),
+        "SLACK_BATCH_SIZE": notification.get("slack_batch_size", 4000),
+        "BATCH_SEND_INTERVAL": notification.get("batch_send_interval", 1.0),
+        "FEISHU_MESSAGE_SEPARATOR": notification.get("feishu_message_separator", "---"),
+        "MAX_ACCOUNTS_PER_CHANNEL": _get_env_int("MAX_ACCOUNTS_PER_CHANNEL") or notification.get("max_accounts_per_channel", 3),
+    }
+
+
+def _load_push_window_config(config_data: Dict) -> Dict:
+    """加载推送窗口配置"""
+    notification = config_data.get("notification", {})
+    push_window = notification.get("push_window", {})
+    time_range = push_window.get("time_range", {})
+
+    enabled_env = _get_env_bool("PUSH_WINDOW_ENABLED")
+    once_per_day_env = _get_env_bool("PUSH_WINDOW_ONCE_PER_DAY")
+
+    return {
+        "ENABLED": enabled_env if enabled_env is not None else push_window.get("enabled", False),
+        "TIME_RANGE": {
+            "START": _get_env_str("PUSH_WINDOW_START") or time_range.get("start", "08:00"),
+            "END": _get_env_str("PUSH_WINDOW_END") or time_range.get("end", "22:00"),
+        },
+        "ONCE_PER_DAY": once_per_day_env if once_per_day_env is not None else push_window.get("once_per_day", True),
+    }
+
+
+def _load_weight_config(config_data: Dict) -> Dict:
+    """加载权重配置"""
+    weight = config_data.get("weight", {})
+    return {
+        "RANK_WEIGHT": weight.get("rank_weight", 1.0),
+        "FREQUENCY_WEIGHT": weight.get("frequency_weight", 1.0),
+        "HOTNESS_WEIGHT": weight.get("hotness_weight", 1.0),
+    }
+
+
+def _load_storage_config(config_data: Dict) -> Dict:
+    """加载存储配置"""
+    storage = config_data.get("storage", {})
+    formats = storage.get("formats", {})
+    local = storage.get("local", {})
+    remote = storage.get("remote", {})
+    pull = storage.get("pull", {})
+
+    txt_enabled_env = _get_env_bool("STORAGE_TXT_ENABLED")
+    html_enabled_env = _get_env_bool("STORAGE_HTML_ENABLED")
+    pull_enabled_env = _get_env_bool("PULL_ENABLED")
+
+    return {
+        "BACKEND": _get_env_str("STORAGE_BACKEND") or storage.get("backend", "auto"),
+        "FORMATS": {
+            "SQLITE": formats.get("sqlite", True),
+            "TXT": txt_enabled_env if txt_enabled_env is not None else formats.get("txt", True),
+            "HTML": html_enabled_env if html_enabled_env is not None else formats.get("html", True),
+        },
+        "LOCAL": {
+            "DATA_DIR": local.get("data_dir", "output"),
+            "RETENTION_DAYS": _get_env_int("LOCAL_RETENTION_DAYS") or local.get("retention_days", 0),
+        },
+        "REMOTE": {
+            "ENDPOINT_URL": _get_env_str("S3_ENDPOINT_URL") or remote.get("endpoint_url", ""),
+            "BUCKET_NAME": _get_env_str("S3_BUCKET_NAME") or remote.get("bucket_name", ""),
+            "ACCESS_KEY_ID": _get_env_str("S3_ACCESS_KEY_ID") or remote.get("access_key_id", ""),
+            "SECRET_ACCESS_KEY": _get_env_str("S3_SECRET_ACCESS_KEY") or remote.get("secret_access_key", ""),
+            "REGION": _get_env_str("S3_REGION") or remote.get("region", ""),
+            "RETENTION_DAYS": _get_env_int("REMOTE_RETENTION_DAYS") or remote.get("retention_days", 0),
+        },
+        "PULL": {
+            "ENABLED": pull_enabled_env if pull_enabled_env is not None else pull.get("enabled", False),
+            "DAYS": _get_env_int("PULL_DAYS") or pull.get("days", 7),
+        },
+    }
+
+
+def _load_webhook_config(config_data: Dict) -> Dict:
+    """加载 Webhook 配置"""
+    notification = config_data.get("notification", {})
+    webhooks = notification.get("webhooks", {})
+
+    return {
+        # 飞书
+        "FEISHU_WEBHOOK_URL": _get_env_str("FEISHU_WEBHOOK_URL") or webhooks.get("feishu_url", ""),
+        # 钉钉
+        "DINGTALK_WEBHOOK_URL": _get_env_str("DINGTALK_WEBHOOK_URL") or webhooks.get("dingtalk_url", ""),
+        # 企业微信
+        "WEWORK_WEBHOOK_URL": _get_env_str("WEWORK_WEBHOOK_URL") or webhooks.get("wework_url", ""),
+        "WEWORK_MSG_TYPE": _get_env_str("WEWORK_MSG_TYPE") or webhooks.get("wework_msg_type", "markdown"),
+        # Telegram
+        "TELEGRAM_BOT_TOKEN": _get_env_str("TELEGRAM_BOT_TOKEN") or webhooks.get("telegram_bot_token", ""),
+        "TELEGRAM_CHAT_ID": _get_env_str("TELEGRAM_CHAT_ID") or webhooks.get("telegram_chat_id", ""),
+        # 邮件
+        "EMAIL_FROM": _get_env_str("EMAIL_FROM") or webhooks.get("email_from", ""),
+        "EMAIL_PASSWORD": _get_env_str("EMAIL_PASSWORD") or webhooks.get("email_password", ""),
+        "EMAIL_TO": _get_env_str("EMAIL_TO") or webhooks.get("email_to", ""),
+        "EMAIL_SMTP_SERVER": _get_env_str("EMAIL_SMTP_SERVER") or webhooks.get("email_smtp_server", ""),
+        "EMAIL_SMTP_PORT": _get_env_str("EMAIL_SMTP_PORT") or webhooks.get("email_smtp_port", ""),
+        # ntfy
+        "NTFY_SERVER_URL": _get_env_str("NTFY_SERVER_URL") or webhooks.get("ntfy_server_url") or "https://ntfy.sh",
+        "NTFY_TOPIC": _get_env_str("NTFY_TOPIC") or webhooks.get("ntfy_topic", ""),
+        "NTFY_TOKEN": _get_env_str("NTFY_TOKEN") or webhooks.get("ntfy_token", ""),
+        # Bark
+        "BARK_URL": _get_env_str("BARK_URL") or webhooks.get("bark_url", ""),
+        # Slack
+        "SLACK_WEBHOOK_URL": _get_env_str("SLACK_WEBHOOK_URL") or webhooks.get("slack_webhook_url", ""),
+    }
+
+
+def _print_notification_sources(config: Dict) -> None:
+    """打印通知渠道配置来源信息"""
+    notification_sources = []
+    max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
+
+    if config["FEISHU_WEBHOOK_URL"]:
+        accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
+        count = min(len(accounts), max_accounts)
+        source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
+        notification_sources.append(f"飞书({source}, {count}个账号)")
+
+    if config["DINGTALK_WEBHOOK_URL"]:
+        accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
+        count = min(len(accounts), max_accounts)
+        source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
+        notification_sources.append(f"钉钉({source}, {count}个账号)")
+
+    if config["WEWORK_WEBHOOK_URL"]:
+        accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
+        count = min(len(accounts), max_accounts)
+        source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
+        notification_sources.append(f"企业微信({source}, {count}个账号)")
+
+    if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
+        tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
+        chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
+        valid, count = validate_paired_configs(
+            {"bot_token": tokens, "chat_id": chat_ids},
+            "Telegram",
+            required_keys=["bot_token", "chat_id"]
+        )
+        if valid and count > 0:
+            count = min(count, max_accounts)
+            token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
+            notification_sources.append(f"Telegram({token_source}, {count}个账号)")
+
+    if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
+        from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
+        notification_sources.append(f"邮件({from_source})")
+
+    if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
+        topics = parse_multi_account_config(config["NTFY_TOPIC"])
+        tokens = parse_multi_account_config(config["NTFY_TOKEN"])
+        if tokens:
+            valid, count = validate_paired_configs(
+                {"topic": topics, "token": tokens},
+                "ntfy"
+            )
+            if valid and count > 0:
+                count = min(count, max_accounts)
+                server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
+                notification_sources.append(f"ntfy({server_source}, {count}个账号)")
+        else:
+            count = min(len(topics), max_accounts)
+            server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
+            notification_sources.append(f"ntfy({server_source}, {count}个账号)")
+
+    if config["BARK_URL"]:
+        accounts = parse_multi_account_config(config["BARK_URL"])
+        count = min(len(accounts), max_accounts)
+        bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
+        notification_sources.append(f"Bark({bark_source}, {count}个账号)")
+
+    if config["SLACK_WEBHOOK_URL"]:
+        accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
+        count = min(len(accounts), max_accounts)
+        slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
+        notification_sources.append(f"Slack({slack_source}, {count}个账号)")
+
+    if notification_sources:
+        print(f"通知渠道配置来源: {', '.join(notification_sources)}")
+        print(f"每个渠道最大账号数: {max_accounts}")
+    else:
+        print("未配置任何通知渠道")
+
+
+def load_config(config_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    加载配置文件
+
+    Args:
+        config_path: 配置文件路径,默认从环境变量 CONFIG_PATH 获取或使用 config/config.yaml
+
+    Returns:
+        包含所有配置的字典
+
+    Raises:
+        FileNotFoundError: 配置文件不存在
+    """
+    if config_path is None:
+        config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
+
+    if not Path(config_path).exists():
+        raise FileNotFoundError(f"配置文件 {config_path} 不存在")
+
+    with open(config_path, "r", encoding="utf-8") as f:
+        config_data = yaml.safe_load(f)
+
+    print(f"配置文件加载成功: {config_path}")
+
+    # 合并所有配置
+    config = {}
+
+    # 应用配置
+    config.update(_load_app_config(config_data))
+
+    # 爬虫配置
+    config.update(_load_crawler_config(config_data))
+
+    # 报告配置
+    config.update(_load_report_config(config_data))
+
+    # 通知配置
+    config.update(_load_notification_config(config_data))
+
+    # 推送窗口配置
+    config["PUSH_WINDOW"] = _load_push_window_config(config_data)
+
+    # 权重配置
+    config["WEIGHT_CONFIG"] = _load_weight_config(config_data)
+
+    # 平台配置
+    config["PLATFORMS"] = config_data.get("platforms", [])
+
+    # 存储配置
+    config["STORAGE"] = _load_storage_config(config_data)
+
+    # Webhook 配置
+    config.update(_load_webhook_config(config_data))
+
+    # 打印通知渠道配置来源
+    _print_notification_sources(config)
+
+    return config

+ 8 - 0
trendradar/crawler/__init__.py

@@ -0,0 +1,8 @@
+# coding=utf-8
+"""
+爬虫模块 - 数据抓取功能
+"""
+
+from trendradar.crawler.fetcher import DataFetcher
+
+__all__ = ["DataFetcher"]

+ 184 - 0
trendradar/crawler/fetcher.py

@@ -0,0 +1,184 @@
+# coding=utf-8
+"""
+数据获取器模块
+
+负责从 NewsNow API 抓取新闻数据,支持:
+- 单个平台数据获取
+- 批量平台数据爬取
+- 自动重试机制
+- 代理支持
+"""
+
+import json
+import random
+import time
+from typing import Dict, List, Tuple, Optional, Union
+
+import requests
+
+
+class DataFetcher:
+    """数据获取器"""
+
+    # 默认 API 地址
+    DEFAULT_API_URL = "https://newsnow.busiyi.world/api/s"
+
+    # 默认请求头
+    DEFAULT_HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+        "Accept": "application/json, text/plain, */*",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Connection": "keep-alive",
+        "Cache-Control": "no-cache",
+    }
+
+    def __init__(
+        self,
+        proxy_url: Optional[str] = None,
+        api_url: Optional[str] = None,
+    ):
+        """
+        初始化数据获取器
+
+        Args:
+            proxy_url: 代理服务器 URL(可选)
+            api_url: API 基础 URL(可选,默认使用 DEFAULT_API_URL)
+        """
+        self.proxy_url = proxy_url
+        self.api_url = api_url or self.DEFAULT_API_URL
+
+    def fetch_data(
+        self,
+        id_info: Union[str, Tuple[str, str]],
+        max_retries: int = 2,
+        min_retry_wait: int = 3,
+        max_retry_wait: int = 5,
+    ) -> Tuple[Optional[str], str, str]:
+        """
+        获取指定ID数据,支持重试
+
+        Args:
+            id_info: 平台ID 或 (平台ID, 别名) 元组
+            max_retries: 最大重试次数
+            min_retry_wait: 最小重试等待时间(秒)
+            max_retry_wait: 最大重试等待时间(秒)
+
+        Returns:
+            (响应文本, 平台ID, 别名) 元组,失败时响应文本为 None
+        """
+        if isinstance(id_info, tuple):
+            id_value, alias = id_info
+        else:
+            id_value = id_info
+            alias = id_value
+
+        url = f"{self.api_url}?id={id_value}&latest"
+
+        proxies = None
+        if self.proxy_url:
+            proxies = {"http": self.proxy_url, "https": self.proxy_url}
+
+        retries = 0
+        while retries <= max_retries:
+            try:
+                response = requests.get(
+                    url,
+                    proxies=proxies,
+                    headers=self.DEFAULT_HEADERS,
+                    timeout=10,
+                )
+                response.raise_for_status()
+
+                data_text = response.text
+                data_json = json.loads(data_text)
+
+                status = data_json.get("status", "未知")
+                if status not in ["success", "cache"]:
+                    raise ValueError(f"响应状态异常: {status}")
+
+                status_info = "最新数据" if status == "success" else "缓存数据"
+                print(f"获取 {id_value} 成功({status_info})")
+                return data_text, id_value, alias
+
+            except Exception as e:
+                retries += 1
+                if retries <= max_retries:
+                    base_wait = random.uniform(min_retry_wait, max_retry_wait)
+                    additional_wait = (retries - 1) * random.uniform(1, 2)
+                    wait_time = base_wait + additional_wait
+                    print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
+                    time.sleep(wait_time)
+                else:
+                    print(f"请求 {id_value} 失败: {e}")
+                    return None, id_value, alias
+
+        return None, id_value, alias
+
+    def crawl_websites(
+        self,
+        ids_list: List[Union[str, Tuple[str, str]]],
+        request_interval: int = 100,
+    ) -> Tuple[Dict, Dict, List]:
+        """
+        爬取多个网站数据
+
+        Args:
+            ids_list: 平台ID列表,每个元素可以是字符串或 (平台ID, 别名) 元组
+            request_interval: 请求间隔(毫秒)
+
+        Returns:
+            (结果字典, ID到名称的映射, 失败ID列表) 元组
+        """
+        results = {}
+        id_to_name = {}
+        failed_ids = []
+
+        for i, id_info in enumerate(ids_list):
+            if isinstance(id_info, tuple):
+                id_value, name = id_info
+            else:
+                id_value = id_info
+                name = id_value
+
+            id_to_name[id_value] = name
+            response, _, _ = self.fetch_data(id_info)
+
+            if response:
+                try:
+                    data = json.loads(response)
+                    results[id_value] = {}
+
+                    for index, item in enumerate(data.get("items", []), 1):
+                        title = item.get("title")
+                        # 跳过无效标题(None、float、空字符串)
+                        if title is None or isinstance(title, float) or not str(title).strip():
+                            continue
+                        title = str(title).strip()
+                        url = item.get("url", "")
+                        mobile_url = item.get("mobileUrl", "")
+
+                        if title in results[id_value]:
+                            results[id_value][title]["ranks"].append(index)
+                        else:
+                            results[id_value][title] = {
+                                "ranks": [index],
+                                "url": url,
+                                "mobileUrl": mobile_url,
+                            }
+                except json.JSONDecodeError:
+                    print(f"解析 {id_value} 响应失败")
+                    failed_ids.append(id_value)
+                except Exception as e:
+                    print(f"处理 {id_value} 数据出错: {e}")
+                    failed_ids.append(id_value)
+            else:
+                failed_ids.append(id_value)
+
+            # 请求间隔(除了最后一个)
+            if i < len(ids_list) - 1:
+                actual_interval = request_interval + random.randint(-10, 20)
+                actual_interval = max(50, actual_interval)
+                time.sleep(actual_interval / 1000)
+
+        print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
+        return results, id_to_name, failed_ids

+ 81 - 0
trendradar/notification/__init__.py

@@ -0,0 +1,81 @@
+# coding=utf-8
+"""
+通知推送模块
+
+提供多渠道通知推送功能,包括:
+- 飞书、钉钉、企业微信
+- Telegram、Slack
+- Email、ntfy、Bark
+
+模块结构:
+- push_manager: 推送记录管理
+- formatters: 内容格式转换
+- batch: 批次处理工具
+- renderer: 通知内容渲染
+- splitter: 消息分批拆分
+- senders: 消息发送器(各渠道发送函数)
+- dispatcher: 多账号通知调度器
+"""
+
+from trendradar.notification.push_manager import PushRecordManager
+from trendradar.notification.formatters import (
+    strip_markdown,
+    convert_markdown_to_mrkdwn,
+)
+from trendradar.notification.batch import (
+    get_batch_header,
+    get_max_batch_header_size,
+    truncate_to_bytes,
+    add_batch_headers,
+)
+from trendradar.notification.renderer import (
+    render_feishu_content,
+    render_dingtalk_content,
+)
+from trendradar.notification.splitter import (
+    split_content_into_batches,
+    DEFAULT_BATCH_SIZES,
+)
+from trendradar.notification.senders import (
+    send_to_feishu,
+    send_to_dingtalk,
+    send_to_wework,
+    send_to_telegram,
+    send_to_email,
+    send_to_ntfy,
+    send_to_bark,
+    send_to_slack,
+    SMTP_CONFIGS,
+)
+from trendradar.notification.dispatcher import NotificationDispatcher
+
+__all__ = [
+    # 推送记录管理
+    "PushRecordManager",
+    # 格式转换
+    "strip_markdown",
+    "convert_markdown_to_mrkdwn",
+    # 批次处理
+    "get_batch_header",
+    "get_max_batch_header_size",
+    "truncate_to_bytes",
+    "add_batch_headers",
+    # 内容渲染
+    "render_feishu_content",
+    "render_dingtalk_content",
+    # 消息分批
+    "split_content_into_batches",
+    "DEFAULT_BATCH_SIZES",
+    # 消息发送器
+    "send_to_feishu",
+    "send_to_dingtalk",
+    "send_to_wework",
+    "send_to_telegram",
+    "send_to_email",
+    "send_to_ntfy",
+    "send_to_bark",
+    "send_to_slack",
+    "SMTP_CONFIGS",
+    # 通知调度器
+    "NotificationDispatcher",
+]

+ 115 - 0
trendradar/notification/batch.py

@@ -0,0 +1,115 @@
+# coding=utf-8
+"""
+批次处理模块
+
+提供消息分批发送的辅助函数
+"""
+
+from typing import List
+
+
+def get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str:
+    """根据 format_type 生成对应格式的批次头部
+
+    Args:
+        format_type: 推送类型(telegram, slack, wework_text, bark, feishu, dingtalk, ntfy, wework)
+        batch_num: 当前批次编号
+        total_batches: 总批次数
+
+    Returns:
+        格式化的批次头部字符串
+    """
+    if format_type == "telegram":
+        return f"<b>[第 {batch_num}/{total_batches} 批次]</b>\n\n"
+    elif format_type == "slack":
+        return f"*[第 {batch_num}/{total_batches} 批次]*\n\n"
+    elif format_type in ("wework_text", "bark"):
+        # 企业微信文本模式和 Bark 使用纯文本格式
+        return f"[第 {batch_num}/{total_batches} 批次]\n\n"
+    else:
+        # 飞书、钉钉、ntfy、企业微信 markdown 模式
+        return f"**[第 {batch_num}/{total_batches} 批次]**\n\n"
+
+
+def get_max_batch_header_size(format_type: str) -> int:
+    """估算批次头部的最大字节数(假设最多 99 批次)
+
+    用于在分批时预留空间,避免事后截断破坏内容完整性。
+
+    Args:
+        format_type: 推送类型
+
+    Returns:
+        最大头部字节数
+    """
+    # 生成最坏情况的头部(99/99 批次)
+    max_header = get_batch_header(format_type, 99, 99)
+    return len(max_header.encode("utf-8"))
+
+
+def truncate_to_bytes(text: str, max_bytes: int) -> str:
+    """安全截断字符串到指定字节数,避免截断多字节字符
+
+    Args:
+        text: 要截断的文本
+        max_bytes: 最大字节数
+
+    Returns:
+        截断后的文本
+    """
+    text_bytes = text.encode("utf-8")
+    if len(text_bytes) <= max_bytes:
+        return text
+
+    # 截断到指定字节数
+    truncated = text_bytes[:max_bytes]
+
+    # 处理可能的不完整 UTF-8 字符
+    for i in range(min(4, len(truncated))):
+        try:
+            return truncated[: len(truncated) - i].decode("utf-8")
+        except UnicodeDecodeError:
+            continue
+
+    # 极端情况:返回空字符串
+    return ""
+
+
+def add_batch_headers(
+    batches: List[str], format_type: str, max_bytes: int
+) -> List[str]:
+    """为批次添加头部,动态计算确保总大小不超过限制
+
+    Args:
+        batches: 原始批次列表
+        format_type: 推送类型(bark, telegram, feishu 等)
+        max_bytes: 该推送类型的最大字节限制
+
+    Returns:
+        添加头部后的批次列表
+    """
+    if len(batches) <= 1:
+        return batches
+
+    total = len(batches)
+    result = []
+
+    for i, content in enumerate(batches, 1):
+        # 生成批次头部
+        header = get_batch_header(format_type, i, total)
+        header_size = len(header.encode("utf-8"))
+
+        # 动态计算允许的最大内容大小
+        max_content_size = max_bytes - header_size
+        content_size = len(content.encode("utf-8"))
+
+        # 如果超出,截断到安全大小
+        if content_size > max_content_size:
+            print(
+                f"警告:{format_type} 第 {i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节),截断到 {max_content_size} 字节"
+            )
+            content = truncate_to_bytes(content, max_content_size)
+
+        result.append(header + content)
+
+    return result

+ 420 - 0
trendradar/notification/dispatcher.py

@@ -0,0 +1,420 @@
+# coding=utf-8
+"""
+通知调度器模块
+
+提供统一的通知分发接口。
+支持所有通知渠道的多账号配置,使用 `;` 分隔多个账号。
+
+使用示例:
+    dispatcher = NotificationDispatcher(config, get_time_func, split_content_func)
+    results = dispatcher.dispatch_all(report_data, report_type, ...)
+"""
+
+from typing import Any, Callable, Dict, List, Optional
+
+from trendradar.core.config import (
+    get_account_at_index,
+    limit_accounts,
+    parse_multi_account_config,
+    validate_paired_configs,
+)
+
+from .senders import (
+    send_to_bark,
+    send_to_dingtalk,
+    send_to_email,
+    send_to_feishu,
+    send_to_ntfy,
+    send_to_slack,
+    send_to_telegram,
+    send_to_wework,
+)
+
+
+class NotificationDispatcher:
+    """
+    统一的多账号通知调度器
+
+    将多账号发送逻辑封装,提供简洁的 dispatch_all 接口。
+    内部处理账号解析、数量限制、配对验证等逻辑。
+    """
+
+    def __init__(
+        self,
+        config: Dict[str, Any],
+        get_time_func: Callable,
+        split_content_func: Callable,
+    ):
+        """
+        初始化通知调度器
+
+        Args:
+            config: 完整的配置字典,包含所有通知渠道的配置
+            get_time_func: 获取当前时间的函数
+            split_content_func: 内容分批函数
+        """
+        self.config = config
+        self.get_time_func = get_time_func
+        self.split_content_func = split_content_func
+        self.max_accounts = config.get("MAX_ACCOUNTS_PER_CHANNEL", 3)
+
+    def dispatch_all(
+        self,
+        report_data: Dict,
+        report_type: str,
+        update_info: Optional[Dict] = None,
+        proxy_url: Optional[str] = None,
+        mode: str = "daily",
+        html_file_path: Optional[str] = None,
+    ) -> Dict[str, bool]:
+        """
+        分发通知到所有已配置的渠道
+
+        Args:
+            report_data: 报告数据(由 prepare_report_data 生成)
+            report_type: 报告类型(如 "当日汇总"、"实时增量")
+            update_info: 版本更新信息(可选)
+            proxy_url: 代理 URL(可选)
+            mode: 报告模式 (daily/current/incremental)
+            html_file_path: HTML 报告文件路径(邮件使用)
+
+        Returns:
+            Dict[str, bool]: 每个渠道的发送结果,key 为渠道名,value 为是否成功
+        """
+        results = {}
+
+        # 飞书
+        if self.config.get("FEISHU_WEBHOOK_URL"):
+            results["feishu"] = self._send_feishu(
+                report_data, report_type, update_info, proxy_url, mode
+            )
+
+        # 钉钉
+        if self.config.get("DINGTALK_WEBHOOK_URL"):
+            results["dingtalk"] = self._send_dingtalk(
+                report_data, report_type, update_info, proxy_url, mode
+            )
+
+        # 企业微信
+        if self.config.get("WEWORK_WEBHOOK_URL"):
+            results["wework"] = self._send_wework(
+                report_data, report_type, update_info, proxy_url, mode
+            )
+
+        # Telegram(需要配对验证)
+        if self.config.get("TELEGRAM_BOT_TOKEN") and self.config.get("TELEGRAM_CHAT_ID"):
+            results["telegram"] = self._send_telegram(
+                report_data, report_type, update_info, proxy_url, mode
+            )
+
+        # ntfy(需要配对验证)
+        if self.config.get("NTFY_SERVER_URL") and self.config.get("NTFY_TOPIC"):
+            results["ntfy"] = self._send_ntfy(
+                report_data, report_type, update_info, proxy_url, mode
+            )
+
+        # Bark
+        if self.config.get("BARK_URL"):
+            results["bark"] = self._send_bark(
+                report_data, report_type, update_info, proxy_url, mode
+            )
+
+        # Slack
+        if self.config.get("SLACK_WEBHOOK_URL"):
+            results["slack"] = self._send_slack(
+                report_data, report_type, update_info, proxy_url, mode
+            )
+
+        # 邮件(保持原有逻辑,已支持多收件人)
+        if (
+            self.config.get("EMAIL_FROM")
+            and self.config.get("EMAIL_PASSWORD")
+            and self.config.get("EMAIL_TO")
+        ):
+            results["email"] = self._send_email(report_type, html_file_path)
+
+        return results
+
+    def _send_to_multi_accounts(
+        self,
+        channel_name: str,
+        config_value: str,
+        send_func: Callable[..., bool],
+        **kwargs,
+    ) -> bool:
+        """
+        通用多账号发送逻辑
+
+        Args:
+            channel_name: 渠道名称(用于日志和账号数量限制提示)
+            config_value: 配置值(可能包含多个账号,用 ; 分隔)
+            send_func: 发送函数,签名为 (account, account_label=..., **kwargs) -> bool
+            **kwargs: 传递给发送函数的其他参数
+
+        Returns:
+            bool: 任一账号发送成功则返回 True
+        """
+        accounts = parse_multi_account_config(config_value)
+        if not accounts:
+            return False
+
+        accounts = limit_accounts(accounts, self.max_accounts, channel_name)
+        results = []
+
+        for i, account in enumerate(accounts):
+            if account:
+                account_label = f"账号{i+1}" if len(accounts) > 1 else ""
+                result = send_func(account, account_label=account_label, **kwargs)
+                results.append(result)
+
+        return any(results) if results else False
+
+    def _send_feishu(
+        self,
+        report_data: Dict,
+        report_type: str,
+        update_info: Optional[Dict],
+        proxy_url: Optional[str],
+        mode: str,
+    ) -> bool:
+        """发送到飞书(多账号)"""
+        return self._send_to_multi_accounts(
+            channel_name="飞书",
+            config_value=self.config["FEISHU_WEBHOOK_URL"],
+            send_func=lambda url, account_label: send_to_feishu(
+                webhook_url=url,
+                report_data=report_data,
+                report_type=report_type,
+                update_info=update_info,
+                proxy_url=proxy_url,
+                mode=mode,
+                account_label=account_label,
+                batch_size=self.config.get("FEISHU_BATCH_SIZE", 29000),
+                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
+                split_content_func=self.split_content_func,
+                get_time_func=self.get_time_func,
+            ),
+        )
+
+    def _send_dingtalk(
+        self,
+        report_data: Dict,
+        report_type: str,
+        update_info: Optional[Dict],
+        proxy_url: Optional[str],
+        mode: str,
+    ) -> bool:
+        """发送到钉钉(多账号)"""
+        return self._send_to_multi_accounts(
+            channel_name="钉钉",
+            config_value=self.config["DINGTALK_WEBHOOK_URL"],
+            send_func=lambda url, account_label: send_to_dingtalk(
+                webhook_url=url,
+                report_data=report_data,
+                report_type=report_type,
+                update_info=update_info,
+                proxy_url=proxy_url,
+                mode=mode,
+                account_label=account_label,
+                batch_size=self.config.get("DINGTALK_BATCH_SIZE", 20000),
+                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
+                split_content_func=self.split_content_func,
+            ),
+        )
+
+    def _send_wework(
+        self,
+        report_data: Dict,
+        report_type: str,
+        update_info: Optional[Dict],
+        proxy_url: Optional[str],
+        mode: str,
+    ) -> bool:
+        """发送到企业微信(多账号)"""
+        return self._send_to_multi_accounts(
+            channel_name="企业微信",
+            config_value=self.config["WEWORK_WEBHOOK_URL"],
+            send_func=lambda url, account_label: send_to_wework(
+                webhook_url=url,
+                report_data=report_data,
+                report_type=report_type,
+                update_info=update_info,
+                proxy_url=proxy_url,
+                mode=mode,
+                account_label=account_label,
+                batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
+                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
+                msg_type=self.config.get("WEWORK_MSG_TYPE", "markdown"),
+                split_content_func=self.split_content_func,
+            ),
+        )
+
+    def _send_telegram(
+        self,
+        report_data: Dict,
+        report_type: str,
+        update_info: Optional[Dict],
+        proxy_url: Optional[str],
+        mode: str,
+    ) -> bool:
+        """发送到 Telegram(多账号,需验证 token 和 chat_id 配对)"""
+        telegram_tokens = parse_multi_account_config(self.config["TELEGRAM_BOT_TOKEN"])
+        telegram_chat_ids = parse_multi_account_config(self.config["TELEGRAM_CHAT_ID"])
+
+        if not telegram_tokens or not telegram_chat_ids:
+            return False
+
+        # 验证配对
+        valid, count = validate_paired_configs(
+            {"bot_token": telegram_tokens, "chat_id": telegram_chat_ids},
+            "Telegram",
+            required_keys=["bot_token", "chat_id"],
+        )
+        if not valid or count == 0:
+            return False
+
+        # 限制账号数量
+        telegram_tokens = limit_accounts(telegram_tokens, self.max_accounts, "Telegram")
+        telegram_chat_ids = telegram_chat_ids[: len(telegram_tokens)]
+
+        results = []
+        for i in range(len(telegram_tokens)):
+            token = telegram_tokens[i]
+            chat_id = telegram_chat_ids[i]
+            if token and chat_id:
+                account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else ""
+                result = send_to_telegram(
+                    bot_token=token,
+                    chat_id=chat_id,
+                    report_data=report_data,
+                    report_type=report_type,
+                    update_info=update_info,
+                    proxy_url=proxy_url,
+                    mode=mode,
+                    account_label=account_label,
+                    batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
+                    batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
+                    split_content_func=self.split_content_func,
+                )
+                results.append(result)
+
+        return any(results) if results else False
+
+    def _send_ntfy(
+        self,
+        report_data: Dict,
+        report_type: str,
+        update_info: Optional[Dict],
+        proxy_url: Optional[str],
+        mode: str,
+    ) -> bool:
+        """发送到 ntfy(多账号,需验证 topic 和 token 配对)"""
+        ntfy_server_url = self.config["NTFY_SERVER_URL"]
+        ntfy_topics = parse_multi_account_config(self.config["NTFY_TOPIC"])
+        ntfy_tokens = parse_multi_account_config(self.config.get("NTFY_TOKEN", ""))
+
+        if not ntfy_server_url or not ntfy_topics:
+            return False
+
+        # 验证 token 和 topic 数量一致(如果配置了 token)
+        if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics):
+            print(
+                f"❌ ntfy 配置错误:topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致,跳过 ntfy 推送"
+            )
+            return False
+
+        # 限制账号数量
+        ntfy_topics = limit_accounts(ntfy_topics, self.max_accounts, "ntfy")
+        if ntfy_tokens:
+            ntfy_tokens = ntfy_tokens[: len(ntfy_topics)]
+
+        results = []
+        for i, topic in enumerate(ntfy_topics):
+            if topic:
+                token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else ""
+                account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else ""
+                result = send_to_ntfy(
+                    server_url=ntfy_server_url,
+                    topic=topic,
+                    token=token,
+                    report_data=report_data,
+                    report_type=report_type,
+                    update_info=update_info,
+                    proxy_url=proxy_url,
+                    mode=mode,
+                    account_label=account_label,
+                    batch_size=3800,
+                    split_content_func=self.split_content_func,
+                )
+                results.append(result)
+
+        return any(results) if results else False
+
+    def _send_bark(
+        self,
+        report_data: Dict,
+        report_type: str,
+        update_info: Optional[Dict],
+        proxy_url: Optional[str],
+        mode: str,
+    ) -> bool:
+        """发送到 Bark(多账号)"""
+        return self._send_to_multi_accounts(
+            channel_name="Bark",
+            config_value=self.config["BARK_URL"],
+            send_func=lambda url, account_label: send_to_bark(
+                bark_url=url,
+                report_data=report_data,
+                report_type=report_type,
+                update_info=update_info,
+                proxy_url=proxy_url,
+                mode=mode,
+                account_label=account_label,
+                batch_size=self.config.get("BARK_BATCH_SIZE", 3600),
+                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
+                split_content_func=self.split_content_func,
+            ),
+        )
+
+    def _send_slack(
+        self,
+        report_data: Dict,
+        report_type: str,
+        update_info: Optional[Dict],
+        proxy_url: Optional[str],
+        mode: str,
+    ) -> bool:
+        """发送到 Slack(多账号)"""
+        return self._send_to_multi_accounts(
+            channel_name="Slack",
+            config_value=self.config["SLACK_WEBHOOK_URL"],
+            send_func=lambda url, account_label: send_to_slack(
+                webhook_url=url,
+                report_data=report_data,
+                report_type=report_type,
+                update_info=update_info,
+                proxy_url=proxy_url,
+                mode=mode,
+                account_label=account_label,
+                batch_size=self.config.get("SLACK_BATCH_SIZE", 4000),
+                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
+                split_content_func=self.split_content_func,
+            ),
+        )
+
+    def _send_email(
+        self,
+        report_type: str,
+        html_file_path: Optional[str],
+    ) -> bool:
+        """发送邮件(保持原有逻辑,已支持多收件人)"""
+        return send_to_email(
+            from_email=self.config["EMAIL_FROM"],
+            password=self.config["EMAIL_PASSWORD"],
+            to_email=self.config["EMAIL_TO"],
+            report_type=report_type,
+            html_file_path=html_file_path,
+            custom_smtp_server=self.config.get("EMAIL_SMTP_SERVER", ""),
+            custom_smtp_port=self.config.get("EMAIL_SMTP_PORT", ""),
+            get_time_func=self.get_time_func,
+        )

+ 80 - 0
trendradar/notification/formatters.py

@@ -0,0 +1,80 @@
+# coding=utf-8
+"""
+通知内容格式转换模块
+
+提供不同推送平台间的格式转换功能
+"""
+
+import re
+
+
+def strip_markdown(text: str) -> str:
+    """去除文本中的 markdown 语法格式,用于个人微信推送
+
+    Args:
+        text: 包含 markdown 格式的文本
+
+    Returns:
+        纯文本内容
+    """
+    # 去除粗体 **text** 或 __text__
+    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
+    text = re.sub(r'__(.+?)__', r'\1', text)
+
+    # 去除斜体 *text* 或 _text_
+    text = re.sub(r'\*(.+?)\*', r'\1', text)
+    text = re.sub(r'_(.+?)_', r'\1', text)
+
+    # 去除删除线 ~~text~~
+    text = re.sub(r'~~(.+?)~~', r'\1', text)
+
+    # 转换链接 [text](url) -> text url(保留 URL)
+    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text)
+
+    # 去除图片 ![alt](url) -> alt
+    text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text)
+
+    # 去除行内代码 `code`
+    text = re.sub(r'`(.+?)`', r'\1', text)
+
+    # 去除引用符号 >
+    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
+
+    # 去除标题符号 # ## ### 等
+    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
+
+    # 去除水平分割线 --- 或 ***
+    text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE)
+
+    # 去除 HTML 标签 <font color='xxx'>text</font> -> text
+    text = re.sub(r'<font[^>]*>(.+?)</font>', r'\1', text)
+    text = re.sub(r'<[^>]+>', '', text)
+
+    # 清理多余的空行(保留最多两个连续空行)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+
+    return text.strip()
+
+
+def convert_markdown_to_mrkdwn(content: str) -> str:
+    """
+    将标准 Markdown 转换为 Slack 的 mrkdwn 格式
+
+    转换规则:
+    - **粗体** → *粗体*
+    - [文本](url) → <url|文本>
+    - 保留其他格式(代码块、列表等)
+
+    Args:
+        content: Markdown 格式的内容
+
+    Returns:
+        Slack mrkdwn 格式的内容
+    """
+    # 1. 转换链接格式: [文本](url) → <url|文本>
+    content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content)
+
+    # 2. 转换粗体: **文本** → *文本*
+    content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content)
+
+    return content

+ 109 - 0
trendradar/notification/push_manager.py

@@ -0,0 +1,109 @@
+# coding=utf-8
+"""
+推送记录管理模块
+
+管理推送记录,支持每日只推送一次和时间窗口控制
+通过 storage_backend 统一存储,支持本地 SQLite 和远程云存储
+"""
+
+from datetime import datetime
+from typing import Callable, Optional, Any
+
+import pytz
+
+
+class PushRecordManager:
+    """
+    推送记录管理器
+
+    通过 storage_backend 统一管理推送记录:
+    - 本地环境:使用 LocalStorageBackend,数据存储在本地 SQLite
+    - GitHub Actions:使用 RemoteStorageBackend,数据存储在云端
+
+    这样 once_per_day 功能在 GitHub Actions 上也能正常工作。
+    """
+
+    def __init__(
+        self,
+        storage_backend: Any,
+        get_time_func: Optional[Callable[[], datetime]] = None,
+    ):
+        """
+        初始化推送记录管理器
+
+        Args:
+            storage_backend: 存储后端实例(LocalStorageBackend 或 RemoteStorageBackend)
+            get_time_func: 获取当前时间的函数(应使用配置的时区)
+        """
+        self.storage_backend = storage_backend
+        self.get_time = get_time_func or self._default_get_time
+
+        print(f"[推送记录] 使用 {storage_backend.backend_name} 存储后端")
+
+    def _default_get_time(self) -> datetime:
+        """默认时间获取函数(UTC+8)"""
+        return datetime.now(pytz.timezone("Asia/Shanghai"))
+
+    def has_pushed_today(self) -> bool:
+        """
+        检查今天是否已经推送过
+
+        Returns:
+            是否已推送
+        """
+        return self.storage_backend.has_pushed_today()
+
+    def record_push(self, report_type: str) -> bool:
+        """
+        记录推送
+
+        Args:
+            report_type: 报告类型
+
+        Returns:
+            是否记录成功
+        """
+        return self.storage_backend.record_push(report_type)
+
+    def is_in_time_range(self, start_time: str, end_time: str) -> bool:
+        """
+        检查当前时间是否在指定时间范围内
+
+        Args:
+            start_time: 开始时间(格式:HH:MM)
+            end_time: 结束时间(格式:HH:MM)
+
+        Returns:
+            是否在时间范围内
+        """
+        now = self.get_time()
+        current_time = now.strftime("%H:%M")
+
+        def normalize_time(time_str: str) -> str:
+            """将时间字符串标准化为 HH:MM 格式"""
+            try:
+                parts = time_str.strip().split(":")
+                if len(parts) != 2:
+                    raise ValueError(f"时间格式错误: {time_str}")
+
+                hour = int(parts[0])
+                minute = int(parts[1])
+
+                if not (0 <= hour <= 23 and 0 <= minute <= 59):
+                    raise ValueError(f"时间范围错误: {time_str}")
+
+                return f"{hour:02d}:{minute:02d}"
+            except Exception as e:
+                print(f"时间格式化错误 '{time_str}': {e}")
+                return time_str
+
+        normalized_start = normalize_time(start_time)
+        normalized_end = normalize_time(end_time)
+        normalized_current = normalize_time(current_time)
+
+        result = normalized_start <= normalized_current <= normalized_end
+
+        if not result:
+            print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}")
+
+        return result

+ 260 - 0
trendradar/notification/renderer.py

@@ -0,0 +1,260 @@
+# coding=utf-8
+"""
+通知内容渲染模块
+
+提供多平台通知内容渲染功能,生成格式化的推送消息
+"""
+
+from datetime import datetime
+from typing import Dict, List, Optional, Callable
+
+from trendradar.report.formatter import format_title_for_platform
+
+
+def render_feishu_content(
+    report_data: Dict,
+    update_info: Optional[Dict] = None,
+    mode: str = "daily",
+    separator: str = "---",
+    reverse_content_order: bool = False,
+    get_time_func: Optional[Callable[[], datetime]] = None,
+) -> str:
+    """渲染飞书通知内容
+
+    Args:
+        report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
+        update_info: 版本更新信息(可选)
+        mode: 报告模式 ("daily", "incremental", "current")
+        separator: 内容分隔符
+        reverse_content_order: 是否反转内容顺序(新增在前)
+        get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now())
+
+    Returns:
+        格式化的飞书消息内容
+    """
+    # 生成热点词汇统计部分
+    stats_content = ""
+    if report_data["stats"]:
+        stats_content += "📊 **热点词汇统计**\n\n"
+
+        total_count = len(report_data["stats"])
+
+        for i, stat in enumerate(report_data["stats"]):
+            word = stat["word"]
+            count = stat["count"]
+
+            sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
+
+            if count >= 10:
+                stats_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
+            elif count >= 5:
+                stats_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
+            else:
+                stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
+
+            for j, title_data in enumerate(stat["titles"], 1):
+                formatted_title = format_title_for_platform(
+                    "feishu", title_data, show_source=True
+                )
+                stats_content += f"  {j}. {formatted_title}\n"
+
+                if j < len(stat["titles"]):
+                    stats_content += "\n"
+
+            if i < len(report_data["stats"]) - 1:
+                stats_content += f"\n{separator}\n\n"
+
+    # 生成新增新闻部分
+    new_titles_content = ""
+    if report_data["new_titles"]:
+        new_titles_content += (
+            f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
+        )
+
+        for source_data in report_data["new_titles"]:
+            new_titles_content += (
+                f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n"
+            )
+
+            for j, title_data in enumerate(source_data["titles"], 1):
+                title_data_copy = title_data.copy()
+                title_data_copy["is_new"] = False
+                formatted_title = format_title_for_platform(
+                    "feishu", title_data_copy, show_source=False
+                )
+                new_titles_content += f"  {j}. {formatted_title}\n"
+
+            new_titles_content += "\n"
+
+    # 根据配置决定内容顺序
+    text_content = ""
+    if reverse_content_order:
+        # 新增热点在前,热点词汇统计在后
+        if new_titles_content:
+            text_content += new_titles_content
+            if stats_content:
+                text_content += f"\n{separator}\n\n"
+        if stats_content:
+            text_content += stats_content
+    else:
+        # 默认:热点词汇统计在前,新增热点在后
+        if stats_content:
+            text_content += stats_content
+            if new_titles_content:
+                text_content += f"\n{separator}\n\n"
+        if new_titles_content:
+            text_content += new_titles_content
+
+    if not text_content:
+        if mode == "incremental":
+            mode_text = "增量模式下暂无新增匹配的热点词汇"
+        elif mode == "current":
+            mode_text = "当前榜单模式下暂无匹配的热点词汇"
+        else:
+            mode_text = "暂无匹配的热点词汇"
+        text_content = f"📭 {mode_text}\n\n"
+
+    if report_data["failed_ids"]:
+        if text_content and "暂无匹配" not in text_content:
+            text_content += f"\n{separator}\n\n"
+
+        text_content += "⚠️ **数据获取失败的平台:**\n\n"
+        for i, id_value in enumerate(report_data["failed_ids"], 1):
+            text_content += f"  • <font color='red'>{id_value}</font>\n"
+
+    # 获取当前时间
+    now = get_time_func() if get_time_func else datetime.now()
+    text_content += (
+        f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
+    )
+
+    if update_info:
+        text_content += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
+
+    return text_content
+
+
+def render_dingtalk_content(
+    report_data: Dict,
+    update_info: Optional[Dict] = None,
+    mode: str = "daily",
+    reverse_content_order: bool = False,
+    get_time_func: Optional[Callable[[], datetime]] = None,
+) -> str:
+    """渲染钉钉通知内容
+
+    Args:
+        report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
+        update_info: 版本更新信息(可选)
+        mode: 报告模式 ("daily", "incremental", "current")
+        reverse_content_order: 是否反转内容顺序(新增在前)
+        get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now())
+
+    Returns:
+        格式化的钉钉消息内容
+    """
+    total_titles = sum(
+        len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
+    )
+    now = get_time_func() if get_time_func else datetime.now()
+
+    # 头部信息
+    header_content = f"**总新闻数:** {total_titles}\n\n"
+    header_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+    header_content += "**类型:** 热点分析报告\n\n"
+    header_content += "---\n\n"
+
+    # 生成热点词汇统计部分
+    stats_content = ""
+    if report_data["stats"]:
+        stats_content += "📊 **热点词汇统计**\n\n"
+
+        total_count = len(report_data["stats"])
+
+        for i, stat in enumerate(report_data["stats"]):
+            word = stat["word"]
+            count = stat["count"]
+
+            sequence_display = f"[{i + 1}/{total_count}]"
+
+            if count >= 10:
+                stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
+            elif count >= 5:
+                stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
+            else:
+                stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
+
+            for j, title_data in enumerate(stat["titles"], 1):
+                formatted_title = format_title_for_platform(
+                    "dingtalk", title_data, show_source=True
+                )
+                stats_content += f"  {j}. {formatted_title}\n"
+
+                if j < len(stat["titles"]):
+                    stats_content += "\n"
+
+            if i < len(report_data["stats"]) - 1:
+                stats_content += "\n---\n\n"
+
+    # 生成新增新闻部分
+    new_titles_content = ""
+    if report_data["new_titles"]:
+        new_titles_content += (
+            f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
+        )
+
+        for source_data in report_data["new_titles"]:
+            new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
+
+            for j, title_data in enumerate(source_data["titles"], 1):
+                title_data_copy = title_data.copy()
+                title_data_copy["is_new"] = False
+                formatted_title = format_title_for_platform(
+                    "dingtalk", title_data_copy, show_source=False
+                )
+                new_titles_content += f"  {j}. {formatted_title}\n"
+
+            new_titles_content += "\n"
+
+    # 根据配置决定内容顺序
+    text_content = header_content
+    if reverse_content_order:
+        # 新增热点在前,热点词汇统计在后
+        if new_titles_content:
+            text_content += new_titles_content
+            if stats_content:
+                text_content += "\n---\n\n"
+        if stats_content:
+            text_content += stats_content
+    else:
+        # 默认:热点词汇统计在前,新增热点在后
+        if stats_content:
+            text_content += stats_content
+            if new_titles_content:
+                text_content += "\n---\n\n"
+        if new_titles_content:
+            text_content += new_titles_content
+
+    if not stats_content and not new_titles_content:
+        if mode == "incremental":
+            mode_text = "增量模式下暂无新增匹配的热点词汇"
+        elif mode == "current":
+            mode_text = "当前榜单模式下暂无匹配的热点词汇"
+        else:
+            mode_text = "暂无匹配的热点词汇"
+        text_content += f"📭 {mode_text}\n\n"
+
+    if report_data["failed_ids"]:
+        if "暂无匹配" not in text_content:
+            text_content += "\n---\n\n"
+
+        text_content += "⚠️ **数据获取失败的平台:**\n\n"
+        for i, id_value in enumerate(report_data["failed_ids"], 1):
+            text_content += f"  • **{id_value}**\n"
+
+    text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
+
+    if update_info:
+        text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
+
+    return text_content

+ 1033 - 0
trendradar/notification/senders.py

@@ -0,0 +1,1033 @@
+# coding=utf-8
+"""
+消息发送器模块
+
+将报告数据发送到各种通知渠道:
+- 飞书 (Feishu/Lark)
+- 钉钉 (DingTalk)
+- 企业微信 (WeCom/WeWork)
+- Telegram
+- 邮件 (Email)
+- ntfy
+- Bark
+- Slack
+
+每个发送函数都支持分批发送,并通过参数化配置实现与 CONFIG 的解耦。
+"""
+
+import smtplib
+import time
+from datetime import datetime
+from email.header import Header
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.utils import formataddr, formatdate, make_msgid
+from pathlib import Path
+from typing import Callable, Dict, List, Optional
+from urllib.parse import urlparse
+
+import requests
+
+from .batch import add_batch_headers, get_max_batch_header_size
+from .formatters import convert_markdown_to_mrkdwn, strip_markdown
+
+
+# === SMTP 邮件配置 ===
+SMTP_CONFIGS = {
+    # Gmail(使用 STARTTLS)
+    "gmail.com": {"server": "smtp.gmail.com", "port": 587, "encryption": "TLS"},
+    # QQ邮箱(使用 SSL,更稳定)
+    "qq.com": {"server": "smtp.qq.com", "port": 465, "encryption": "SSL"},
+    # Outlook(使用 STARTTLS)
+    "outlook.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"},
+    "hotmail.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"},
+    "live.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"},
+    # 网易邮箱(使用 SSL,更稳定)
+    "163.com": {"server": "smtp.163.com", "port": 465, "encryption": "SSL"},
+    "126.com": {"server": "smtp.126.com", "port": 465, "encryption": "SSL"},
+    # 新浪邮箱(使用 SSL)
+    "sina.com": {"server": "smtp.sina.com", "port": 465, "encryption": "SSL"},
+    # 搜狐邮箱(使用 SSL)
+    "sohu.com": {"server": "smtp.sohu.com", "port": 465, "encryption": "SSL"},
+    # 天翼邮箱(使用 SSL)
+    "189.cn": {"server": "smtp.189.cn", "port": 465, "encryption": "SSL"},
+    # 阿里云邮箱(使用 TLS)
+    "aliyun.com": {"server": "smtp.aliyun.com", "port": 465, "encryption": "TLS"},
+}
+
+
+def send_to_feishu(
+    webhook_url: str,
+    report_data: Dict,
+    report_type: str,
+    update_info: Optional[Dict] = None,
+    proxy_url: Optional[str] = None,
+    mode: str = "daily",
+    account_label: str = "",
+    *,
+    batch_size: int = 29000,
+    batch_interval: float = 1.0,
+    split_content_func: Callable = None,
+    get_time_func: Callable = None,
+) -> bool:
+    """
+    发送到飞书(支持分批发送)
+
+    Args:
+        webhook_url: 飞书 Webhook URL
+        report_data: 报告数据
+        report_type: 报告类型
+        update_info: 更新信息(可选)
+        proxy_url: 代理 URL(可选)
+        mode: 报告模式 (daily/current)
+        account_label: 账号标签(多账号时显示)
+        batch_size: 批次大小(字节)
+        batch_interval: 批次发送间隔(秒)
+        split_content_func: 内容分批函数
+        get_time_func: 获取当前时间的函数
+
+    Returns:
+        bool: 发送是否成功
+    """
+    headers = {"Content-Type": "application/json"}
+    proxies = None
+    if proxy_url:
+        proxies = {"http": proxy_url, "https": proxy_url}
+
+    # 日志前缀
+    log_prefix = f"飞书{account_label}" if account_label else "飞书"
+
+    # 预留批次头部空间,避免添加头部后超限
+    header_reserve = get_max_batch_header_size("feishu")
+    batches = split_content_func(
+        report_data,
+        "feishu",
+        update_info,
+        max_bytes=batch_size - header_reserve,
+        mode=mode,
+    )
+
+    # 统一添加批次头部(已预留空间,不会超限)
+    batches = add_batch_headers(batches, "feishu", batch_size)
+
+    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
+
+    # 逐批发送
+    for i, batch_content in enumerate(batches, 1):
+        content_size = len(batch_content.encode("utf-8"))
+        print(
+            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]"
+        )
+
+        total_titles = sum(
+            len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
+        )
+        now = get_time_func() if get_time_func else datetime.now()
+
+        payload = {
+            "msg_type": "text",
+            "content": {
+                "total_titles": total_titles,
+                "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
+                "report_type": report_type,
+                "text": batch_content,
+            },
+        }
+
+        try:
+            response = requests.post(
+                webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
+            )
+            if response.status_code == 200:
+                result = response.json()
+                # 检查飞书的响应状态
+                if result.get("StatusCode") == 0 or result.get("code") == 0:
+                    print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
+                    # 批次间间隔
+                    if i < len(batches):
+                        time.sleep(batch_interval)
+                else:
+                    error_msg = result.get("msg") or result.get("StatusMessage", "未知错误")
+                    print(
+                        f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}"
+                    )
+                    return False
+            else:
+                print(
+                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
+                )
+                return False
+        except Exception as e:
+            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
+            return False
+
+    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
+    return True
+
+
+def send_to_dingtalk(
+    webhook_url: str,
+    report_data: Dict,
+    report_type: str,
+    update_info: Optional[Dict] = None,
+    proxy_url: Optional[str] = None,
+    mode: str = "daily",
+    account_label: str = "",
+    *,
+    batch_size: int = 20000,
+    batch_interval: float = 1.0,
+    split_content_func: Callable = None,
+) -> bool:
+    """
+    发送到钉钉(支持分批发送)
+
+    Args:
+        webhook_url: 钉钉 Webhook URL
+        report_data: 报告数据
+        report_type: 报告类型
+        update_info: 更新信息(可选)
+        proxy_url: 代理 URL(可选)
+        mode: 报告模式 (daily/current)
+        account_label: 账号标签(多账号时显示)
+        batch_size: 批次大小(字节)
+        batch_interval: 批次发送间隔(秒)
+        split_content_func: 内容分批函数
+
+    Returns:
+        bool: 发送是否成功
+    """
+    headers = {"Content-Type": "application/json"}
+    proxies = None
+    if proxy_url:
+        proxies = {"http": proxy_url, "https": proxy_url}
+
+    # 日志前缀
+    log_prefix = f"钉钉{account_label}" if account_label else "钉钉"
+
+    # 预留批次头部空间,避免添加头部后超限
+    header_reserve = get_max_batch_header_size("dingtalk")
+    batches = split_content_func(
+        report_data,
+        "dingtalk",
+        update_info,
+        max_bytes=batch_size - header_reserve,
+        mode=mode,
+    )
+
+    # 统一添加批次头部(已预留空间,不会超限)
+    batches = add_batch_headers(batches, "dingtalk", batch_size)
+
+    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
+
+    # 逐批发送
+    for i, batch_content in enumerate(batches, 1):
+        content_size = len(batch_content.encode("utf-8"))
+        print(
+            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]"
+        )
+
+        payload = {
+            "msgtype": "markdown",
+            "markdown": {
+                "title": f"TrendRadar 热点分析报告 - {report_type}",
+                "text": batch_content,
+            },
+        }
+
+        try:
+            response = requests.post(
+                webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if result.get("errcode") == 0:
+                    print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
+                    # 批次间间隔
+                    if i < len(batches):
+                        time.sleep(batch_interval)
+                else:
+                    print(
+                        f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}"
+                    )
+                    return False
+            else:
+                print(
+                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
+                )
+                return False
+        except Exception as e:
+            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
+            return False
+
+    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
+    return True
+
+
+def send_to_wework(
+    webhook_url: str,
+    report_data: Dict,
+    report_type: str,
+    update_info: Optional[Dict] = None,
+    proxy_url: Optional[str] = None,
+    mode: str = "daily",
+    account_label: str = "",
+    *,
+    batch_size: int = 4000,
+    batch_interval: float = 1.0,
+    msg_type: str = "markdown",
+    split_content_func: Callable = None,
+) -> bool:
+    """
+    发送到企业微信(支持分批发送,支持 markdown 和 text 两种格式)
+
+    Args:
+        webhook_url: 企业微信 Webhook URL
+        report_data: 报告数据
+        report_type: 报告类型
+        update_info: 更新信息(可选)
+        proxy_url: 代理 URL(可选)
+        mode: 报告模式 (daily/current)
+        account_label: 账号标签(多账号时显示)
+        batch_size: 批次大小(字节)
+        batch_interval: 批次发送间隔(秒)
+        msg_type: 消息类型 (markdown/text)
+        split_content_func: 内容分批函数
+
+    Returns:
+        bool: 发送是否成功
+    """
+    headers = {"Content-Type": "application/json"}
+    proxies = None
+    if proxy_url:
+        proxies = {"http": proxy_url, "https": proxy_url}
+
+    # 日志前缀
+    log_prefix = f"企业微信{account_label}" if account_label else "企业微信"
+
+    # 获取消息类型配置(markdown 或 text)
+    is_text_mode = msg_type.lower() == "text"
+
+    if is_text_mode:
+        print(f"{log_prefix}使用 text 格式(个人微信模式)[{report_type}]")
+    else:
+        print(f"{log_prefix}使用 markdown 格式(群机器人模式)[{report_type}]")
+
+    # text 模式使用 wework_text,markdown 模式使用 wework
+    header_format_type = "wework_text" if is_text_mode else "wework"
+
+    # 获取分批内容,预留批次头部空间
+    header_reserve = get_max_batch_header_size(header_format_type)
+    batches = split_content_func(
+        report_data, "wework", update_info, max_bytes=batch_size - header_reserve, mode=mode
+    )
+
+    # 统一添加批次头部(已预留空间,不会超限)
+    batches = add_batch_headers(batches, header_format_type, batch_size)
+
+    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
+
+    # 逐批发送
+    for i, batch_content in enumerate(batches, 1):
+        # 根据消息类型构建 payload
+        if is_text_mode:
+            # text 格式:去除 markdown 语法
+            plain_content = strip_markdown(batch_content)
+            payload = {"msgtype": "text", "text": {"content": plain_content}}
+            content_size = len(plain_content.encode("utf-8"))
+        else:
+            # markdown 格式:保持原样
+            payload = {"msgtype": "markdown", "markdown": {"content": batch_content}}
+            content_size = len(batch_content.encode("utf-8"))
+
+        print(
+            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]"
+        )
+
+        try:
+            response = requests.post(
+                webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if result.get("errcode") == 0:
+                    print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
+                    # 批次间间隔
+                    if i < len(batches):
+                        time.sleep(batch_interval)
+                else:
+                    print(
+                        f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}"
+                    )
+                    return False
+            else:
+                print(
+                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
+                )
+                return False
+        except Exception as e:
+            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
+            return False
+
+    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
+    return True
+
+
+def send_to_telegram(
+    bot_token: str,
+    chat_id: str,
+    report_data: Dict,
+    report_type: str,
+    update_info: Optional[Dict] = None,
+    proxy_url: Optional[str] = None,
+    mode: str = "daily",
+    account_label: str = "",
+    *,
+    batch_size: int = 4000,
+    batch_interval: float = 1.0,
+    split_content_func: Callable = None,
+) -> bool:
+    """
+    发送到 Telegram(支持分批发送)
+
+    Args:
+        bot_token: Telegram Bot Token
+        chat_id: Telegram Chat ID
+        report_data: 报告数据
+        report_type: 报告类型
+        update_info: 更新信息(可选)
+        proxy_url: 代理 URL(可选)
+        mode: 报告模式 (daily/current)
+        account_label: 账号标签(多账号时显示)
+        batch_size: 批次大小(字节)
+        batch_interval: 批次发送间隔(秒)
+        split_content_func: 内容分批函数
+
+    Returns:
+        bool: 发送是否成功
+    """
+    headers = {"Content-Type": "application/json"}
+    url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
+
+    proxies = None
+    if proxy_url:
+        proxies = {"http": proxy_url, "https": proxy_url}
+
+    # 日志前缀
+    log_prefix = f"Telegram{account_label}" if account_label else "Telegram"
+
+    # 获取分批内容,预留批次头部空间
+    header_reserve = get_max_batch_header_size("telegram")
+    batches = split_content_func(
+        report_data, "telegram", update_info, max_bytes=batch_size - header_reserve, mode=mode
+    )
+
+    # 统一添加批次头部(已预留空间,不会超限)
+    batches = add_batch_headers(batches, "telegram", batch_size)
+
+    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
+
+    # 逐批发送
+    for i, batch_content in enumerate(batches, 1):
+        content_size = len(batch_content.encode("utf-8"))
+        print(
+            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]"
+        )
+
+        payload = {
+            "chat_id": chat_id,
+            "text": batch_content,
+            "parse_mode": "HTML",
+            "disable_web_page_preview": True,
+        }
+
+        try:
+            response = requests.post(
+                url, headers=headers, json=payload, proxies=proxies, timeout=30
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if result.get("ok"):
+                    print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
+                    # 批次间间隔
+                    if i < len(batches):
+                        time.sleep(batch_interval)
+                else:
+                    print(
+                        f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('description')}"
+                    )
+                    return False
+            else:
+                print(
+                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
+                )
+                return False
+        except Exception as e:
+            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
+            return False
+
+    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
+    return True
+
+
+def send_to_email(
+    from_email: str,
+    password: str,
+    to_email: str,
+    report_type: str,
+    html_file_path: str,
+    custom_smtp_server: Optional[str] = None,
+    custom_smtp_port: Optional[int] = None,
+    *,
+    get_time_func: Callable = None,
+) -> bool:
+    """
+    发送邮件通知
+
+    Args:
+        from_email: 发件人邮箱
+        password: 邮箱密码/授权码
+        to_email: 收件人邮箱(多个用逗号分隔)
+        report_type: 报告类型
+        html_file_path: HTML 报告文件路径
+        custom_smtp_server: 自定义 SMTP 服务器(可选)
+        custom_smtp_port: 自定义 SMTP 端口(可选)
+        get_time_func: 获取当前时间的函数
+
+    Returns:
+        bool: 发送是否成功
+    """
+    try:
+        if not html_file_path or not Path(html_file_path).exists():
+            print(f"错误:HTML文件不存在或未提供: {html_file_path}")
+            return False
+
+        print(f"使用HTML文件: {html_file_path}")
+        with open(html_file_path, "r", encoding="utf-8") as f:
+            html_content = f.read()
+
+        domain = from_email.split("@")[-1].lower()
+
+        if custom_smtp_server and custom_smtp_port:
+            # 使用自定义 SMTP 配置
+            smtp_server = custom_smtp_server
+            smtp_port = int(custom_smtp_port)
+            # 根据端口判断加密方式:465=SSL, 587=TLS
+            if smtp_port == 465:
+                use_tls = False  # SSL 模式(SMTP_SSL)
+            elif smtp_port == 587:
+                use_tls = True  # TLS 模式(STARTTLS)
+            else:
+                # 其他端口优先尝试 TLS(更安全,更广泛支持)
+                use_tls = True
+        elif domain in SMTP_CONFIGS:
+            # 使用预设配置
+            config = SMTP_CONFIGS[domain]
+            smtp_server = config["server"]
+            smtp_port = config["port"]
+            use_tls = config["encryption"] == "TLS"
+        else:
+            print(f"未识别的邮箱服务商: {domain},使用通用 SMTP 配置")
+            smtp_server = f"smtp.{domain}"
+            smtp_port = 587
+            use_tls = True
+
+        msg = MIMEMultipart("alternative")
+
+        # 严格按照 RFC 标准设置 From header
+        sender_name = "TrendRadar"
+        msg["From"] = formataddr((sender_name, from_email))
+
+        # 设置收件人
+        recipients = [addr.strip() for addr in to_email.split(",")]
+        if len(recipients) == 1:
+            msg["To"] = recipients[0]
+        else:
+            msg["To"] = ", ".join(recipients)
+
+        # 设置邮件主题
+        now = get_time_func() if get_time_func else datetime.now()
+        subject = f"TrendRadar 热点分析报告 - {report_type} - {now.strftime('%m月%d日 %H:%M')}"
+        msg["Subject"] = Header(subject, "utf-8")
+
+        # 设置其他标准 header
+        msg["MIME-Version"] = "1.0"
+        msg["Date"] = formatdate(localtime=True)
+        msg["Message-ID"] = make_msgid()
+
+        # 添加纯文本部分(作为备选)
+        text_content = f"""
+TrendRadar 热点分析报告
+========================
+报告类型:{report_type}
+生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')}
+
+请使用支持HTML的邮件客户端查看完整报告内容。
+        """
+        text_part = MIMEText(text_content, "plain", "utf-8")
+        msg.attach(text_part)
+
+        html_part = MIMEText(html_content, "html", "utf-8")
+        msg.attach(html_part)
+
+        print(f"正在发送邮件到 {to_email}...")
+        print(f"SMTP 服务器: {smtp_server}:{smtp_port}")
+        print(f"发件人: {from_email}")
+
+        try:
+            if use_tls:
+                # TLS 模式
+                server = smtplib.SMTP(smtp_server, smtp_port, timeout=30)
+                server.set_debuglevel(0)  # 设为1可以查看详细调试信息
+                server.ehlo()
+                server.starttls()
+                server.ehlo()
+            else:
+                # SSL 模式
+                server = smtplib.SMTP_SSL(smtp_server, smtp_port, timeout=30)
+                server.set_debuglevel(0)
+                server.ehlo()
+
+            # 登录
+            server.login(from_email, password)
+
+            # 发送邮件
+            server.send_message(msg)
+            server.quit()
+
+            print(f"邮件发送成功 [{report_type}] -> {to_email}")
+            return True
+
+        except smtplib.SMTPServerDisconnected:
+            print("邮件发送失败:服务器意外断开连接,请检查网络或稍后重试")
+            return False
+
+    except smtplib.SMTPAuthenticationError as e:
+        print("邮件发送失败:认证错误,请检查邮箱和密码/授权码")
+        print(f"详细错误: {str(e)}")
+        return False
+    except smtplib.SMTPRecipientsRefused as e:
+        print(f"邮件发送失败:收件人地址被拒绝 {e}")
+        return False
+    except smtplib.SMTPSenderRefused as e:
+        print(f"邮件发送失败:发件人地址被拒绝 {e}")
+        return False
+    except smtplib.SMTPDataError as e:
+        print(f"邮件发送失败:邮件数据错误 {e}")
+        return False
+    except smtplib.SMTPConnectError as e:
+        print(f"邮件发送失败:无法连接到 SMTP 服务器 {smtp_server}:{smtp_port}")
+        print(f"详细错误: {str(e)}")
+        return False
+    except Exception as e:
+        print(f"邮件发送失败 [{report_type}]:{e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def send_to_ntfy(
+    server_url: str,
+    topic: str,
+    token: Optional[str],
+    report_data: Dict,
+    report_type: str,
+    update_info: Optional[Dict] = None,
+    proxy_url: Optional[str] = None,
+    mode: str = "daily",
+    account_label: str = "",
+    *,
+    batch_size: int = 3800,
+    split_content_func: Callable = None,
+) -> bool:
+    """
+    发送到 ntfy(支持分批发送,严格遵守4KB限制)
+
+    Args:
+        server_url: ntfy 服务器 URL
+        topic: ntfy 主题
+        token: ntfy 访问令牌(可选)
+        report_data: 报告数据
+        report_type: 报告类型
+        update_info: 更新信息(可选)
+        proxy_url: 代理 URL(可选)
+        mode: 报告模式 (daily/current)
+        account_label: 账号标签(多账号时显示)
+        batch_size: 批次大小(字节)
+        split_content_func: 内容分批函数
+
+    Returns:
+        bool: 发送是否成功
+    """
+    # 日志前缀
+    log_prefix = f"ntfy{account_label}" if account_label else "ntfy"
+
+    # 避免 HTTP header 编码问题
+    report_type_en_map = {
+        "当日汇总": "Daily Summary",
+        "当前榜单汇总": "Current Ranking",
+        "增量更新": "Incremental Update",
+        "实时增量": "Realtime Incremental",
+        "实时当前榜单": "Realtime Current Ranking",
+    }
+    report_type_en = report_type_en_map.get(report_type, "News Report")
+
+    headers = {
+        "Content-Type": "text/plain; charset=utf-8",
+        "Markdown": "yes",
+        "Title": report_type_en,
+        "Priority": "default",
+        "Tags": "news",
+    }
+
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+
+    # 构建完整URL,确保格式正确
+    base_url = server_url.rstrip("/")
+    if not base_url.startswith(("http://", "https://")):
+        base_url = f"https://{base_url}"
+    url = f"{base_url}/{topic}"
+
+    proxies = None
+    if proxy_url:
+        proxies = {"http": proxy_url, "https": proxy_url}
+
+    # 获取分批内容,预留批次头部空间
+    header_reserve = get_max_batch_header_size("ntfy")
+    batches = split_content_func(
+        report_data, "ntfy", update_info, max_bytes=batch_size - header_reserve, mode=mode
+    )
+
+    # 统一添加批次头部(已预留空间,不会超限)
+    batches = add_batch_headers(batches, "ntfy", batch_size)
+
+    total_batches = len(batches)
+    print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]")
+
+    # 反转批次顺序,使得在ntfy客户端显示时顺序正确
+    # ntfy显示最新消息在上面,所以我们从最后一批开始推送
+    reversed_batches = list(reversed(batches))
+
+    print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确")
+
+    # 逐批发送(反向顺序)
+    success_count = 0
+    for idx, batch_content in enumerate(reversed_batches, 1):
+        # 计算正确的批次编号(用户视角的编号)
+        actual_batch_num = total_batches - idx + 1
+
+        content_size = len(batch_content.encode("utf-8"))
+        print(
+            f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{content_size} 字节 [{report_type}]"
+        )
+
+        # 检查消息大小,确保不超过4KB
+        if content_size > 4096:
+            print(f"警告:{log_prefix}第 {actual_batch_num} 批次消息过大({content_size} 字节),可能被拒绝")
+
+        # 更新 headers 的批次标识
+        current_headers = headers.copy()
+        if total_batches > 1:
+            current_headers["Title"] = f"{report_type_en} ({actual_batch_num}/{total_batches})"
+
+        try:
+            response = requests.post(
+                url,
+                headers=current_headers,
+                data=batch_content.encode("utf-8"),
+                proxies=proxies,
+                timeout=30,
+            )
+
+            if response.status_code == 200:
+                print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]")
+                success_count += 1
+                if idx < total_batches:
+                    # 公共服务器建议 2-3 秒,自托管可以更短
+                    interval = 2 if "ntfy.sh" in server_url else 1
+                    time.sleep(interval)
+            elif response.status_code == 429:
+                print(
+                    f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次速率限制 [{report_type}],等待后重试"
+                )
+                time.sleep(10)  # 等待10秒后重试
+                # 重试一次
+                retry_response = requests.post(
+                    url,
+                    headers=current_headers,
+                    data=batch_content.encode("utf-8"),
+                    proxies=proxies,
+                    timeout=30,
+                )
+                if retry_response.status_code == 200:
+                    print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试成功 [{report_type}]")
+                    success_count += 1
+                else:
+                    print(
+                        f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试失败,状态码:{retry_response.status_code}"
+                    )
+            elif response.status_code == 413:
+                print(
+                    f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大被拒绝 [{report_type}],消息大小:{content_size} 字节"
+                )
+            else:
+                print(
+                    f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}"
+                )
+                try:
+                    print(f"错误详情:{response.text}")
+                except:
+                    pass
+
+        except requests.exceptions.ConnectTimeout:
+            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]")
+        except requests.exceptions.ReadTimeout:
+            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]")
+        except requests.exceptions.ConnectionError as e:
+            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}")
+        except Exception as e:
+            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}")
+
+    # 判断整体发送是否成功
+    if success_count == total_batches:
+        print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]")
+        return True
+    elif success_count > 0:
+        print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]")
+        return True  # 部分成功也视为成功
+    else:
+        print(f"{log_prefix}发送完全失败 [{report_type}]")
+        return False
+
+
+def send_to_bark(
+    bark_url: str,
+    report_data: Dict,
+    report_type: str,
+    update_info: Optional[Dict] = None,
+    proxy_url: Optional[str] = None,
+    mode: str = "daily",
+    account_label: str = "",
+    *,
+    batch_size: int = 3600,
+    batch_interval: float = 1.0,
+    split_content_func: Callable = None,
+) -> bool:
+    """
+    发送到 Bark(支持分批发送,使用 markdown 格式)
+
+    Args:
+        bark_url: Bark URL(包含 device_key)
+        report_data: 报告数据
+        report_type: 报告类型
+        update_info: 更新信息(可选)
+        proxy_url: 代理 URL(可选)
+        mode: 报告模式 (daily/current)
+        account_label: 账号标签(多账号时显示)
+        batch_size: 批次大小(字节)
+        batch_interval: 批次发送间隔(秒)
+        split_content_func: 内容分批函数
+
+    Returns:
+        bool: 发送是否成功
+    """
+    # 日志前缀
+    log_prefix = f"Bark{account_label}" if account_label else "Bark"
+
+    proxies = None
+    if proxy_url:
+        proxies = {"http": proxy_url, "https": proxy_url}
+
+    # 解析 Bark URL,提取 device_key 和 API 端点
+    # Bark URL 格式: https://api.day.app/device_key 或 https://bark.day.app/device_key
+    parsed_url = urlparse(bark_url)
+    device_key = parsed_url.path.strip('/').split('/')[0] if parsed_url.path else None
+
+    if not device_key:
+        print(f"{log_prefix} URL 格式错误,无法提取 device_key: {bark_url}")
+        return False
+
+    # 构建正确的 API 端点
+    api_endpoint = f"{parsed_url.scheme}://{parsed_url.netloc}/push"
+
+    # 获取分批内容,预留批次头部空间
+    header_reserve = get_max_batch_header_size("bark")
+    batches = split_content_func(
+        report_data, "bark", update_info, max_bytes=batch_size - header_reserve, mode=mode
+    )
+
+    # 统一添加批次头部(已预留空间,不会超限)
+    batches = add_batch_headers(batches, "bark", batch_size)
+
+    total_batches = len(batches)
+    print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]")
+
+    # 反转批次顺序,使得在Bark客户端显示时顺序正确
+    # Bark显示最新消息在上面,所以我们从最后一批开始推送
+    reversed_batches = list(reversed(batches))
+
+    print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确")
+
+    # 逐批发送(反向顺序)
+    success_count = 0
+    for idx, batch_content in enumerate(reversed_batches, 1):
+        # 计算正确的批次编号(用户视角的编号)
+        actual_batch_num = total_batches - idx + 1
+
+        content_size = len(batch_content.encode("utf-8"))
+        print(
+            f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{content_size} 字节 [{report_type}]"
+        )
+
+        # 检查消息大小(Bark使用APNs,限制4KB)
+        if content_size > 4096:
+            print(
+                f"警告:{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大({content_size} 字节),可能被拒绝"
+            )
+
+        # 构建JSON payload
+        payload = {
+            "title": report_type,
+            "markdown": batch_content,
+            "device_key": device_key,
+            "sound": "default",
+            "group": "TrendRadar",
+            "action": "none",  # 点击推送跳到 APP 不弹出弹框,方便阅读
+        }
+
+        try:
+            response = requests.post(
+                api_endpoint,
+                json=payload,
+                proxies=proxies,
+                timeout=30,
+            )
+
+            if response.status_code == 200:
+                result = response.json()
+                if result.get("code") == 200:
+                    print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]")
+                    success_count += 1
+                    # 批次间间隔
+                    if idx < total_batches:
+                        time.sleep(batch_interval)
+                else:
+                    print(
+                        f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],错误:{result.get('message', '未知错误')}"
+                    )
+            else:
+                print(
+                    f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}"
+                )
+                try:
+                    print(f"错误详情:{response.text}")
+                except:
+                    pass
+
+        except requests.exceptions.ConnectTimeout:
+            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]")
+        except requests.exceptions.ReadTimeout:
+            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]")
+        except requests.exceptions.ConnectionError as e:
+            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}")
+        except Exception as e:
+            print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}")
+
+    # 判断整体发送是否成功
+    if success_count == total_batches:
+        print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]")
+        return True
+    elif success_count > 0:
+        print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]")
+        return True  # 部分成功也视为成功
+    else:
+        print(f"{log_prefix}发送完全失败 [{report_type}]")
+        return False
+
+
+def send_to_slack(
+    webhook_url: str,
+    report_data: Dict,
+    report_type: str,
+    update_info: Optional[Dict] = None,
+    proxy_url: Optional[str] = None,
+    mode: str = "daily",
+    account_label: str = "",
+    *,
+    batch_size: int = 4000,
+    batch_interval: float = 1.0,
+    split_content_func: Callable = None,
+) -> bool:
+    """
+    发送到 Slack(支持分批发送,使用 mrkdwn 格式)
+
+    Args:
+        webhook_url: Slack Webhook URL
+        report_data: 报告数据
+        report_type: 报告类型
+        update_info: 更新信息(可选)
+        proxy_url: 代理 URL(可选)
+        mode: 报告模式 (daily/current)
+        account_label: 账号标签(多账号时显示)
+        batch_size: 批次大小(字节)
+        batch_interval: 批次发送间隔(秒)
+        split_content_func: 内容分批函数
+
+    Returns:
+        bool: 发送是否成功
+    """
+    headers = {"Content-Type": "application/json"}
+    proxies = None
+    if proxy_url:
+        proxies = {"http": proxy_url, "https": proxy_url}
+
+    # 日志前缀
+    log_prefix = f"Slack{account_label}" if account_label else "Slack"
+
+    # 获取分批内容,预留批次头部空间
+    header_reserve = get_max_batch_header_size("slack")
+    batches = split_content_func(
+        report_data, "slack", update_info, max_bytes=batch_size - header_reserve, mode=mode
+    )
+
+    # 统一添加批次头部(已预留空间,不会超限)
+    batches = add_batch_headers(batches, "slack", batch_size)
+
+    print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
+
+    # 逐批发送
+    for i, batch_content in enumerate(batches, 1):
+        # 转换 Markdown 到 mrkdwn 格式
+        mrkdwn_content = convert_markdown_to_mrkdwn(batch_content)
+
+        content_size = len(mrkdwn_content.encode("utf-8"))
+        print(
+            f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]"
+        )
+
+        # 构建 Slack payload(使用简单的 text 字段,支持 mrkdwn)
+        payload = {"text": mrkdwn_content}
+
+        try:
+            response = requests.post(
+                webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
+            )
+
+            # Slack Incoming Webhooks 成功时返回 "ok" 文本
+            if response.status_code == 200 and response.text == "ok":
+                print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
+                # 批次间间隔
+                if i < len(batches):
+                    time.sleep(batch_interval)
+            else:
+                error_msg = response.text if response.text else f"状态码:{response.status_code}"
+                print(
+                    f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}"
+                )
+                return False
+        except Exception as e:
+            print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
+            return False
+
+    print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
+    return True

+ 580 - 0
trendradar/notification/splitter.py

@@ -0,0 +1,580 @@
+# coding=utf-8
+"""
+消息分批处理模块
+
+提供消息内容分批拆分功能,确保消息大小不超过各平台限制
+"""
+
+from datetime import datetime
+from typing import Dict, List, Optional, Callable
+
+from trendradar.report.formatter import format_title_for_platform
+
+
+# 默认批次大小配置
+DEFAULT_BATCH_SIZES = {
+    "dingtalk": 20000,
+    "feishu": 29000,
+    "ntfy": 3800,
+    "default": 4000,
+}
+
+
+def split_content_into_batches(
+    report_data: Dict,
+    format_type: str,
+    update_info: Optional[Dict] = None,
+    max_bytes: Optional[int] = None,
+    mode: str = "daily",
+    batch_sizes: Optional[Dict[str, int]] = None,
+    feishu_separator: str = "---",
+    reverse_content_order: bool = False,
+    get_time_func: Optional[Callable[[], datetime]] = None,
+) -> List[str]:
+    """分批处理消息内容,确保词组标题+至少第一条新闻的完整性
+
+    Args:
+        report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
+        format_type: 格式类型 (feishu, dingtalk, wework, telegram, ntfy, bark, slack)
+        update_info: 版本更新信息(可选)
+        max_bytes: 最大字节数(可选,如果不指定则使用默认配置)
+        mode: 报告模式 (daily, incremental, current)
+        batch_sizes: 批次大小配置字典(可选)
+        feishu_separator: 飞书消息分隔符
+        reverse_content_order: 是否反转内容顺序(新增在前)
+        get_time_func: 获取当前时间的函数(可选)
+
+    Returns:
+        分批后的消息内容列表
+    """
+    # 合并批次大小配置
+    sizes = {**DEFAULT_BATCH_SIZES, **(batch_sizes or {})}
+
+    if max_bytes is None:
+        if format_type == "dingtalk":
+            max_bytes = sizes.get("dingtalk", 20000)
+        elif format_type == "feishu":
+            max_bytes = sizes.get("feishu", 29000)
+        elif format_type == "ntfy":
+            max_bytes = sizes.get("ntfy", 3800)
+        else:
+            max_bytes = sizes.get("default", 4000)
+
+    batches = []
+
+    total_titles = sum(
+        len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
+    )
+    now = get_time_func() if get_time_func else datetime.now()
+
+    base_header = ""
+    if format_type in ("wework", "bark"):
+        base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
+    elif format_type == "telegram":
+        base_header = f"总新闻数: {total_titles}\n\n"
+    elif format_type == "ntfy":
+        base_header = f"**总新闻数:** {total_titles}\n\n"
+    elif format_type == "feishu":
+        base_header = ""
+    elif format_type == "dingtalk":
+        base_header = f"**总新闻数:** {total_titles}\n\n"
+        base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+        base_header += f"**类型:** 热点分析报告\n\n"
+        base_header += "---\n\n"
+    elif format_type == "slack":
+        base_header = f"*总新闻数:* {total_titles}\n\n"
+
+    base_footer = ""
+    if format_type in ("wework", "bark"):
+        base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
+        if update_info:
+            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
+    elif format_type == "telegram":
+        base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
+        if update_info:
+            base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
+    elif format_type == "ntfy":
+        base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
+        if update_info:
+            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
+    elif format_type == "feishu":
+        base_footer = f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
+        if update_info:
+            base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
+    elif format_type == "dingtalk":
+        base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
+        if update_info:
+            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
+    elif format_type == "slack":
+        base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_"
+        if update_info:
+            base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_"
+
+    stats_header = ""
+    if report_data["stats"]:
+        if format_type in ("wework", "bark"):
+            stats_header = f"📊 **热点词汇统计**\n\n"
+        elif format_type == "telegram":
+            stats_header = f"📊 热点词汇统计\n\n"
+        elif format_type == "ntfy":
+            stats_header = f"📊 **热点词汇统计**\n\n"
+        elif format_type == "feishu":
+            stats_header = f"📊 **热点词汇统计**\n\n"
+        elif format_type == "dingtalk":
+            stats_header = f"📊 **热点词汇统计**\n\n"
+        elif format_type == "slack":
+            stats_header = f"📊 *热点词汇统计*\n\n"
+
+    current_batch = base_header
+    current_batch_has_content = False
+
+    if (
+        not report_data["stats"]
+        and not report_data["new_titles"]
+        and not report_data["failed_ids"]
+    ):
+        if mode == "incremental":
+            mode_text = "增量模式下暂无新增匹配的热点词汇"
+        elif mode == "current":
+            mode_text = "当前榜单模式下暂无匹配的热点词汇"
+        else:
+            mode_text = "暂无匹配的热点词汇"
+        simple_content = f"📭 {mode_text}\n\n"
+        final_content = base_header + simple_content + base_footer
+        batches.append(final_content)
+        return batches
+
+    # 定义处理热点词汇统计的函数
+    def process_stats_section(current_batch, current_batch_has_content, batches):
+        """处理热点词汇统计"""
+        if not report_data["stats"]:
+            return current_batch, current_batch_has_content, batches
+
+        total_count = len(report_data["stats"])
+
+        # 添加统计标题
+        test_content = current_batch + stats_header
+        if (
+            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+            < max_bytes
+        ):
+            current_batch = test_content
+            current_batch_has_content = True
+        else:
+            if current_batch_has_content:
+                batches.append(current_batch + base_footer)
+            current_batch = base_header + stats_header
+            current_batch_has_content = True
+
+        # 逐个处理词组(确保词组标题+第一条新闻的原子性)
+        for i, stat in enumerate(report_data["stats"]):
+            word = stat["word"]
+            count = stat["count"]
+            sequence_display = f"[{i + 1}/{total_count}]"
+
+            # 构建词组标题
+            word_header = ""
+            if format_type in ("wework", "bark"):
+                if count >= 10:
+                    word_header = (
+                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
+                    )
+                elif count >= 5:
+                    word_header = (
+                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
+                    )
+                else:
+                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
+            elif format_type == "telegram":
+                if count >= 10:
+                    word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
+                elif count >= 5:
+                    word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
+                else:
+                    word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
+            elif format_type == "ntfy":
+                if count >= 10:
+                    word_header = (
+                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
+                    )
+                elif count >= 5:
+                    word_header = (
+                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
+                    )
+                else:
+                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
+            elif format_type == "feishu":
+                if count >= 10:
+                    word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
+                elif count >= 5:
+                    word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
+                else:
+                    word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
+            elif format_type == "dingtalk":
+                if count >= 10:
+                    word_header = (
+                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
+                    )
+                elif count >= 5:
+                    word_header = (
+                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
+                    )
+                else:
+                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
+            elif format_type == "slack":
+                if count >= 10:
+                    word_header = (
+                        f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
+                    )
+                elif count >= 5:
+                    word_header = (
+                        f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
+                    )
+                else:
+                    word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
+
+            # 构建第一条新闻
+            first_news_line = ""
+            if stat["titles"]:
+                first_title_data = stat["titles"][0]
+                if format_type in ("wework", "bark"):
+                    formatted_title = format_title_for_platform(
+                        "wework", first_title_data, show_source=True
+                    )
+                elif format_type == "telegram":
+                    formatted_title = format_title_for_platform(
+                        "telegram", first_title_data, show_source=True
+                    )
+                elif format_type == "ntfy":
+                    formatted_title = format_title_for_platform(
+                        "ntfy", first_title_data, show_source=True
+                    )
+                elif format_type == "feishu":
+                    formatted_title = format_title_for_platform(
+                        "feishu", first_title_data, show_source=True
+                    )
+                elif format_type == "dingtalk":
+                    formatted_title = format_title_for_platform(
+                        "dingtalk", first_title_data, show_source=True
+                    )
+                elif format_type == "slack":
+                    formatted_title = format_title_for_platform(
+                        "slack", first_title_data, show_source=True
+                    )
+                else:
+                    formatted_title = f"{first_title_data['title']}"
+
+                first_news_line = f"  1. {formatted_title}\n"
+                if len(stat["titles"]) > 1:
+                    first_news_line += "\n"
+
+            # 原子性检查:词组标题+第一条新闻必须一起处理
+            word_with_first_news = word_header + first_news_line
+            test_content = current_batch + word_with_first_news
+
+            if (
+                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+                >= max_bytes
+            ):
+                # 当前批次容纳不下,开启新批次
+                if current_batch_has_content:
+                    batches.append(current_batch + base_footer)
+                current_batch = base_header + stats_header + word_with_first_news
+                current_batch_has_content = True
+                start_index = 1
+            else:
+                current_batch = test_content
+                current_batch_has_content = True
+                start_index = 1
+
+            # 处理剩余新闻条目
+            for j in range(start_index, len(stat["titles"])):
+                title_data = stat["titles"][j]
+                if format_type in ("wework", "bark"):
+                    formatted_title = format_title_for_platform(
+                        "wework", title_data, show_source=True
+                    )
+                elif format_type == "telegram":
+                    formatted_title = format_title_for_platform(
+                        "telegram", title_data, show_source=True
+                    )
+                elif format_type == "ntfy":
+                    formatted_title = format_title_for_platform(
+                        "ntfy", title_data, show_source=True
+                    )
+                elif format_type == "feishu":
+                    formatted_title = format_title_for_platform(
+                        "feishu", title_data, show_source=True
+                    )
+                elif format_type == "dingtalk":
+                    formatted_title = format_title_for_platform(
+                        "dingtalk", title_data, show_source=True
+                    )
+                elif format_type == "slack":
+                    formatted_title = format_title_for_platform(
+                        "slack", title_data, show_source=True
+                    )
+                else:
+                    formatted_title = f"{title_data['title']}"
+
+                news_line = f"  {j + 1}. {formatted_title}\n"
+                if j < len(stat["titles"]) - 1:
+                    news_line += "\n"
+
+                test_content = current_batch + news_line
+                if (
+                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+                    >= max_bytes
+                ):
+                    if current_batch_has_content:
+                        batches.append(current_batch + base_footer)
+                    current_batch = base_header + stats_header + word_header + news_line
+                    current_batch_has_content = True
+                else:
+                    current_batch = test_content
+                    current_batch_has_content = True
+
+            # 词组间分隔符
+            if i < len(report_data["stats"]) - 1:
+                separator = ""
+                if format_type in ("wework", "bark"):
+                    separator = f"\n\n\n\n"
+                elif format_type == "telegram":
+                    separator = f"\n\n"
+                elif format_type == "ntfy":
+                    separator = f"\n\n"
+                elif format_type == "feishu":
+                    separator = f"\n{feishu_separator}\n\n"
+                elif format_type == "dingtalk":
+                    separator = f"\n---\n\n"
+                elif format_type == "slack":
+                    separator = f"\n\n"
+
+                test_content = current_batch + separator
+                if (
+                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+                    < max_bytes
+                ):
+                    current_batch = test_content
+
+        return current_batch, current_batch_has_content, batches
+
+    # 定义处理新增新闻的函数
+    def process_new_titles_section(current_batch, current_batch_has_content, batches):
+        """处理新增新闻"""
+        if not report_data["new_titles"]:
+            return current_batch, current_batch_has_content, batches
+
+        new_header = ""
+        if format_type in ("wework", "bark"):
+            new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
+        elif format_type == "telegram":
+            new_header = (
+                f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
+            )
+        elif format_type == "ntfy":
+            new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
+        elif format_type == "feishu":
+            new_header = f"\n{feishu_separator}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
+        elif format_type == "dingtalk":
+            new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
+        elif format_type == "slack":
+            new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
+
+        test_content = current_batch + new_header
+        if (
+            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+            >= max_bytes
+        ):
+            if current_batch_has_content:
+                batches.append(current_batch + base_footer)
+            current_batch = base_header + new_header
+            current_batch_has_content = True
+        else:
+            current_batch = test_content
+            current_batch_has_content = True
+
+        # 逐个处理新增新闻来源
+        for source_data in report_data["new_titles"]:
+            source_header = ""
+            if format_type in ("wework", "bark"):
+                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
+            elif format_type == "telegram":
+                source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
+            elif format_type == "ntfy":
+                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
+            elif format_type == "feishu":
+                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
+            elif format_type == "dingtalk":
+                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
+            elif format_type == "slack":
+                source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
+
+            # 构建第一条新增新闻
+            first_news_line = ""
+            if source_data["titles"]:
+                first_title_data = source_data["titles"][0]
+                title_data_copy = first_title_data.copy()
+                title_data_copy["is_new"] = False
+
+                if format_type in ("wework", "bark"):
+                    formatted_title = format_title_for_platform(
+                        "wework", title_data_copy, show_source=False
+                    )
+                elif format_type == "telegram":
+                    formatted_title = format_title_for_platform(
+                        "telegram", title_data_copy, show_source=False
+                    )
+                elif format_type == "feishu":
+                    formatted_title = format_title_for_platform(
+                        "feishu", title_data_copy, show_source=False
+                    )
+                elif format_type == "dingtalk":
+                    formatted_title = format_title_for_platform(
+                        "dingtalk", title_data_copy, show_source=False
+                    )
+                elif format_type == "slack":
+                    formatted_title = format_title_for_platform(
+                        "slack", title_data_copy, show_source=False
+                    )
+                else:
+                    formatted_title = f"{title_data_copy['title']}"
+
+                first_news_line = f"  1. {formatted_title}\n"
+
+            # 原子性检查:来源标题+第一条新闻
+            source_with_first_news = source_header + first_news_line
+            test_content = current_batch + source_with_first_news
+
+            if (
+                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+                >= max_bytes
+            ):
+                if current_batch_has_content:
+                    batches.append(current_batch + base_footer)
+                current_batch = base_header + new_header + source_with_first_news
+                current_batch_has_content = True
+                start_index = 1
+            else:
+                current_batch = test_content
+                current_batch_has_content = True
+                start_index = 1
+
+            # 处理剩余新增新闻
+            for j in range(start_index, len(source_data["titles"])):
+                title_data = source_data["titles"][j]
+                title_data_copy = title_data.copy()
+                title_data_copy["is_new"] = False
+
+                if format_type == "wework":
+                    formatted_title = format_title_for_platform(
+                        "wework", title_data_copy, show_source=False
+                    )
+                elif format_type == "telegram":
+                    formatted_title = format_title_for_platform(
+                        "telegram", title_data_copy, show_source=False
+                    )
+                elif format_type == "feishu":
+                    formatted_title = format_title_for_platform(
+                        "feishu", title_data_copy, show_source=False
+                    )
+                elif format_type == "dingtalk":
+                    formatted_title = format_title_for_platform(
+                        "dingtalk", title_data_copy, show_source=False
+                    )
+                elif format_type == "slack":
+                    formatted_title = format_title_for_platform(
+                        "slack", title_data_copy, show_source=False
+                    )
+                else:
+                    formatted_title = f"{title_data_copy['title']}"
+
+                news_line = f"  {j + 1}. {formatted_title}\n"
+
+                test_content = current_batch + news_line
+                if (
+                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+                    >= max_bytes
+                ):
+                    if current_batch_has_content:
+                        batches.append(current_batch + base_footer)
+                    current_batch = base_header + new_header + source_header + news_line
+                    current_batch_has_content = True
+                else:
+                    current_batch = test_content
+                    current_batch_has_content = True
+
+            current_batch += "\n"
+
+        return current_batch, current_batch_has_content, batches
+
+    # 根据配置决定处理顺序
+    if reverse_content_order:
+        # 新增热点在前,热点词汇统计在后
+        current_batch, current_batch_has_content, batches = process_new_titles_section(
+            current_batch, current_batch_has_content, batches
+        )
+        current_batch, current_batch_has_content, batches = process_stats_section(
+            current_batch, current_batch_has_content, batches
+        )
+    else:
+        # 默认:热点词汇统计在前,新增热点在后
+        current_batch, current_batch_has_content, batches = process_stats_section(
+            current_batch, current_batch_has_content, batches
+        )
+        current_batch, current_batch_has_content, batches = process_new_titles_section(
+            current_batch, current_batch_has_content, batches
+        )
+
+    if report_data["failed_ids"]:
+        failed_header = ""
+        if format_type == "wework":
+            failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
+        elif format_type == "telegram":
+            failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
+        elif format_type == "ntfy":
+            failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n"
+        elif format_type == "feishu":
+            failed_header = f"\n{feishu_separator}\n\n⚠️ **数据获取失败的平台:**\n\n"
+        elif format_type == "dingtalk":
+            failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n"
+
+        test_content = current_batch + failed_header
+        if (
+            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+            >= max_bytes
+        ):
+            if current_batch_has_content:
+                batches.append(current_batch + base_footer)
+            current_batch = base_header + failed_header
+            current_batch_has_content = True
+        else:
+            current_batch = test_content
+            current_batch_has_content = True
+
+        for i, id_value in enumerate(report_data["failed_ids"], 1):
+            if format_type == "feishu":
+                failed_line = f"  • <font color='red'>{id_value}</font>\n"
+            elif format_type == "dingtalk":
+                failed_line = f"  • **{id_value}**\n"
+            else:
+                failed_line = f"  • {id_value}\n"
+
+            test_content = current_batch + failed_line
+            if (
+                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
+                >= max_bytes
+            ):
+                if current_batch_has_content:
+                    batches.append(current_batch + base_footer)
+                current_batch = base_header + failed_header + failed_line
+                current_batch_has_content = True
+            else:
+                current_batch = test_content
+                current_batch_has_content = True
+
+    # 完成最后批次
+    if current_batch_has_content:
+        batches.append(current_batch + base_footer)
+
+    return batches

+ 40 - 0
trendradar/report/__init__.py

@@ -0,0 +1,40 @@
+# coding=utf-8
+"""
+报告生成模块
+
+提供报告生成和格式化功能,包括:
+- HTML 报告生成
+- 标题格式化工具
+
+模块结构:
+- helpers: 报告辅助函数(清理、转义、格式化)
+- formatter: 平台标题格式化
+- html: HTML 报告渲染
+- generator: 报告生成器
+"""
+
+from trendradar.report.helpers import (
+    clean_title,
+    html_escape,
+    format_rank_display,
+)
+from trendradar.report.formatter import format_title_for_platform
+from trendradar.report.html import render_html_content
+from trendradar.report.generator import (
+    prepare_report_data,
+    generate_html_report,
+)
+
+__all__ = [
+    # 辅助函数
+    "clean_title",
+    "html_escape",
+    "format_rank_display",
+    # 格式化函数
+    "format_title_for_platform",
+    # HTML 渲染
+    "render_html_content",
+    # 报告生成器
+    "prepare_report_data",
+    "generate_html_report",
+]

+ 223 - 0
trendradar/report/formatter.py

@@ -0,0 +1,223 @@
+# coding=utf-8
+"""
+平台标题格式化模块
+
+提供多平台标题格式化功能
+"""
+
+from typing import Dict
+
+from trendradar.report.helpers import clean_title, html_escape, format_rank_display
+
+
+def format_title_for_platform(
+    platform: str, title_data: Dict, show_source: bool = True
+) -> str:
+    """统一的标题格式化方法
+
+    为不同平台生成对应格式的标题字符串。
+
+    Args:
+        platform: 目标平台,支持:
+            - "feishu": 飞书
+            - "dingtalk": 钉钉
+            - "wework": 企业微信
+            - "bark": Bark
+            - "telegram": Telegram
+            - "ntfy": ntfy
+            - "slack": Slack
+            - "html": HTML 报告
+        title_data: 标题数据字典,包含以下字段:
+            - title: 标题文本
+            - source_name: 来源名称
+            - time_display: 时间显示
+            - count: 出现次数
+            - ranks: 排名列表
+            - rank_threshold: 高亮阈值
+            - url: PC端链接
+            - mobile_url: 移动端链接(优先使用)
+            - is_new: 是否为新增标题(可选)
+        show_source: 是否显示来源名称
+
+    Returns:
+        格式化后的标题字符串
+    """
+    rank_display = format_rank_display(
+        title_data["ranks"], title_data["rank_threshold"], platform
+    )
+
+    link_url = title_data["mobile_url"] or title_data["url"]
+    cleaned_title = clean_title(title_data["title"])
+
+    if platform == "feishu":
+        if link_url:
+            formatted_title = f"[{cleaned_title}]({link_url})"
+        else:
+            formatted_title = cleaned_title
+
+        title_prefix = "🆕 " if title_data.get("is_new") else ""
+
+        if show_source:
+            result = f"<font color='grey'>[{title_data['source_name']}]</font> {title_prefix}{formatted_title}"
+        else:
+            result = f"{title_prefix}{formatted_title}"
+
+        if rank_display:
+            result += f" {rank_display}"
+        if title_data["time_display"]:
+            result += f" <font color='grey'>- {title_data['time_display']}</font>"
+        if title_data["count"] > 1:
+            result += f" <font color='green'>({title_data['count']}次)</font>"
+
+        return result
+
+    elif platform == "dingtalk":
+        if link_url:
+            formatted_title = f"[{cleaned_title}]({link_url})"
+        else:
+            formatted_title = cleaned_title
+
+        title_prefix = "🆕 " if title_data.get("is_new") else ""
+
+        if show_source:
+            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
+        else:
+            result = f"{title_prefix}{formatted_title}"
+
+        if rank_display:
+            result += f" {rank_display}"
+        if title_data["time_display"]:
+            result += f" - {title_data['time_display']}"
+        if title_data["count"] > 1:
+            result += f" ({title_data['count']}次)"
+
+        return result
+
+    elif platform in ("wework", "bark"):
+        # WeWork 和 Bark 使用 markdown 格式
+        if link_url:
+            formatted_title = f"[{cleaned_title}]({link_url})"
+        else:
+            formatted_title = cleaned_title
+
+        title_prefix = "🆕 " if title_data.get("is_new") else ""
+
+        if show_source:
+            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
+        else:
+            result = f"{title_prefix}{formatted_title}"
+
+        if rank_display:
+            result += f" {rank_display}"
+        if title_data["time_display"]:
+            result += f" - {title_data['time_display']}"
+        if title_data["count"] > 1:
+            result += f" ({title_data['count']}次)"
+
+        return result
+
+    elif platform == "telegram":
+        if link_url:
+            formatted_title = f'<a href="{link_url}">{html_escape(cleaned_title)}</a>'
+        else:
+            formatted_title = cleaned_title
+
+        title_prefix = "🆕 " if title_data.get("is_new") else ""
+
+        if show_source:
+            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
+        else:
+            result = f"{title_prefix}{formatted_title}"
+
+        if rank_display:
+            result += f" {rank_display}"
+        if title_data["time_display"]:
+            result += f" <code>- {title_data['time_display']}</code>"
+        if title_data["count"] > 1:
+            result += f" <code>({title_data['count']}次)</code>"
+
+        return result
+
+    elif platform == "ntfy":
+        if link_url:
+            formatted_title = f"[{cleaned_title}]({link_url})"
+        else:
+            formatted_title = cleaned_title
+
+        title_prefix = "🆕 " if title_data.get("is_new") else ""
+
+        if show_source:
+            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
+        else:
+            result = f"{title_prefix}{formatted_title}"
+
+        if rank_display:
+            result += f" {rank_display}"
+        if title_data["time_display"]:
+            result += f" `- {title_data['time_display']}`"
+        if title_data["count"] > 1:
+            result += f" `({title_data['count']}次)`"
+
+        return result
+
+    elif platform == "slack":
+        # Slack 使用 mrkdwn 格式
+        if link_url:
+            # Slack 链接格式: <url|text>
+            formatted_title = f"<{link_url}|{cleaned_title}>"
+        else:
+            formatted_title = cleaned_title
+
+        title_prefix = "🆕 " if title_data.get("is_new") else ""
+
+        if show_source:
+            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
+        else:
+            result = f"{title_prefix}{formatted_title}"
+
+        # 排名(使用 * 加粗)
+        rank_display = format_rank_display(
+            title_data["ranks"], title_data["rank_threshold"], "slack"
+        )
+        if rank_display:
+            result += f" {rank_display}"
+        if title_data["time_display"]:
+            result += f" `- {title_data['time_display']}`"
+        if title_data["count"] > 1:
+            result += f" `({title_data['count']}次)`"
+
+        return result
+
+    elif platform == "html":
+        rank_display = format_rank_display(
+            title_data["ranks"], title_data["rank_threshold"], "html"
+        )
+
+        link_url = title_data["mobile_url"] or title_data["url"]
+
+        escaped_title = html_escape(cleaned_title)
+        escaped_source_name = html_escape(title_data["source_name"])
+
+        if link_url:
+            escaped_url = html_escape(link_url)
+            formatted_title = f'[{escaped_source_name}] <a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
+        else:
+            formatted_title = (
+                f'[{escaped_source_name}] <span class="no-link">{escaped_title}</span>'
+            )
+
+        if rank_display:
+            formatted_title += f" {rank_display}"
+        if title_data["time_display"]:
+            escaped_time = html_escape(title_data["time_display"])
+            formatted_title += f" <font color='grey'>- {escaped_time}</font>"
+        if title_data["count"] > 1:
+            formatted_title += f" <font color='green'>({title_data['count']}次)</font>"
+
+        if title_data.get("is_new"):
+            formatted_title = f"<div class='new-title'>🆕 {formatted_title}</div>"
+
+        return formatted_title
+
+    else:
+        return cleaned_title

+ 235 - 0
trendradar/report/generator.py

@@ -0,0 +1,235 @@
+# coding=utf-8
+"""
+报告生成模块
+
+提供报告数据准备和 HTML 生成功能:
+- prepare_report_data: 准备报告数据
+- generate_html_report: 生成 HTML 报告
+"""
+
+from pathlib import Path
+from typing import Dict, List, Optional, Callable
+
+
+def prepare_report_data(
+    stats: List[Dict],
+    failed_ids: Optional[List] = None,
+    new_titles: Optional[Dict] = None,
+    id_to_name: Optional[Dict] = None,
+    mode: str = "daily",
+    rank_threshold: int = 3,
+    matches_word_groups_func: Optional[Callable] = None,
+    load_frequency_words_func: Optional[Callable] = None,
+) -> Dict:
+    """
+    准备报告数据
+
+    Args:
+        stats: 统计结果列表
+        failed_ids: 失败的 ID 列表
+        new_titles: 新增标题
+        id_to_name: ID 到名称的映射
+        mode: 报告模式 (daily/incremental/current)
+        rank_threshold: 排名阈值
+        matches_word_groups_func: 词组匹配函数
+        load_frequency_words_func: 加载频率词函数
+
+    Returns:
+        Dict: 准备好的报告数据
+    """
+    processed_new_titles = []
+
+    # 在增量模式下隐藏新增新闻区域
+    hide_new_section = mode == "incremental"
+
+    # 只有在非隐藏模式下才处理新增新闻部分
+    if not hide_new_section:
+        filtered_new_titles = {}
+        if new_titles and id_to_name:
+            # 如果提供了匹配函数,使用它过滤
+            if matches_word_groups_func and load_frequency_words_func:
+                word_groups, filter_words, global_filters = load_frequency_words_func()
+                for source_id, titles_data in new_titles.items():
+                    filtered_titles = {}
+                    for title, title_data in titles_data.items():
+                        if matches_word_groups_func(title, word_groups, filter_words, global_filters):
+                            filtered_titles[title] = title_data
+                    if filtered_titles:
+                        filtered_new_titles[source_id] = filtered_titles
+            else:
+                # 没有匹配函数时,使用全部
+                filtered_new_titles = new_titles
+
+            # 打印过滤后的新增热点数(与推送显示一致)
+            original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0
+            filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0
+            if original_new_count > 0:
+                print(f"频率词过滤后:{filtered_new_count} 条新增热点匹配(原始 {original_new_count} 条)")
+
+        if filtered_new_titles and id_to_name:
+            for source_id, titles_data in filtered_new_titles.items():
+                source_name = id_to_name.get(source_id, source_id)
+                source_titles = []
+
+                for title, title_data in titles_data.items():
+                    url = title_data.get("url", "")
+                    mobile_url = title_data.get("mobileUrl", "")
+                    ranks = title_data.get("ranks", [])
+
+                    processed_title = {
+                        "title": title,
+                        "source_name": source_name,
+                        "time_display": "",
+                        "count": 1,
+                        "ranks": ranks,
+                        "rank_threshold": rank_threshold,
+                        "url": url,
+                        "mobile_url": mobile_url,
+                        "is_new": True,
+                    }
+                    source_titles.append(processed_title)
+
+                if source_titles:
+                    processed_new_titles.append(
+                        {
+                            "source_id": source_id,
+                            "source_name": source_name,
+                            "titles": source_titles,
+                        }
+                    )
+
+    processed_stats = []
+    for stat in stats:
+        if stat["count"] <= 0:
+            continue
+
+        processed_titles = []
+        for title_data in stat["titles"]:
+            processed_title = {
+                "title": title_data["title"],
+                "source_name": title_data["source_name"],
+                "time_display": title_data["time_display"],
+                "count": title_data["count"],
+                "ranks": title_data["ranks"],
+                "rank_threshold": title_data["rank_threshold"],
+                "url": title_data.get("url", ""),
+                "mobile_url": title_data.get("mobileUrl", ""),
+                "is_new": title_data.get("is_new", False),
+            }
+            processed_titles.append(processed_title)
+
+        processed_stats.append(
+            {
+                "word": stat["word"],
+                "count": stat["count"],
+                "percentage": stat.get("percentage", 0),
+                "titles": processed_titles,
+            }
+        )
+
+    return {
+        "stats": processed_stats,
+        "new_titles": processed_new_titles,
+        "failed_ids": failed_ids or [],
+        "total_new_count": sum(
+            len(source["titles"]) for source in processed_new_titles
+        ),
+    }
+
+
+def generate_html_report(
+    stats: List[Dict],
+    total_titles: int,
+    failed_ids: Optional[List] = None,
+    new_titles: Optional[Dict] = None,
+    id_to_name: Optional[Dict] = None,
+    mode: str = "daily",
+    is_daily_summary: bool = False,
+    update_info: Optional[Dict] = None,
+    rank_threshold: int = 3,
+    output_dir: str = "output",
+    date_folder: str = "",
+    time_filename: str = "",
+    render_html_func: Optional[Callable] = None,
+    matches_word_groups_func: Optional[Callable] = None,
+    load_frequency_words_func: Optional[Callable] = None,
+    enable_index_copy: bool = True,
+) -> str:
+    """
+    生成 HTML 报告
+
+    Args:
+        stats: 统计结果列表
+        total_titles: 总标题数
+        failed_ids: 失败的 ID 列表
+        new_titles: 新增标题
+        id_to_name: ID 到名称的映射
+        mode: 报告模式 (daily/incremental/current)
+        is_daily_summary: 是否是每日汇总
+        update_info: 更新信息
+        rank_threshold: 排名阈值
+        output_dir: 输出目录
+        date_folder: 日期文件夹名称
+        time_filename: 时间文件名
+        render_html_func: HTML 渲染函数
+        matches_word_groups_func: 词组匹配函数
+        load_frequency_words_func: 加载频率词函数
+        enable_index_copy: 是否复制到 index.html
+
+    Returns:
+        str: 生成的 HTML 文件路径
+    """
+    if is_daily_summary:
+        if mode == "current":
+            filename = "当前榜单汇总.html"
+        elif mode == "incremental":
+            filename = "当日增量.html"
+        else:
+            filename = "当日汇总.html"
+    else:
+        filename = f"{time_filename}.html"
+
+    # 构建输出路径
+    output_path = Path(output_dir) / date_folder / "html"
+    output_path.mkdir(parents=True, exist_ok=True)
+    file_path = str(output_path / filename)
+
+    # 准备报告数据
+    report_data = prepare_report_data(
+        stats,
+        failed_ids,
+        new_titles,
+        id_to_name,
+        mode,
+        rank_threshold,
+        matches_word_groups_func,
+        load_frequency_words_func,
+    )
+
+    # 渲染 HTML 内容
+    if render_html_func:
+        html_content = render_html_func(
+            report_data, total_titles, is_daily_summary, mode, update_info
+        )
+    else:
+        # 默认简单 HTML
+        html_content = f"<html><body><h1>Report</h1><pre>{report_data}</pre></body></html>"
+
+    # 写入文件
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(html_content)
+
+    # 如果是每日汇总且启用 index 复制
+    if is_daily_summary and enable_index_copy:
+        # 生成到根目录(供 GitHub Pages 访问)
+        root_index_path = Path("index.html")
+        with open(root_index_path, "w", encoding="utf-8") as f:
+            f.write(html_content)
+
+        # 同时生成到 output 目录(供 Docker Volume 挂载访问)
+        output_index_path = Path(output_dir) / "index.html"
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        with open(output_index_path, "w", encoding="utf-8") as f:
+            f.write(html_content)
+
+    return file_path

+ 125 - 0
trendradar/report/helpers.py

@@ -0,0 +1,125 @@
+# coding=utf-8
+"""
+报告辅助函数模块
+
+提供报告生成相关的通用辅助函数
+"""
+
+import re
+from typing import List
+
+
+def clean_title(title: str) -> str:
+    """清理标题中的特殊字符
+
+    清理规则:
+    - 将换行符(\n, \r)替换为空格
+    - 将多个连续空白字符合并为单个空格
+    - 去除首尾空白
+
+    Args:
+        title: 原始标题字符串
+
+    Returns:
+        清理后的标题字符串
+    """
+    if not isinstance(title, str):
+        title = str(title)
+    cleaned_title = title.replace("\n", " ").replace("\r", " ")
+    cleaned_title = re.sub(r"\s+", " ", cleaned_title)
+    cleaned_title = cleaned_title.strip()
+    return cleaned_title
+
+
+def html_escape(text: str) -> str:
+    """HTML特殊字符转义
+
+    转义规则(按顺序):
+    - & → &amp;
+    - < → &lt;
+    - > → &gt;
+    - " → &quot;
+    - ' → &#x27;
+
+    Args:
+        text: 原始文本
+
+    Returns:
+        转义后的文本
+    """
+    if not isinstance(text, str):
+        text = str(text)
+
+    return (
+        text.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+        .replace('"', "&quot;")
+        .replace("'", "&#x27;")
+    )
+
+
+def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str:
+    """格式化排名显示
+
+    根据不同平台类型生成对应格式的排名字符串。
+    当最小排名小于等于阈值时,使用高亮格式。
+
+    Args:
+        ranks: 排名列表(可能包含重复值)
+        rank_threshold: 高亮阈值,小于等于此值的排名会高亮显示
+        format_type: 平台类型,支持:
+            - "html": HTML格式
+            - "feishu": 飞书格式
+            - "dingtalk": 钉钉格式
+            - "wework": 企业微信格式
+            - "telegram": Telegram格式
+            - "slack": Slack格式
+            - 其他: 默认markdown格式
+
+    Returns:
+        格式化后的排名字符串,如 "[1]" 或 "[1 - 5]"
+        如果排名列表为空,返回空字符串
+    """
+    if not ranks:
+        return ""
+
+    unique_ranks = sorted(set(ranks))
+    min_rank = unique_ranks[0]
+    max_rank = unique_ranks[-1]
+
+    # 根据平台类型选择高亮格式
+    if format_type == "html":
+        highlight_start = "<font color='red'><strong>"
+        highlight_end = "</strong></font>"
+    elif format_type == "feishu":
+        highlight_start = "<font color='red'>**"
+        highlight_end = "**</font>"
+    elif format_type == "dingtalk":
+        highlight_start = "**"
+        highlight_end = "**"
+    elif format_type == "wework":
+        highlight_start = "**"
+        highlight_end = "**"
+    elif format_type == "telegram":
+        highlight_start = "<b>"
+        highlight_end = "</b>"
+    elif format_type == "slack":
+        highlight_start = "*"
+        highlight_end = "*"
+    else:
+        # 默认 markdown 格式
+        highlight_start = "**"
+        highlight_end = "**"
+
+    # 生成排名显示
+    if min_rank <= rank_threshold:
+        if min_rank == max_rank:
+            return f"{highlight_start}[{min_rank}]{highlight_end}"
+        else:
+            return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}"
+    else:
+        if min_rank == max_rank:
+            return f"[{min_rank}]"
+        else:
+            return f"[{min_rank} - {max_rank}]"

+ 1050 - 0
trendradar/report/html.py

@@ -0,0 +1,1050 @@
+# coding=utf-8
+"""
+HTML 报告渲染模块
+
+提供 HTML 格式的热点新闻报告生成功能
+"""
+
+from datetime import datetime
+from typing import Dict, Optional, Callable
+
+from trendradar.report.helpers import html_escape
+
+
+def render_html_content(
+    report_data: Dict,
+    total_titles: int,
+    is_daily_summary: bool = False,
+    mode: str = "daily",
+    update_info: Optional[Dict] = None,
+    *,
+    reverse_content_order: bool = False,
+    get_time_func: Optional[Callable[[], datetime]] = None,
+) -> str:
+    """渲染HTML内容
+
+    Args:
+        report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
+        total_titles: 新闻总数
+        is_daily_summary: 是否为当日汇总
+        mode: 报告模式 ("daily", "current", "incremental")
+        update_info: 更新信息(可选)
+        reverse_content_order: 是否反转内容顺序(新增热点在前)
+        get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now)
+
+    Returns:
+        渲染后的 HTML 字符串
+    """
+    html = """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>热点新闻分析</title>
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/html2canvas/1.4.1/html2canvas.min.js" integrity="sha512-BNaRQnYJYiPSqHHDb58B0yaPfCu+Wgds8Gp/gU33kqBtgNS4tSPHuGibyoeqMV/TJlSKda6FXzoEyYGjTe+vXA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
+        <style>
+            * { box-sizing: border-box; }
+            body {
+                font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif;
+                margin: 0;
+                padding: 16px;
+                background: #fafafa;
+                color: #333;
+                line-height: 1.5;
+            }
+
+            .container {
+                max-width: 600px;
+                margin: 0 auto;
+                background: white;
+                border-radius: 12px;
+                overflow: hidden;
+                box-shadow: 0 2px 16px rgba(0,0,0,0.06);
+            }
+
+            .header {
+                background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
+                color: white;
+                padding: 32px 24px;
+                text-align: center;
+                position: relative;
+            }
+
+            .save-buttons {
+                position: absolute;
+                top: 16px;
+                right: 16px;
+                display: flex;
+                gap: 8px;
+            }
+
+            .save-btn {
+                background: rgba(255, 255, 255, 0.2);
+                border: 1px solid rgba(255, 255, 255, 0.3);
+                color: white;
+                padding: 8px 16px;
+                border-radius: 6px;
+                cursor: pointer;
+                font-size: 13px;
+                font-weight: 500;
+                transition: all 0.2s ease;
+                backdrop-filter: blur(10px);
+                white-space: nowrap;
+            }
+
+            .save-btn:hover {
+                background: rgba(255, 255, 255, 0.3);
+                border-color: rgba(255, 255, 255, 0.5);
+                transform: translateY(-1px);
+            }
+
+            .save-btn:active {
+                transform: translateY(0);
+            }
+
+            .save-btn:disabled {
+                opacity: 0.6;
+                cursor: not-allowed;
+            }
+
+            .header-title {
+                font-size: 22px;
+                font-weight: 700;
+                margin: 0 0 20px 0;
+            }
+
+            .header-info {
+                display: grid;
+                grid-template-columns: 1fr 1fr;
+                gap: 16px;
+                font-size: 14px;
+                opacity: 0.95;
+            }
+
+            .info-item {
+                text-align: center;
+            }
+
+            .info-label {
+                display: block;
+                font-size: 12px;
+                opacity: 0.8;
+                margin-bottom: 4px;
+            }
+
+            .info-value {
+                font-weight: 600;
+                font-size: 16px;
+            }
+
+            .content {
+                padding: 24px;
+            }
+
+            .word-group {
+                margin-bottom: 40px;
+            }
+
+            .word-group:first-child {
+                margin-top: 0;
+            }
+
+            .word-header {
+                display: flex;
+                align-items: center;
+                justify-content: space-between;
+                margin-bottom: 20px;
+                padding-bottom: 8px;
+                border-bottom: 1px solid #f0f0f0;
+            }
+
+            .word-info {
+                display: flex;
+                align-items: center;
+                gap: 12px;
+            }
+
+            .word-name {
+                font-size: 17px;
+                font-weight: 600;
+                color: #1a1a1a;
+            }
+
+            .word-count {
+                color: #666;
+                font-size: 13px;
+                font-weight: 500;
+            }
+
+            .word-count.hot { color: #dc2626; font-weight: 600; }
+            .word-count.warm { color: #ea580c; font-weight: 600; }
+
+            .word-index {
+                color: #999;
+                font-size: 12px;
+            }
+
+            .news-item {
+                margin-bottom: 20px;
+                padding: 16px 0;
+                border-bottom: 1px solid #f5f5f5;
+                position: relative;
+                display: flex;
+                gap: 12px;
+                align-items: center;
+            }
+
+            .news-item:last-child {
+                border-bottom: none;
+            }
+
+            .news-item.new::after {
+                content: "NEW";
+                position: absolute;
+                top: 12px;
+                right: 0;
+                background: #fbbf24;
+                color: #92400e;
+                font-size: 9px;
+                font-weight: 700;
+                padding: 3px 6px;
+                border-radius: 4px;
+                letter-spacing: 0.5px;
+            }
+
+            .news-number {
+                color: #999;
+                font-size: 13px;
+                font-weight: 600;
+                min-width: 20px;
+                text-align: center;
+                flex-shrink: 0;
+                background: #f8f9fa;
+                border-radius: 50%;
+                width: 24px;
+                height: 24px;
+                display: flex;
+                align-items: center;
+                justify-content: center;
+                align-self: flex-start;
+                margin-top: 8px;
+            }
+
+            .news-content {
+                flex: 1;
+                min-width: 0;
+                padding-right: 40px;
+            }
+
+            .news-item.new .news-content {
+                padding-right: 50px;
+            }
+
+            .news-header {
+                display: flex;
+                align-items: center;
+                gap: 8px;
+                margin-bottom: 8px;
+                flex-wrap: wrap;
+            }
+
+            .source-name {
+                color: #666;
+                font-size: 12px;
+                font-weight: 500;
+            }
+
+            .rank-num {
+                color: #fff;
+                background: #6b7280;
+                font-size: 10px;
+                font-weight: 700;
+                padding: 2px 6px;
+                border-radius: 10px;
+                min-width: 18px;
+                text-align: center;
+            }
+
+            .rank-num.top { background: #dc2626; }
+            .rank-num.high { background: #ea580c; }
+
+            .time-info {
+                color: #999;
+                font-size: 11px;
+            }
+
+            .count-info {
+                color: #059669;
+                font-size: 11px;
+                font-weight: 500;
+            }
+
+            .news-title {
+                font-size: 15px;
+                line-height: 1.4;
+                color: #1a1a1a;
+                margin: 0;
+            }
+
+            .news-link {
+                color: #2563eb;
+                text-decoration: none;
+            }
+
+            .news-link:hover {
+                text-decoration: underline;
+            }
+
+            .news-link:visited {
+                color: #7c3aed;
+            }
+
+            .new-section {
+                margin-top: 40px;
+                padding-top: 24px;
+                border-top: 2px solid #f0f0f0;
+            }
+
+            .new-section-title {
+                color: #1a1a1a;
+                font-size: 16px;
+                font-weight: 600;
+                margin: 0 0 20px 0;
+            }
+
+            .new-source-group {
+                margin-bottom: 24px;
+            }
+
+            .new-source-title {
+                color: #666;
+                font-size: 13px;
+                font-weight: 500;
+                margin: 0 0 12px 0;
+                padding-bottom: 6px;
+                border-bottom: 1px solid #f5f5f5;
+            }
+
+            .new-item {
+                display: flex;
+                align-items: center;
+                gap: 12px;
+                padding: 8px 0;
+                border-bottom: 1px solid #f9f9f9;
+            }
+
+            .new-item:last-child {
+                border-bottom: none;
+            }
+
+            .new-item-number {
+                color: #999;
+                font-size: 12px;
+                font-weight: 600;
+                min-width: 18px;
+                text-align: center;
+                flex-shrink: 0;
+                background: #f8f9fa;
+                border-radius: 50%;
+                width: 20px;
+                height: 20px;
+                display: flex;
+                align-items: center;
+                justify-content: center;
+            }
+
+            .new-item-rank {
+                color: #fff;
+                background: #6b7280;
+                font-size: 10px;
+                font-weight: 700;
+                padding: 3px 6px;
+                border-radius: 8px;
+                min-width: 20px;
+                text-align: center;
+                flex-shrink: 0;
+            }
+
+            .new-item-rank.top { background: #dc2626; }
+            .new-item-rank.high { background: #ea580c; }
+
+            .new-item-content {
+                flex: 1;
+                min-width: 0;
+            }
+
+            .new-item-title {
+                font-size: 14px;
+                line-height: 1.4;
+                color: #1a1a1a;
+                margin: 0;
+            }
+
+            .error-section {
+                background: #fef2f2;
+                border: 1px solid #fecaca;
+                border-radius: 8px;
+                padding: 16px;
+                margin-bottom: 24px;
+            }
+
+            .error-title {
+                color: #dc2626;
+                font-size: 14px;
+                font-weight: 600;
+                margin: 0 0 8px 0;
+            }
+
+            .error-list {
+                list-style: none;
+                padding: 0;
+                margin: 0;
+            }
+
+            .error-item {
+                color: #991b1b;
+                font-size: 13px;
+                padding: 2px 0;
+                font-family: 'SF Mono', Consolas, monospace;
+            }
+
+            .footer {
+                margin-top: 32px;
+                padding: 20px 24px;
+                background: #f8f9fa;
+                border-top: 1px solid #e5e7eb;
+                text-align: center;
+            }
+
+            .footer-content {
+                font-size: 13px;
+                color: #6b7280;
+                line-height: 1.6;
+            }
+
+            .footer-link {
+                color: #4f46e5;
+                text-decoration: none;
+                font-weight: 500;
+                transition: color 0.2s ease;
+            }
+
+            .footer-link:hover {
+                color: #7c3aed;
+                text-decoration: underline;
+            }
+
+            .project-name {
+                font-weight: 600;
+                color: #374151;
+            }
+
+            @media (max-width: 480px) {
+                body { padding: 12px; }
+                .header { padding: 24px 20px; }
+                .content { padding: 20px; }
+                .footer { padding: 16px 20px; }
+                .header-info { grid-template-columns: 1fr; gap: 12px; }
+                .news-header { gap: 6px; }
+                .news-content { padding-right: 45px; }
+                .news-item { gap: 8px; }
+                .new-item { gap: 8px; }
+                .news-number { width: 20px; height: 20px; font-size: 12px; }
+                .save-buttons {
+                    position: static;
+                    margin-bottom: 16px;
+                    display: flex;
+                    gap: 8px;
+                    justify-content: center;
+                    flex-direction: column;
+                    width: 100%;
+                }
+                .save-btn {
+                    width: 100%;
+                }
+            }
+        </style>
+    </head>
+    <body>
+        <div class="container">
+            <div class="header">
+                <div class="save-buttons">
+                    <button class="save-btn" onclick="saveAsImage()">保存为图片</button>
+                    <button class="save-btn" onclick="saveAsMultipleImages()">分段保存</button>
+                </div>
+                <div class="header-title">热点新闻分析</div>
+                <div class="header-info">
+                    <div class="info-item">
+                        <span class="info-label">报告类型</span>
+                        <span class="info-value">"""
+
+    # 处理报告类型显示
+    if is_daily_summary:
+        if mode == "current":
+            html += "当前榜单"
+        elif mode == "incremental":
+            html += "增量模式"
+        else:
+            html += "当日汇总"
+    else:
+        html += "实时分析"
+
+    html += """</span>
+                    </div>
+                    <div class="info-item">
+                        <span class="info-label">新闻总数</span>
+                        <span class="info-value">"""
+
+    html += f"{total_titles} 条"
+
+    # 计算筛选后的热点新闻数量
+    hot_news_count = sum(len(stat["titles"]) for stat in report_data["stats"])
+
+    html += """</span>
+                    </div>
+                    <div class="info-item">
+                        <span class="info-label">热点新闻</span>
+                        <span class="info-value">"""
+
+    html += f"{hot_news_count} 条"
+
+    html += """</span>
+                    </div>
+                    <div class="info-item">
+                        <span class="info-label">生成时间</span>
+                        <span class="info-value">"""
+
+    # 使用提供的时间函数或默认 datetime.now
+    if get_time_func:
+        now = get_time_func()
+    else:
+        now = datetime.now()
+    html += now.strftime("%m-%d %H:%M")
+
+    html += """</span>
+                    </div>
+                </div>
+            </div>
+
+            <div class="content">"""
+
+    # 处理失败ID错误信息
+    if report_data["failed_ids"]:
+        html += """
+                <div class="error-section">
+                    <div class="error-title">⚠️ 请求失败的平台</div>
+                    <ul class="error-list">"""
+        for id_value in report_data["failed_ids"]:
+            html += f'<li class="error-item">{html_escape(id_value)}</li>'
+        html += """
+                    </ul>
+                </div>"""
+
+    # 生成热点词汇统计部分的HTML
+    stats_html = ""
+    if report_data["stats"]:
+        total_count = len(report_data["stats"])
+
+        for i, stat in enumerate(report_data["stats"], 1):
+            count = stat["count"]
+
+            # 确定热度等级
+            if count >= 10:
+                count_class = "hot"
+            elif count >= 5:
+                count_class = "warm"
+            else:
+                count_class = ""
+
+            escaped_word = html_escape(stat["word"])
+
+            stats_html += f"""
+                <div class="word-group">
+                    <div class="word-header">
+                        <div class="word-info">
+                            <div class="word-name">{escaped_word}</div>
+                            <div class="word-count {count_class}">{count} 条</div>
+                        </div>
+                        <div class="word-index">{i}/{total_count}</div>
+                    </div>"""
+
+            # 处理每个词组下的新闻标题,给每条新闻标上序号
+            for j, title_data in enumerate(stat["titles"], 1):
+                is_new = title_data.get("is_new", False)
+                new_class = "new" if is_new else ""
+
+                stats_html += f"""
+                    <div class="news-item {new_class}">
+                        <div class="news-number">{j}</div>
+                        <div class="news-content">
+                            <div class="news-header">
+                                <span class="source-name">{html_escape(title_data["source_name"])}</span>"""
+
+                # 处理排名显示
+                ranks = title_data.get("ranks", [])
+                if ranks:
+                    min_rank = min(ranks)
+                    max_rank = max(ranks)
+                    rank_threshold = title_data.get("rank_threshold", 10)
+
+                    # 确定排名等级
+                    if min_rank <= 3:
+                        rank_class = "top"
+                    elif min_rank <= rank_threshold:
+                        rank_class = "high"
+                    else:
+                        rank_class = ""
+
+                    if min_rank == max_rank:
+                        rank_text = str(min_rank)
+                    else:
+                        rank_text = f"{min_rank}-{max_rank}"
+
+                    stats_html += f'<span class="rank-num {rank_class}">{rank_text}</span>'
+
+                # 处理时间显示
+                time_display = title_data.get("time_display", "")
+                if time_display:
+                    # 简化时间显示格式,将波浪线替换为~
+                    simplified_time = (
+                        time_display.replace(" ~ ", "~")
+                        .replace("[", "")
+                        .replace("]", "")
+                    )
+                    stats_html += (
+                        f'<span class="time-info">{html_escape(simplified_time)}</span>'
+                    )
+
+                # 处理出现次数
+                count_info = title_data.get("count", 1)
+                if count_info > 1:
+                    stats_html += f'<span class="count-info">{count_info}次</span>'
+
+                stats_html += """
+                            </div>
+                            <div class="news-title">"""
+
+                # 处理标题和链接
+                escaped_title = html_escape(title_data["title"])
+                link_url = title_data.get("mobile_url") or title_data.get("url", "")
+
+                if link_url:
+                    escaped_url = html_escape(link_url)
+                    stats_html += f'<a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
+                else:
+                    stats_html += escaped_title
+
+                stats_html += """
+                            </div>
+                        </div>
+                    </div>"""
+
+            stats_html += """
+                </div>"""
+
+    # 生成新增新闻区域的HTML
+    new_titles_html = ""
+    if report_data["new_titles"]:
+        new_titles_html += f"""
+                <div class="new-section">
+                    <div class="new-section-title">本次新增热点 (共 {report_data['total_new_count']} 条)</div>"""
+
+        for source_data in report_data["new_titles"]:
+            escaped_source = html_escape(source_data["source_name"])
+            titles_count = len(source_data["titles"])
+
+            new_titles_html += f"""
+                    <div class="new-source-group">
+                        <div class="new-source-title">{escaped_source} · {titles_count}条</div>"""
+
+            # 为新增新闻也添加序号
+            for idx, title_data in enumerate(source_data["titles"], 1):
+                ranks = title_data.get("ranks", [])
+
+                # 处理新增新闻的排名显示
+                rank_class = ""
+                if ranks:
+                    min_rank = min(ranks)
+                    if min_rank <= 3:
+                        rank_class = "top"
+                    elif min_rank <= title_data.get("rank_threshold", 10):
+                        rank_class = "high"
+
+                    if len(ranks) == 1:
+                        rank_text = str(ranks[0])
+                    else:
+                        rank_text = f"{min(ranks)}-{max(ranks)}"
+                else:
+                    rank_text = "?"
+
+                new_titles_html += f"""
+                        <div class="new-item">
+                            <div class="new-item-number">{idx}</div>
+                            <div class="new-item-rank {rank_class}">{rank_text}</div>
+                            <div class="new-item-content">
+                                <div class="new-item-title">"""
+
+                # 处理新增新闻的链接
+                escaped_title = html_escape(title_data["title"])
+                link_url = title_data.get("mobile_url") or title_data.get("url", "")
+
+                if link_url:
+                    escaped_url = html_escape(link_url)
+                    new_titles_html += f'<a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
+                else:
+                    new_titles_html += escaped_title
+
+                new_titles_html += """
+                                </div>
+                            </div>
+                        </div>"""
+
+            new_titles_html += """
+                    </div>"""
+
+        new_titles_html += """
+                </div>"""
+
+    # 根据配置决定内容顺序
+    if reverse_content_order:
+        # 新增热点在前,热点词汇统计在后
+        html += new_titles_html + stats_html
+    else:
+        # 默认:热点词汇统计在前,新增热点在后
+        html += stats_html + new_titles_html
+
+    html += """
+            </div>
+
+            <div class="footer">
+                <div class="footer-content">
+                    由 <span class="project-name">TrendRadar</span> 生成 ·
+                    <a href="https://github.com/sansan0/TrendRadar" target="_blank" class="footer-link">
+                        GitHub 开源项目
+                    </a>"""
+
+    if update_info:
+        html += f"""
+                    <br>
+                    <span style="color: #ea580c; font-weight: 500;">
+                        发现新版本 {update_info['remote_version']},当前版本 {update_info['current_version']}
+                    </span>"""
+
+    html += """
+                </div>
+            </div>
+        </div>
+
+        <script>
+            async function saveAsImage() {
+                const button = event.target;
+                const originalText = button.textContent;
+
+                try {
+                    button.textContent = '生成中...';
+                    button.disabled = true;
+                    window.scrollTo(0, 0);
+
+                    // 等待页面稳定
+                    await new Promise(resolve => setTimeout(resolve, 200));
+
+                    // 截图前隐藏按钮
+                    const buttons = document.querySelector('.save-buttons');
+                    buttons.style.visibility = 'hidden';
+
+                    // 再次等待确保按钮完全隐藏
+                    await new Promise(resolve => setTimeout(resolve, 100));
+
+                    const container = document.querySelector('.container');
+
+                    const canvas = await html2canvas(container, {
+                        backgroundColor: '#ffffff',
+                        scale: 1.5,
+                        useCORS: true,
+                        allowTaint: false,
+                        imageTimeout: 10000,
+                        removeContainer: false,
+                        foreignObjectRendering: false,
+                        logging: false,
+                        width: container.offsetWidth,
+                        height: container.offsetHeight,
+                        x: 0,
+                        y: 0,
+                        scrollX: 0,
+                        scrollY: 0,
+                        windowWidth: window.innerWidth,
+                        windowHeight: window.innerHeight
+                    });
+
+                    buttons.style.visibility = 'visible';
+
+                    const link = document.createElement('a');
+                    const now = new Date();
+                    const filename = `TrendRadar_热点新闻分析_${now.getFullYear()}${String(now.getMonth() + 1).padStart(2, '0')}${String(now.getDate()).padStart(2, '0')}_${String(now.getHours()).padStart(2, '0')}${String(now.getMinutes()).padStart(2, '0')}.png`;
+
+                    link.download = filename;
+                    link.href = canvas.toDataURL('image/png', 1.0);
+
+                    // 触发下载
+                    document.body.appendChild(link);
+                    link.click();
+                    document.body.removeChild(link);
+
+                    button.textContent = '保存成功!';
+                    setTimeout(() => {
+                        button.textContent = originalText;
+                        button.disabled = false;
+                    }, 2000);
+
+                } catch (error) {
+                    const buttons = document.querySelector('.save-buttons');
+                    buttons.style.visibility = 'visible';
+                    button.textContent = '保存失败';
+                    setTimeout(() => {
+                        button.textContent = originalText;
+                        button.disabled = false;
+                    }, 2000);
+                }
+            }
+
+            async function saveAsMultipleImages() {
+                const button = event.target;
+                const originalText = button.textContent;
+                const container = document.querySelector('.container');
+                const scale = 1.5;
+                const maxHeight = 5000 / scale;
+
+                try {
+                    button.textContent = '分析中...';
+                    button.disabled = true;
+
+                    // 获取所有可能的分割元素
+                    const newsItems = Array.from(container.querySelectorAll('.news-item'));
+                    const wordGroups = Array.from(container.querySelectorAll('.word-group'));
+                    const newSection = container.querySelector('.new-section');
+                    const errorSection = container.querySelector('.error-section');
+                    const header = container.querySelector('.header');
+                    const footer = container.querySelector('.footer');
+
+                    // 计算元素位置和高度
+                    const containerRect = container.getBoundingClientRect();
+                    const elements = [];
+
+                    // 添加header作为必须包含的元素
+                    elements.push({
+                        type: 'header',
+                        element: header,
+                        top: 0,
+                        bottom: header.offsetHeight,
+                        height: header.offsetHeight
+                    });
+
+                    // 添加错误信息(如果存在)
+                    if (errorSection) {
+                        const rect = errorSection.getBoundingClientRect();
+                        elements.push({
+                            type: 'error',
+                            element: errorSection,
+                            top: rect.top - containerRect.top,
+                            bottom: rect.bottom - containerRect.top,
+                            height: rect.height
+                        });
+                    }
+
+                    // 按word-group分组处理news-item
+                    wordGroups.forEach(group => {
+                        const groupRect = group.getBoundingClientRect();
+                        const groupNewsItems = group.querySelectorAll('.news-item');
+
+                        // 添加word-group的header部分
+                        const wordHeader = group.querySelector('.word-header');
+                        if (wordHeader) {
+                            const headerRect = wordHeader.getBoundingClientRect();
+                            elements.push({
+                                type: 'word-header',
+                                element: wordHeader,
+                                parent: group,
+                                top: groupRect.top - containerRect.top,
+                                bottom: headerRect.bottom - containerRect.top,
+                                height: headerRect.height
+                            });
+                        }
+
+                        // 添加每个news-item
+                        groupNewsItems.forEach(item => {
+                            const rect = item.getBoundingClientRect();
+                            elements.push({
+                                type: 'news-item',
+                                element: item,
+                                parent: group,
+                                top: rect.top - containerRect.top,
+                                bottom: rect.bottom - containerRect.top,
+                                height: rect.height
+                            });
+                        });
+                    });
+
+                    // 添加新增新闻部分
+                    if (newSection) {
+                        const rect = newSection.getBoundingClientRect();
+                        elements.push({
+                            type: 'new-section',
+                            element: newSection,
+                            top: rect.top - containerRect.top,
+                            bottom: rect.bottom - containerRect.top,
+                            height: rect.height
+                        });
+                    }
+
+                    // 添加footer
+                    const footerRect = footer.getBoundingClientRect();
+                    elements.push({
+                        type: 'footer',
+                        element: footer,
+                        top: footerRect.top - containerRect.top,
+                        bottom: footerRect.bottom - containerRect.top,
+                        height: footer.offsetHeight
+                    });
+
+                    // 计算分割点
+                    const segments = [];
+                    let currentSegment = { start: 0, end: 0, height: 0, includeHeader: true };
+                    let headerHeight = header.offsetHeight;
+                    currentSegment.height = headerHeight;
+
+                    for (let i = 1; i < elements.length; i++) {
+                        const element = elements[i];
+                        const potentialHeight = element.bottom - currentSegment.start;
+
+                        // 检查是否需要创建新分段
+                        if (potentialHeight > maxHeight && currentSegment.height > headerHeight) {
+                            // 在前一个元素结束处分割
+                            currentSegment.end = elements[i - 1].bottom;
+                            segments.push(currentSegment);
+
+                            // 开始新分段
+                            currentSegment = {
+                                start: currentSegment.end,
+                                end: 0,
+                                height: element.bottom - currentSegment.end,
+                                includeHeader: false
+                            };
+                        } else {
+                            currentSegment.height = potentialHeight;
+                            currentSegment.end = element.bottom;
+                        }
+                    }
+
+                    // 添加最后一个分段
+                    if (currentSegment.height > 0) {
+                        currentSegment.end = container.offsetHeight;
+                        segments.push(currentSegment);
+                    }
+
+                    button.textContent = `生成中 (0/${segments.length})...`;
+
+                    // 隐藏保存按钮
+                    const buttons = document.querySelector('.save-buttons');
+                    buttons.style.visibility = 'hidden';
+
+                    // 为每个分段生成图片
+                    const images = [];
+                    for (let i = 0; i < segments.length; i++) {
+                        const segment = segments[i];
+                        button.textContent = `生成中 (${i + 1}/${segments.length})...`;
+
+                        // 创建临时容器用于截图
+                        const tempContainer = document.createElement('div');
+                        tempContainer.style.cssText = `
+                            position: absolute;
+                            left: -9999px;
+                            top: 0;
+                            width: ${container.offsetWidth}px;
+                            background: white;
+                        `;
+                        tempContainer.className = 'container';
+
+                        // 克隆容器内容
+                        const clonedContainer = container.cloneNode(true);
+
+                        // 移除克隆内容中的保存按钮
+                        const clonedButtons = clonedContainer.querySelector('.save-buttons');
+                        if (clonedButtons) {
+                            clonedButtons.style.display = 'none';
+                        }
+
+                        tempContainer.appendChild(clonedContainer);
+                        document.body.appendChild(tempContainer);
+
+                        // 等待DOM更新
+                        await new Promise(resolve => setTimeout(resolve, 100));
+
+                        // 使用html2canvas截取特定区域
+                        const canvas = await html2canvas(clonedContainer, {
+                            backgroundColor: '#ffffff',
+                            scale: scale,
+                            useCORS: true,
+                            allowTaint: false,
+                            imageTimeout: 10000,
+                            logging: false,
+                            width: container.offsetWidth,
+                            height: segment.end - segment.start,
+                            x: 0,
+                            y: segment.start,
+                            windowWidth: window.innerWidth,
+                            windowHeight: window.innerHeight
+                        });
+
+                        images.push(canvas.toDataURL('image/png', 1.0));
+
+                        // 清理临时容器
+                        document.body.removeChild(tempContainer);
+                    }
+
+                    // 恢复按钮显示
+                    buttons.style.visibility = 'visible';
+
+                    // 下载所有图片
+                    const now = new Date();
+                    const baseFilename = `TrendRadar_热点新闻分析_${now.getFullYear()}${String(now.getMonth() + 1).padStart(2, '0')}${String(now.getDate()).padStart(2, '0')}_${String(now.getHours()).padStart(2, '0')}${String(now.getMinutes()).padStart(2, '0')}`;
+
+                    for (let i = 0; i < images.length; i++) {
+                        const link = document.createElement('a');
+                        link.download = `${baseFilename}_part${i + 1}.png`;
+                        link.href = images[i];
+                        document.body.appendChild(link);
+                        link.click();
+                        document.body.removeChild(link);
+
+                        // 延迟一下避免浏览器阻止多个下载
+                        await new Promise(resolve => setTimeout(resolve, 100));
+                    }
+
+                    button.textContent = `已保存 ${segments.length} 张图片!`;
+                    setTimeout(() => {
+                        button.textContent = originalText;
+                        button.disabled = false;
+                    }, 2000);
+
+                } catch (error) {
+                    console.error('分段保存失败:', error);
+                    const buttons = document.querySelector('.save-buttons');
+                    buttons.style.visibility = 'visible';
+                    button.textContent = '保存失败';
+                    setTimeout(() => {
+                        button.textContent = originalText;
+                        button.disabled = false;
+                    }, 2000);
+                }
+            }
+
+            document.addEventListener('DOMContentLoaded', function() {
+                window.scrollTo(0, 0);
+            });
+        </script>
+    </body>
+    </html>
+    """
+
+    return html

+ 44 - 0
trendradar/storage/__init__.py

@@ -0,0 +1,44 @@
+# coding=utf-8
+"""
+存储模块 - 支持多种存储后端
+
+支持的存储后端:
+- local: 本地 SQLite + TXT/HTML 文件
+- remote: 远程云存储(S3 兼容协议:R2/OSS/COS/S3 等)
+- auto: 根据环境自动选择(GitHub Actions 用 remote,其他用 local)
+"""
+
+from trendradar.storage.base import (
+    StorageBackend,
+    NewsItem,
+    NewsData,
+    convert_crawl_results_to_news_data,
+    convert_news_data_to_results,
+)
+from trendradar.storage.local import LocalStorageBackend
+from trendradar.storage.manager import StorageManager, get_storage_manager
+
+# 远程后端可选导入(需要 boto3)
+try:
+    from trendradar.storage.remote import RemoteStorageBackend
+    HAS_REMOTE = True
+except ImportError:
+    RemoteStorageBackend = None
+    HAS_REMOTE = False
+
+__all__ = [
+    # 基础类
+    "StorageBackend",
+    "NewsItem",
+    "NewsData",
+    # 转换函数
+    "convert_crawl_results_to_news_data",
+    "convert_news_data_to_results",
+    # 后端实现
+    "LocalStorageBackend",
+    "RemoteStorageBackend",
+    "HAS_REMOTE",
+    # 管理器
+    "StorageManager",
+    "get_storage_manager",
+]

+ 457 - 0
trendradar/storage/base.py

@@ -0,0 +1,457 @@
+# coding=utf-8
+"""
+存储后端抽象基类和数据模型
+
+定义统一的存储接口,所有存储后端都需要实现这些方法
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Dict, List, Optional, Any
+import json
+
+
+@dataclass
+class NewsItem:
+    """新闻条目数据模型"""
+
+    title: str                          # 新闻标题
+    source_id: str                      # 来源平台ID(如 toutiao, baidu)
+    source_name: str = ""               # 来源平台名称(运行时使用,数据库不存储)
+    rank: int = 0                       # 排名
+    url: str = ""                       # 链接 URL
+    mobile_url: str = ""                # 移动端 URL
+    crawl_time: str = ""                # 抓取时间(HH:MM 格式)
+
+    # 统计信息(用于分析)
+    ranks: List[int] = field(default_factory=list)  # 历史排名列表
+    first_time: str = ""                # 首次出现时间
+    last_time: str = ""                 # 最后出现时间
+    count: int = 1                      # 出现次数
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            "title": self.title,
+            "source_id": self.source_id,
+            "source_name": self.source_name,
+            "rank": self.rank,
+            "url": self.url,
+            "mobile_url": self.mobile_url,
+            "crawl_time": self.crawl_time,
+            "ranks": self.ranks,
+            "first_time": self.first_time,
+            "last_time": self.last_time,
+            "count": self.count,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "NewsItem":
+        """从字典创建"""
+        return cls(
+            title=data.get("title", ""),
+            source_id=data.get("source_id", ""),
+            source_name=data.get("source_name", ""),
+            rank=data.get("rank", 0),
+            url=data.get("url", ""),
+            mobile_url=data.get("mobile_url", ""),
+            crawl_time=data.get("crawl_time", ""),
+            ranks=data.get("ranks", []),
+            first_time=data.get("first_time", ""),
+            last_time=data.get("last_time", ""),
+            count=data.get("count", 1),
+        )
+
+
+@dataclass
+class NewsData:
+    """
+    新闻数据集合
+
+    结构:
+    - date: 日期(YYYY-MM-DD)
+    - crawl_time: 抓取时间(HH时MM分)
+    - items: 按来源ID分组的新闻条目
+    - id_to_name: 来源ID到名称的映射
+    - failed_ids: 失败的来源ID列表
+    """
+
+    date: str                                   # 日期
+    crawl_time: str                             # 抓取时间
+    items: Dict[str, List[NewsItem]]            # 按来源分组的新闻
+    id_to_name: Dict[str, str] = field(default_factory=dict)   # ID到名称映射
+    failed_ids: List[str] = field(default_factory=list)        # 失败的ID
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        items_dict = {}
+        for source_id, news_list in self.items.items():
+            items_dict[source_id] = [item.to_dict() for item in news_list]
+
+        return {
+            "date": self.date,
+            "crawl_time": self.crawl_time,
+            "items": items_dict,
+            "id_to_name": self.id_to_name,
+            "failed_ids": self.failed_ids,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "NewsData":
+        """从字典创建"""
+        items = {}
+        items_data = data.get("items", {})
+        for source_id, news_list in items_data.items():
+            items[source_id] = [NewsItem.from_dict(item) for item in news_list]
+
+        return cls(
+            date=data.get("date", ""),
+            crawl_time=data.get("crawl_time", ""),
+            items=items,
+            id_to_name=data.get("id_to_name", {}),
+            failed_ids=data.get("failed_ids", []),
+        )
+
+    def get_total_count(self) -> int:
+        """获取新闻总数"""
+        return sum(len(news_list) for news_list in self.items.values())
+
+    def merge_with(self, other: "NewsData") -> "NewsData":
+        """
+        合并另一个 NewsData 到当前数据
+
+        合并规则:
+        - 相同 source_id + title 的新闻合并排名历史
+        - 更新 last_time 和 count
+        - 保留较早的 first_time
+        """
+        merged_items = {}
+
+        # 复制当前数据
+        for source_id, news_list in self.items.items():
+            merged_items[source_id] = {item.title: item for item in news_list}
+
+        # 合并其他数据
+        for source_id, news_list in other.items.items():
+            if source_id not in merged_items:
+                merged_items[source_id] = {}
+
+            for item in news_list:
+                if item.title in merged_items[source_id]:
+                    # 合并已存在的新闻
+                    existing = merged_items[source_id][item.title]
+
+                    # 合并排名
+                    existing_ranks = set(existing.ranks) if existing.ranks else set()
+                    new_ranks = set(item.ranks) if item.ranks else set()
+                    merged_ranks = sorted(existing_ranks | new_ranks)
+                    existing.ranks = merged_ranks
+
+                    # 更新时间
+                    if item.first_time and (not existing.first_time or item.first_time < existing.first_time):
+                        existing.first_time = item.first_time
+                    if item.last_time and (not existing.last_time or item.last_time > existing.last_time):
+                        existing.last_time = item.last_time
+
+                    # 更新计数
+                    existing.count += 1
+
+                    # 保留URL(如果原来没有)
+                    if not existing.url and item.url:
+                        existing.url = item.url
+                    if not existing.mobile_url and item.mobile_url:
+                        existing.mobile_url = item.mobile_url
+                else:
+                    # 添加新新闻
+                    merged_items[source_id][item.title] = item
+
+        # 转换回列表格式
+        final_items = {}
+        for source_id, items_dict in merged_items.items():
+            final_items[source_id] = list(items_dict.values())
+
+        # 合并 id_to_name
+        merged_id_to_name = {**self.id_to_name, **other.id_to_name}
+
+        # 合并 failed_ids(去重)
+        merged_failed_ids = list(set(self.failed_ids + other.failed_ids))
+
+        return NewsData(
+            date=self.date or other.date,
+            crawl_time=other.crawl_time,  # 使用较新的抓取时间
+            items=final_items,
+            id_to_name=merged_id_to_name,
+            failed_ids=merged_failed_ids,
+        )
+
+
+class StorageBackend(ABC):
+    """
+    存储后端抽象基类
+
+    所有存储后端都需要实现这些方法,以支持:
+    - 保存新闻数据
+    - 读取当天所有数据
+    - 检测新增新闻
+    - 生成报告文件(TXT/HTML)
+    """
+
+    @abstractmethod
+    def save_news_data(self, data: NewsData) -> bool:
+        """
+        保存新闻数据
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            是否保存成功
+        """
+        pass
+
+    @abstractmethod
+    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """
+        获取指定日期的所有新闻数据
+
+        Args:
+            date: 日期字符串(YYYY-MM-DD),默认为今天
+
+        Returns:
+            合并后的新闻数据,如果没有数据返回 None
+        """
+        pass
+
+    @abstractmethod
+    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """
+        获取最新一次抓取的数据
+
+        Args:
+            date: 日期字符串,默认为今天
+
+        Returns:
+            最新抓取的新闻数据
+        """
+        pass
+
+    @abstractmethod
+    def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
+        """
+        检测新增的标题
+
+        Args:
+            current_data: 当前抓取的数据
+
+        Returns:
+            新增的标题数据,格式: {source_id: {title: title_data}}
+        """
+        pass
+
+    @abstractmethod
+    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
+        """
+        保存 TXT 快照(可选功能,本地环境可用)
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            保存的文件路径,如果不支持返回 None
+        """
+        pass
+
+    @abstractmethod
+    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
+        """
+        保存 HTML 报告
+
+        Args:
+            html_content: HTML 内容
+            filename: 文件名
+            is_summary: 是否为汇总报告
+
+        Returns:
+            保存的文件路径
+        """
+        pass
+
+    @abstractmethod
+    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查是否是当天第一次抓取
+
+        Args:
+            date: 日期字符串,默认为今天
+
+        Returns:
+            是否是第一次抓取
+        """
+        pass
+
+    @abstractmethod
+    def cleanup(self) -> None:
+        """
+        清理资源(如临时文件、数据库连接等)
+        """
+        pass
+
+    @abstractmethod
+    def cleanup_old_data(self, retention_days: int) -> int:
+        """
+        清理过期数据
+
+        Args:
+            retention_days: 保留天数(0 表示不清理)
+
+        Returns:
+            删除的日期目录数量
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def backend_name(self) -> str:
+        """
+        存储后端名称
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def supports_txt(self) -> bool:
+        """
+        是否支持生成 TXT 快照
+        """
+        pass
+
+    # === 推送记录相关方法 ===
+
+    @abstractmethod
+    def has_pushed_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查指定日期是否已推送过
+
+        Args:
+            date: 日期字符串(YYYY-MM-DD),默认为今天
+
+        Returns:
+            是否已推送
+        """
+        pass
+
+    @abstractmethod
+    def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
+        """
+        记录推送
+
+        Args:
+            report_type: 报告类型
+            date: 日期字符串(YYYY-MM-DD),默认为今天
+
+        Returns:
+            是否记录成功
+        """
+        pass
+
+
+def convert_crawl_results_to_news_data(
+    results: Dict[str, Dict],
+    id_to_name: Dict[str, str],
+    failed_ids: List[str],
+    crawl_time: str,
+    crawl_date: str,
+) -> NewsData:
+    """
+    将爬虫结果转换为 NewsData 格式
+
+    Args:
+        results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}}
+        id_to_name: 来源ID到名称的映射
+        failed_ids: 失败的来源ID
+        crawl_time: 抓取时间(HH:MM)
+        crawl_date: 抓取日期(YYYY-MM-DD)
+
+    Returns:
+        NewsData 对象
+    """
+    items = {}
+
+    for source_id, titles_data in results.items():
+        source_name = id_to_name.get(source_id, source_id)
+        news_list = []
+
+        for title, data in titles_data.items():
+            if isinstance(data, dict):
+                ranks = data.get("ranks", [])
+                url = data.get("url", "")
+                mobile_url = data.get("mobileUrl", "")
+            else:
+                # 兼容旧格式
+                ranks = data if isinstance(data, list) else []
+                url = ""
+                mobile_url = ""
+
+            rank = ranks[0] if ranks else 99
+
+            news_item = NewsItem(
+                title=title,
+                source_id=source_id,
+                source_name=source_name,
+                rank=rank,
+                url=url,
+                mobile_url=mobile_url,
+                crawl_time=crawl_time,
+                ranks=ranks,
+                first_time=crawl_time,
+                last_time=crawl_time,
+                count=1,
+            )
+            news_list.append(news_item)
+
+        items[source_id] = news_list
+
+    return NewsData(
+        date=crawl_date,
+        crawl_time=crawl_time,
+        items=items,
+        id_to_name=id_to_name,
+        failed_ids=failed_ids,
+    )
+
+
+def convert_news_data_to_results(data: NewsData) -> tuple:
+    """
+    将 NewsData 转换回原有的 results 格式(用于兼容现有代码)
+
+    Args:
+        data: NewsData 对象
+
+    Returns:
+        (results, id_to_name, title_info) 元组
+    """
+    results = {}
+    title_info = {}
+
+    for source_id, news_list in data.items.items():
+        results[source_id] = {}
+        title_info[source_id] = {}
+
+        for item in news_list:
+            results[source_id][item.title] = {
+                "ranks": item.ranks,
+                "url": item.url,
+                "mobileUrl": item.mobile_url,
+            }
+
+            title_info[source_id][item.title] = {
+                "first_time": item.first_time,
+                "last_time": item.last_time,
+                "count": item.count,
+                "ranks": item.ranks,
+                "url": item.url,
+                "mobileUrl": item.mobile_url,
+            }
+
+    return results, data.id_to_name, title_info

+ 869 - 0
trendradar/storage/local.py

@@ -0,0 +1,869 @@
+# coding=utf-8
+"""
+本地存储后端 - SQLite + TXT/HTML
+
+使用 SQLite 作为主存储,支持可选的 TXT 快照和 HTML 报告
+"""
+
+import sqlite3
+import os
+import shutil
+import pytz
+import re
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+
+from trendradar.storage.base import StorageBackend, NewsItem, NewsData
+from trendradar.utils.time import (
+    get_configured_time,
+    format_date_folder,
+    format_time_filename,
+)
+
+
+class LocalStorageBackend(StorageBackend):
+    """
+    本地存储后端
+
+    使用 SQLite 数据库存储新闻数据,支持:
+    - 按日期组织的 SQLite 数据库文件
+    - 可选的 TXT 快照(用于调试)
+    - HTML 报告生成
+    """
+
+    def __init__(
+        self,
+        data_dir: str = "output",
+        enable_txt: bool = True,
+        enable_html: bool = True,
+        timezone: str = "Asia/Shanghai",
+    ):
+        """
+        初始化本地存储后端
+
+        Args:
+            data_dir: 数据目录路径
+            enable_txt: 是否启用 TXT 快照
+            enable_html: 是否启用 HTML 报告
+            timezone: 时区配置(默认 Asia/Shanghai)
+        """
+        self.data_dir = Path(data_dir)
+        self.enable_txt = enable_txt
+        self.enable_html = enable_html
+        self.timezone = timezone
+        self._db_connections: Dict[str, sqlite3.Connection] = {}
+
+    @property
+    def backend_name(self) -> str:
+        return "local"
+
+    @property
+    def supports_txt(self) -> bool:
+        return self.enable_txt
+
+    def _get_configured_time(self) -> datetime:
+        """获取配置时区的当前时间"""
+        return get_configured_time(self.timezone)
+
+    def _format_date_folder(self, date: Optional[str] = None) -> str:
+        """格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
+        return format_date_folder(date, self.timezone)
+
+    def _format_time_filename(self) -> str:
+        """格式化时间文件名 (格式: HH-MM)"""
+        return format_time_filename(self.timezone)
+
+    def _get_db_path(self, date: Optional[str] = None) -> Path:
+        """获取 SQLite 数据库路径"""
+        date_folder = self._format_date_folder(date)
+        db_dir = self.data_dir / date_folder
+        db_dir.mkdir(parents=True, exist_ok=True)
+        return db_dir / "news.db"
+
+    def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
+        """获取数据库连接(带缓存)"""
+        db_path = str(self._get_db_path(date))
+
+        if db_path not in self._db_connections:
+            conn = sqlite3.connect(db_path)
+            conn.row_factory = sqlite3.Row
+            self._init_tables(conn)
+            self._db_connections[db_path] = conn
+
+        return self._db_connections[db_path]
+
+    def _get_schema_path(self) -> Path:
+        """获取 schema.sql 文件路径"""
+        return Path(__file__).parent / "schema.sql"
+
+    def _init_tables(self, conn: sqlite3.Connection) -> None:
+        """从 schema.sql 初始化数据库表结构"""
+        schema_path = self._get_schema_path()
+        
+        if schema_path.exists():
+            with open(schema_path, "r", encoding="utf-8") as f:
+                schema_sql = f.read()
+            conn.executescript(schema_sql)
+        else:
+            raise FileNotFoundError(f"Schema file not found: {schema_path}")
+        
+        conn.commit()
+
+    def save_news_data(self, data: NewsData) -> bool:
+        """
+        保存新闻数据到 SQLite(以 URL 为唯一标识,支持标题更新检测)
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            是否保存成功
+        """
+        try:
+            conn = self._get_connection(data.date)
+            cursor = conn.cursor()
+
+            # 获取配置时区的当前时间
+            now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
+
+            # 首先同步平台信息到 platforms 表
+            for source_id, source_name in data.id_to_name.items():
+                cursor.execute("""
+                    INSERT INTO platforms (id, name, updated_at)
+                    VALUES (?, ?, ?)
+                    ON CONFLICT(id) DO UPDATE SET
+                        name = excluded.name,
+                        updated_at = excluded.updated_at
+                """, (source_id, source_name, now_str))
+
+            # 统计计数器
+            new_count = 0
+            updated_count = 0
+            title_changed_count = 0
+            success_sources = []
+
+            for source_id, news_list in data.items.items():
+                success_sources.append(source_id)
+
+                for item in news_list:
+                    try:
+                        # 检查是否已存在(通过 URL + platform_id)
+                        if item.url:
+                            cursor.execute("""
+                                SELECT id, title FROM news_items
+                                WHERE url = ? AND platform_id = ?
+                            """, (item.url, source_id))
+                            existing = cursor.fetchone()
+
+                            if existing:
+                                # 已存在,更新记录
+                                existing_id, existing_title = existing
+
+                                # 检查标题是否变化
+                                if existing_title != item.title:
+                                    # 记录标题变更
+                                    cursor.execute("""
+                                        INSERT INTO title_changes
+                                        (news_item_id, old_title, new_title, changed_at)
+                                        VALUES (?, ?, ?, ?)
+                                    """, (existing_id, existing_title, item.title, now_str))
+                                    title_changed_count += 1
+
+                                # 记录排名历史
+                                cursor.execute("""
+                                    INSERT INTO rank_history
+                                    (news_item_id, rank, crawl_time, created_at)
+                                    VALUES (?, ?, ?, ?)
+                                """, (existing_id, item.rank, data.crawl_time, now_str))
+
+                                # 更新现有记录
+                                cursor.execute("""
+                                    UPDATE news_items SET
+                                        title = ?,
+                                        rank = ?,
+                                        mobile_url = ?,
+                                        last_crawl_time = ?,
+                                        crawl_count = crawl_count + 1,
+                                        updated_at = ?
+                                    WHERE id = ?
+                                """, (item.title, item.rank, item.mobile_url,
+                                      data.crawl_time, now_str, existing_id))
+                                updated_count += 1
+                            else:
+                                # 不存在,插入新记录
+                                cursor.execute("""
+                                    INSERT INTO news_items
+                                    (title, platform_id, rank, url, mobile_url,
+                                     first_crawl_time, last_crawl_time, crawl_count,
+                                     created_at, updated_at)
+                                    VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
+                                """, (item.title, source_id, item.rank, item.url,
+                                      item.mobile_url, data.crawl_time, data.crawl_time,
+                                      now_str, now_str))
+                                new_id = cursor.lastrowid
+                                # 记录初始排名
+                                cursor.execute("""
+                                    INSERT INTO rank_history
+                                    (news_item_id, rank, crawl_time, created_at)
+                                    VALUES (?, ?, ?, ?)
+                                """, (new_id, item.rank, data.crawl_time, now_str))
+                                new_count += 1
+                        else:
+                            # URL 为空的情况,直接插入(不做去重)
+                            cursor.execute("""
+                                INSERT INTO news_items
+                                (title, platform_id, rank, url, mobile_url,
+                                 first_crawl_time, last_crawl_time, crawl_count,
+                                 created_at, updated_at)
+                                VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
+                            """, (item.title, source_id, item.rank, item.url,
+                                  item.mobile_url, data.crawl_time, data.crawl_time,
+                                  now_str, now_str))
+                            new_id = cursor.lastrowid
+                            # 记录初始排名
+                            cursor.execute("""
+                                INSERT INTO rank_history
+                                (news_item_id, rank, crawl_time, created_at)
+                                VALUES (?, ?, ?, ?)
+                            """, (new_id, item.rank, data.crawl_time, now_str))
+                            new_count += 1
+
+                    except sqlite3.Error as e:
+                        print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}")
+
+            total_items = new_count + updated_count
+
+            # 记录抓取信息
+            cursor.execute("""
+                INSERT OR REPLACE INTO crawl_records
+                (crawl_time, total_items, created_at)
+                VALUES (?, ?, ?)
+            """, (data.crawl_time, total_items, now_str))
+
+            # 获取刚插入的 crawl_record 的 ID
+            cursor.execute("""
+                SELECT id FROM crawl_records WHERE crawl_time = ?
+            """, (data.crawl_time,))
+            record_row = cursor.fetchone()
+            if record_row:
+                crawl_record_id = record_row[0]
+
+                # 记录成功的来源
+                for source_id in success_sources:
+                    cursor.execute("""
+                        INSERT OR REPLACE INTO crawl_source_status
+                        (crawl_record_id, platform_id, status)
+                        VALUES (?, ?, 'success')
+                    """, (crawl_record_id, source_id))
+
+                # 记录失败的来源
+                for failed_id in data.failed_ids:
+                    # 确保失败的平台也在 platforms 表中
+                    cursor.execute("""
+                        INSERT OR IGNORE INTO platforms (id, name, updated_at)
+                        VALUES (?, ?, ?)
+                    """, (failed_id, failed_id, now_str))
+
+                    cursor.execute("""
+                        INSERT OR REPLACE INTO crawl_source_status
+                        (crawl_record_id, platform_id, status)
+                        VALUES (?, ?, 'failed')
+                    """, (crawl_record_id, failed_id))
+
+            conn.commit()
+
+            # 输出详细的存储统计日志
+            log_parts = [f"[本地存储] 处理完成:新增 {new_count} 条"]
+            if updated_count > 0:
+                log_parts.append(f"更新 {updated_count} 条")
+            if title_changed_count > 0:
+                log_parts.append(f"标题变更 {title_changed_count} 条")
+            print(",".join(log_parts))
+
+            return True
+
+        except Exception as e:
+            print(f"[本地存储] 保存失败: {e}")
+            return False
+
+    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """
+        获取指定日期的所有新闻数据(合并后)
+
+        Args:
+            date: 日期字符串,默认为今天
+
+        Returns:
+            合并后的新闻数据
+        """
+        try:
+            db_path = self._get_db_path(date)
+            if not db_path.exists():
+                return None
+
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            # 获取所有新闻数据(包含 id 用于查询排名历史)
+            cursor.execute("""
+                SELECT n.id, n.title, n.platform_id, p.name as platform_name,
+                       n.rank, n.url, n.mobile_url,
+                       n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                FROM news_items n
+                LEFT JOIN platforms p ON n.platform_id = p.id
+                ORDER BY n.platform_id, n.last_crawl_time
+            """)
+
+            rows = cursor.fetchall()
+            if not rows:
+                return None
+
+            # 收集所有 news_item_id
+            news_ids = [row[0] for row in rows]
+
+            # 批量查询排名历史
+            rank_history_map: Dict[int, List[int]] = {}
+            if news_ids:
+                placeholders = ",".join("?" * len(news_ids))
+                cursor.execute(f"""
+                    SELECT news_item_id, rank FROM rank_history
+                    WHERE news_item_id IN ({placeholders})
+                    ORDER BY news_item_id, crawl_time
+                """, news_ids)
+                for rh_row in cursor.fetchall():
+                    news_id, rank = rh_row[0], rh_row[1]
+                    if news_id not in rank_history_map:
+                        rank_history_map[news_id] = []
+                    if rank not in rank_history_map[news_id]:
+                        rank_history_map[news_id].append(rank)
+
+            # 按 platform_id 分组
+            items: Dict[str, List[NewsItem]] = {}
+            id_to_name: Dict[str, str] = {}
+            crawl_date = self._format_date_folder(date)
+
+            for row in rows:
+                news_id = row[0]
+                platform_id = row[2]
+                title = row[1]
+                platform_name = row[3] or platform_id
+
+                id_to_name[platform_id] = platform_name
+
+                if platform_id not in items:
+                    items[platform_id] = []
+
+                # 获取排名历史,如果没有则使用当前排名
+                ranks = rank_history_map.get(news_id, [row[4]])
+
+                items[platform_id].append(NewsItem(
+                    title=title,
+                    source_id=platform_id,
+                    source_name=platform_name,
+                    rank=row[4],
+                    url=row[5] or "",
+                    mobile_url=row[6] or "",
+                    crawl_time=row[8],  # last_crawl_time
+                    ranks=ranks,
+                    first_time=row[7],  # first_crawl_time
+                    last_time=row[8],   # last_crawl_time
+                    count=row[9],       # crawl_count
+                ))
+
+            final_items = items
+
+            # 获取失败的来源
+            cursor.execute("""
+                SELECT DISTINCT css.platform_id
+                FROM crawl_source_status css
+                JOIN crawl_records cr ON css.crawl_record_id = cr.id
+                WHERE css.status = 'failed'
+            """)
+            failed_ids = [row[0] for row in cursor.fetchall()]
+
+            # 获取最新的抓取时间
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time DESC
+                LIMIT 1
+            """)
+
+            time_row = cursor.fetchone()
+            crawl_time = time_row[0] if time_row else self._format_time_filename()
+
+            return NewsData(
+                date=crawl_date,
+                crawl_time=crawl_time,
+                items=final_items,
+                id_to_name=id_to_name,
+                failed_ids=failed_ids,
+            )
+
+        except Exception as e:
+            print(f"[本地存储] 读取数据失败: {e}")
+            return None
+
+    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """
+        获取最新一次抓取的数据
+
+        Args:
+            date: 日期字符串,默认为今天
+
+        Returns:
+            最新抓取的新闻数据
+        """
+        try:
+            db_path = self._get_db_path(date)
+            if not db_path.exists():
+                return None
+
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            # 获取最新的抓取时间
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time DESC
+                LIMIT 1
+            """)
+
+            time_row = cursor.fetchone()
+            if not time_row:
+                return None
+
+            latest_time = time_row[0]
+
+            # 获取该时间的新闻数据(包含 id 用于查询排名历史)
+            cursor.execute("""
+                SELECT n.id, n.title, n.platform_id, p.name as platform_name,
+                       n.rank, n.url, n.mobile_url,
+                       n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                FROM news_items n
+                LEFT JOIN platforms p ON n.platform_id = p.id
+                WHERE n.last_crawl_time = ?
+            """, (latest_time,))
+
+            rows = cursor.fetchall()
+            if not rows:
+                return None
+
+            # 收集所有 news_item_id
+            news_ids = [row[0] for row in rows]
+
+            # 批量查询排名历史
+            rank_history_map: Dict[int, List[int]] = {}
+            if news_ids:
+                placeholders = ",".join("?" * len(news_ids))
+                cursor.execute(f"""
+                    SELECT news_item_id, rank FROM rank_history
+                    WHERE news_item_id IN ({placeholders})
+                    ORDER BY news_item_id, crawl_time
+                """, news_ids)
+                for rh_row in cursor.fetchall():
+                    news_id, rank = rh_row[0], rh_row[1]
+                    if news_id not in rank_history_map:
+                        rank_history_map[news_id] = []
+                    if rank not in rank_history_map[news_id]:
+                        rank_history_map[news_id].append(rank)
+
+            items: Dict[str, List[NewsItem]] = {}
+            id_to_name: Dict[str, str] = {}
+            crawl_date = self._format_date_folder(date)
+
+            for row in rows:
+                news_id = row[0]
+                platform_id = row[2]
+                platform_name = row[3] or platform_id
+                id_to_name[platform_id] = platform_name
+
+                if platform_id not in items:
+                    items[platform_id] = []
+
+                # 获取排名历史,如果没有则使用当前排名
+                ranks = rank_history_map.get(news_id, [row[4]])
+
+                items[platform_id].append(NewsItem(
+                    title=row[1],
+                    source_id=platform_id,
+                    source_name=platform_name,
+                    rank=row[4],
+                    url=row[5] or "",
+                    mobile_url=row[6] or "",
+                    crawl_time=row[8],  # last_crawl_time
+                    ranks=ranks,
+                    first_time=row[7],  # first_crawl_time
+                    last_time=row[8],   # last_crawl_time
+                    count=row[9],       # crawl_count
+                ))
+
+            # 获取失败的来源(针对最新一次抓取)
+            cursor.execute("""
+                SELECT css.platform_id
+                FROM crawl_source_status css
+                JOIN crawl_records cr ON css.crawl_record_id = cr.id
+                WHERE cr.crawl_time = ? AND css.status = 'failed'
+            """, (latest_time,))
+
+            failed_ids = [row[0] for row in cursor.fetchall()]
+
+            return NewsData(
+                date=crawl_date,
+                crawl_time=latest_time,
+                items=items,
+                id_to_name=id_to_name,
+                failed_ids=failed_ids,
+            )
+
+        except Exception as e:
+            print(f"[本地存储] 获取最新数据失败: {e}")
+            return None
+
+    def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
+        """
+        检测新增的标题
+
+        Args:
+            current_data: 当前抓取的数据
+
+        Returns:
+            新增的标题数据 {source_id: {title: NewsItem}}
+        """
+        try:
+            # 获取历史数据
+            historical_data = self.get_today_all_data(current_data.date)
+
+            if not historical_data:
+                # 没有历史数据,所有都是新的
+                new_titles = {}
+                for source_id, news_list in current_data.items.items():
+                    new_titles[source_id] = {item.title: item for item in news_list}
+                return new_titles
+
+            # 收集历史标题
+            historical_titles: Dict[str, set] = {}
+            for source_id, news_list in historical_data.items.items():
+                historical_titles[source_id] = {item.title for item in news_list}
+
+            # 检测新增
+            new_titles = {}
+            for source_id, news_list in current_data.items.items():
+                hist_set = historical_titles.get(source_id, set())
+                for item in news_list:
+                    if item.title not in hist_set:
+                        if source_id not in new_titles:
+                            new_titles[source_id] = {}
+                        new_titles[source_id][item.title] = item
+
+            return new_titles
+
+        except Exception as e:
+            print(f"[本地存储] 检测新标题失败: {e}")
+            return {}
+
+    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
+        """
+        保存 TXT 快照
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            保存的文件路径
+        """
+        if not self.enable_txt:
+            return None
+
+        try:
+            date_folder = self._format_date_folder(data.date)
+            txt_dir = self.data_dir / date_folder / "txt"
+            txt_dir.mkdir(parents=True, exist_ok=True)
+
+            file_path = txt_dir / f"{data.crawl_time}.txt"
+
+            with open(file_path, "w", encoding="utf-8") as f:
+                for source_id, news_list in data.items.items():
+                    source_name = data.id_to_name.get(source_id, source_id)
+
+                    # 写入来源标题
+                    if source_name and source_name != source_id:
+                        f.write(f"{source_id} | {source_name}\n")
+                    else:
+                        f.write(f"{source_id}\n")
+
+                    # 按排名排序
+                    sorted_news = sorted(news_list, key=lambda x: x.rank)
+
+                    for item in sorted_news:
+                        line = f"{item.rank}. {item.title}"
+                        if item.url:
+                            line += f" [URL:{item.url}]"
+                        if item.mobile_url:
+                            line += f" [MOBILE:{item.mobile_url}]"
+                        f.write(line + "\n")
+
+                    f.write("\n")
+
+                # 写入失败的来源
+                if data.failed_ids:
+                    f.write("==== 以下ID请求失败 ====\n")
+                    for failed_id in data.failed_ids:
+                        f.write(f"{failed_id}\n")
+
+            print(f"[本地存储] TXT 快照已保存: {file_path}")
+            return str(file_path)
+
+        except Exception as e:
+            print(f"[本地存储] 保存 TXT 快照失败: {e}")
+            return None
+
+    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
+        """
+        保存 HTML 报告
+
+        Args:
+            html_content: HTML 内容
+            filename: 文件名
+            is_summary: 是否为汇总报告
+
+        Returns:
+            保存的文件路径
+        """
+        if not self.enable_html:
+            return None
+
+        try:
+            date_folder = self._format_date_folder()
+            html_dir = self.data_dir / date_folder / "html"
+            html_dir.mkdir(parents=True, exist_ok=True)
+
+            file_path = html_dir / filename
+
+            with open(file_path, "w", encoding="utf-8") as f:
+                f.write(html_content)
+
+            print(f"[本地存储] HTML 报告已保存: {file_path}")
+            return str(file_path)
+
+        except Exception as e:
+            print(f"[本地存储] 保存 HTML 报告失败: {e}")
+            return None
+
+    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查是否是当天第一次抓取
+
+        Args:
+            date: 日期字符串,默认为今天
+
+        Returns:
+            是否是第一次抓取
+        """
+        try:
+            db_path = self._get_db_path(date)
+            if not db_path.exists():
+                return True
+
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            cursor.execute("""
+                SELECT COUNT(*) as count FROM crawl_records
+            """)
+
+            row = cursor.fetchone()
+            count = row[0] if row else 0
+
+            # 如果只有一条或没有记录,视为第一次抓取
+            return count <= 1
+
+        except Exception as e:
+            print(f"[本地存储] 检查首次抓取失败: {e}")
+            return True
+
+    def get_crawl_times(self, date: Optional[str] = None) -> List[str]:
+        """
+        获取指定日期的所有抓取时间列表
+
+        Args:
+            date: 日期字符串,默认为今天
+
+        Returns:
+            抓取时间列表(按时间排序)
+        """
+        try:
+            db_path = self._get_db_path(date)
+            if not db_path.exists():
+                return []
+
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time
+            """)
+
+            rows = cursor.fetchall()
+            return [row[0] for row in rows]
+
+        except Exception as e:
+            print(f"[本地存储] 获取抓取时间列表失败: {e}")
+            return []
+
+    def cleanup(self) -> None:
+        """清理资源(关闭数据库连接)"""
+        for db_path, conn in self._db_connections.items():
+            try:
+                conn.close()
+                print(f"[本地存储] 关闭数据库连接: {db_path}")
+            except Exception as e:
+                print(f"[本地存储] 关闭连接失败 {db_path}: {e}")
+
+        self._db_connections.clear()
+
+    def cleanup_old_data(self, retention_days: int) -> int:
+        """
+        清理过期数据
+
+        Args:
+            retention_days: 保留天数(0 表示不清理)
+
+        Returns:
+            删除的日期目录数量
+        """
+        if retention_days <= 0:
+            return 0
+
+        deleted_count = 0
+        cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
+
+        try:
+            if not self.data_dir.exists():
+                return 0
+
+            for date_folder in self.data_dir.iterdir():
+                if not date_folder.is_dir() or date_folder.name.startswith('.'):
+                    continue
+
+                # 解析日期文件夹名(支持两种格式)
+                folder_date = None
+                try:
+                    # ISO 格式: YYYY-MM-DD
+                    date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name)
+                    if date_match:
+                        folder_date = datetime(
+                            int(date_match.group(1)),
+                            int(date_match.group(2)),
+                            int(date_match.group(3)),
+                            tzinfo=pytz.timezone("Asia/Shanghai")
+                        )
+                    else:
+                        # 旧中文格式: YYYY年MM月DD日
+                        date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
+                        if date_match:
+                            folder_date = datetime(
+                                int(date_match.group(1)),
+                                int(date_match.group(2)),
+                                int(date_match.group(3)),
+                                tzinfo=pytz.timezone("Asia/Shanghai")
+                            )
+                except Exception:
+                    continue
+
+                if folder_date and folder_date < cutoff_date:
+                    # 先关闭该日期的数据库连接
+                    db_path = str(self._get_db_path(date_folder.name))
+                    if db_path in self._db_connections:
+                        try:
+                            self._db_connections[db_path].close()
+                            del self._db_connections[db_path]
+                        except Exception:
+                            pass
+
+                    # 删除整个日期目录
+                    try:
+                        shutil.rmtree(date_folder)
+                        deleted_count += 1
+                        print(f"[本地存储] 清理过期数据: {date_folder.name}")
+                    except Exception as e:
+                        print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}")
+
+            if deleted_count > 0:
+                print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录")
+
+            return deleted_count
+
+        except Exception as e:
+            print(f"[本地存储] 清理过期数据失败: {e}")
+            return deleted_count
+
+    def has_pushed_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查指定日期是否已推送过
+
+        Args:
+            date: 日期字符串(YYYY-MM-DD),默认为今天
+
+        Returns:
+            是否已推送
+        """
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            target_date = self._format_date_folder(date)
+
+            cursor.execute("""
+                SELECT pushed FROM push_records WHERE date = ?
+            """, (target_date,))
+
+            row = cursor.fetchone()
+            if row:
+                return bool(row[0])
+            return False
+
+        except Exception as e:
+            print(f"[本地存储] 检查推送记录失败: {e}")
+            return False
+
+    def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
+        """
+        记录推送
+
+        Args:
+            report_type: 报告类型
+            date: 日期字符串(YYYY-MM-DD),默认为今天
+
+        Returns:
+            是否记录成功
+        """
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            target_date = self._format_date_folder(date)
+            now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
+
+            cursor.execute("""
+                INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
+                VALUES (?, 1, ?, ?, ?)
+                ON CONFLICT(date) DO UPDATE SET
+                    pushed = 1,
+                    push_time = excluded.push_time,
+                    report_type = excluded.report_type
+            """, (target_date, now_str, report_type, now_str))
+
+            conn.commit()
+
+            print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}")
+            return True
+
+        except Exception as e:
+            print(f"[本地存储] 记录推送失败: {e}")
+            return False
+
+    def __del__(self):
+        """析构函数,确保关闭连接"""
+        self.cleanup()

+ 316 - 0
trendradar/storage/manager.py

@@ -0,0 +1,316 @@
+# coding=utf-8
+"""
+存储管理器 - 统一管理存储后端
+
+根据环境和配置自动选择合适的存储后端
+"""
+
+import os
+from typing import Optional
+
+from trendradar.storage.base import StorageBackend, NewsData
+
+
+# 存储管理器单例
+_storage_manager: Optional["StorageManager"] = None
+
+
+class StorageManager:
+    """
+    存储管理器
+
+    功能:
+    - 自动检测运行环境(GitHub Actions / Docker / 本地)
+    - 根据配置选择存储后端(local / remote / auto)
+    - 提供统一的存储接口
+    - 支持从远程拉取数据到本地
+    """
+
+    def __init__(
+        self,
+        backend_type: str = "auto",
+        data_dir: str = "output",
+        enable_txt: bool = True,
+        enable_html: bool = True,
+        remote_config: Optional[dict] = None,
+        local_retention_days: int = 0,
+        remote_retention_days: int = 0,
+        pull_enabled: bool = False,
+        pull_days: int = 0,
+        timezone: str = "Asia/Shanghai",
+    ):
+        """
+        初始化存储管理器
+
+        Args:
+            backend_type: 存储后端类型 (local / remote / auto)
+            data_dir: 本地数据目录
+            enable_txt: 是否启用 TXT 快照
+            enable_html: 是否启用 HTML 报告
+            remote_config: 远程存储配置(endpoint_url, bucket_name, access_key_id 等)
+            local_retention_days: 本地数据保留天数(0 = 无限制)
+            remote_retention_days: 远程数据保留天数(0 = 无限制)
+            pull_enabled: 是否启用启动时自动拉取
+            pull_days: 拉取最近 N 天的数据
+            timezone: 时区配置(默认 Asia/Shanghai)
+        """
+        self.backend_type = backend_type
+        self.data_dir = data_dir
+        self.enable_txt = enable_txt
+        self.enable_html = enable_html
+        self.remote_config = remote_config or {}
+        self.local_retention_days = local_retention_days
+        self.remote_retention_days = remote_retention_days
+        self.pull_enabled = pull_enabled
+        self.pull_days = pull_days
+        self.timezone = timezone
+
+        self._backend: Optional[StorageBackend] = None
+        self._remote_backend: Optional[StorageBackend] = None
+
+    @staticmethod
+    def is_github_actions() -> bool:
+        """检测是否在 GitHub Actions 环境中运行"""
+        return os.environ.get("GITHUB_ACTIONS") == "true"
+
+    @staticmethod
+    def is_docker() -> bool:
+        """检测是否在 Docker 容器中运行"""
+        # 方法1: 检查 /.dockerenv 文件
+        if os.path.exists("/.dockerenv"):
+            return True
+
+        # 方法2: 检查 cgroup(Linux)
+        try:
+            with open("/proc/1/cgroup", "r") as f:
+                return "docker" in f.read()
+        except (FileNotFoundError, PermissionError):
+            pass
+
+        # 方法3: 检查环境变量
+        return os.environ.get("DOCKER_CONTAINER") == "true"
+
+    def _resolve_backend_type(self) -> str:
+        """解析实际使用的后端类型"""
+        if self.backend_type == "auto":
+            if self.is_github_actions():
+                # GitHub Actions 环境,检查是否配置了远程存储
+                if self._has_remote_config():
+                    return "remote"
+                else:
+                    print("[存储管理器] GitHub Actions 环境但未配置远程存储,使用本地存储")
+                    return "local"
+            else:
+                return "local"
+        return self.backend_type
+
+    def _has_remote_config(self) -> bool:
+        """检查是否有有效的远程存储配置"""
+        # 检查配置或环境变量
+        bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
+        access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
+        secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
+        endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
+
+        # 调试日志
+        has_config = bool(bucket_name and access_key and secret_key and endpoint)
+        if not has_config:
+            print(f"[存储管理器] 远程存储配置检查失败:")
+            print(f"  - bucket_name: {'已配置' if bucket_name else '未配置'}")
+            print(f"  - access_key_id: {'已配置' if access_key else '未配置'}")
+            print(f"  - secret_access_key: {'已配置' if secret_key else '未配置'}")
+            print(f"  - endpoint_url: {'已配置' if endpoint else '未配置'}")
+
+        return has_config
+
+    def _create_remote_backend(self) -> Optional[StorageBackend]:
+        """创建远程存储后端"""
+        try:
+            from trendradar.storage.remote import RemoteStorageBackend
+
+            return RemoteStorageBackend(
+                bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
+                access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
+                secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
+                endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
+                region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
+                enable_txt=self.enable_txt,
+                enable_html=self.enable_html,
+                timezone=self.timezone,
+            )
+        except ImportError as e:
+            print(f"[存储管理器] 远程后端导入失败: {e}")
+            print("[存储管理器] 请确保已安装 boto3: pip install boto3")
+            return None
+        except Exception as e:
+            print(f"[存储管理器] 远程后端初始化失败: {e}")
+            return None
+
+    def get_backend(self) -> StorageBackend:
+        """获取存储后端实例"""
+        if self._backend is None:
+            resolved_type = self._resolve_backend_type()
+
+            if resolved_type == "remote":
+                self._backend = self._create_remote_backend()
+                if self._backend:
+                    print(f"[存储管理器] 使用远程存储后端")
+                else:
+                    print("[存储管理器] 回退到本地存储")
+                    resolved_type = "local"
+
+            if resolved_type == "local" or self._backend is None:
+                from trendradar.storage.local import LocalStorageBackend
+
+                self._backend = LocalStorageBackend(
+                    data_dir=self.data_dir,
+                    enable_txt=self.enable_txt,
+                    enable_html=self.enable_html,
+                    timezone=self.timezone,
+                )
+                print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
+
+        return self._backend
+
+    def pull_from_remote(self) -> int:
+        """
+        从远程拉取数据到本地
+
+        Returns:
+            成功拉取的文件数量
+        """
+        if not self.pull_enabled or self.pull_days <= 0:
+            return 0
+
+        if not self._has_remote_config():
+            print("[存储管理器] 未配置远程存储,无法拉取")
+            return 0
+
+        # 创建远程后端(如果还没有)
+        if self._remote_backend is None:
+            self._remote_backend = self._create_remote_backend()
+
+        if self._remote_backend is None:
+            print("[存储管理器] 无法创建远程后端,拉取失败")
+            return 0
+
+        # 调用拉取方法
+        return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
+
+    def save_news_data(self, data: NewsData) -> bool:
+        """保存新闻数据"""
+        return self.get_backend().save_news_data(data)
+
+    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """获取当天所有数据"""
+        return self.get_backend().get_today_all_data(date)
+
+    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """获取最新抓取数据"""
+        return self.get_backend().get_latest_crawl_data(date)
+
+    def detect_new_titles(self, current_data: NewsData) -> dict:
+        """检测新增标题"""
+        return self.get_backend().detect_new_titles(current_data)
+
+    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
+        """保存 TXT 快照"""
+        return self.get_backend().save_txt_snapshot(data)
+
+    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
+        """保存 HTML 报告"""
+        return self.get_backend().save_html_report(html_content, filename, is_summary)
+
+    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
+        """检查是否是当天第一次抓取"""
+        return self.get_backend().is_first_crawl_today(date)
+
+    def cleanup(self) -> None:
+        """清理资源"""
+        if self._backend:
+            self._backend.cleanup()
+        if self._remote_backend:
+            self._remote_backend.cleanup()
+
+    def cleanup_old_data(self) -> int:
+        """
+        清理过期数据
+
+        Returns:
+            删除的日期目录数量
+        """
+        total_deleted = 0
+
+        # 清理本地数据
+        if self.local_retention_days > 0:
+            total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
+
+        # 清理远程数据(如果配置了)
+        if self.remote_retention_days > 0 and self._has_remote_config():
+            if self._remote_backend is None:
+                self._remote_backend = self._create_remote_backend()
+            if self._remote_backend:
+                total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
+
+        return total_deleted
+
+    @property
+    def backend_name(self) -> str:
+        """获取当前后端名称"""
+        return self.get_backend().backend_name
+
+    @property
+    def supports_txt(self) -> bool:
+        """是否支持 TXT 快照"""
+        return self.get_backend().supports_txt
+
+
+def get_storage_manager(
+    backend_type: str = "auto",
+    data_dir: str = "output",
+    enable_txt: bool = True,
+    enable_html: bool = True,
+    remote_config: Optional[dict] = None,
+    local_retention_days: int = 0,
+    remote_retention_days: int = 0,
+    pull_enabled: bool = False,
+    pull_days: int = 0,
+    timezone: str = "Asia/Shanghai",
+    force_new: bool = False,
+) -> StorageManager:
+    """
+    获取存储管理器单例
+
+    Args:
+        backend_type: 存储后端类型
+        data_dir: 本地数据目录
+        enable_txt: 是否启用 TXT 快照
+        enable_html: 是否启用 HTML 报告
+        remote_config: 远程存储配置
+        local_retention_days: 本地数据保留天数(0 = 无限制)
+        remote_retention_days: 远程数据保留天数(0 = 无限制)
+        pull_enabled: 是否启用启动时自动拉取
+        pull_days: 拉取最近 N 天的数据
+        timezone: 时区配置(默认 Asia/Shanghai)
+        force_new: 是否强制创建新实例
+
+    Returns:
+        StorageManager 实例
+    """
+    global _storage_manager
+
+    if _storage_manager is None or force_new:
+        _storage_manager = StorageManager(
+            backend_type=backend_type,
+            data_dir=data_dir,
+            enable_txt=enable_txt,
+            enable_html=enable_html,
+            remote_config=remote_config,
+            local_retention_days=local_retention_days,
+            remote_retention_days=remote_retention_days,
+            pull_enabled=pull_enabled,
+            pull_days=pull_days,
+            timezone=timezone,
+        )
+
+    return _storage_manager

+ 1071 - 0
trendradar/storage/remote.py

@@ -0,0 +1,1071 @@
+# coding=utf-8
+"""
+远程存储后端(S3 兼容协议)
+
+支持 Cloudflare R2、阿里云 OSS、腾讯云 COS、AWS S3、MinIO 等
+使用 S3 兼容 API (boto3) 访问对象存储
+数据流程:下载当天 SQLite → 合并新数据 → 上传回远程
+"""
+
+import atexit
+import os
+import pytz
+import re
+import shutil
+import sys
+import tempfile
+import sqlite3
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+
+try:
+    import boto3
+    from botocore.exceptions import ClientError
+    HAS_BOTO3 = True
+except ImportError:
+    HAS_BOTO3 = False
+    boto3 = None
+    ClientError = Exception
+
+from trendradar.storage.base import StorageBackend, NewsItem, NewsData
+from trendradar.utils.time import (
+    get_configured_time,
+    format_date_folder,
+    format_time_filename,
+)
+
+
+class RemoteStorageBackend(StorageBackend):
+    """
+    远程云存储后端(S3 兼容协议)
+
+    特点:
+    - 使用 S3 兼容 API 访问远程存储
+    - 支持 Cloudflare R2、阿里云 OSS、腾讯云 COS、AWS S3、MinIO 等
+    - 下载 SQLite 到临时目录进行操作
+    - 支持数据合并和上传
+    - 支持从远程拉取历史数据到本地
+    - 运行结束后自动清理临时文件
+    """
+
+    def __init__(
+        self,
+        bucket_name: str,
+        access_key_id: str,
+        secret_access_key: str,
+        endpoint_url: str,
+        region: str = "",
+        enable_txt: bool = False,  # 远程模式默认不生成 TXT
+        enable_html: bool = True,
+        temp_dir: Optional[str] = None,
+        timezone: str = "Asia/Shanghai",
+    ):
+        """
+        初始化远程存储后端
+
+        Args:
+            bucket_name: 存储桶名称
+            access_key_id: 访问密钥 ID
+            secret_access_key: 访问密钥
+            endpoint_url: 服务端点 URL
+            region: 区域(可选,部分服务商需要)
+            enable_txt: 是否启用 TXT 快照(默认关闭)
+            enable_html: 是否启用 HTML 报告
+            temp_dir: 临时目录路径(默认使用系统临时目录)
+            timezone: 时区配置(默认 Asia/Shanghai)
+        """
+        if not HAS_BOTO3:
+            raise ImportError("远程存储后端需要安装 boto3: pip install boto3")
+
+        self.bucket_name = bucket_name
+        self.endpoint_url = endpoint_url
+        self.region = region
+        self.enable_txt = enable_txt
+        self.enable_html = enable_html
+        self.timezone = timezone
+
+        # 创建临时目录
+        self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp(prefix="trendradar_"))
+        self.temp_dir.mkdir(parents=True, exist_ok=True)
+
+        # 初始化 S3 客户端
+        client_kwargs = {
+            "endpoint_url": endpoint_url,
+            "aws_access_key_id": access_key_id,
+            "aws_secret_access_key": secret_access_key,
+        }
+        if region:
+            client_kwargs["region_name"] = region
+
+        self.s3_client = boto3.client("s3", **client_kwargs)
+
+        # 跟踪下载的文件(用于清理)
+        self._downloaded_files: List[Path] = []
+        self._db_connections: Dict[str, sqlite3.Connection] = {}
+
+        print(f"[远程存储] 初始化完成,存储桶: {bucket_name}")
+
+    @property
+    def backend_name(self) -> str:
+        return "remote"
+
+    @property
+    def supports_txt(self) -> bool:
+        return self.enable_txt
+
+    def _get_configured_time(self) -> datetime:
+        """获取配置时区的当前时间"""
+        return get_configured_time(self.timezone)
+
+    def _format_date_folder(self, date: Optional[str] = None) -> str:
+        """格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
+        return format_date_folder(date, self.timezone)
+
+    def _format_time_filename(self) -> str:
+        """格式化时间文件名 (格式: HH-MM)"""
+        return format_time_filename(self.timezone)
+
+    def _get_remote_db_key(self, date: Optional[str] = None) -> str:
+        """获取 R2 中 SQLite 文件的对象键"""
+        date_folder = self._format_date_folder(date)
+        return f"news/{date_folder}.db"
+
+    def _get_local_db_path(self, date: Optional[str] = None) -> Path:
+        """获取本地临时 SQLite 文件路径"""
+        date_folder = self._format_date_folder(date)
+        return self.temp_dir / date_folder / "news.db"
+
+    def _check_object_exists(self, r2_key: str) -> bool:
+        """
+        检查 R2 中对象是否存在
+
+        Args:
+            r2_key: R2 对象键
+
+        Returns:
+            是否存在
+        """
+        try:
+            self.s3_client.head_object(Bucket=self.bucket_name, Key=r2_key)
+            return True
+        except ClientError as e:
+            error_code = e.response.get("Error", {}).get("Code", "")
+            # R2/S3 可能返回 404, NoSuchKey, 或其他变体
+            if error_code in ("404", "NoSuchKey", "Not Found"):
+                return False
+            # 其他错误(如权限问题)也视为不存在,但打印警告
+            print(f"[远程存储] 检查对象存在性失败 ({r2_key}): {e}")
+            return False
+        except Exception as e:
+            print(f"[远程存储] 检查对象存在性异常 ({r2_key}): {e}")
+            return False
+
+    def _download_sqlite(self, date: Optional[str] = None) -> Optional[Path]:
+        """
+        从 R2 下载当天的 SQLite 文件到本地临时目录
+
+        Args:
+            date: 日期字符串
+
+        Returns:
+            本地文件路径,如果不存在返回 None
+        """
+        r2_key = self._get_remote_db_key(date)
+        local_path = self._get_local_db_path(date)
+
+        # 确保目录存在
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # 先检查文件是否存在
+        if not self._check_object_exists(r2_key):
+            print(f"[远程存储] 文件不存在,将创建新数据库: {r2_key}")
+            return None
+
+        try:
+            self.s3_client.download_file(self.bucket_name, r2_key, str(local_path))
+            self._downloaded_files.append(local_path)
+            print(f"[远程存储] 已下载: {r2_key} -> {local_path}")
+            return local_path
+        except ClientError as e:
+            error_code = e.response.get("Error", {}).get("Code", "")
+            # R2/S3 可能返回不同的错误码
+            if error_code in ("404", "NoSuchKey", "Not Found"):
+                print(f"[远程存储] 文件不存在,将创建新数据库: {r2_key}")
+                return None
+            else:
+                print(f"[远程存储] 下载失败 (错误码: {error_code}): {e}")
+                raise
+        except Exception as e:
+            print(f"[远程存储] 下载异常: {e}")
+            raise
+
+    def _upload_sqlite(self, date: Optional[str] = None) -> bool:
+        """
+        上传本地 SQLite 文件到 R2
+
+        Args:
+            date: 日期字符串
+
+        Returns:
+            是否上传成功
+        """
+        local_path = self._get_local_db_path(date)
+        r2_key = self._get_remote_db_key(date)
+
+        if not local_path.exists():
+            print(f"[远程存储] 本地文件不存在,无法上传: {local_path}")
+            return False
+
+        try:
+            # 获取本地文件大小
+            local_size = local_path.stat().st_size
+            print(f"[远程存储] 准备上传: {local_path} ({local_size} bytes) -> {r2_key}")
+
+            self.s3_client.upload_file(str(local_path), self.bucket_name, r2_key)
+            print(f"[远程存储] 已上传: {local_path} -> {r2_key}")
+
+            # 验证上传成功
+            if self._check_object_exists(r2_key):
+                print(f"[远程存储] 上传验证成功: {r2_key}")
+                return True
+            else:
+                print(f"[远程存储] 上传验证失败: 文件未在 R2 中找到")
+                return False
+
+        except Exception as e:
+            print(f"[远程存储] 上传失败: {e}")
+            return False
+
+    def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
+        """获取数据库连接"""
+        local_path = self._get_local_db_path(date)
+        db_path = str(local_path)
+
+        if db_path not in self._db_connections:
+            # 确保目录存在
+            local_path.parent.mkdir(parents=True, exist_ok=True)
+
+            # 如果本地不存在,尝试从 R2 下载
+            if not local_path.exists():
+                self._download_sqlite(date)
+
+            conn = sqlite3.connect(db_path)
+            conn.row_factory = sqlite3.Row
+            self._init_tables(conn)
+            self._db_connections[db_path] = conn
+
+        return self._db_connections[db_path]
+
+    def _get_schema_path(self) -> Path:
+        """获取 schema.sql 文件路径"""
+        return Path(__file__).parent / "schema.sql"
+
+    def _init_tables(self, conn: sqlite3.Connection) -> None:
+        """从 schema.sql 初始化数据库表结构"""
+        schema_path = self._get_schema_path()
+        
+        if schema_path.exists():
+            with open(schema_path, "r", encoding="utf-8") as f:
+                schema_sql = f.read()
+            conn.executescript(schema_sql)
+        else:
+            raise FileNotFoundError(f"Schema file not found: {schema_path}")
+
+        conn.commit()
+
+    def save_news_data(self, data: NewsData) -> bool:
+        """
+        保存新闻数据到 R2(以 URL 为唯一标识,支持标题更新检测)
+
+        流程:下载现有数据库 → 插入/更新数据 → 上传回 R2
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            是否保存成功
+        """
+        try:
+            conn = self._get_connection(data.date)
+            cursor = conn.cursor()
+
+            # 查询已有记录数
+            cursor.execute("SELECT COUNT(*) as count FROM news_items")
+            row = cursor.fetchone()
+            existing_count = row[0] if row else 0
+            if existing_count > 0:
+                print(f"[远程存储] 已有 {existing_count} 条历史记录,将合并新数据")
+
+            # 获取配置时区的当前时间
+            now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
+
+            # 首先同步平台信息到 platforms 表
+            for source_id, source_name in data.id_to_name.items():
+                cursor.execute("""
+                    INSERT INTO platforms (id, name, updated_at)
+                    VALUES (?, ?, ?)
+                    ON CONFLICT(id) DO UPDATE SET
+                        name = excluded.name,
+                        updated_at = excluded.updated_at
+                """, (source_id, source_name, now_str))
+
+            # 统计计数器
+            new_count = 0
+            updated_count = 0
+            title_changed_count = 0
+            success_sources = []
+
+            for source_id, news_list in data.items.items():
+                success_sources.append(source_id)
+
+                for item in news_list:
+                    try:
+                        # 检查是否已存在(通过 URL + platform_id)
+                        if item.url:
+                            cursor.execute("""
+                                SELECT id, title FROM news_items
+                                WHERE url = ? AND platform_id = ?
+                            """, (item.url, source_id))
+                            existing = cursor.fetchone()
+
+                            if existing:
+                                # 已存在,更新记录
+                                existing_id, existing_title = existing
+
+                                # 检查标题是否变化
+                                if existing_title != item.title:
+                                    # 记录标题变更
+                                    cursor.execute("""
+                                        INSERT INTO title_changes
+                                        (news_item_id, old_title, new_title, changed_at)
+                                        VALUES (?, ?, ?, ?)
+                                    """, (existing_id, existing_title, item.title, now_str))
+                                    title_changed_count += 1
+
+                                # 记录排名历史
+                                cursor.execute("""
+                                    INSERT INTO rank_history
+                                    (news_item_id, rank, crawl_time, created_at)
+                                    VALUES (?, ?, ?, ?)
+                                """, (existing_id, item.rank, data.crawl_time, now_str))
+
+                                # 更新现有记录
+                                cursor.execute("""
+                                    UPDATE news_items SET
+                                        title = ?,
+                                        rank = ?,
+                                        mobile_url = ?,
+                                        last_crawl_time = ?,
+                                        crawl_count = crawl_count + 1,
+                                        updated_at = ?
+                                    WHERE id = ?
+                                """, (item.title, item.rank, item.mobile_url,
+                                      data.crawl_time, now_str, existing_id))
+                                updated_count += 1
+                            else:
+                                # 不存在,插入新记录
+                                cursor.execute("""
+                                    INSERT INTO news_items
+                                    (title, platform_id, rank, url, mobile_url,
+                                     first_crawl_time, last_crawl_time, crawl_count,
+                                     created_at, updated_at)
+                                    VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
+                                """, (item.title, source_id, item.rank, item.url,
+                                      item.mobile_url, data.crawl_time, data.crawl_time,
+                                      now_str, now_str))
+                                new_id = cursor.lastrowid
+                                # 记录初始排名
+                                cursor.execute("""
+                                    INSERT INTO rank_history
+                                    (news_item_id, rank, crawl_time, created_at)
+                                    VALUES (?, ?, ?, ?)
+                                """, (new_id, item.rank, data.crawl_time, now_str))
+                                new_count += 1
+                        else:
+                            # URL 为空的情况,直接插入(不做去重)
+                            cursor.execute("""
+                                INSERT INTO news_items
+                                (title, platform_id, rank, url, mobile_url,
+                                 first_crawl_time, last_crawl_time, crawl_count,
+                                 created_at, updated_at)
+                                VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
+                            """, (item.title, source_id, item.rank, item.url,
+                                  item.mobile_url, data.crawl_time, data.crawl_time,
+                                  now_str, now_str))
+                            new_id = cursor.lastrowid
+                            # 记录初始排名
+                            cursor.execute("""
+                                INSERT INTO rank_history
+                                (news_item_id, rank, crawl_time, created_at)
+                                VALUES (?, ?, ?, ?)
+                            """, (new_id, item.rank, data.crawl_time, now_str))
+                            new_count += 1
+
+                    except sqlite3.Error as e:
+                        print(f"[远程存储] 保存新闻条目失败 [{item.title[:30]}...]: {e}")
+
+            total_items = new_count + updated_count
+
+            # 记录抓取信息
+            cursor.execute("""
+                INSERT OR REPLACE INTO crawl_records
+                (crawl_time, total_items, created_at)
+                VALUES (?, ?, ?)
+            """, (data.crawl_time, total_items, now_str))
+
+            # 获取刚插入的 crawl_record 的 ID
+            cursor.execute("""
+                SELECT id FROM crawl_records WHERE crawl_time = ?
+            """, (data.crawl_time,))
+            record_row = cursor.fetchone()
+            if record_row:
+                crawl_record_id = record_row[0]
+
+                # 记录成功的来源
+                for source_id in success_sources:
+                    cursor.execute("""
+                        INSERT OR REPLACE INTO crawl_source_status
+                        (crawl_record_id, platform_id, status)
+                        VALUES (?, ?, 'success')
+                    """, (crawl_record_id, source_id))
+
+                # 记录失败的来源
+                for failed_id in data.failed_ids:
+                    # 确保失败的平台也在 platforms 表中
+                    cursor.execute("""
+                        INSERT OR IGNORE INTO platforms (id, name, updated_at)
+                        VALUES (?, ?, ?)
+                    """, (failed_id, failed_id, now_str))
+
+                    cursor.execute("""
+                        INSERT OR REPLACE INTO crawl_source_status
+                        (crawl_record_id, platform_id, status)
+                        VALUES (?, ?, 'failed')
+                    """, (crawl_record_id, failed_id))
+
+            conn.commit()
+
+            # 查询合并后的总记录数
+            cursor.execute("SELECT COUNT(*) as count FROM news_items")
+            row = cursor.fetchone()
+            final_count = row[0] if row else 0
+
+            # 输出详细的存储统计日志
+            log_parts = [f"[远程存储] 处理完成:新增 {new_count} 条"]
+            if updated_count > 0:
+                log_parts.append(f"更新 {updated_count} 条")
+            if title_changed_count > 0:
+                log_parts.append(f"标题变更 {title_changed_count} 条")
+            log_parts.append(f"(去重后总计: {final_count} 条)")
+            print(",".join(log_parts))
+
+            # 上传到 R2
+            if self._upload_sqlite(data.date):
+                print(f"[远程存储] 数据已同步到 R2")
+                return True
+            else:
+                print(f"[远程存储] 上传 R2 失败")
+                return False
+
+        except Exception as e:
+            print(f"[远程存储] 保存失败: {e}")
+            return False
+
+    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """获取指定日期的所有新闻数据(合并后)"""
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            # 获取所有新闻数据(包含 id 用于查询排名历史)
+            cursor.execute("""
+                SELECT n.id, n.title, n.platform_id, p.name as platform_name,
+                       n.rank, n.url, n.mobile_url,
+                       n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                FROM news_items n
+                LEFT JOIN platforms p ON n.platform_id = p.id
+                ORDER BY n.platform_id, n.last_crawl_time
+            """)
+
+            rows = cursor.fetchall()
+            if not rows:
+                return None
+
+            # 收集所有 news_item_id
+            news_ids = [row[0] for row in rows]
+
+            # 批量查询排名历史
+            rank_history_map: Dict[int, List[int]] = {}
+            if news_ids:
+                placeholders = ",".join("?" * len(news_ids))
+                cursor.execute(f"""
+                    SELECT news_item_id, rank FROM rank_history
+                    WHERE news_item_id IN ({placeholders})
+                    ORDER BY news_item_id, crawl_time
+                """, news_ids)
+                for rh_row in cursor.fetchall():
+                    news_id, rank = rh_row[0], rh_row[1]
+                    if news_id not in rank_history_map:
+                        rank_history_map[news_id] = []
+                    if rank not in rank_history_map[news_id]:
+                        rank_history_map[news_id].append(rank)
+
+            # 按 platform_id 分组
+            items: Dict[str, List[NewsItem]] = {}
+            id_to_name: Dict[str, str] = {}
+            crawl_date = self._format_date_folder(date)
+
+            for row in rows:
+                news_id = row[0]
+                platform_id = row[2]
+                title = row[1]
+                platform_name = row[3] or platform_id
+
+                id_to_name[platform_id] = platform_name
+
+                if platform_id not in items:
+                    items[platform_id] = []
+
+                # 获取排名历史,如果没有则使用当前排名
+                ranks = rank_history_map.get(news_id, [row[4]])
+
+                items[platform_id].append(NewsItem(
+                    title=title,
+                    source_id=platform_id,
+                    source_name=platform_name,
+                    rank=row[4],
+                    url=row[5] or "",
+                    mobile_url=row[6] or "",
+                    crawl_time=row[8],  # last_crawl_time
+                    ranks=ranks,
+                    first_time=row[7],  # first_crawl_time
+                    last_time=row[8],   # last_crawl_time
+                    count=row[9],       # crawl_count
+                ))
+
+            final_items = items
+
+            # 获取失败的来源
+            cursor.execute("""
+                SELECT DISTINCT css.platform_id
+                FROM crawl_source_status css
+                JOIN crawl_records cr ON css.crawl_record_id = cr.id
+                WHERE css.status = 'failed'
+            """)
+            failed_ids = [row[0] for row in cursor.fetchall()]
+
+            # 获取最新的抓取时间
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time DESC
+                LIMIT 1
+            """)
+
+            time_row = cursor.fetchone()
+            crawl_time = time_row[0] if time_row else self._format_time_filename()
+
+            return NewsData(
+                date=crawl_date,
+                crawl_time=crawl_time,
+                items=final_items,
+                id_to_name=id_to_name,
+                failed_ids=failed_ids,
+            )
+
+        except Exception as e:
+            print(f"[远程存储] 读取数据失败: {e}")
+            return None
+
+    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """获取最新一次抓取的数据"""
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            # 获取最新的抓取时间
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time DESC
+                LIMIT 1
+            """)
+
+            time_row = cursor.fetchone()
+            if not time_row:
+                return None
+
+            latest_time = time_row[0]
+
+            # 获取该时间的新闻数据,通过 JOIN 获取平台名称
+            cursor.execute("""
+                SELECT n.title, n.platform_id, p.name as platform_name,
+                       n.rank, n.url, n.mobile_url,
+                       n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                FROM news_items n
+                LEFT JOIN platforms p ON n.platform_id = p.id
+                WHERE n.last_crawl_time = ?
+            """, (latest_time,))
+
+            rows = cursor.fetchall()
+            if not rows:
+                return None
+
+            items: Dict[str, List[NewsItem]] = {}
+            id_to_name: Dict[str, str] = {}
+            crawl_date = self._format_date_folder(date)
+
+            for row in rows:
+                platform_id = row[1]
+                platform_name = row[2] or platform_id
+                id_to_name[platform_id] = platform_name
+
+                if platform_id not in items:
+                    items[platform_id] = []
+
+                items[platform_id].append(NewsItem(
+                    title=row[0],
+                    source_id=platform_id,
+                    source_name=platform_name,
+                    rank=row[3],
+                    url=row[4] or "",
+                    mobile_url=row[5] or "",
+                    crawl_time=row[7],  # last_crawl_time
+                    ranks=[row[3]],
+                    first_time=row[6],  # first_crawl_time
+                    last_time=row[7],   # last_crawl_time
+                    count=row[8],       # crawl_count
+                ))
+
+            # 获取失败的来源(针对最新一次抓取)
+            cursor.execute("""
+                SELECT css.platform_id
+                FROM crawl_source_status css
+                JOIN crawl_records cr ON css.crawl_record_id = cr.id
+                WHERE cr.crawl_time = ? AND css.status = 'failed'
+            """, (latest_time,))
+
+            failed_ids = [row[0] for row in cursor.fetchall()]
+
+            return NewsData(
+                date=crawl_date,
+                crawl_time=latest_time,
+                items=items,
+                id_to_name=id_to_name,
+                failed_ids=failed_ids,
+            )
+
+        except Exception as e:
+            print(f"[远程存储] 获取最新数据失败: {e}")
+            return None
+
+    def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
+        """检测新增的标题"""
+        try:
+            historical_data = self.get_today_all_data(current_data.date)
+
+            if not historical_data:
+                new_titles = {}
+                for source_id, news_list in current_data.items.items():
+                    new_titles[source_id] = {item.title: item for item in news_list}
+                return new_titles
+
+            historical_titles: Dict[str, set] = {}
+            for source_id, news_list in historical_data.items.items():
+                historical_titles[source_id] = {item.title for item in news_list}
+
+            new_titles = {}
+            for source_id, news_list in current_data.items.items():
+                hist_set = historical_titles.get(source_id, set())
+                for item in news_list:
+                    if item.title not in hist_set:
+                        if source_id not in new_titles:
+                            new_titles[source_id] = {}
+                        new_titles[source_id][item.title] = item
+
+            return new_titles
+
+        except Exception as e:
+            print(f"[远程存储] 检测新标题失败: {e}")
+            return {}
+
+    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
+        """保存 TXT 快照(R2 模式下默认不支持)"""
+        if not self.enable_txt:
+            return None
+
+        # 如果启用,保存到本地临时目录
+        try:
+            date_folder = self._format_date_folder(data.date)
+            txt_dir = self.temp_dir / date_folder / "txt"
+            txt_dir.mkdir(parents=True, exist_ok=True)
+
+            file_path = txt_dir / f"{data.crawl_time}.txt"
+
+            with open(file_path, "w", encoding="utf-8") as f:
+                for source_id, news_list in data.items.items():
+                    source_name = data.id_to_name.get(source_id, source_id)
+
+                    if source_name and source_name != source_id:
+                        f.write(f"{source_id} | {source_name}\n")
+                    else:
+                        f.write(f"{source_id}\n")
+
+                    sorted_news = sorted(news_list, key=lambda x: x.rank)
+
+                    for item in sorted_news:
+                        line = f"{item.rank}. {item.title}"
+                        if item.url:
+                            line += f" [URL:{item.url}]"
+                        if item.mobile_url:
+                            line += f" [MOBILE:{item.mobile_url}]"
+                        f.write(line + "\n")
+
+                    f.write("\n")
+
+                if data.failed_ids:
+                    f.write("==== 以下ID请求失败 ====\n")
+                    for failed_id in data.failed_ids:
+                        f.write(f"{failed_id}\n")
+
+            print(f"[远程存储] TXT 快照已保存: {file_path}")
+            return str(file_path)
+
+        except Exception as e:
+            print(f"[远程存储] 保存 TXT 快照失败: {e}")
+            return None
+
+    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
+        """保存 HTML 报告到临时目录"""
+        if not self.enable_html:
+            return None
+
+        try:
+            date_folder = self._format_date_folder()
+            html_dir = self.temp_dir / date_folder / "html"
+            html_dir.mkdir(parents=True, exist_ok=True)
+
+            file_path = html_dir / filename
+
+            with open(file_path, "w", encoding="utf-8") as f:
+                f.write(html_content)
+
+            print(f"[远程存储] HTML 报告已保存: {file_path}")
+            return str(file_path)
+
+        except Exception as e:
+            print(f"[远程存储] 保存 HTML 报告失败: {e}")
+            return None
+
+    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
+        """检查是否是当天第一次抓取"""
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            cursor.execute("""
+                SELECT COUNT(*) as count FROM crawl_records
+            """)
+
+            row = cursor.fetchone()
+            count = row[0] if row else 0
+
+            return count <= 1
+
+        except Exception as e:
+            print(f"[远程存储] 检查首次抓取失败: {e}")
+            return True
+
+    def cleanup(self) -> None:
+        """清理资源(关闭连接和删除临时文件)"""
+        # 检查 Python 是否正在关闭
+        if sys.meta_path is None:
+            return
+
+        # 关闭数据库连接
+        db_connections = getattr(self, "_db_connections", {})
+        for db_path, conn in list(db_connections.items()):
+            try:
+                conn.close()
+                print(f"[远程存储] 关闭数据库连接: {db_path}")
+            except Exception as e:
+                print(f"[远程存储] 关闭连接失败 {db_path}: {e}")
+
+        if db_connections:
+            db_connections.clear()
+
+        # 删除临时目录
+        temp_dir = getattr(self, "temp_dir", None)
+        if temp_dir:
+            try:
+                if temp_dir.exists():
+                    shutil.rmtree(temp_dir)
+                    print(f"[远程存储] 临时目录已清理: {temp_dir}")
+            except Exception as e:
+                # 忽略 Python 关闭时的错误
+                if sys.meta_path is not None:
+                    print(f"[远程存储] 清理临时目录失败: {e}")
+
+        downloaded_files = getattr(self, "_downloaded_files", None)
+        if downloaded_files:
+            downloaded_files.clear()
+
+    def cleanup_old_data(self, retention_days: int) -> int:
+        """
+        清理 R2 上的过期数据
+
+        Args:
+            retention_days: 保留天数(0 表示不清理)
+
+        Returns:
+            删除的数据库文件数量
+        """
+        if retention_days <= 0:
+            return 0
+
+        deleted_count = 0
+        cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
+
+        try:
+            # 列出 R2 中 news/ 前缀下的所有对象
+            paginator = self.s3_client.get_paginator('list_objects_v2')
+            pages = paginator.paginate(Bucket=self.bucket_name, Prefix="news/")
+
+            # 收集需要删除的对象键
+            objects_to_delete = []
+            deleted_dates = set()
+
+            for page in pages:
+                if 'Contents' not in page:
+                    continue
+
+                for obj in page['Contents']:
+                    key = obj['Key']
+
+                    # 解析日期(格式: news/YYYY-MM-DD.db 或 news/YYYY年MM月DD日.db)
+                    folder_date = None
+                    try:
+                        # ISO 格式: news/YYYY-MM-DD.db
+                        date_match = re.match(r'news/(\d{4})-(\d{2})-(\d{2})\.db$', key)
+                        if date_match:
+                            folder_date = datetime(
+                                int(date_match.group(1)),
+                                int(date_match.group(2)),
+                                int(date_match.group(3)),
+                                tzinfo=pytz.timezone("Asia/Shanghai")
+                            )
+                            date_str = f"{date_match.group(1)}-{date_match.group(2)}-{date_match.group(3)}"
+                        else:
+                            # 旧中文格式: news/YYYY年MM月DD日.db
+                            date_match = re.match(r'news/(\d{4})年(\d{2})月(\d{2})日\.db$', key)
+                            if date_match:
+                                folder_date = datetime(
+                                    int(date_match.group(1)),
+                                    int(date_match.group(2)),
+                                    int(date_match.group(3)),
+                                    tzinfo=pytz.timezone("Asia/Shanghai")
+                                )
+                                date_str = f"{date_match.group(1)}年{date_match.group(2)}月{date_match.group(3)}日"
+                    except Exception:
+                        continue
+
+                    if folder_date and folder_date < cutoff_date:
+                        objects_to_delete.append({'Key': key})
+                        deleted_dates.add(date_str)
+
+            # 批量删除对象(每次最多 1000 个)
+            if objects_to_delete:
+                batch_size = 1000
+                for i in range(0, len(objects_to_delete), batch_size):
+                    batch = objects_to_delete[i:i + batch_size]
+                    try:
+                        self.s3_client.delete_objects(
+                            Bucket=self.bucket_name,
+                            Delete={'Objects': batch}
+                        )
+                        print(f"[远程存储] 删除 {len(batch)} 个对象")
+                    except Exception as e:
+                        print(f"[远程存储] 批量删除失败: {e}")
+
+                deleted_count = len(deleted_dates)
+                for date_str in sorted(deleted_dates):
+                    print(f"[远程存储] 清理过期数据: news/{date_str}.db")
+
+                print(f"[远程存储] 共清理 {deleted_count} 个过期日期数据库文件")
+
+            return deleted_count
+
+        except Exception as e:
+            print(f"[远程存储] 清理过期数据失败: {e}")
+            return deleted_count
+
+    def has_pushed_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查指定日期是否已推送过
+
+        Args:
+            date: 日期字符串(YYYY-MM-DD),默认为今天
+
+        Returns:
+            是否已推送
+        """
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            target_date = self._format_date_folder(date)
+
+            cursor.execute("""
+                SELECT pushed FROM push_records WHERE date = ?
+            """, (target_date,))
+
+            row = cursor.fetchone()
+            if row:
+                return bool(row[0])
+            return False
+
+        except Exception as e:
+            print(f"[远程存储] 检查推送记录失败: {e}")
+            return False
+
+    def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
+        """
+        记录推送
+
+        Args:
+            report_type: 报告类型
+            date: 日期字符串(YYYY-MM-DD),默认为今天
+
+        Returns:
+            是否记录成功
+        """
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            target_date = self._format_date_folder(date)
+            now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
+
+            cursor.execute("""
+                INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
+                VALUES (?, 1, ?, ?, ?)
+                ON CONFLICT(date) DO UPDATE SET
+                    pushed = 1,
+                    push_time = excluded.push_time,
+                    report_type = excluded.report_type
+            """, (target_date, now_str, report_type, now_str))
+
+            conn.commit()
+
+            print(f"[远程存储] 推送记录已保存: {report_type} at {now_str}")
+
+            # 上传到 R2 确保记录持久化
+            if self._upload_sqlite(date):
+                print(f"[远程存储] 推送记录已同步到 R2")
+                return True
+            else:
+                print(f"[远程存储] 推送记录同步到 R2 失败")
+                return False
+
+        except Exception as e:
+            print(f"[远程存储] 记录推送失败: {e}")
+            return False
+
+    def __del__(self):
+        """析构函数"""
+        # 检查 Python 是否正在关闭
+        if sys.meta_path is None:
+            return
+        try:
+            self.cleanup()
+        except Exception:
+            # Python 关闭时可能会出错,忽略即可
+            pass
+
+    def pull_recent_days(self, days: int, local_data_dir: str = "output") -> int:
+        """
+        从远程拉取最近 N 天的数据到本地
+
+        Args:
+            days: 拉取天数
+            local_data_dir: 本地数据目录
+
+        Returns:
+            成功拉取的数据库文件数量
+        """
+        if days <= 0:
+            return 0
+
+        local_dir = Path(local_data_dir)
+        local_dir.mkdir(parents=True, exist_ok=True)
+
+        pulled_count = 0
+        now = self._get_configured_time()
+
+        print(f"[远程存储] 开始拉取最近 {days} 天的数据...")
+
+        for i in range(days):
+            date = now - timedelta(days=i)
+            date_str = date.strftime("%Y-%m-%d")
+
+            # 本地目标路径
+            local_date_dir = local_dir / date_str
+            local_db_path = local_date_dir / "news.db"
+
+            # 如果本地已存在,跳过
+            if local_db_path.exists():
+                print(f"[远程存储] 跳过(本地已存在): {date_str}")
+                continue
+
+            # 远程对象键
+            remote_key = f"news/{date_str}.db"
+
+            # 检查远程是否存在
+            if not self._check_object_exists(remote_key):
+                print(f"[远程存储] 跳过(远程不存在): {date_str}")
+                continue
+
+            # 下载
+            try:
+                local_date_dir.mkdir(parents=True, exist_ok=True)
+                self.s3_client.download_file(
+                    self.bucket_name,
+                    remote_key,
+                    str(local_db_path)
+                )
+                print(f"[远程存储] 已拉取: {remote_key} -> {local_db_path}")
+                pulled_count += 1
+            except Exception as e:
+                print(f"[远程存储] 拉取失败 ({date_str}): {e}")
+
+        print(f"[远程存储] 拉取完成,共下载 {pulled_count} 个数据库文件")
+        return pulled_count
+
+    def list_remote_dates(self) -> List[str]:
+        """
+        列出远程存储中所有可用的日期
+
+        Returns:
+            日期字符串列表(YYYY-MM-DD 格式)
+        """
+        dates = []
+
+        try:
+            paginator = self.s3_client.get_paginator('list_objects_v2')
+            pages = paginator.paginate(Bucket=self.bucket_name, Prefix="news/")
+
+            for page in pages:
+                if 'Contents' not in page:
+                    continue
+
+                for obj in page['Contents']:
+                    key = obj['Key']
+                    # 解析日期
+                    date_match = re.match(r'news/(\d{4}-\d{2}-\d{2})\.db$', key)
+                    if date_match:
+                        dates.append(date_match.group(1))
+
+            return sorted(dates, reverse=True)
+
+        except Exception as e:
+            print(f"[远程存储] 列出远程日期失败: {e}")
+            return []

+ 117 - 0
trendradar/storage/schema.sql

@@ -0,0 +1,117 @@
+-- TrendRadar 数据库表结构
+
+-- ============================================
+-- 平台信息表
+-- 核心:id 不变,name 可变
+-- ============================================
+CREATE TABLE IF NOT EXISTS platforms (
+    id TEXT PRIMARY KEY,
+    name TEXT NOT NULL,
+    is_active INTEGER DEFAULT 1,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- ============================================
+-- 新闻条目表
+-- 以 URL + platform_id 为唯一标识,支持去重存储
+-- ============================================
+CREATE TABLE IF NOT EXISTS news_items (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    title TEXT NOT NULL,
+    platform_id TEXT NOT NULL,
+    rank INTEGER NOT NULL,
+    url TEXT DEFAULT '',
+    mobile_url TEXT DEFAULT '',
+    first_crawl_time TEXT NOT NULL,      -- 首次抓取时间
+    last_crawl_time TEXT NOT NULL,       -- 最后抓取时间
+    crawl_count INTEGER DEFAULT 1,       -- 抓取次数
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (platform_id) REFERENCES platforms(id)
+);
+
+-- ============================================
+-- 标题变更历史表
+-- 记录同一 URL 下标题的变化
+-- ============================================
+CREATE TABLE IF NOT EXISTS title_changes (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    news_item_id INTEGER NOT NULL,
+    old_title TEXT NOT NULL,
+    new_title TEXT NOT NULL,
+    changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (news_item_id) REFERENCES news_items(id)
+);
+
+-- ============================================
+-- 排名历史表
+-- 记录每次抓取时的排名变化
+-- ============================================
+CREATE TABLE IF NOT EXISTS rank_history (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    news_item_id INTEGER NOT NULL,
+    rank INTEGER NOT NULL,
+    crawl_time TEXT NOT NULL,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (news_item_id) REFERENCES news_items(id)
+);
+
+-- ============================================
+-- 抓取记录表
+-- 记录每次抓取的时间和数量
+-- ============================================
+CREATE TABLE IF NOT EXISTS crawl_records (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    crawl_time TEXT NOT NULL UNIQUE,
+    total_items INTEGER DEFAULT 0,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- ============================================
+-- 抓取来源状态表
+-- 记录每次抓取各平台的成功/失败状态
+-- ============================================
+CREATE TABLE IF NOT EXISTS crawl_source_status (
+    crawl_record_id INTEGER NOT NULL,
+    platform_id TEXT NOT NULL,
+    status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
+    PRIMARY KEY (crawl_record_id, platform_id),
+    FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id),
+    FOREIGN KEY (platform_id) REFERENCES platforms(id)
+);
+
+-- ============================================
+-- 推送记录表
+-- 用于 push_window once_per_day 功能
+-- ============================================
+CREATE TABLE IF NOT EXISTS push_records (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    date TEXT NOT NULL UNIQUE,
+    pushed INTEGER DEFAULT 0,
+    push_time TEXT,
+    report_type TEXT,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- ============================================
+-- 索引定义
+-- ============================================
+
+-- 平台索引
+CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id);
+
+-- 时间索引(用于查询最新数据)
+CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time);
+
+-- 标题索引(用于标题搜索)
+CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title);
+
+-- URL + platform_id 唯一索引(仅对非空 URL,实现去重)
+CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform
+    ON news_items(url, platform_id) WHERE url != '';
+
+-- 抓取状态索引
+CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id);
+
+-- 排名历史索引
+CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id);

+ 20 - 0
trendradar/utils/__init__.py

@@ -0,0 +1,20 @@
+# coding=utf-8
+"""
+工具模块 - 公共工具函数
+"""
+
+from trendradar.utils.time import (
+    get_configured_time,
+    format_date_folder,
+    format_time_filename,
+    get_current_time_display,
+    convert_time_for_display,
+)
+
+__all__ = [
+    "get_configured_time",
+    "format_date_folder",
+    "format_time_filename",
+    "get_current_time_display",
+    "convert_time_for_display",
+]

+ 91 - 0
trendradar/utils/time.py

@@ -0,0 +1,91 @@
+# coding=utf-8
+"""
+时间工具模块 - 统一时间处理函数
+"""
+
+from datetime import datetime
+from typing import Optional
+
+import pytz
+
+# 默认时区
+DEFAULT_TIMEZONE = "Asia/Shanghai"
+
+
+def get_configured_time(timezone: str = DEFAULT_TIMEZONE) -> datetime:
+    """
+    获取配置时区的当前时间
+
+    Args:
+        timezone: 时区名称,如 'Asia/Shanghai', 'America/Los_Angeles'
+
+    Returns:
+        带时区信息的当前时间
+    """
+    try:
+        tz = pytz.timezone(timezone)
+    except pytz.UnknownTimeZoneError:
+        print(f"[警告] 未知时区 '{timezone}',使用默认时区 {DEFAULT_TIMEZONE}")
+        tz = pytz.timezone(DEFAULT_TIMEZONE)
+    return datetime.now(tz)
+
+
+def format_date_folder(
+    date: Optional[str] = None, timezone: str = DEFAULT_TIMEZONE
+) -> str:
+    """
+    格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)
+
+    Args:
+        date: 指定日期字符串,为 None 则使用当前日期
+        timezone: 时区名称
+
+    Returns:
+        格式化后的日期字符串,如 '2025-12-09'
+    """
+    if date:
+        return date
+    return get_configured_time(timezone).strftime("%Y-%m-%d")
+
+
+def format_time_filename(timezone: str = DEFAULT_TIMEZONE) -> str:
+    """
+    格式化时间文件名 (格式: HH-MM,用于文件名)
+
+    Windows 系统不支持冒号作为文件名,因此使用连字符
+
+    Args:
+        timezone: 时区名称
+
+    Returns:
+        格式化后的时间字符串,如 '15-30'
+    """
+    return get_configured_time(timezone).strftime("%H-%M")
+
+
+def get_current_time_display(timezone: str = DEFAULT_TIMEZONE) -> str:
+    """
+    获取当前时间显示 (格式: HH:MM,用于显示)
+
+    Args:
+        timezone: 时区名称
+
+    Returns:
+        格式化后的时间字符串,如 '15:30'
+    """
+    return get_configured_time(timezone).strftime("%H:%M")
+
+
+def convert_time_for_display(time_str: str) -> str:
+    """
+    将 HH-MM 格式转换为 HH:MM 格式用于显示
+
+    Args:
+        time_str: 输入时间字符串,如 '15-30'
+
+    Returns:
+        转换后的时间字符串,如 '15:30'
+    """
+    if time_str and "-" in time_str and len(time_str) == 5:
+        return time_str.replace("-", ":")
+    return time_str

+ 1 - 1
version

@@ -1 +1 @@
-3.5.0
+4.0.0