|
|
@@ -13,7 +13,7 @@ import requests
|
|
|
import pytz
|
|
|
|
|
|
CONFIG = {
|
|
|
- "VERSION": "1.2.1",
|
|
|
+ "VERSION": "1.3.0",
|
|
|
"VERSION_CHECK_URL": "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version",
|
|
|
"SHOW_VERSION_UPDATE": True, # 控制显示版本更新提示,改成 False 将不接受新版本提示
|
|
|
"FEISHU_MESSAGE_SEPARATOR": "━━━━━━━━━━━━━━━━━━━", # feishu消息分割线
|
|
|
@@ -22,7 +22,10 @@ CONFIG = {
|
|
|
"RANK_THRESHOLD": 5, # 排名高亮阈值
|
|
|
"USE_PROXY": True, # 是否启用代理
|
|
|
"DEFAULT_PROXY": "http://127.0.0.1:10086",
|
|
|
- "CONTINUE_WITHOUT_WEBHOOK": True, # 控制在没有webhook URL时是否继续执行爬虫
|
|
|
+ "ENABLE_CRAWLER": True, # 是否启用爬取新闻功能,False时直接停止程序
|
|
|
+ "ENABLE_NOTIFICATION": True, # 是否启用通知功能,False时不发送手机通知
|
|
|
+ "MESSAGE_BATCH_SIZE": 4000, # 消息分批大小(字节)
|
|
|
+ "BATCH_SEND_INTERVAL": 1, # 批次发送间隔(秒)
|
|
|
# 飞书机器人的 webhook URL
|
|
|
"FEISHU_WEBHOOK_URL": "",
|
|
|
# 钉钉机器人的 webhook URL
|
|
|
@@ -274,8 +277,7 @@ class DataProcessor:
|
|
|
|
|
|
files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
|
|
|
if len(files) < 2:
|
|
|
- if len(files) == 1:
|
|
|
- return DataProcessor._parse_file_titles(files[0])
|
|
|
+ # 如果只有一个文件(第一次爬取),没有"新增"的概念,返回空字典
|
|
|
return {}
|
|
|
|
|
|
latest_file = files[-1]
|
|
|
@@ -307,6 +309,7 @@ class DataProcessor:
|
|
|
if alias == source_name:
|
|
|
source_id = id_val
|
|
|
break
|
|
|
+
|
|
|
if source_id:
|
|
|
new_titles[source_id] = source_new_titles
|
|
|
|
|
|
@@ -849,7 +852,20 @@ class StatisticsCalculator:
|
|
|
)
|
|
|
|
|
|
source_alias = id_to_alias.get(source_id, source_id)
|
|
|
- is_new = source_id in new_titles and title in new_titles[source_id]
|
|
|
+
|
|
|
+ # 修复is_new判断逻辑,添加容错处理
|
|
|
+ is_new = False
|
|
|
+ if new_titles and source_id in new_titles:
|
|
|
+ new_titles_for_source = new_titles[source_id]
|
|
|
+ if title in new_titles_for_source:
|
|
|
+ is_new = True
|
|
|
+ else:
|
|
|
+ # 如果直接匹配失败,尝试去除首尾空格后匹配
|
|
|
+ title_stripped = title.strip()
|
|
|
+ for new_title in new_titles_for_source.keys():
|
|
|
+ if title_stripped == new_title.strip():
|
|
|
+ is_new = True
|
|
|
+ break
|
|
|
|
|
|
word_stats[group_key]["titles"][source_id].append(
|
|
|
{
|
|
|
@@ -1630,191 +1646,338 @@ class ReportGenerator:
|
|
|
return text_content
|
|
|
|
|
|
@staticmethod
|
|
|
- def _render_wework_content(
|
|
|
- report_data: Dict, update_info: Optional[Dict] = None
|
|
|
- ) -> str:
|
|
|
- """渲染企业微信内容"""
|
|
|
- text_content = ""
|
|
|
+ def _split_content_into_batches(
|
|
|
+ report_data: Dict,
|
|
|
+ format_type: str,
|
|
|
+ update_info: Optional[Dict] = None,
|
|
|
+ max_bytes: int = CONFIG["MESSAGE_BATCH_SIZE"],
|
|
|
+ ) -> List[str]:
|
|
|
+ """分批处理消息内容,确保词组标题+至少第一条新闻的完整性"""
|
|
|
+ batches = []
|
|
|
|
|
|
- # 计算总标题数
|
|
|
+ # 基础信息构建
|
|
|
total_titles = sum(
|
|
|
len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
|
|
|
)
|
|
|
now = TimeHelper.get_beijing_time()
|
|
|
|
|
|
- # 顶部统计信息
|
|
|
- text_content += f"**总新闻数:** {total_titles}\n\n"
|
|
|
- text_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
|
|
- text_content += f"**类型:** 热点分析报告\n\n\n\n"
|
|
|
+ base_header = ""
|
|
|
+ if format_type == "wework":
|
|
|
+ base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
|
|
|
+ elif format_type == "telegram":
|
|
|
+ base_header = f"总新闻数: {total_titles}\n\n"
|
|
|
|
|
|
- # 渲染热点词汇统计
|
|
|
- if report_data["stats"]:
|
|
|
- text_content += "📊 **热点词汇统计**\n\n"
|
|
|
+ base_footer = ""
|
|
|
+ if format_type == "wework":
|
|
|
+ base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
|
|
+ if update_info:
|
|
|
+ base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
|
|
|
+ elif format_type == "telegram":
|
|
|
+ base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
|
|
+ if update_info:
|
|
|
+ base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
|
|
|
|
|
|
+ stats_header = ""
|
|
|
+ if report_data["stats"]:
|
|
|
+ if format_type == "wework":
|
|
|
+ stats_header = "📊 **热点词汇统计**\n\n"
|
|
|
+ elif format_type == "telegram":
|
|
|
+ stats_header = "📊 热点词汇统计\n\n"
|
|
|
+
|
|
|
+ current_batch = base_header
|
|
|
+ current_batch_has_content = False
|
|
|
+
|
|
|
+ # 空内容处理
|
|
|
+ if (
|
|
|
+ not report_data["stats"]
|
|
|
+ and not report_data["new_titles"]
|
|
|
+ and not report_data["failed_ids"]
|
|
|
+ ):
|
|
|
+ simple_content = "📭 暂无匹配的热点词汇\n\n"
|
|
|
+ final_content = base_header + simple_content + base_footer
|
|
|
+ batches.append(final_content)
|
|
|
+ return batches
|
|
|
+
|
|
|
+ # 处理热点词汇统计
|
|
|
+ if report_data["stats"]:
|
|
|
total_count = len(report_data["stats"])
|
|
|
|
|
|
+ # 添加统计标题
|
|
|
+ test_content = current_batch + stats_header
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
|
|
+ < max_bytes
|
|
|
+ ):
|
|
|
+ current_batch = test_content
|
|
|
+ current_batch_has_content = True
|
|
|
+ else:
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
+ current_batch = base_header + stats_header
|
|
|
+ current_batch_has_content = True
|
|
|
+
|
|
|
+ # 逐个处理词组(确保词组标题+第一条新闻的原子性)
|
|
|
for i, stat in enumerate(report_data["stats"]):
|
|
|
word = stat["word"]
|
|
|
count = stat["count"]
|
|
|
-
|
|
|
sequence_display = f"[{i + 1}/{total_count}]"
|
|
|
|
|
|
- if count >= 10:
|
|
|
- text_content += (
|
|
|
- f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
|
|
|
- )
|
|
|
- elif count >= 5:
|
|
|
- text_content += (
|
|
|
- f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
|
|
|
- )
|
|
|
+ # 构建词组标题
|
|
|
+ word_header = ""
|
|
|
+ if format_type == "wework":
|
|
|
+ if count >= 10:
|
|
|
+ word_header = (
|
|
|
+ f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
|
|
|
+ )
|
|
|
+ elif count >= 5:
|
|
|
+ word_header = (
|
|
|
+ f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ word_header = (
|
|
|
+ f"📌 {sequence_display} **{word}** : {count} 条\n\n"
|
|
|
+ )
|
|
|
+ elif format_type == "telegram":
|
|
|
+ if count >= 10:
|
|
|
+ word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
|
|
|
+ elif count >= 5:
|
|
|
+ word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
|
|
|
+ else:
|
|
|
+ word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
|
|
|
+
|
|
|
+ # 构建第一条新闻
|
|
|
+ first_news_line = ""
|
|
|
+ if stat["titles"]:
|
|
|
+ first_title_data = stat["titles"][0]
|
|
|
+ if format_type == "wework":
|
|
|
+ formatted_title = ReportGenerator._format_title_wework(
|
|
|
+ first_title_data, show_source=True
|
|
|
+ )
|
|
|
+ elif format_type == "telegram":
|
|
|
+ formatted_title = ReportGenerator._format_title_telegram(
|
|
|
+ first_title_data, show_source=True
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ formatted_title = f"{first_title_data['title']}"
|
|
|
+
|
|
|
+ first_news_line = f" 1. {formatted_title}\n"
|
|
|
+ if len(stat["titles"]) > 1:
|
|
|
+ first_news_line += "\n"
|
|
|
+
|
|
|
+ # 原子性检查:词组标题+第一条新闻必须一起处理
|
|
|
+ word_with_first_news = word_header + first_news_line
|
|
|
+ test_content = current_batch + word_with_first_news
|
|
|
+
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
|
|
+ >= max_bytes
|
|
|
+ ):
|
|
|
+ # 当前批次容纳不下,开启新批次
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
+ current_batch = base_header + stats_header + word_with_first_news
|
|
|
+ current_batch_has_content = True
|
|
|
+ start_index = 1
|
|
|
else:
|
|
|
- text_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
|
|
|
+ current_batch = test_content
|
|
|
+ current_batch_has_content = True
|
|
|
+ start_index = 1
|
|
|
+
|
|
|
+ # 处理剩余新闻条目
|
|
|
+ for j in range(start_index, len(stat["titles"])):
|
|
|
+ title_data = stat["titles"][j]
|
|
|
+ if format_type == "wework":
|
|
|
+ formatted_title = ReportGenerator._format_title_wework(
|
|
|
+ title_data, show_source=True
|
|
|
+ )
|
|
|
+ elif format_type == "telegram":
|
|
|
+ formatted_title = ReportGenerator._format_title_telegram(
|
|
|
+ title_data, show_source=True
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ formatted_title = f"{title_data['title']}"
|
|
|
|
|
|
- for j, title_data in enumerate(stat["titles"], 1):
|
|
|
- formatted_title = ReportGenerator._format_title_wework(
|
|
|
- title_data, show_source=True
|
|
|
- )
|
|
|
- text_content += f" {j}. {formatted_title}\n"
|
|
|
+ news_line = f" {j + 1}. {formatted_title}\n"
|
|
|
+ if j < len(stat["titles"]) - 1:
|
|
|
+ news_line += "\n"
|
|
|
|
|
|
- if j < len(stat["titles"]):
|
|
|
- text_content += "\n"
|
|
|
+ test_content = current_batch + news_line
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8"))
|
|
|
+ + len(base_footer.encode("utf-8"))
|
|
|
+ >= max_bytes
|
|
|
+ ):
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
+ current_batch = (
|
|
|
+ base_header + stats_header + word_header + news_line
|
|
|
+ )
|
|
|
+ current_batch_has_content = True
|
|
|
+ else:
|
|
|
+ current_batch = test_content
|
|
|
+ current_batch_has_content = True
|
|
|
|
|
|
+ # 词组间分隔符
|
|
|
if i < len(report_data["stats"]) - 1:
|
|
|
- text_content += f"\n\n\n\n"
|
|
|
+ separator = ""
|
|
|
+ if format_type == "wework":
|
|
|
+ separator = f"\n\n\n\n"
|
|
|
+ elif format_type == "telegram":
|
|
|
+ separator = f"\n\n"
|
|
|
|
|
|
- if not report_data["stats"]:
|
|
|
- text_content += "📭 暂无匹配的热点词汇\n\n"
|
|
|
+ test_content = current_batch + separator
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8"))
|
|
|
+ + len(base_footer.encode("utf-8"))
|
|
|
+ < max_bytes
|
|
|
+ ):
|
|
|
+ current_batch = test_content
|
|
|
|
|
|
- # 渲染新增新闻部分
|
|
|
+ # 处理新增新闻(同样确保来源标题+第一条新闻的原子性)
|
|
|
if report_data["new_titles"]:
|
|
|
- if text_content and "暂无匹配" not in text_content:
|
|
|
- text_content += f"\n\n\n\n"
|
|
|
-
|
|
|
- text_content += (
|
|
|
- f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
|
|
|
- )
|
|
|
+ new_header = ""
|
|
|
+ if format_type == "wework":
|
|
|
+ new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
|
|
|
+ elif format_type == "telegram":
|
|
|
+ new_header = f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
|
|
|
+
|
|
|
+ test_content = current_batch + new_header
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
|
|
+ >= max_bytes
|
|
|
+ ):
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
+ current_batch = base_header + new_header
|
|
|
+ current_batch_has_content = True
|
|
|
+ else:
|
|
|
+ current_batch = test_content
|
|
|
+ current_batch_has_content = True
|
|
|
|
|
|
+ # 逐个处理新增新闻来源
|
|
|
for source_data in report_data["new_titles"]:
|
|
|
- text_content += f"**{source_data['source_alias']}** ({len(source_data['titles'])} 条):\n\n"
|
|
|
-
|
|
|
- for j, title_data in enumerate(source_data["titles"], 1):
|
|
|
- title_data_copy = title_data.copy()
|
|
|
+ source_header = ""
|
|
|
+ if format_type == "wework":
|
|
|
+ source_header = f"**{source_data['source_alias']}** ({len(source_data['titles'])} 条):\n\n"
|
|
|
+ elif format_type == "telegram":
|
|
|
+ source_header = f"{source_data['source_alias']} ({len(source_data['titles'])} 条):\n\n"
|
|
|
+
|
|
|
+ # 构建第一条新增新闻
|
|
|
+ first_news_line = ""
|
|
|
+ if source_data["titles"]:
|
|
|
+ first_title_data = source_data["titles"][0]
|
|
|
+ title_data_copy = first_title_data.copy()
|
|
|
title_data_copy["is_new"] = False
|
|
|
- formatted_title = ReportGenerator._format_title_wework(
|
|
|
- title_data_copy, show_source=False
|
|
|
- )
|
|
|
- text_content += f" {j}. {formatted_title}\n"
|
|
|
-
|
|
|
- text_content += "\n"
|
|
|
-
|
|
|
- # 渲染失败平台
|
|
|
- if report_data["failed_ids"]:
|
|
|
- if text_content and "暂无匹配" not in text_content:
|
|
|
- text_content += f"\n\n\n\n"
|
|
|
-
|
|
|
- text_content += "⚠️ **数据获取失败的平台:**\n\n"
|
|
|
- for i, id_value in enumerate(report_data["failed_ids"], 1):
|
|
|
- text_content += f" • {id_value}\n"
|
|
|
-
|
|
|
- # 添加时间戳
|
|
|
- text_content += f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
|
|
-
|
|
|
- # 版本更新提示
|
|
|
- if update_info:
|
|
|
- text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
|
|
|
|
|
|
- return text_content
|
|
|
-
|
|
|
- @staticmethod
|
|
|
- def _render_telegram_content(
|
|
|
- report_data: Dict, update_info: Optional[Dict] = None
|
|
|
- ) -> str:
|
|
|
- """渲染Telegram内容"""
|
|
|
- text_content = ""
|
|
|
-
|
|
|
- # 计算总标题数
|
|
|
- total_titles = sum(
|
|
|
- len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
|
|
|
- )
|
|
|
- now = TimeHelper.get_beijing_time()
|
|
|
-
|
|
|
- # 顶部统计信息
|
|
|
- text_content += f"总新闻数: {total_titles}\n"
|
|
|
- text_content += f"时间: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
|
- text_content += f"类型: 热点分析报告\n\n"
|
|
|
-
|
|
|
- # 渲染热点词汇统计
|
|
|
- if report_data["stats"]:
|
|
|
- text_content += "📊 热点词汇统计\n\n"
|
|
|
+ if format_type == "wework":
|
|
|
+ formatted_title = ReportGenerator._format_title_wework(
|
|
|
+ title_data_copy, show_source=False
|
|
|
+ )
|
|
|
+ elif format_type == "telegram":
|
|
|
+ formatted_title = ReportGenerator._format_title_telegram(
|
|
|
+ title_data_copy, show_source=False
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ formatted_title = f"{title_data_copy['title']}"
|
|
|
|
|
|
- total_count = len(report_data["stats"])
|
|
|
+ first_news_line = f" 1. {formatted_title}\n"
|
|
|
|
|
|
- for i, stat in enumerate(report_data["stats"]):
|
|
|
- word = stat["word"]
|
|
|
- count = stat["count"]
|
|
|
-
|
|
|
- sequence_display = f"[{i + 1}/{total_count}]"
|
|
|
+ # 原子性检查:来源标题+第一条新闻
|
|
|
+ source_with_first_news = source_header + first_news_line
|
|
|
+ test_content = current_batch + source_with_first_news
|
|
|
|
|
|
- if count >= 10:
|
|
|
- text_content += f"🔥 {sequence_display} {word} : {count} 条\n\n"
|
|
|
- elif count >= 5:
|
|
|
- text_content += f"📈 {sequence_display} {word} : {count} 条\n\n"
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
|
|
+ >= max_bytes
|
|
|
+ ):
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
+ current_batch = base_header + new_header + source_with_first_news
|
|
|
+ current_batch_has_content = True
|
|
|
+ start_index = 1
|
|
|
else:
|
|
|
- text_content += f"📌 {sequence_display} {word} : {count} 条\n\n"
|
|
|
+ current_batch = test_content
|
|
|
+ current_batch_has_content = True
|
|
|
+ start_index = 1
|
|
|
|
|
|
- for j, title_data in enumerate(stat["titles"], 1):
|
|
|
- formatted_title = ReportGenerator._format_title_telegram(
|
|
|
- title_data, show_source=True
|
|
|
- )
|
|
|
- text_content += f" {j}. {formatted_title}\n"
|
|
|
-
|
|
|
- if j < len(stat["titles"]):
|
|
|
- text_content += "\n"
|
|
|
-
|
|
|
- if i < len(report_data["stats"]) - 1:
|
|
|
- text_content += f"\n\n"
|
|
|
-
|
|
|
- if not report_data["stats"]:
|
|
|
- text_content += "📭 暂无匹配的热点词汇\n\n"
|
|
|
-
|
|
|
- # 渲染新增新闻部分
|
|
|
- if report_data["new_titles"]:
|
|
|
- if text_content and "暂无匹配" not in text_content:
|
|
|
- text_content += f"\n\n"
|
|
|
+ # 处理剩余新增新闻
|
|
|
+ for j in range(start_index, len(source_data["titles"])):
|
|
|
+ title_data = source_data["titles"][j]
|
|
|
+ title_data_copy = title_data.copy()
|
|
|
+ title_data_copy["is_new"] = False
|
|
|
|
|
|
- text_content += (
|
|
|
- f"🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
|
|
|
- )
|
|
|
+ if format_type == "wework":
|
|
|
+ formatted_title = ReportGenerator._format_title_wework(
|
|
|
+ title_data_copy, show_source=False
|
|
|
+ )
|
|
|
+ elif format_type == "telegram":
|
|
|
+ formatted_title = ReportGenerator._format_title_telegram(
|
|
|
+ title_data_copy, show_source=False
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ formatted_title = f"{title_data_copy['title']}"
|
|
|
|
|
|
- for source_data in report_data["new_titles"]:
|
|
|
- text_content += f"{source_data['source_alias']} ({len(source_data['titles'])} 条):\n\n"
|
|
|
+ news_line = f" {j + 1}. {formatted_title}\n"
|
|
|
|
|
|
- for j, title_data in enumerate(source_data["titles"], 1):
|
|
|
- title_data_copy = title_data.copy()
|
|
|
- title_data_copy["is_new"] = False
|
|
|
- formatted_title = ReportGenerator._format_title_telegram(
|
|
|
- title_data_copy, show_source=False
|
|
|
- )
|
|
|
- text_content += f" {j}. {formatted_title}\n"
|
|
|
+ test_content = current_batch + news_line
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8"))
|
|
|
+ + len(base_footer.encode("utf-8"))
|
|
|
+ >= max_bytes
|
|
|
+ ):
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
+ current_batch = (
|
|
|
+ base_header + new_header + source_header + news_line
|
|
|
+ )
|
|
|
+ current_batch_has_content = True
|
|
|
+ else:
|
|
|
+ current_batch = test_content
|
|
|
+ current_batch_has_content = True
|
|
|
|
|
|
- text_content += "\n"
|
|
|
+ current_batch += "\n"
|
|
|
|
|
|
- # 渲染失败平台
|
|
|
+ # 处理失败平台
|
|
|
if report_data["failed_ids"]:
|
|
|
- if text_content and "暂无匹配" not in text_content:
|
|
|
- text_content += f"\n\n"
|
|
|
+ failed_header = ""
|
|
|
+ if format_type == "wework":
|
|
|
+ failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
|
|
|
+ elif format_type == "telegram":
|
|
|
+ failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
|
|
|
+
|
|
|
+ test_content = current_batch + failed_header
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
|
|
+ >= max_bytes
|
|
|
+ ):
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
+ current_batch = base_header + failed_header
|
|
|
+ current_batch_has_content = True
|
|
|
+ else:
|
|
|
+ current_batch = test_content
|
|
|
+ current_batch_has_content = True
|
|
|
|
|
|
- text_content += "⚠️ 数据获取失败的平台:\n\n"
|
|
|
for i, id_value in enumerate(report_data["failed_ids"], 1):
|
|
|
- text_content += f" • {id_value}\n"
|
|
|
+ failed_line = f" • {id_value}\n"
|
|
|
+ test_content = current_batch + failed_line
|
|
|
+ if (
|
|
|
+ len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
|
|
+ >= max_bytes
|
|
|
+ ):
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
+ current_batch = base_header + failed_header + failed_line
|
|
|
+ current_batch_has_content = True
|
|
|
+ else:
|
|
|
+ current_batch = test_content
|
|
|
+ current_batch_has_content = True
|
|
|
|
|
|
- text_content += f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
|
|
+ # 完成最后批次
|
|
|
+ if current_batch_has_content:
|
|
|
+ batches.append(current_batch + base_footer)
|
|
|
|
|
|
- # 版本更新提示
|
|
|
- if update_info:
|
|
|
- text_content += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
|
|
|
-
|
|
|
- return text_content
|
|
|
+ return batches
|
|
|
|
|
|
@staticmethod
|
|
|
def send_to_webhooks(
|
|
|
@@ -1988,39 +2151,68 @@ class ReportGenerator:
|
|
|
update_info: Optional[Dict] = None,
|
|
|
proxy_url: Optional[str] = None,
|
|
|
) -> bool:
|
|
|
- """发送到企业微信"""
|
|
|
+ """发送到企业微信(支持分批发送)"""
|
|
|
headers = {"Content-Type": "application/json"}
|
|
|
-
|
|
|
- text_content = ReportGenerator._render_wework_content(report_data, update_info)
|
|
|
-
|
|
|
- payload = {"msgtype": "markdown", "markdown": {"content": text_content}}
|
|
|
-
|
|
|
proxies = None
|
|
|
if proxy_url:
|
|
|
proxies = {"http": proxy_url, "https": proxy_url}
|
|
|
|
|
|
- try:
|
|
|
- response = requests.post(
|
|
|
- webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
|
|
|
+ # 获取分批内容
|
|
|
+ batches = ReportGenerator._split_content_into_batches(
|
|
|
+ report_data, "wework", update_info
|
|
|
+ )
|
|
|
+
|
|
|
+ print(f"企业微信消息分为 {len(batches)} 批次发送 [{report_type}]")
|
|
|
+
|
|
|
+ # 逐批发送
|
|
|
+ for i, batch_content in enumerate(batches, 1):
|
|
|
+ batch_size = len(batch_content.encode("utf-8"))
|
|
|
+ print(
|
|
|
+ f"发送企业微信第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
|
|
|
)
|
|
|
- if response.status_code == 200:
|
|
|
- result = response.json()
|
|
|
- if result.get("errcode") == 0:
|
|
|
- print(f"企业微信通知发送成功 [{report_type}]")
|
|
|
- return True
|
|
|
+
|
|
|
+ # 添加批次标识
|
|
|
+ if len(batches) > 1:
|
|
|
+ batch_header = f"**[第 {i}/{len(batches)} 批次]**\n\n"
|
|
|
+ batch_content = batch_header + batch_content
|
|
|
+
|
|
|
+ payload = {"msgtype": "markdown", "markdown": {"content": batch_content}}
|
|
|
+
|
|
|
+ try:
|
|
|
+ response = requests.post(
|
|
|
+ webhook_url,
|
|
|
+ headers=headers,
|
|
|
+ json=payload,
|
|
|
+ proxies=proxies,
|
|
|
+ timeout=30,
|
|
|
+ )
|
|
|
+ if response.status_code == 200:
|
|
|
+ result = response.json()
|
|
|
+ if result.get("errcode") == 0:
|
|
|
+ print(
|
|
|
+ f"企业微信第 {i}/{len(batches)} 批次发送成功 [{report_type}]"
|
|
|
+ )
|
|
|
+ # 批次间间隔
|
|
|
+ if i < len(batches):
|
|
|
+ time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
|
|
|
+ else:
|
|
|
+ print(
|
|
|
+ f"企业微信第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}"
|
|
|
+ )
|
|
|
+ return False
|
|
|
else:
|
|
|
print(
|
|
|
- f"企业微信通知发送失败 [{report_type}],错误:{result.get('errmsg')}"
|
|
|
+ f"企业微信第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
|
|
|
)
|
|
|
return False
|
|
|
- else:
|
|
|
+ except Exception as e:
|
|
|
print(
|
|
|
- f"企业微信通知发送失败 [{report_type}],状态码:{response.status_code}"
|
|
|
+ f"企业微信第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}"
|
|
|
)
|
|
|
return False
|
|
|
- except Exception as e:
|
|
|
- print(f"企业微信通知发送出错 [{report_type}]:{e}")
|
|
|
- return False
|
|
|
+
|
|
|
+ print(f"企业微信所有 {len(batches)} 批次发送完成 [{report_type}]")
|
|
|
+ return True
|
|
|
|
|
|
@staticmethod
|
|
|
def _send_to_telegram(
|
|
|
@@ -2031,48 +2223,71 @@ class ReportGenerator:
|
|
|
update_info: Optional[Dict] = None,
|
|
|
proxy_url: Optional[str] = None,
|
|
|
) -> bool:
|
|
|
- """发送到Telegram"""
|
|
|
+ """发送到Telegram(支持分批发送)"""
|
|
|
headers = {"Content-Type": "application/json"}
|
|
|
-
|
|
|
- text_content = ReportGenerator._render_telegram_content(
|
|
|
- report_data, update_info
|
|
|
- )
|
|
|
-
|
|
|
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
|
|
|
|
|
- payload = {
|
|
|
- "chat_id": chat_id,
|
|
|
- "text": text_content,
|
|
|
- "parse_mode": "HTML",
|
|
|
- "disable_web_page_preview": True,
|
|
|
- }
|
|
|
-
|
|
|
proxies = None
|
|
|
if proxy_url:
|
|
|
proxies = {"http": proxy_url, "https": proxy_url}
|
|
|
|
|
|
- try:
|
|
|
- response = requests.post(
|
|
|
- url, headers=headers, json=payload, proxies=proxies, timeout=30
|
|
|
+ # 获取分批内容
|
|
|
+ batches = ReportGenerator._split_content_into_batches(
|
|
|
+ report_data, "telegram", update_info
|
|
|
+ )
|
|
|
+
|
|
|
+ print(f"Telegram消息分为 {len(batches)} 批次发送 [{report_type}]")
|
|
|
+
|
|
|
+ # 逐批发送
|
|
|
+ for i, batch_content in enumerate(batches, 1):
|
|
|
+ batch_size = len(batch_content.encode("utf-8"))
|
|
|
+ print(
|
|
|
+ f"发送Telegram第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
|
|
|
)
|
|
|
- if response.status_code == 200:
|
|
|
- result = response.json()
|
|
|
- if result.get("ok"):
|
|
|
- print(f"Telegram通知发送成功 [{report_type}]")
|
|
|
- return True
|
|
|
+
|
|
|
+ # 添加批次标识
|
|
|
+ if len(batches) > 1:
|
|
|
+ batch_header = f"<b>[第 {i}/{len(batches)} 批次]</b>\n\n"
|
|
|
+ batch_content = batch_header + batch_content
|
|
|
+
|
|
|
+ payload = {
|
|
|
+ "chat_id": chat_id,
|
|
|
+ "text": batch_content,
|
|
|
+ "parse_mode": "HTML",
|
|
|
+ "disable_web_page_preview": True,
|
|
|
+ }
|
|
|
+
|
|
|
+ try:
|
|
|
+ response = requests.post(
|
|
|
+ url, headers=headers, json=payload, proxies=proxies, timeout=30
|
|
|
+ )
|
|
|
+ if response.status_code == 200:
|
|
|
+ result = response.json()
|
|
|
+ if result.get("ok"):
|
|
|
+ print(
|
|
|
+ f"Telegram第 {i}/{len(batches)} 批次发送成功 [{report_type}]"
|
|
|
+ )
|
|
|
+ # 批次间间隔
|
|
|
+ if i < len(batches):
|
|
|
+ time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
|
|
|
+ else:
|
|
|
+ print(
|
|
|
+ f"Telegram第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('description')}"
|
|
|
+ )
|
|
|
+ return False
|
|
|
else:
|
|
|
print(
|
|
|
- f"Telegram通知发送失败 [{report_type}],错误:{result.get('description')}"
|
|
|
+ f"Telegram第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
|
|
|
)
|
|
|
return False
|
|
|
- else:
|
|
|
+ except Exception as e:
|
|
|
print(
|
|
|
- f"Telegram通知发送失败 [{report_type}],状态码:{response.status_code}"
|
|
|
+ f"Telegram第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}"
|
|
|
)
|
|
|
return False
|
|
|
- except Exception as e:
|
|
|
- print(f"Telegram通知发送出错 [{report_type}]:{e}")
|
|
|
- return False
|
|
|
+
|
|
|
+ print(f"Telegram所有 {len(batches)} 批次发送完成 [{report_type}]")
|
|
|
+ return True
|
|
|
|
|
|
|
|
|
class NewsAnalyzer:
|
|
|
@@ -2160,7 +2375,24 @@ class NewsAnalyzer:
|
|
|
)
|
|
|
print(f"当日HTML统计报告已生成: {html_file}")
|
|
|
|
|
|
- if self.report_type in ["daily", "both"]:
|
|
|
+ # 检查通知配置
|
|
|
+ has_webhook = any(
|
|
|
+ [
|
|
|
+ os.environ.get("FEISHU_WEBHOOK_URL", CONFIG["FEISHU_WEBHOOK_URL"]),
|
|
|
+ os.environ.get("DINGTALK_WEBHOOK_URL", CONFIG["DINGTALK_WEBHOOK_URL"]),
|
|
|
+ os.environ.get("WEWORK_WEBHOOK_URL", CONFIG["WEWORK_WEBHOOK_URL"]),
|
|
|
+ (
|
|
|
+ os.environ.get("TELEGRAM_BOT_TOKEN", CONFIG["TELEGRAM_BOT_TOKEN"])
|
|
|
+ and os.environ.get("TELEGRAM_CHAT_ID", CONFIG["TELEGRAM_CHAT_ID"])
|
|
|
+ ),
|
|
|
+ ]
|
|
|
+ )
|
|
|
+
|
|
|
+ if (
|
|
|
+ CONFIG["ENABLE_NOTIFICATION"]
|
|
|
+ and has_webhook
|
|
|
+ and self.report_type in ["daily", "both"]
|
|
|
+ ):
|
|
|
ReportGenerator.send_to_webhooks(
|
|
|
stats,
|
|
|
[],
|
|
|
@@ -2170,6 +2402,10 @@ class NewsAnalyzer:
|
|
|
self.update_info,
|
|
|
self.proxy_url,
|
|
|
)
|
|
|
+ elif CONFIG["ENABLE_NOTIFICATION"] and not has_webhook:
|
|
|
+ print("⚠️ 警告:通知功能已启用但未配置webhook URL,将跳过通知发送")
|
|
|
+ elif not CONFIG["ENABLE_NOTIFICATION"]:
|
|
|
+ print("跳过当日汇总通知:通知功能已禁用")
|
|
|
|
|
|
return html_file
|
|
|
|
|
|
@@ -2178,6 +2414,10 @@ class NewsAnalyzer:
|
|
|
now = TimeHelper.get_beijing_time()
|
|
|
print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
|
|
+ if not CONFIG["ENABLE_CRAWLER"]:
|
|
|
+ print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出")
|
|
|
+ return
|
|
|
+
|
|
|
# 检查是否配置了任何webhook URL
|
|
|
has_webhook = any(
|
|
|
[
|
|
|
@@ -2191,14 +2431,13 @@ class NewsAnalyzer:
|
|
|
]
|
|
|
)
|
|
|
|
|
|
- if not has_webhook and not CONFIG["CONTINUE_WITHOUT_WEBHOOK"]:
|
|
|
- print(
|
|
|
- "错误: 未配置任何webhook URL且CONTINUE_WITHOUT_WEBHOOK为False,程序退出"
|
|
|
- )
|
|
|
- return
|
|
|
-
|
|
|
- if not has_webhook:
|
|
|
- print("未配置任何webhook URL,将继续执行爬虫但不发送通知")
|
|
|
+ # 通知功能状态检查和提示
|
|
|
+ if not CONFIG["ENABLE_NOTIFICATION"]:
|
|
|
+ print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取")
|
|
|
+ elif not has_webhook:
|
|
|
+ print("未配置任何webhook URL,将只进行数据抓取,不发送通知")
|
|
|
+ else:
|
|
|
+ print("通知功能已启用,将发送webhook通知")
|
|
|
|
|
|
print(f"报告类型: {self.report_type}")
|
|
|
|
|
|
@@ -2259,7 +2498,12 @@ class NewsAnalyzer:
|
|
|
new_titles,
|
|
|
)
|
|
|
|
|
|
- if self.report_type in ["current", "both"]:
|
|
|
+ # 只有启用通知且配置了webhook时才发送通知
|
|
|
+ if (
|
|
|
+ CONFIG["ENABLE_NOTIFICATION"]
|
|
|
+ and has_webhook
|
|
|
+ and self.report_type in ["current", "both"]
|
|
|
+ ):
|
|
|
ReportGenerator.send_to_webhooks(
|
|
|
stats,
|
|
|
failed_ids,
|
|
|
@@ -2269,6 +2513,10 @@ class NewsAnalyzer:
|
|
|
self.update_info,
|
|
|
self.proxy_url,
|
|
|
)
|
|
|
+ elif CONFIG["ENABLE_NOTIFICATION"] and not has_webhook:
|
|
|
+ print("⚠️ 警告:通知功能已启用但未配置webhook URL,将跳过通知发送")
|
|
|
+ elif not CONFIG["ENABLE_NOTIFICATION"]:
|
|
|
+ print("跳过单次爬取通知:通知功能已禁用")
|
|
|
|
|
|
html_file = ReportGenerator.generate_html_report(
|
|
|
stats, total_titles, failed_ids, False, new_titles, id_to_alias
|