splitter.py 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052
  1. # coding=utf-8
  2. """
  3. 消息分批处理模块
  4. 提供消息内容分批拆分功能,确保消息大小不超过各平台限制
  5. """
  6. from datetime import datetime
  7. from typing import Dict, List, Optional, Callable
  8. from trendradar.report.formatter import format_title_for_platform
  9. from trendradar.utils.time import format_iso_time_friendly
  10. # 默认批次大小配置
  11. DEFAULT_BATCH_SIZES = {
  12. "dingtalk": 20000,
  13. "feishu": 29000,
  14. "ntfy": 3800,
  15. "default": 4000,
  16. }
  17. def split_content_into_batches(
  18. report_data: Dict,
  19. format_type: str,
  20. update_info: Optional[Dict] = None,
  21. max_bytes: Optional[int] = None,
  22. mode: str = "daily",
  23. batch_sizes: Optional[Dict[str, int]] = None,
  24. feishu_separator: str = "---",
  25. reverse_content_order: bool = False,
  26. get_time_func: Optional[Callable[[], datetime]] = None,
  27. rss_items: Optional[list] = None,
  28. rss_new_items: Optional[list] = None,
  29. timezone: str = "Asia/Shanghai",
  30. display_mode: str = "keyword",
  31. ) -> List[str]:
  32. """分批处理消息内容,确保词组标题+至少第一条新闻的完整性(支持热榜+RSS合并)
  33. 热榜统计与RSS统计并列显示,热榜新增与RSS新增并列显示。
  34. reverse_content_order 控制统计和新增的前后顺序。
  35. Args:
  36. report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
  37. format_type: 格式类型 (feishu, dingtalk, wework, telegram, ntfy, bark, slack)
  38. update_info: 版本更新信息(可选)
  39. max_bytes: 最大字节数(可选,如果不指定则使用默认配置)
  40. mode: 报告模式 (daily, incremental, current)
  41. batch_sizes: 批次大小配置字典(可选)
  42. feishu_separator: 飞书消息分隔符
  43. reverse_content_order: 是否反转内容顺序(新增在前,统计在后)
  44. get_time_func: 获取当前时间的函数(可选)
  45. rss_items: RSS 统计条目列表(按源分组,用于合并推送)
  46. rss_new_items: RSS 新增条目列表(可选,用于新增区块)
  47. timezone: 时区名称(用于 RSS 时间格式化)
  48. display_mode: 显示模式 (keyword=按关键词分组, platform=按平台分组)
  49. Returns:
  50. 分批后的消息内容列表
  51. """
  52. # 合并批次大小配置
  53. sizes = {**DEFAULT_BATCH_SIZES, **(batch_sizes or {})}
  54. if max_bytes is None:
  55. if format_type == "dingtalk":
  56. max_bytes = sizes.get("dingtalk", 20000)
  57. elif format_type == "feishu":
  58. max_bytes = sizes.get("feishu", 29000)
  59. elif format_type == "ntfy":
  60. max_bytes = sizes.get("ntfy", 3800)
  61. else:
  62. max_bytes = sizes.get("default", 4000)
  63. batches = []
  64. total_titles = sum(
  65. len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
  66. )
  67. now = get_time_func() if get_time_func else datetime.now()
  68. base_header = ""
  69. if format_type in ("wework", "bark"):
  70. base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
  71. elif format_type == "telegram":
  72. base_header = f"总新闻数: {total_titles}\n\n"
  73. elif format_type == "ntfy":
  74. base_header = f"**总新闻数:** {total_titles}\n\n"
  75. elif format_type == "feishu":
  76. base_header = ""
  77. elif format_type == "dingtalk":
  78. base_header = f"**总新闻数:** {total_titles}\n\n"
  79. base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
  80. base_header += f"**类型:** 热点分析报告\n\n"
  81. base_header += "---\n\n"
  82. elif format_type == "slack":
  83. base_header = f"*总新闻数:* {total_titles}\n\n"
  84. base_footer = ""
  85. if format_type in ("wework", "bark"):
  86. base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  87. if update_info:
  88. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  89. elif format_type == "telegram":
  90. base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  91. if update_info:
  92. base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
  93. elif format_type == "ntfy":
  94. base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  95. if update_info:
  96. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  97. elif format_type == "feishu":
  98. base_footer = f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
  99. if update_info:
  100. base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
  101. elif format_type == "dingtalk":
  102. base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  103. if update_info:
  104. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  105. elif format_type == "slack":
  106. base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_"
  107. if update_info:
  108. base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_"
  109. # 根据 display_mode 选择统计标题
  110. stats_title = "热点词汇统计" if display_mode == "keyword" else "热点新闻统计"
  111. stats_header = ""
  112. if report_data["stats"]:
  113. if format_type in ("wework", "bark"):
  114. stats_header = f"📊 **{stats_title}**\n\n"
  115. elif format_type == "telegram":
  116. stats_header = f"📊 {stats_title}\n\n"
  117. elif format_type == "ntfy":
  118. stats_header = f"📊 **{stats_title}**\n\n"
  119. elif format_type == "feishu":
  120. stats_header = f"📊 **{stats_title}**\n\n"
  121. elif format_type == "dingtalk":
  122. stats_header = f"📊 **{stats_title}**\n\n"
  123. elif format_type == "slack":
  124. stats_header = f"📊 *{stats_title}*\n\n"
  125. current_batch = base_header
  126. current_batch_has_content = False
  127. if (
  128. not report_data["stats"]
  129. and not report_data["new_titles"]
  130. and not report_data["failed_ids"]
  131. ):
  132. if mode == "incremental":
  133. mode_text = "增量模式下暂无新增匹配的热点词汇"
  134. elif mode == "current":
  135. mode_text = "当前榜单模式下暂无匹配的热点词汇"
  136. else:
  137. mode_text = "暂无匹配的热点词汇"
  138. simple_content = f"📭 {mode_text}\n\n"
  139. final_content = base_header + simple_content + base_footer
  140. batches.append(final_content)
  141. return batches
  142. # 定义处理热点词汇统计的函数
  143. def process_stats_section(current_batch, current_batch_has_content, batches):
  144. """处理热点词汇统计"""
  145. if not report_data["stats"]:
  146. return current_batch, current_batch_has_content, batches
  147. total_count = len(report_data["stats"])
  148. # 添加统计标题
  149. test_content = current_batch + stats_header
  150. if (
  151. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  152. < max_bytes
  153. ):
  154. current_batch = test_content
  155. current_batch_has_content = True
  156. else:
  157. if current_batch_has_content:
  158. batches.append(current_batch + base_footer)
  159. current_batch = base_header + stats_header
  160. current_batch_has_content = True
  161. # 逐个处理词组(确保词组标题+第一条新闻的原子性)
  162. for i, stat in enumerate(report_data["stats"]):
  163. word = stat["word"]
  164. count = stat["count"]
  165. sequence_display = f"[{i + 1}/{total_count}]"
  166. # 构建词组标题
  167. word_header = ""
  168. if format_type in ("wework", "bark"):
  169. if count >= 10:
  170. word_header = (
  171. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  172. )
  173. elif count >= 5:
  174. word_header = (
  175. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  176. )
  177. else:
  178. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  179. elif format_type == "telegram":
  180. if count >= 10:
  181. word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
  182. elif count >= 5:
  183. word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
  184. else:
  185. word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
  186. elif format_type == "ntfy":
  187. if count >= 10:
  188. word_header = (
  189. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  190. )
  191. elif count >= 5:
  192. word_header = (
  193. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  194. )
  195. else:
  196. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  197. elif format_type == "feishu":
  198. if count >= 10:
  199. word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
  200. elif count >= 5:
  201. word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
  202. else:
  203. word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
  204. elif format_type == "dingtalk":
  205. if count >= 10:
  206. word_header = (
  207. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  208. )
  209. elif count >= 5:
  210. word_header = (
  211. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  212. )
  213. else:
  214. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  215. elif format_type == "slack":
  216. if count >= 10:
  217. word_header = (
  218. f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
  219. )
  220. elif count >= 5:
  221. word_header = (
  222. f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
  223. )
  224. else:
  225. word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
  226. # 构建第一条新闻
  227. # display_mode: keyword=显示来源, platform=显示关键词
  228. show_source = display_mode == "keyword"
  229. show_keyword = display_mode == "platform"
  230. first_news_line = ""
  231. if stat["titles"]:
  232. first_title_data = stat["titles"][0]
  233. if format_type in ("wework", "bark"):
  234. formatted_title = format_title_for_platform(
  235. "wework", first_title_data, show_source=show_source, show_keyword=show_keyword
  236. )
  237. elif format_type == "telegram":
  238. formatted_title = format_title_for_platform(
  239. "telegram", first_title_data, show_source=show_source, show_keyword=show_keyword
  240. )
  241. elif format_type == "ntfy":
  242. formatted_title = format_title_for_platform(
  243. "ntfy", first_title_data, show_source=show_source, show_keyword=show_keyword
  244. )
  245. elif format_type == "feishu":
  246. formatted_title = format_title_for_platform(
  247. "feishu", first_title_data, show_source=show_source, show_keyword=show_keyword
  248. )
  249. elif format_type == "dingtalk":
  250. formatted_title = format_title_for_platform(
  251. "dingtalk", first_title_data, show_source=show_source, show_keyword=show_keyword
  252. )
  253. elif format_type == "slack":
  254. formatted_title = format_title_for_platform(
  255. "slack", first_title_data, show_source=show_source, show_keyword=show_keyword
  256. )
  257. else:
  258. formatted_title = f"{first_title_data['title']}"
  259. first_news_line = f" 1. {formatted_title}\n"
  260. if len(stat["titles"]) > 1:
  261. first_news_line += "\n"
  262. # 原子性检查:词组标题+第一条新闻必须一起处理
  263. word_with_first_news = word_header + first_news_line
  264. test_content = current_batch + word_with_first_news
  265. if (
  266. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  267. >= max_bytes
  268. ):
  269. # 当前批次容纳不下,开启新批次
  270. if current_batch_has_content:
  271. batches.append(current_batch + base_footer)
  272. current_batch = base_header + stats_header + word_with_first_news
  273. current_batch_has_content = True
  274. start_index = 1
  275. else:
  276. current_batch = test_content
  277. current_batch_has_content = True
  278. start_index = 1
  279. # 处理剩余新闻条目
  280. for j in range(start_index, len(stat["titles"])):
  281. title_data = stat["titles"][j]
  282. if format_type in ("wework", "bark"):
  283. formatted_title = format_title_for_platform(
  284. "wework", title_data, show_source=show_source, show_keyword=show_keyword
  285. )
  286. elif format_type == "telegram":
  287. formatted_title = format_title_for_platform(
  288. "telegram", title_data, show_source=show_source, show_keyword=show_keyword
  289. )
  290. elif format_type == "ntfy":
  291. formatted_title = format_title_for_platform(
  292. "ntfy", title_data, show_source=show_source, show_keyword=show_keyword
  293. )
  294. elif format_type == "feishu":
  295. formatted_title = format_title_for_platform(
  296. "feishu", title_data, show_source=show_source, show_keyword=show_keyword
  297. )
  298. elif format_type == "dingtalk":
  299. formatted_title = format_title_for_platform(
  300. "dingtalk", title_data, show_source=show_source, show_keyword=show_keyword
  301. )
  302. elif format_type == "slack":
  303. formatted_title = format_title_for_platform(
  304. "slack", title_data, show_source=show_source, show_keyword=show_keyword
  305. )
  306. else:
  307. formatted_title = f"{title_data['title']}"
  308. news_line = f" {j + 1}. {formatted_title}\n"
  309. if j < len(stat["titles"]) - 1:
  310. news_line += "\n"
  311. test_content = current_batch + news_line
  312. if (
  313. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  314. >= max_bytes
  315. ):
  316. if current_batch_has_content:
  317. batches.append(current_batch + base_footer)
  318. current_batch = base_header + stats_header + word_header + news_line
  319. current_batch_has_content = True
  320. else:
  321. current_batch = test_content
  322. current_batch_has_content = True
  323. # 词组间分隔符
  324. if i < len(report_data["stats"]) - 1:
  325. separator = ""
  326. if format_type in ("wework", "bark"):
  327. separator = f"\n\n\n\n"
  328. elif format_type == "telegram":
  329. separator = f"\n\n"
  330. elif format_type == "ntfy":
  331. separator = f"\n\n"
  332. elif format_type == "feishu":
  333. separator = f"\n{feishu_separator}\n\n"
  334. elif format_type == "dingtalk":
  335. separator = f"\n---\n\n"
  336. elif format_type == "slack":
  337. separator = f"\n\n"
  338. test_content = current_batch + separator
  339. if (
  340. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  341. < max_bytes
  342. ):
  343. current_batch = test_content
  344. return current_batch, current_batch_has_content, batches
  345. # 定义处理新增新闻的函数
  346. def process_new_titles_section(current_batch, current_batch_has_content, batches):
  347. """处理新增新闻"""
  348. if not report_data["new_titles"]:
  349. return current_batch, current_batch_has_content, batches
  350. new_header = ""
  351. if format_type in ("wework", "bark"):
  352. new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  353. elif format_type == "telegram":
  354. new_header = (
  355. f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
  356. )
  357. elif format_type == "ntfy":
  358. new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  359. elif format_type == "feishu":
  360. new_header = f"\n{feishu_separator}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  361. elif format_type == "dingtalk":
  362. new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  363. elif format_type == "slack":
  364. new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
  365. test_content = current_batch + new_header
  366. if (
  367. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  368. >= max_bytes
  369. ):
  370. if current_batch_has_content:
  371. batches.append(current_batch + base_footer)
  372. current_batch = base_header + new_header
  373. current_batch_has_content = True
  374. else:
  375. current_batch = test_content
  376. current_batch_has_content = True
  377. # 逐个处理新增新闻来源
  378. for source_data in report_data["new_titles"]:
  379. source_header = ""
  380. if format_type in ("wework", "bark"):
  381. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  382. elif format_type == "telegram":
  383. source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
  384. elif format_type == "ntfy":
  385. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  386. elif format_type == "feishu":
  387. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  388. elif format_type == "dingtalk":
  389. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  390. elif format_type == "slack":
  391. source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
  392. # 构建第一条新增新闻
  393. first_news_line = ""
  394. if source_data["titles"]:
  395. first_title_data = source_data["titles"][0]
  396. title_data_copy = first_title_data.copy()
  397. title_data_copy["is_new"] = False
  398. if format_type in ("wework", "bark"):
  399. formatted_title = format_title_for_platform(
  400. "wework", title_data_copy, show_source=False
  401. )
  402. elif format_type == "telegram":
  403. formatted_title = format_title_for_platform(
  404. "telegram", title_data_copy, show_source=False
  405. )
  406. elif format_type == "feishu":
  407. formatted_title = format_title_for_platform(
  408. "feishu", title_data_copy, show_source=False
  409. )
  410. elif format_type == "dingtalk":
  411. formatted_title = format_title_for_platform(
  412. "dingtalk", title_data_copy, show_source=False
  413. )
  414. elif format_type == "slack":
  415. formatted_title = format_title_for_platform(
  416. "slack", title_data_copy, show_source=False
  417. )
  418. else:
  419. formatted_title = f"{title_data_copy['title']}"
  420. first_news_line = f" 1. {formatted_title}\n"
  421. # 原子性检查:来源标题+第一条新闻
  422. source_with_first_news = source_header + first_news_line
  423. test_content = current_batch + source_with_first_news
  424. if (
  425. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  426. >= max_bytes
  427. ):
  428. if current_batch_has_content:
  429. batches.append(current_batch + base_footer)
  430. current_batch = base_header + new_header + source_with_first_news
  431. current_batch_has_content = True
  432. start_index = 1
  433. else:
  434. current_batch = test_content
  435. current_batch_has_content = True
  436. start_index = 1
  437. # 处理剩余新增新闻
  438. for j in range(start_index, len(source_data["titles"])):
  439. title_data = source_data["titles"][j]
  440. title_data_copy = title_data.copy()
  441. title_data_copy["is_new"] = False
  442. if format_type == "wework":
  443. formatted_title = format_title_for_platform(
  444. "wework", title_data_copy, show_source=False
  445. )
  446. elif format_type == "telegram":
  447. formatted_title = format_title_for_platform(
  448. "telegram", title_data_copy, show_source=False
  449. )
  450. elif format_type == "feishu":
  451. formatted_title = format_title_for_platform(
  452. "feishu", title_data_copy, show_source=False
  453. )
  454. elif format_type == "dingtalk":
  455. formatted_title = format_title_for_platform(
  456. "dingtalk", title_data_copy, show_source=False
  457. )
  458. elif format_type == "slack":
  459. formatted_title = format_title_for_platform(
  460. "slack", title_data_copy, show_source=False
  461. )
  462. else:
  463. formatted_title = f"{title_data_copy['title']}"
  464. news_line = f" {j + 1}. {formatted_title}\n"
  465. test_content = current_batch + news_line
  466. if (
  467. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  468. >= max_bytes
  469. ):
  470. if current_batch_has_content:
  471. batches.append(current_batch + base_footer)
  472. current_batch = base_header + new_header + source_header + news_line
  473. current_batch_has_content = True
  474. else:
  475. current_batch = test_content
  476. current_batch_has_content = True
  477. current_batch += "\n"
  478. return current_batch, current_batch_has_content, batches
  479. # 根据配置决定处理顺序
  480. if reverse_content_order:
  481. # 新增热点在前,热点词汇统计在后
  482. # 1. 处理热榜新增
  483. current_batch, current_batch_has_content, batches = process_new_titles_section(
  484. current_batch, current_batch_has_content, batches
  485. )
  486. # 2. 处理 RSS 新增(如果有)
  487. if rss_new_items:
  488. current_batch, current_batch_has_content, batches = _process_rss_new_titles_section(
  489. rss_new_items, format_type, feishu_separator, base_header, base_footer,
  490. max_bytes, current_batch, current_batch_has_content, batches, timezone
  491. )
  492. # 3. 处理热榜统计
  493. current_batch, current_batch_has_content, batches = process_stats_section(
  494. current_batch, current_batch_has_content, batches
  495. )
  496. # 4. 处理 RSS 统计(如果有)
  497. if rss_items:
  498. current_batch, current_batch_has_content, batches = _process_rss_stats_section(
  499. rss_items, format_type, feishu_separator, base_header, base_footer,
  500. max_bytes, current_batch, current_batch_has_content, batches, timezone
  501. )
  502. else:
  503. # 默认:热点词汇统计在前,新增热点在后
  504. # 1. 处理热榜统计
  505. current_batch, current_batch_has_content, batches = process_stats_section(
  506. current_batch, current_batch_has_content, batches
  507. )
  508. # 2. 处理 RSS 统计(如果有)
  509. if rss_items:
  510. current_batch, current_batch_has_content, batches = _process_rss_stats_section(
  511. rss_items, format_type, feishu_separator, base_header, base_footer,
  512. max_bytes, current_batch, current_batch_has_content, batches, timezone
  513. )
  514. # 3. 处理热榜新增
  515. current_batch, current_batch_has_content, batches = process_new_titles_section(
  516. current_batch, current_batch_has_content, batches
  517. )
  518. # 4. 处理 RSS 新增(如果有)
  519. if rss_new_items:
  520. current_batch, current_batch_has_content, batches = _process_rss_new_titles_section(
  521. rss_new_items, format_type, feishu_separator, base_header, base_footer,
  522. max_bytes, current_batch, current_batch_has_content, batches, timezone
  523. )
  524. if report_data["failed_ids"]:
  525. failed_header = ""
  526. if format_type == "wework":
  527. failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
  528. elif format_type == "telegram":
  529. failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
  530. elif format_type == "ntfy":
  531. failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n"
  532. elif format_type == "feishu":
  533. failed_header = f"\n{feishu_separator}\n\n⚠️ **数据获取失败的平台:**\n\n"
  534. elif format_type == "dingtalk":
  535. failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n"
  536. test_content = current_batch + failed_header
  537. if (
  538. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  539. >= max_bytes
  540. ):
  541. if current_batch_has_content:
  542. batches.append(current_batch + base_footer)
  543. current_batch = base_header + failed_header
  544. current_batch_has_content = True
  545. else:
  546. current_batch = test_content
  547. current_batch_has_content = True
  548. for i, id_value in enumerate(report_data["failed_ids"], 1):
  549. if format_type == "feishu":
  550. failed_line = f" • <font color='red'>{id_value}</font>\n"
  551. elif format_type == "dingtalk":
  552. failed_line = f" • **{id_value}**\n"
  553. else:
  554. failed_line = f" • {id_value}\n"
  555. test_content = current_batch + failed_line
  556. if (
  557. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  558. >= max_bytes
  559. ):
  560. if current_batch_has_content:
  561. batches.append(current_batch + base_footer)
  562. current_batch = base_header + failed_header + failed_line
  563. current_batch_has_content = True
  564. else:
  565. current_batch = test_content
  566. current_batch_has_content = True
  567. # 完成最后批次
  568. if current_batch_has_content:
  569. batches.append(current_batch + base_footer)
  570. return batches
  571. def _process_rss_stats_section(
  572. rss_stats: list,
  573. format_type: str,
  574. feishu_separator: str,
  575. base_header: str,
  576. base_footer: str,
  577. max_bytes: int,
  578. current_batch: str,
  579. current_batch_has_content: bool,
  580. batches: List[str],
  581. timezone: str = "Asia/Shanghai",
  582. ) -> tuple:
  583. """处理 RSS 统计区块(按关键词分组,与热榜统计格式一致)
  584. Args:
  585. rss_stats: RSS 关键词统计列表,格式与热榜 stats 一致:
  586. [{"word": "AI", "count": 5, "titles": [...]}]
  587. format_type: 格式类型
  588. feishu_separator: 飞书分隔符
  589. base_header: 基础头部
  590. base_footer: 基础尾部
  591. max_bytes: 最大字节数
  592. current_batch: 当前批次内容
  593. current_batch_has_content: 当前批次是否有内容
  594. batches: 已完成的批次列表
  595. timezone: 时区名称
  596. Returns:
  597. (current_batch, current_batch_has_content, batches) 元组
  598. """
  599. if not rss_stats:
  600. return current_batch, current_batch_has_content, batches
  601. # 计算总条目数
  602. total_items = sum(stat["count"] for stat in rss_stats)
  603. total_keywords = len(rss_stats)
  604. # RSS 统计区块标题
  605. rss_header = ""
  606. if format_type == "feishu":
  607. rss_header = f"\n{feishu_separator}\n\n📰 **RSS 订阅统计** (共 {total_items} 条)\n\n"
  608. elif format_type == "dingtalk":
  609. rss_header = f"\n---\n\n📰 **RSS 订阅统计** (共 {total_items} 条)\n\n"
  610. elif format_type == "telegram":
  611. rss_header = f"\n\n📰 RSS 订阅统计 (共 {total_items} 条)\n\n"
  612. elif format_type == "slack":
  613. rss_header = f"\n\n📰 *RSS 订阅统计* (共 {total_items} 条)\n\n"
  614. else:
  615. rss_header = f"\n\n📰 **RSS 订阅统计** (共 {total_items} 条)\n\n"
  616. # 添加 RSS 标题
  617. test_content = current_batch + rss_header
  618. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) < max_bytes:
  619. current_batch = test_content
  620. current_batch_has_content = True
  621. else:
  622. if current_batch_has_content:
  623. batches.append(current_batch + base_footer)
  624. current_batch = base_header + rss_header
  625. current_batch_has_content = True
  626. # 逐个处理关键词组(与热榜一致)
  627. for i, stat in enumerate(rss_stats):
  628. word = stat["word"]
  629. count = stat["count"]
  630. sequence_display = f"[{i + 1}/{total_keywords}]"
  631. # 构建关键词标题(与热榜格式一致)
  632. word_header = ""
  633. if format_type in ("wework", "bark"):
  634. if count >= 10:
  635. word_header = f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  636. elif count >= 5:
  637. word_header = f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  638. else:
  639. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  640. elif format_type == "telegram":
  641. if count >= 10:
  642. word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
  643. elif count >= 5:
  644. word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
  645. else:
  646. word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
  647. elif format_type == "ntfy":
  648. if count >= 10:
  649. word_header = f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  650. elif count >= 5:
  651. word_header = f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  652. else:
  653. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  654. elif format_type == "feishu":
  655. if count >= 10:
  656. word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
  657. elif count >= 5:
  658. word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
  659. else:
  660. word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
  661. elif format_type == "dingtalk":
  662. if count >= 10:
  663. word_header = f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  664. elif count >= 5:
  665. word_header = f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  666. else:
  667. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  668. elif format_type == "slack":
  669. if count >= 10:
  670. word_header = f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
  671. elif count >= 5:
  672. word_header = f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
  673. else:
  674. word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
  675. # 构建第一条新闻(使用 format_title_for_platform)
  676. first_news_line = ""
  677. if stat["titles"]:
  678. first_title_data = stat["titles"][0]
  679. if format_type in ("wework", "bark"):
  680. formatted_title = format_title_for_platform("wework", first_title_data, show_source=True)
  681. elif format_type == "telegram":
  682. formatted_title = format_title_for_platform("telegram", first_title_data, show_source=True)
  683. elif format_type == "ntfy":
  684. formatted_title = format_title_for_platform("ntfy", first_title_data, show_source=True)
  685. elif format_type == "feishu":
  686. formatted_title = format_title_for_platform("feishu", first_title_data, show_source=True)
  687. elif format_type == "dingtalk":
  688. formatted_title = format_title_for_platform("dingtalk", first_title_data, show_source=True)
  689. elif format_type == "slack":
  690. formatted_title = format_title_for_platform("slack", first_title_data, show_source=True)
  691. else:
  692. formatted_title = f"{first_title_data['title']}"
  693. first_news_line = f" 1. {formatted_title}\n"
  694. if len(stat["titles"]) > 1:
  695. first_news_line += "\n"
  696. # 原子性检查:关键词标题 + 第一条新闻必须一起处理
  697. word_with_first_news = word_header + first_news_line
  698. test_content = current_batch + word_with_first_news
  699. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  700. if current_batch_has_content:
  701. batches.append(current_batch + base_footer)
  702. current_batch = base_header + rss_header + word_with_first_news
  703. current_batch_has_content = True
  704. start_index = 1
  705. else:
  706. current_batch = test_content
  707. current_batch_has_content = True
  708. start_index = 1
  709. # 处理剩余新闻条目
  710. for j in range(start_index, len(stat["titles"])):
  711. title_data = stat["titles"][j]
  712. if format_type in ("wework", "bark"):
  713. formatted_title = format_title_for_platform("wework", title_data, show_source=True)
  714. elif format_type == "telegram":
  715. formatted_title = format_title_for_platform("telegram", title_data, show_source=True)
  716. elif format_type == "ntfy":
  717. formatted_title = format_title_for_platform("ntfy", title_data, show_source=True)
  718. elif format_type == "feishu":
  719. formatted_title = format_title_for_platform("feishu", title_data, show_source=True)
  720. elif format_type == "dingtalk":
  721. formatted_title = format_title_for_platform("dingtalk", title_data, show_source=True)
  722. elif format_type == "slack":
  723. formatted_title = format_title_for_platform("slack", title_data, show_source=True)
  724. else:
  725. formatted_title = f"{title_data['title']}"
  726. news_line = f" {j + 1}. {formatted_title}\n"
  727. if j < len(stat["titles"]) - 1:
  728. news_line += "\n"
  729. test_content = current_batch + news_line
  730. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  731. if current_batch_has_content:
  732. batches.append(current_batch + base_footer)
  733. current_batch = base_header + rss_header + word_header + news_line
  734. current_batch_has_content = True
  735. else:
  736. current_batch = test_content
  737. current_batch_has_content = True
  738. # 关键词间分隔符
  739. if i < len(rss_stats) - 1:
  740. separator = ""
  741. if format_type in ("wework", "bark"):
  742. separator = "\n\n\n\n"
  743. elif format_type == "telegram":
  744. separator = "\n\n"
  745. elif format_type == "ntfy":
  746. separator = "\n\n"
  747. elif format_type == "feishu":
  748. separator = f"\n{feishu_separator}\n\n"
  749. elif format_type == "dingtalk":
  750. separator = "\n---\n\n"
  751. elif format_type == "slack":
  752. separator = "\n\n"
  753. test_content = current_batch + separator
  754. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) < max_bytes:
  755. current_batch = test_content
  756. return current_batch, current_batch_has_content, batches
  757. def _process_rss_new_titles_section(
  758. rss_new_stats: list,
  759. format_type: str,
  760. feishu_separator: str,
  761. base_header: str,
  762. base_footer: str,
  763. max_bytes: int,
  764. current_batch: str,
  765. current_batch_has_content: bool,
  766. batches: List[str],
  767. timezone: str = "Asia/Shanghai",
  768. ) -> tuple:
  769. """处理 RSS 新增区块(按来源分组,与热榜新增格式一致)
  770. Args:
  771. rss_new_stats: RSS 新增关键词统计列表,格式与热榜 stats 一致:
  772. [{"word": "AI", "count": 5, "titles": [...]}]
  773. format_type: 格式类型
  774. feishu_separator: 飞书分隔符
  775. base_header: 基础头部
  776. base_footer: 基础尾部
  777. max_bytes: 最大字节数
  778. current_batch: 当前批次内容
  779. current_batch_has_content: 当前批次是否有内容
  780. batches: 已完成的批次列表
  781. timezone: 时区名称
  782. Returns:
  783. (current_batch, current_batch_has_content, batches) 元组
  784. """
  785. if not rss_new_stats:
  786. return current_batch, current_batch_has_content, batches
  787. # 从关键词分组中提取所有条目,重新按来源分组
  788. source_map = {}
  789. for stat in rss_new_stats:
  790. for title_data in stat.get("titles", []):
  791. source_name = title_data.get("source_name", "未知来源")
  792. if source_name not in source_map:
  793. source_map[source_name] = []
  794. source_map[source_name].append(title_data)
  795. if not source_map:
  796. return current_batch, current_batch_has_content, batches
  797. # 计算总条目数
  798. total_items = sum(len(titles) for titles in source_map.values())
  799. # RSS 新增区块标题
  800. new_header = ""
  801. if format_type in ("wework", "bark"):
  802. new_header = f"\n\n\n\n🆕 **RSS 本次新增** (共 {total_items} 条)\n\n"
  803. elif format_type == "telegram":
  804. new_header = f"\n\n🆕 RSS 本次新增 (共 {total_items} 条)\n\n"
  805. elif format_type == "ntfy":
  806. new_header = f"\n\n🆕 **RSS 本次新增** (共 {total_items} 条)\n\n"
  807. elif format_type == "feishu":
  808. new_header = f"\n{feishu_separator}\n\n🆕 **RSS 本次新增** (共 {total_items} 条)\n\n"
  809. elif format_type == "dingtalk":
  810. new_header = f"\n---\n\n🆕 **RSS 本次新增** (共 {total_items} 条)\n\n"
  811. elif format_type == "slack":
  812. new_header = f"\n\n🆕 *RSS 本次新增* (共 {total_items} 条)\n\n"
  813. # 添加 RSS 新增标题
  814. test_content = current_batch + new_header
  815. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  816. if current_batch_has_content:
  817. batches.append(current_batch + base_footer)
  818. current_batch = base_header + new_header
  819. current_batch_has_content = True
  820. else:
  821. current_batch = test_content
  822. current_batch_has_content = True
  823. # 按来源分组显示(与热榜新增格式一致)
  824. source_list = list(source_map.items())
  825. for i, (source_name, titles) in enumerate(source_list):
  826. count = len(titles)
  827. # 构建来源标题(与热榜新增格式一致)
  828. source_header = ""
  829. if format_type in ("wework", "bark"):
  830. source_header = f"**{source_name}** ({count} 条):\n\n"
  831. elif format_type == "telegram":
  832. source_header = f"{source_name} ({count} 条):\n\n"
  833. elif format_type == "ntfy":
  834. source_header = f"**{source_name}** ({count} 条):\n\n"
  835. elif format_type == "feishu":
  836. source_header = f"**{source_name}** ({count} 条):\n\n"
  837. elif format_type == "dingtalk":
  838. source_header = f"**{source_name}** ({count} 条):\n\n"
  839. elif format_type == "slack":
  840. source_header = f"*{source_name}* ({count} 条):\n\n"
  841. # 构建第一条新闻(不显示来源,禁用 new emoji)
  842. first_news_line = ""
  843. if titles:
  844. first_title_data = titles[0].copy()
  845. first_title_data["is_new"] = False
  846. if format_type in ("wework", "bark"):
  847. formatted_title = format_title_for_platform("wework", first_title_data, show_source=False)
  848. elif format_type == "telegram":
  849. formatted_title = format_title_for_platform("telegram", first_title_data, show_source=False)
  850. elif format_type == "ntfy":
  851. formatted_title = format_title_for_platform("ntfy", first_title_data, show_source=False)
  852. elif format_type == "feishu":
  853. formatted_title = format_title_for_platform("feishu", first_title_data, show_source=False)
  854. elif format_type == "dingtalk":
  855. formatted_title = format_title_for_platform("dingtalk", first_title_data, show_source=False)
  856. elif format_type == "slack":
  857. formatted_title = format_title_for_platform("slack", first_title_data, show_source=False)
  858. else:
  859. formatted_title = f"{first_title_data['title']}"
  860. first_news_line = f" 1. {formatted_title}\n"
  861. # 原子性检查:来源标题 + 第一条新闻必须一起处理
  862. source_with_first_news = source_header + first_news_line
  863. test_content = current_batch + source_with_first_news
  864. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  865. if current_batch_has_content:
  866. batches.append(current_batch + base_footer)
  867. current_batch = base_header + new_header + source_with_first_news
  868. current_batch_has_content = True
  869. start_index = 1
  870. else:
  871. current_batch = test_content
  872. current_batch_has_content = True
  873. start_index = 1
  874. # 处理剩余新闻条目(禁用 new emoji)
  875. for j in range(start_index, len(titles)):
  876. title_data = titles[j].copy()
  877. title_data["is_new"] = False
  878. if format_type in ("wework", "bark"):
  879. formatted_title = format_title_for_platform("wework", title_data, show_source=False)
  880. elif format_type == "telegram":
  881. formatted_title = format_title_for_platform("telegram", title_data, show_source=False)
  882. elif format_type == "ntfy":
  883. formatted_title = format_title_for_platform("ntfy", title_data, show_source=False)
  884. elif format_type == "feishu":
  885. formatted_title = format_title_for_platform("feishu", title_data, show_source=False)
  886. elif format_type == "dingtalk":
  887. formatted_title = format_title_for_platform("dingtalk", title_data, show_source=False)
  888. elif format_type == "slack":
  889. formatted_title = format_title_for_platform("slack", title_data, show_source=False)
  890. else:
  891. formatted_title = f"{title_data['title']}"
  892. news_line = f" {j + 1}. {formatted_title}\n"
  893. test_content = current_batch + news_line
  894. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  895. if current_batch_has_content:
  896. batches.append(current_batch + base_footer)
  897. current_batch = base_header + new_header + source_header + news_line
  898. current_batch_has_content = True
  899. else:
  900. current_batch = test_content
  901. current_batch_has_content = True
  902. # 来源间添加空行(与热榜新增格式一致)
  903. current_batch += "\n"
  904. return current_batch, current_batch_has_content, batches
  905. def _format_rss_item_line(
  906. item: Dict,
  907. index: int,
  908. format_type: str,
  909. timezone: str = "Asia/Shanghai",
  910. ) -> str:
  911. """格式化单条 RSS 条目
  912. Args:
  913. item: RSS 条目字典
  914. index: 序号
  915. format_type: 格式类型
  916. timezone: 时区名称
  917. Returns:
  918. 格式化后的条目行字符串
  919. """
  920. title = item.get("title", "")
  921. url = item.get("url", "")
  922. published_at = item.get("published_at", "")
  923. # 使用友好时间格式
  924. if published_at:
  925. friendly_time = format_iso_time_friendly(published_at, timezone, include_date=True)
  926. else:
  927. friendly_time = ""
  928. # 构建条目行
  929. if format_type == "feishu":
  930. if url:
  931. item_line = f" {index}. [{title}]({url})"
  932. else:
  933. item_line = f" {index}. {title}"
  934. if friendly_time:
  935. item_line += f" <font color='grey'>- {friendly_time}</font>"
  936. elif format_type == "telegram":
  937. if url:
  938. item_line = f" {index}. {title} ({url})"
  939. else:
  940. item_line = f" {index}. {title}"
  941. if friendly_time:
  942. item_line += f" - {friendly_time}"
  943. else:
  944. if url:
  945. item_line = f" {index}. [{title}]({url})"
  946. else:
  947. item_line = f" {index}. {title}"
  948. if friendly_time:
  949. item_line += f" `{friendly_time}`"
  950. item_line += "\n"
  951. return item_line