splitter.py 64 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514
  1. # coding=utf-8
  2. """
  3. 消息分批处理模块
  4. 提供消息内容分批拆分功能,确保消息大小不超过各平台限制
  5. """
  6. from datetime import datetime
  7. from typing import Dict, List, Optional, Callable
  8. from trendradar.report.formatter import format_title_for_platform
  9. from trendradar.report.helpers import format_rank_display
  10. from trendradar.utils.time import format_iso_time_friendly, convert_time_for_display
  11. # 默认批次大小配置
  12. DEFAULT_BATCH_SIZES = {
  13. "dingtalk": 20000,
  14. "feishu": 29000,
  15. "ntfy": 3800,
  16. "default": 4000,
  17. }
  18. def split_content_into_batches(
  19. report_data: Dict,
  20. format_type: str,
  21. update_info: Optional[Dict] = None,
  22. max_bytes: Optional[int] = None,
  23. mode: str = "daily",
  24. batch_sizes: Optional[Dict[str, int]] = None,
  25. feishu_separator: str = "---",
  26. reverse_content_order: bool = False,
  27. get_time_func: Optional[Callable[[], datetime]] = None,
  28. rss_items: Optional[list] = None,
  29. rss_new_items: Optional[list] = None,
  30. timezone: str = "Asia/Shanghai",
  31. display_mode: str = "keyword",
  32. ai_content: Optional[str] = None,
  33. standalone_data: Optional[Dict] = None,
  34. rank_threshold: int = 10,
  35. ai_stats: Optional[Dict] = None,
  36. report_type: str = "热点分析报告",
  37. ) -> List[str]:
  38. """分批处理消息内容,确保词组标题+至少第一条新闻的完整性(支持热榜+RSS合并+AI分析+独立展示区)
  39. 热榜统计与RSS统计并列显示,热榜新增与RSS新增并列显示。
  40. reverse_content_order 控制统计和新增的前后顺序。
  41. AI分析内容默认放在最后(footer之前)。
  42. 独立展示区放在新增区块之后、失败ID之前。
  43. Args:
  44. report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
  45. format_type: 格式类型 (feishu, dingtalk, wework, telegram, ntfy, bark, slack)
  46. update_info: 版本更新信息(可选)
  47. max_bytes: 最大字节数(可选,如果不指定则使用默认配置)
  48. mode: 报告模式 (daily, incremental, current)
  49. batch_sizes: 批次大小配置字典(可选)
  50. feishu_separator: 飞书消息分隔符
  51. reverse_content_order: 是否反转内容顺序(新增在前,统计在后)
  52. get_time_func: 获取当前时间的函数(可选)
  53. rss_items: RSS 统计条目列表(按源分组,用于合并推送)
  54. rss_new_items: RSS 新增条目列表(可选,用于新增区块)
  55. timezone: 时区名称(用于 RSS 时间格式化)
  56. display_mode: 显示模式 (keyword=按关键词分组, platform=按平台分组)
  57. ai_content: AI 分析内容(已渲染的字符串,可选)
  58. standalone_data: 独立展示区数据(可选),包含 platforms 和 rss_feeds 列表
  59. ai_stats: AI 分析统计数据(可选),包含 total_news, analyzed_news, max_news_limit 等
  60. Returns:
  61. 分批后的消息内容列表
  62. """
  63. # 合并批次大小配置
  64. sizes = {**DEFAULT_BATCH_SIZES, **(batch_sizes or {})}
  65. if max_bytes is None:
  66. if format_type == "dingtalk":
  67. max_bytes = sizes.get("dingtalk", 20000)
  68. elif format_type == "feishu":
  69. max_bytes = sizes.get("feishu", 29000)
  70. elif format_type == "ntfy":
  71. max_bytes = sizes.get("ntfy", 3800)
  72. else:
  73. max_bytes = sizes.get("default", 4000)
  74. batches = []
  75. total_hotlist_count = sum(
  76. len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
  77. )
  78. total_titles = total_hotlist_count
  79. # 累加 RSS 条目数
  80. if rss_items:
  81. total_titles += sum(stat.get("count", 0) for stat in rss_items)
  82. now = get_time_func() if get_time_func else datetime.now()
  83. # 构建头部信息
  84. base_header = ""
  85. # 准备 AI 分析统计行(如果存在)
  86. ai_stats_line = ""
  87. if ai_stats and ai_stats.get("analyzed_news", 0) > 0:
  88. analyzed_news = ai_stats.get("analyzed_news", 0)
  89. if format_type in ("wework", "bark", "ntfy", "feishu", "dingtalk"):
  90. ai_stats_line = f"**AI 分析数:** {analyzed_news}\n"
  91. elif format_type == "slack":
  92. ai_stats_line = f"*AI 分析数:* {analyzed_news}\n"
  93. elif format_type == "telegram":
  94. ai_stats_line = f"AI 分析数: {analyzed_news}\n"
  95. # 构建统一的头部(总是显示总新闻数、时间和类型)
  96. if format_type in ("wework", "bark"):
  97. base_header = f"**总新闻数:** {total_titles}\n"
  98. base_header += ai_stats_line
  99. base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
  100. base_header += f"**类型:** {report_type}\n\n"
  101. elif format_type == "telegram":
  102. base_header = f"总新闻数: {total_titles}\n"
  103. base_header += ai_stats_line
  104. base_header += f"时间: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
  105. base_header += f"类型: {report_type}\n\n"
  106. elif format_type == "ntfy":
  107. base_header = f"**总新闻数:** {total_titles}\n"
  108. base_header += ai_stats_line
  109. base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
  110. base_header += f"**类型:** {report_type}\n\n"
  111. elif format_type == "feishu":
  112. base_header = f"**总新闻数:** {total_titles}\n"
  113. base_header += ai_stats_line
  114. base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
  115. base_header += f"**类型:** {report_type}\n\n"
  116. base_header += "---\n\n"
  117. elif format_type == "dingtalk":
  118. base_header = f"**总新闻数:** {total_titles}\n"
  119. base_header += ai_stats_line
  120. base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
  121. base_header += f"**类型:** {report_type}\n\n"
  122. base_header += "---\n\n"
  123. elif format_type == "slack":
  124. base_header = f"*总新闻数:* {total_titles}\n"
  125. base_header += ai_stats_line
  126. base_header += f"*时间:* {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
  127. base_header += f"*类型:* {report_type}\n\n"
  128. base_footer = ""
  129. if format_type in ("wework", "bark"):
  130. base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  131. if update_info:
  132. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  133. elif format_type == "telegram":
  134. base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  135. if update_info:
  136. base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
  137. elif format_type == "ntfy":
  138. base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  139. if update_info:
  140. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  141. elif format_type == "feishu":
  142. base_footer = f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
  143. if update_info:
  144. base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
  145. elif format_type == "dingtalk":
  146. base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  147. if update_info:
  148. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  149. elif format_type == "slack":
  150. base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_"
  151. if update_info:
  152. base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_"
  153. # 根据 display_mode 选择统计标题
  154. stats_title = "热点词汇统计" if display_mode == "keyword" else "热点新闻统计"
  155. stats_header = ""
  156. if report_data["stats"]:
  157. if format_type in ("wework", "bark"):
  158. stats_header = f"📊 **{stats_title}** (共 {total_hotlist_count} 条)\n\n"
  159. elif format_type == "telegram":
  160. stats_header = f"📊 {stats_title} (共 {total_hotlist_count} 条)\n\n"
  161. elif format_type == "ntfy":
  162. stats_header = f"📊 **{stats_title}** (共 {total_hotlist_count} 条)\n\n"
  163. elif format_type == "feishu":
  164. stats_header = f"📊 **{stats_title}** (共 {total_hotlist_count} 条)\n\n"
  165. elif format_type == "dingtalk":
  166. stats_header = f"📊 **{stats_title}** (共 {total_hotlist_count} 条)\n\n"
  167. elif format_type == "slack":
  168. stats_header = f"📊 *{stats_title}* (共 {total_hotlist_count} 条)\n\n"
  169. current_batch = base_header
  170. current_batch_has_content = False
  171. # 当没有热榜数据时的处理
  172. # 注意:如果有 ai_content,不应该返回"暂无匹配"消息,而应该继续处理 AI 内容
  173. if (
  174. not report_data["stats"]
  175. and not report_data["new_titles"]
  176. and not report_data["failed_ids"]
  177. and not ai_content # 有 AI 内容时不返回"暂无匹配"
  178. and not rss_items # 有 RSS 内容时也不返回
  179. and not standalone_data # 有独立展示区数据时也不返回
  180. ):
  181. if mode == "incremental":
  182. mode_text = "增量模式下暂无新增匹配的热点词汇"
  183. elif mode == "current":
  184. mode_text = "当前榜单模式下暂无匹配的热点词汇"
  185. else:
  186. mode_text = "暂无匹配的热点词汇"
  187. simple_content = f"📭 {mode_text}\n\n"
  188. final_content = base_header + simple_content + base_footer
  189. batches.append(final_content)
  190. return batches
  191. # 定义处理热点词汇统计的函数
  192. def process_stats_section(current_batch, current_batch_has_content, batches):
  193. """处理热点词汇统计"""
  194. if not report_data["stats"]:
  195. return current_batch, current_batch_has_content, batches
  196. total_count = len(report_data["stats"])
  197. # 添加统计标题
  198. test_content = current_batch + stats_header
  199. if (
  200. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  201. < max_bytes
  202. ):
  203. current_batch = test_content
  204. current_batch_has_content = True
  205. else:
  206. if current_batch_has_content:
  207. batches.append(current_batch + base_footer)
  208. current_batch = base_header + stats_header
  209. current_batch_has_content = True
  210. # 逐个处理词组(确保词组标题+第一条新闻的原子性)
  211. for i, stat in enumerate(report_data["stats"]):
  212. word = stat["word"]
  213. count = stat["count"]
  214. sequence_display = f"[{i + 1}/{total_count}]"
  215. # 构建词组标题
  216. word_header = ""
  217. if format_type in ("wework", "bark"):
  218. if count >= 10:
  219. word_header = (
  220. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  221. )
  222. elif count >= 5:
  223. word_header = (
  224. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  225. )
  226. else:
  227. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  228. elif format_type == "telegram":
  229. if count >= 10:
  230. word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
  231. elif count >= 5:
  232. word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
  233. else:
  234. word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
  235. elif format_type == "ntfy":
  236. if count >= 10:
  237. word_header = (
  238. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  239. )
  240. elif count >= 5:
  241. word_header = (
  242. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  243. )
  244. else:
  245. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  246. elif format_type == "feishu":
  247. if count >= 10:
  248. word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
  249. elif count >= 5:
  250. word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
  251. else:
  252. word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
  253. elif format_type == "dingtalk":
  254. if count >= 10:
  255. word_header = (
  256. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  257. )
  258. elif count >= 5:
  259. word_header = (
  260. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  261. )
  262. else:
  263. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  264. elif format_type == "slack":
  265. if count >= 10:
  266. word_header = (
  267. f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
  268. )
  269. elif count >= 5:
  270. word_header = (
  271. f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
  272. )
  273. else:
  274. word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
  275. # 构建第一条新闻
  276. # display_mode: keyword=显示来源, platform=显示关键词
  277. show_source = display_mode == "keyword"
  278. show_keyword = display_mode == "platform"
  279. first_news_line = ""
  280. if stat["titles"]:
  281. first_title_data = stat["titles"][0]
  282. if format_type in ("wework", "bark"):
  283. formatted_title = format_title_for_platform(
  284. "wework", first_title_data, show_source=show_source, show_keyword=show_keyword
  285. )
  286. elif format_type == "telegram":
  287. formatted_title = format_title_for_platform(
  288. "telegram", first_title_data, show_source=show_source, show_keyword=show_keyword
  289. )
  290. elif format_type == "ntfy":
  291. formatted_title = format_title_for_platform(
  292. "ntfy", first_title_data, show_source=show_source, show_keyword=show_keyword
  293. )
  294. elif format_type == "feishu":
  295. formatted_title = format_title_for_platform(
  296. "feishu", first_title_data, show_source=show_source, show_keyword=show_keyword
  297. )
  298. elif format_type == "dingtalk":
  299. formatted_title = format_title_for_platform(
  300. "dingtalk", first_title_data, show_source=show_source, show_keyword=show_keyword
  301. )
  302. elif format_type == "slack":
  303. formatted_title = format_title_for_platform(
  304. "slack", first_title_data, show_source=show_source, show_keyword=show_keyword
  305. )
  306. else:
  307. formatted_title = f"{first_title_data['title']}"
  308. first_news_line = f" 1. {formatted_title}\n"
  309. if len(stat["titles"]) > 1:
  310. first_news_line += "\n"
  311. # 原子性检查:词组标题+第一条新闻必须一起处理
  312. word_with_first_news = word_header + first_news_line
  313. test_content = current_batch + word_with_first_news
  314. if (
  315. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  316. >= max_bytes
  317. ):
  318. # 当前批次容纳不下,开启新批次
  319. if current_batch_has_content:
  320. batches.append(current_batch + base_footer)
  321. current_batch = base_header + stats_header + word_with_first_news
  322. current_batch_has_content = True
  323. start_index = 1
  324. else:
  325. current_batch = test_content
  326. current_batch_has_content = True
  327. start_index = 1
  328. # 处理剩余新闻条目
  329. for j in range(start_index, len(stat["titles"])):
  330. title_data = stat["titles"][j]
  331. if format_type in ("wework", "bark"):
  332. formatted_title = format_title_for_platform(
  333. "wework", title_data, show_source=show_source, show_keyword=show_keyword
  334. )
  335. elif format_type == "telegram":
  336. formatted_title = format_title_for_platform(
  337. "telegram", title_data, show_source=show_source, show_keyword=show_keyword
  338. )
  339. elif format_type == "ntfy":
  340. formatted_title = format_title_for_platform(
  341. "ntfy", title_data, show_source=show_source, show_keyword=show_keyword
  342. )
  343. elif format_type == "feishu":
  344. formatted_title = format_title_for_platform(
  345. "feishu", title_data, show_source=show_source, show_keyword=show_keyword
  346. )
  347. elif format_type == "dingtalk":
  348. formatted_title = format_title_for_platform(
  349. "dingtalk", title_data, show_source=show_source, show_keyword=show_keyword
  350. )
  351. elif format_type == "slack":
  352. formatted_title = format_title_for_platform(
  353. "slack", title_data, show_source=show_source, show_keyword=show_keyword
  354. )
  355. else:
  356. formatted_title = f"{title_data['title']}"
  357. news_line = f" {j + 1}. {formatted_title}\n"
  358. if j < len(stat["titles"]) - 1:
  359. news_line += "\n"
  360. test_content = current_batch + news_line
  361. if (
  362. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  363. >= max_bytes
  364. ):
  365. if current_batch_has_content:
  366. batches.append(current_batch + base_footer)
  367. current_batch = base_header + stats_header + word_header + news_line
  368. current_batch_has_content = True
  369. else:
  370. current_batch = test_content
  371. current_batch_has_content = True
  372. # 词组间分隔符
  373. if i < len(report_data["stats"]) - 1:
  374. separator = ""
  375. if format_type in ("wework", "bark"):
  376. separator = f"\n\n\n\n"
  377. elif format_type == "telegram":
  378. separator = f"\n\n"
  379. elif format_type == "ntfy":
  380. separator = f"\n\n"
  381. elif format_type == "feishu":
  382. separator = f"\n{feishu_separator}\n\n"
  383. elif format_type == "dingtalk":
  384. separator = f"\n---\n\n"
  385. elif format_type == "slack":
  386. separator = f"\n\n"
  387. test_content = current_batch + separator
  388. if (
  389. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  390. < max_bytes
  391. ):
  392. current_batch = test_content
  393. return current_batch, current_batch_has_content, batches
  394. # 定义处理新增新闻的函数
  395. def process_new_titles_section(current_batch, current_batch_has_content, batches):
  396. """处理新增新闻"""
  397. if not report_data["new_titles"]:
  398. return current_batch, current_batch_has_content, batches
  399. new_header = ""
  400. if format_type in ("wework", "bark"):
  401. new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  402. elif format_type == "telegram":
  403. new_header = (
  404. f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
  405. )
  406. elif format_type == "ntfy":
  407. new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  408. elif format_type == "feishu":
  409. new_header = f"\n{feishu_separator}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  410. elif format_type == "dingtalk":
  411. new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  412. elif format_type == "slack":
  413. new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
  414. test_content = current_batch + new_header
  415. if (
  416. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  417. >= max_bytes
  418. ):
  419. if current_batch_has_content:
  420. batches.append(current_batch + base_footer)
  421. current_batch = base_header + new_header
  422. current_batch_has_content = True
  423. else:
  424. current_batch = test_content
  425. current_batch_has_content = True
  426. # 逐个处理新增新闻来源
  427. for source_data in report_data["new_titles"]:
  428. source_header = ""
  429. if format_type in ("wework", "bark"):
  430. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  431. elif format_type == "telegram":
  432. source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
  433. elif format_type == "ntfy":
  434. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  435. elif format_type == "feishu":
  436. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  437. elif format_type == "dingtalk":
  438. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  439. elif format_type == "slack":
  440. source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
  441. # 构建第一条新增新闻
  442. first_news_line = ""
  443. if source_data["titles"]:
  444. first_title_data = source_data["titles"][0]
  445. title_data_copy = first_title_data.copy()
  446. title_data_copy["is_new"] = False
  447. if format_type in ("wework", "bark"):
  448. formatted_title = format_title_for_platform(
  449. "wework", title_data_copy, show_source=False
  450. )
  451. elif format_type == "telegram":
  452. formatted_title = format_title_for_platform(
  453. "telegram", title_data_copy, show_source=False
  454. )
  455. elif format_type == "feishu":
  456. formatted_title = format_title_for_platform(
  457. "feishu", title_data_copy, show_source=False
  458. )
  459. elif format_type == "dingtalk":
  460. formatted_title = format_title_for_platform(
  461. "dingtalk", title_data_copy, show_source=False
  462. )
  463. elif format_type == "slack":
  464. formatted_title = format_title_for_platform(
  465. "slack", title_data_copy, show_source=False
  466. )
  467. else:
  468. formatted_title = f"{title_data_copy['title']}"
  469. first_news_line = f" 1. {formatted_title}\n"
  470. # 原子性检查:来源标题+第一条新闻
  471. source_with_first_news = source_header + first_news_line
  472. test_content = current_batch + source_with_first_news
  473. if (
  474. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  475. >= max_bytes
  476. ):
  477. if current_batch_has_content:
  478. batches.append(current_batch + base_footer)
  479. current_batch = base_header + new_header + source_with_first_news
  480. current_batch_has_content = True
  481. start_index = 1
  482. else:
  483. current_batch = test_content
  484. current_batch_has_content = True
  485. start_index = 1
  486. # 处理剩余新增新闻
  487. for j in range(start_index, len(source_data["titles"])):
  488. title_data = source_data["titles"][j]
  489. title_data_copy = title_data.copy()
  490. title_data_copy["is_new"] = False
  491. if format_type == "wework":
  492. formatted_title = format_title_for_platform(
  493. "wework", title_data_copy, show_source=False
  494. )
  495. elif format_type == "telegram":
  496. formatted_title = format_title_for_platform(
  497. "telegram", title_data_copy, show_source=False
  498. )
  499. elif format_type == "feishu":
  500. formatted_title = format_title_for_platform(
  501. "feishu", title_data_copy, show_source=False
  502. )
  503. elif format_type == "dingtalk":
  504. formatted_title = format_title_for_platform(
  505. "dingtalk", title_data_copy, show_source=False
  506. )
  507. elif format_type == "slack":
  508. formatted_title = format_title_for_platform(
  509. "slack", title_data_copy, show_source=False
  510. )
  511. else:
  512. formatted_title = f"{title_data_copy['title']}"
  513. news_line = f" {j + 1}. {formatted_title}\n"
  514. test_content = current_batch + news_line
  515. if (
  516. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  517. >= max_bytes
  518. ):
  519. if current_batch_has_content:
  520. batches.append(current_batch + base_footer)
  521. current_batch = base_header + new_header + source_header + news_line
  522. current_batch_has_content = True
  523. else:
  524. current_batch = test_content
  525. current_batch_has_content = True
  526. current_batch += "\n"
  527. return current_batch, current_batch_has_content, batches
  528. # 根据配置决定处理顺序
  529. if reverse_content_order:
  530. # 新增热点在前,热点词汇统计在后
  531. # 1. 处理热榜新增
  532. current_batch, current_batch_has_content, batches = process_new_titles_section(
  533. current_batch, current_batch_has_content, batches
  534. )
  535. # 2. 处理 RSS 新增(如果有)
  536. if rss_new_items:
  537. current_batch, current_batch_has_content, batches = _process_rss_new_titles_section(
  538. rss_new_items, format_type, feishu_separator, base_header, base_footer,
  539. max_bytes, current_batch, current_batch_has_content, batches, timezone
  540. )
  541. # 3. 处理热榜统计
  542. current_batch, current_batch_has_content, batches = process_stats_section(
  543. current_batch, current_batch_has_content, batches
  544. )
  545. # 4. 处理 RSS 统计(如果有)
  546. if rss_items:
  547. current_batch, current_batch_has_content, batches = _process_rss_stats_section(
  548. rss_items, format_type, feishu_separator, base_header, base_footer,
  549. max_bytes, current_batch, current_batch_has_content, batches, timezone
  550. )
  551. else:
  552. # 默认:热点词汇统计在前,新增热点在后
  553. # 1. 处理热榜统计
  554. current_batch, current_batch_has_content, batches = process_stats_section(
  555. current_batch, current_batch_has_content, batches
  556. )
  557. # 2. 处理 RSS 统计(如果有)
  558. if rss_items:
  559. current_batch, current_batch_has_content, batches = _process_rss_stats_section(
  560. rss_items, format_type, feishu_separator, base_header, base_footer,
  561. max_bytes, current_batch, current_batch_has_content, batches, timezone
  562. )
  563. # 3. 处理热榜新增
  564. current_batch, current_batch_has_content, batches = process_new_titles_section(
  565. current_batch, current_batch_has_content, batches
  566. )
  567. # 4. 处理 RSS 新增(如果有)
  568. if rss_new_items:
  569. current_batch, current_batch_has_content, batches = _process_rss_new_titles_section(
  570. rss_new_items, format_type, feishu_separator, base_header, base_footer,
  571. max_bytes, current_batch, current_batch_has_content, batches, timezone
  572. )
  573. # 5. 处理独立展示区(如果有)
  574. if standalone_data:
  575. current_batch, current_batch_has_content, batches = _process_standalone_section(
  576. standalone_data, format_type, feishu_separator, base_header, base_footer,
  577. max_bytes, current_batch, current_batch_has_content, batches, timezone,
  578. rank_threshold
  579. )
  580. if report_data["failed_ids"]:
  581. failed_header = ""
  582. if format_type == "wework":
  583. failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
  584. elif format_type == "telegram":
  585. failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
  586. elif format_type == "ntfy":
  587. failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n"
  588. elif format_type == "feishu":
  589. failed_header = f"\n{feishu_separator}\n\n⚠️ **数据获取失败的平台:**\n\n"
  590. elif format_type == "dingtalk":
  591. failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n"
  592. test_content = current_batch + failed_header
  593. if (
  594. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  595. >= max_bytes
  596. ):
  597. if current_batch_has_content:
  598. batches.append(current_batch + base_footer)
  599. current_batch = base_header + failed_header
  600. current_batch_has_content = True
  601. else:
  602. current_batch = test_content
  603. current_batch_has_content = True
  604. for i, id_value in enumerate(report_data["failed_ids"], 1):
  605. if format_type == "feishu":
  606. failed_line = f" • <font color='red'>{id_value}</font>\n"
  607. elif format_type == "dingtalk":
  608. failed_line = f" • **{id_value}**\n"
  609. else:
  610. failed_line = f" • {id_value}\n"
  611. test_content = current_batch + failed_line
  612. if (
  613. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  614. >= max_bytes
  615. ):
  616. if current_batch_has_content:
  617. batches.append(current_batch + base_footer)
  618. current_batch = base_header + failed_header + failed_line
  619. current_batch_has_content = True
  620. else:
  621. current_batch = test_content
  622. current_batch_has_content = True
  623. # 处理 AI 分析内容(放在最后,footer 之前)
  624. if ai_content:
  625. # 添加 AI 分析区块分隔符
  626. ai_separator = ""
  627. if format_type == "feishu":
  628. ai_separator = f"\n{feishu_separator}\n\n"
  629. elif format_type == "dingtalk":
  630. ai_separator = "\n---\n\n"
  631. elif format_type in ("wework", "bark"):
  632. ai_separator = "\n\n\n\n"
  633. elif format_type in ("telegram", "ntfy", "slack"):
  634. ai_separator = "\n\n"
  635. # 尝试将 AI 内容添加到当前批次
  636. test_content = current_batch + ai_separator + ai_content
  637. if (
  638. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  639. < max_bytes
  640. ):
  641. current_batch = test_content
  642. current_batch_has_content = True
  643. else:
  644. # 当前批次容纳不下,开启新批次
  645. if current_batch_has_content:
  646. batches.append(current_batch + base_footer)
  647. # AI 内容可能很长,需要考虑是否需要进一步分割
  648. ai_with_header = base_header + ai_content
  649. if len(ai_with_header.encode("utf-8")) + len(base_footer.encode("utf-8")) < max_bytes:
  650. current_batch = ai_with_header
  651. current_batch_has_content = True
  652. else:
  653. # AI 内容过长,直接添加(可能会超限,但保持完整性)
  654. current_batch = ai_with_header
  655. current_batch_has_content = True
  656. # 完成最后批次
  657. if current_batch_has_content:
  658. batches.append(current_batch + base_footer)
  659. return batches
  660. def _process_rss_stats_section(
  661. rss_stats: list,
  662. format_type: str,
  663. feishu_separator: str,
  664. base_header: str,
  665. base_footer: str,
  666. max_bytes: int,
  667. current_batch: str,
  668. current_batch_has_content: bool,
  669. batches: List[str],
  670. timezone: str = "Asia/Shanghai",
  671. ) -> tuple:
  672. """处理 RSS 统计区块(按关键词分组,与热榜统计格式一致)
  673. Args:
  674. rss_stats: RSS 关键词统计列表,格式与热榜 stats 一致:
  675. [{"word": "AI", "count": 5, "titles": [...]}]
  676. format_type: 格式类型
  677. feishu_separator: 飞书分隔符
  678. base_header: 基础头部
  679. base_footer: 基础尾部
  680. max_bytes: 最大字节数
  681. current_batch: 当前批次内容
  682. current_batch_has_content: 当前批次是否有内容
  683. batches: 已完成的批次列表
  684. timezone: 时区名称
  685. Returns:
  686. (current_batch, current_batch_has_content, batches) 元组
  687. """
  688. if not rss_stats:
  689. return current_batch, current_batch_has_content, batches
  690. # 计算总条目数
  691. total_items = sum(stat["count"] for stat in rss_stats)
  692. total_keywords = len(rss_stats)
  693. # RSS 统计区块标题
  694. rss_header = ""
  695. if format_type == "feishu":
  696. rss_header = f"\n{feishu_separator}\n\n📰 **RSS 订阅统计** (共 {total_items} 条)\n\n"
  697. elif format_type == "dingtalk":
  698. rss_header = f"\n---\n\n📰 **RSS 订阅统计** (共 {total_items} 条)\n\n"
  699. elif format_type == "telegram":
  700. rss_header = f"\n\n📰 RSS 订阅统计 (共 {total_items} 条)\n\n"
  701. elif format_type == "slack":
  702. rss_header = f"\n\n📰 *RSS 订阅统计* (共 {total_items} 条)\n\n"
  703. else:
  704. rss_header = f"\n\n📰 **RSS 订阅统计** (共 {total_items} 条)\n\n"
  705. # 添加 RSS 标题
  706. test_content = current_batch + rss_header
  707. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) < max_bytes:
  708. current_batch = test_content
  709. current_batch_has_content = True
  710. else:
  711. if current_batch_has_content:
  712. batches.append(current_batch + base_footer)
  713. current_batch = base_header + rss_header
  714. current_batch_has_content = True
  715. # 逐个处理关键词组(与热榜一致)
  716. for i, stat in enumerate(rss_stats):
  717. word = stat["word"]
  718. count = stat["count"]
  719. sequence_display = f"[{i + 1}/{total_keywords}]"
  720. # 构建关键词标题(与热榜格式一致)
  721. word_header = ""
  722. if format_type in ("wework", "bark"):
  723. if count >= 10:
  724. word_header = f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  725. elif count >= 5:
  726. word_header = f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  727. else:
  728. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  729. elif format_type == "telegram":
  730. if count >= 10:
  731. word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
  732. elif count >= 5:
  733. word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
  734. else:
  735. word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
  736. elif format_type == "ntfy":
  737. if count >= 10:
  738. word_header = f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  739. elif count >= 5:
  740. word_header = f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  741. else:
  742. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  743. elif format_type == "feishu":
  744. if count >= 10:
  745. word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
  746. elif count >= 5:
  747. word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
  748. else:
  749. word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
  750. elif format_type == "dingtalk":
  751. if count >= 10:
  752. word_header = f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  753. elif count >= 5:
  754. word_header = f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  755. else:
  756. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  757. elif format_type == "slack":
  758. if count >= 10:
  759. word_header = f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
  760. elif count >= 5:
  761. word_header = f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
  762. else:
  763. word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
  764. # 构建第一条新闻(使用 format_title_for_platform)
  765. first_news_line = ""
  766. if stat["titles"]:
  767. first_title_data = stat["titles"][0]
  768. if format_type in ("wework", "bark"):
  769. formatted_title = format_title_for_platform("wework", first_title_data, show_source=True)
  770. elif format_type == "telegram":
  771. formatted_title = format_title_for_platform("telegram", first_title_data, show_source=True)
  772. elif format_type == "ntfy":
  773. formatted_title = format_title_for_platform("ntfy", first_title_data, show_source=True)
  774. elif format_type == "feishu":
  775. formatted_title = format_title_for_platform("feishu", first_title_data, show_source=True)
  776. elif format_type == "dingtalk":
  777. formatted_title = format_title_for_platform("dingtalk", first_title_data, show_source=True)
  778. elif format_type == "slack":
  779. formatted_title = format_title_for_platform("slack", first_title_data, show_source=True)
  780. else:
  781. formatted_title = f"{first_title_data['title']}"
  782. first_news_line = f" 1. {formatted_title}\n"
  783. if len(stat["titles"]) > 1:
  784. first_news_line += "\n"
  785. # 原子性检查:关键词标题 + 第一条新闻必须一起处理
  786. word_with_first_news = word_header + first_news_line
  787. test_content = current_batch + word_with_first_news
  788. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  789. if current_batch_has_content:
  790. batches.append(current_batch + base_footer)
  791. current_batch = base_header + rss_header + word_with_first_news
  792. current_batch_has_content = True
  793. start_index = 1
  794. else:
  795. current_batch = test_content
  796. current_batch_has_content = True
  797. start_index = 1
  798. # 处理剩余新闻条目
  799. for j in range(start_index, len(stat["titles"])):
  800. title_data = stat["titles"][j]
  801. if format_type in ("wework", "bark"):
  802. formatted_title = format_title_for_platform("wework", title_data, show_source=True)
  803. elif format_type == "telegram":
  804. formatted_title = format_title_for_platform("telegram", title_data, show_source=True)
  805. elif format_type == "ntfy":
  806. formatted_title = format_title_for_platform("ntfy", title_data, show_source=True)
  807. elif format_type == "feishu":
  808. formatted_title = format_title_for_platform("feishu", title_data, show_source=True)
  809. elif format_type == "dingtalk":
  810. formatted_title = format_title_for_platform("dingtalk", title_data, show_source=True)
  811. elif format_type == "slack":
  812. formatted_title = format_title_for_platform("slack", title_data, show_source=True)
  813. else:
  814. formatted_title = f"{title_data['title']}"
  815. news_line = f" {j + 1}. {formatted_title}\n"
  816. if j < len(stat["titles"]) - 1:
  817. news_line += "\n"
  818. test_content = current_batch + news_line
  819. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  820. if current_batch_has_content:
  821. batches.append(current_batch + base_footer)
  822. current_batch = base_header + rss_header + word_header + news_line
  823. current_batch_has_content = True
  824. else:
  825. current_batch = test_content
  826. current_batch_has_content = True
  827. # 关键词间分隔符
  828. if i < len(rss_stats) - 1:
  829. separator = ""
  830. if format_type in ("wework", "bark"):
  831. separator = "\n\n\n\n"
  832. elif format_type == "telegram":
  833. separator = "\n\n"
  834. elif format_type == "ntfy":
  835. separator = "\n\n"
  836. elif format_type == "feishu":
  837. separator = f"\n{feishu_separator}\n\n"
  838. elif format_type == "dingtalk":
  839. separator = "\n---\n\n"
  840. elif format_type == "slack":
  841. separator = "\n\n"
  842. test_content = current_batch + separator
  843. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) < max_bytes:
  844. current_batch = test_content
  845. return current_batch, current_batch_has_content, batches
  846. def _process_rss_new_titles_section(
  847. rss_new_stats: list,
  848. format_type: str,
  849. feishu_separator: str,
  850. base_header: str,
  851. base_footer: str,
  852. max_bytes: int,
  853. current_batch: str,
  854. current_batch_has_content: bool,
  855. batches: List[str],
  856. timezone: str = "Asia/Shanghai",
  857. ) -> tuple:
  858. """处理 RSS 新增区块(按来源分组,与热榜新增格式一致)
  859. Args:
  860. rss_new_stats: RSS 新增关键词统计列表,格式与热榜 stats 一致:
  861. [{"word": "AI", "count": 5, "titles": [...]}]
  862. format_type: 格式类型
  863. feishu_separator: 飞书分隔符
  864. base_header: 基础头部
  865. base_footer: 基础尾部
  866. max_bytes: 最大字节数
  867. current_batch: 当前批次内容
  868. current_batch_has_content: 当前批次是否有内容
  869. batches: 已完成的批次列表
  870. timezone: 时区名称
  871. Returns:
  872. (current_batch, current_batch_has_content, batches) 元组
  873. """
  874. if not rss_new_stats:
  875. return current_batch, current_batch_has_content, batches
  876. # 从关键词分组中提取所有条目,重新按来源分组
  877. source_map = {}
  878. for stat in rss_new_stats:
  879. for title_data in stat.get("titles", []):
  880. source_name = title_data.get("source_name", "未知来源")
  881. if source_name not in source_map:
  882. source_map[source_name] = []
  883. source_map[source_name].append(title_data)
  884. if not source_map:
  885. return current_batch, current_batch_has_content, batches
  886. # 计算总条目数
  887. total_items = sum(len(titles) for titles in source_map.values())
  888. # RSS 新增区块标题
  889. new_header = ""
  890. if format_type in ("wework", "bark"):
  891. new_header = f"\n\n\n\n🆕 **RSS 本次新增** (共 {total_items} 条)\n\n"
  892. elif format_type == "telegram":
  893. new_header = f"\n\n🆕 RSS 本次新增 (共 {total_items} 条)\n\n"
  894. elif format_type == "ntfy":
  895. new_header = f"\n\n🆕 **RSS 本次新增** (共 {total_items} 条)\n\n"
  896. elif format_type == "feishu":
  897. new_header = f"\n{feishu_separator}\n\n🆕 **RSS 本次新增** (共 {total_items} 条)\n\n"
  898. elif format_type == "dingtalk":
  899. new_header = f"\n---\n\n🆕 **RSS 本次新增** (共 {total_items} 条)\n\n"
  900. elif format_type == "slack":
  901. new_header = f"\n\n🆕 *RSS 本次新增* (共 {total_items} 条)\n\n"
  902. # 添加 RSS 新增标题
  903. test_content = current_batch + new_header
  904. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  905. if current_batch_has_content:
  906. batches.append(current_batch + base_footer)
  907. current_batch = base_header + new_header
  908. current_batch_has_content = True
  909. else:
  910. current_batch = test_content
  911. current_batch_has_content = True
  912. # 按来源分组显示(与热榜新增格式一致)
  913. source_list = list(source_map.items())
  914. for i, (source_name, titles) in enumerate(source_list):
  915. count = len(titles)
  916. # 构建来源标题(与热榜新增格式一致)
  917. source_header = ""
  918. if format_type in ("wework", "bark"):
  919. source_header = f"**{source_name}** ({count} 条):\n\n"
  920. elif format_type == "telegram":
  921. source_header = f"{source_name} ({count} 条):\n\n"
  922. elif format_type == "ntfy":
  923. source_header = f"**{source_name}** ({count} 条):\n\n"
  924. elif format_type == "feishu":
  925. source_header = f"**{source_name}** ({count} 条):\n\n"
  926. elif format_type == "dingtalk":
  927. source_header = f"**{source_name}** ({count} 条):\n\n"
  928. elif format_type == "slack":
  929. source_header = f"*{source_name}* ({count} 条):\n\n"
  930. # 构建第一条新闻(不显示来源,禁用 new emoji)
  931. first_news_line = ""
  932. if titles:
  933. first_title_data = titles[0].copy()
  934. first_title_data["is_new"] = False
  935. if format_type in ("wework", "bark"):
  936. formatted_title = format_title_for_platform("wework", first_title_data, show_source=False)
  937. elif format_type == "telegram":
  938. formatted_title = format_title_for_platform("telegram", first_title_data, show_source=False)
  939. elif format_type == "ntfy":
  940. formatted_title = format_title_for_platform("ntfy", first_title_data, show_source=False)
  941. elif format_type == "feishu":
  942. formatted_title = format_title_for_platform("feishu", first_title_data, show_source=False)
  943. elif format_type == "dingtalk":
  944. formatted_title = format_title_for_platform("dingtalk", first_title_data, show_source=False)
  945. elif format_type == "slack":
  946. formatted_title = format_title_for_platform("slack", first_title_data, show_source=False)
  947. else:
  948. formatted_title = f"{first_title_data['title']}"
  949. first_news_line = f" 1. {formatted_title}\n"
  950. # 原子性检查:来源标题 + 第一条新闻必须一起处理
  951. source_with_first_news = source_header + first_news_line
  952. test_content = current_batch + source_with_first_news
  953. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  954. if current_batch_has_content:
  955. batches.append(current_batch + base_footer)
  956. current_batch = base_header + new_header + source_with_first_news
  957. current_batch_has_content = True
  958. start_index = 1
  959. else:
  960. current_batch = test_content
  961. current_batch_has_content = True
  962. start_index = 1
  963. # 处理剩余新闻条目(禁用 new emoji)
  964. for j in range(start_index, len(titles)):
  965. title_data = titles[j].copy()
  966. title_data["is_new"] = False
  967. if format_type in ("wework", "bark"):
  968. formatted_title = format_title_for_platform("wework", title_data, show_source=False)
  969. elif format_type == "telegram":
  970. formatted_title = format_title_for_platform("telegram", title_data, show_source=False)
  971. elif format_type == "ntfy":
  972. formatted_title = format_title_for_platform("ntfy", title_data, show_source=False)
  973. elif format_type == "feishu":
  974. formatted_title = format_title_for_platform("feishu", title_data, show_source=False)
  975. elif format_type == "dingtalk":
  976. formatted_title = format_title_for_platform("dingtalk", title_data, show_source=False)
  977. elif format_type == "slack":
  978. formatted_title = format_title_for_platform("slack", title_data, show_source=False)
  979. else:
  980. formatted_title = f"{title_data['title']}"
  981. news_line = f" {j + 1}. {formatted_title}\n"
  982. test_content = current_batch + news_line
  983. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  984. if current_batch_has_content:
  985. batches.append(current_batch + base_footer)
  986. current_batch = base_header + new_header + source_header + news_line
  987. current_batch_has_content = True
  988. else:
  989. current_batch = test_content
  990. current_batch_has_content = True
  991. # 来源间添加空行(与热榜新增格式一致)
  992. current_batch += "\n"
  993. return current_batch, current_batch_has_content, batches
  994. def _format_rss_item_line(
  995. item: Dict,
  996. index: int,
  997. format_type: str,
  998. timezone: str = "Asia/Shanghai",
  999. ) -> str:
  1000. """格式化单条 RSS 条目
  1001. Args:
  1002. item: RSS 条目字典
  1003. index: 序号
  1004. format_type: 格式类型
  1005. timezone: 时区名称
  1006. Returns:
  1007. 格式化后的条目行字符串
  1008. """
  1009. title = item.get("title", "")
  1010. url = item.get("url", "")
  1011. published_at = item.get("published_at", "")
  1012. # 使用友好时间格式
  1013. if published_at:
  1014. friendly_time = format_iso_time_friendly(published_at, timezone, include_date=True)
  1015. else:
  1016. friendly_time = ""
  1017. # 构建条目行
  1018. if format_type == "feishu":
  1019. if url:
  1020. item_line = f" {index}. [{title}]({url})"
  1021. else:
  1022. item_line = f" {index}. {title}"
  1023. if friendly_time:
  1024. item_line += f" <font color='grey'>- {friendly_time}</font>"
  1025. elif format_type == "telegram":
  1026. if url:
  1027. item_line = f" {index}. {title} ({url})"
  1028. else:
  1029. item_line = f" {index}. {title}"
  1030. if friendly_time:
  1031. item_line += f" - {friendly_time}"
  1032. else:
  1033. if url:
  1034. item_line = f" {index}. [{title}]({url})"
  1035. else:
  1036. item_line = f" {index}. {title}"
  1037. if friendly_time:
  1038. item_line += f" `{friendly_time}`"
  1039. item_line += "\n"
  1040. return item_line
  1041. def _process_standalone_section(
  1042. standalone_data: Dict,
  1043. format_type: str,
  1044. feishu_separator: str,
  1045. base_header: str,
  1046. base_footer: str,
  1047. max_bytes: int,
  1048. current_batch: str,
  1049. current_batch_has_content: bool,
  1050. batches: List[str],
  1051. timezone: str = "Asia/Shanghai",
  1052. rank_threshold: int = 10,
  1053. ) -> tuple:
  1054. """处理独立展示区区块
  1055. 独立展示区显示指定平台的完整热榜或 RSS 源内容,不受关键词过滤影响。
  1056. 热榜按原始排名排序,RSS 按发布时间排序。
  1057. Args:
  1058. standalone_data: 独立展示数据,格式:
  1059. {
  1060. "platforms": [{"id": "zhihu", "name": "知乎热榜", "items": [...]}],
  1061. "rss_feeds": [{"id": "hacker-news", "name": "Hacker News", "items": [...]}]
  1062. }
  1063. format_type: 格式类型
  1064. feishu_separator: 飞书分隔符
  1065. base_header: 基础头部
  1066. base_footer: 基础尾部
  1067. max_bytes: 最大字节数
  1068. current_batch: 当前批次内容
  1069. current_batch_has_content: 当前批次是否有内容
  1070. batches: 已完成的批次列表
  1071. timezone: 时区名称
  1072. Returns:
  1073. (current_batch, current_batch_has_content, batches) 元组
  1074. """
  1075. if not standalone_data:
  1076. return current_batch, current_batch_has_content, batches
  1077. platforms = standalone_data.get("platforms", [])
  1078. rss_feeds = standalone_data.get("rss_feeds", [])
  1079. if not platforms and not rss_feeds:
  1080. return current_batch, current_batch_has_content, batches
  1081. # 计算总条目数
  1082. total_platform_items = sum(len(p.get("items", [])) for p in platforms)
  1083. total_rss_items = sum(len(f.get("items", [])) for f in rss_feeds)
  1084. total_items = total_platform_items + total_rss_items
  1085. # 独立展示区标题
  1086. section_header = ""
  1087. if format_type == "feishu":
  1088. section_header = f"\n{feishu_separator}\n\n📋 **独立展示区** (共 {total_items} 条)\n\n"
  1089. elif format_type == "dingtalk":
  1090. section_header = f"\n---\n\n📋 **独立展示区** (共 {total_items} 条)\n\n"
  1091. elif format_type == "telegram":
  1092. section_header = f"\n\n📋 独立展示区 (共 {total_items} 条)\n\n"
  1093. elif format_type == "slack":
  1094. section_header = f"\n\n📋 *独立展示区* (共 {total_items} 条)\n\n"
  1095. else:
  1096. section_header = f"\n\n📋 **独立展示区** (共 {total_items} 条)\n\n"
  1097. # 添加区块标题
  1098. test_content = current_batch + section_header
  1099. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) < max_bytes:
  1100. current_batch = test_content
  1101. current_batch_has_content = True
  1102. else:
  1103. if current_batch_has_content:
  1104. batches.append(current_batch + base_footer)
  1105. current_batch = base_header + section_header
  1106. current_batch_has_content = True
  1107. # 处理热榜平台
  1108. for platform in platforms:
  1109. platform_name = platform.get("name", platform.get("id", ""))
  1110. items = platform.get("items", [])
  1111. if not items:
  1112. continue
  1113. # 平台标题
  1114. platform_header = ""
  1115. if format_type in ("wework", "bark"):
  1116. platform_header = f"**{platform_name}** ({len(items)} 条):\n\n"
  1117. elif format_type == "telegram":
  1118. platform_header = f"{platform_name} ({len(items)} 条):\n\n"
  1119. elif format_type == "ntfy":
  1120. platform_header = f"**{platform_name}** ({len(items)} 条):\n\n"
  1121. elif format_type == "feishu":
  1122. platform_header = f"**{platform_name}** ({len(items)} 条):\n\n"
  1123. elif format_type == "dingtalk":
  1124. platform_header = f"**{platform_name}** ({len(items)} 条):\n\n"
  1125. elif format_type == "slack":
  1126. platform_header = f"*{platform_name}* ({len(items)} 条):\n\n"
  1127. # 构建第一条新闻
  1128. first_item_line = ""
  1129. if items:
  1130. first_item_line = _format_standalone_platform_item(items[0], 1, format_type, rank_threshold)
  1131. # 原子性检查
  1132. platform_with_first = platform_header + first_item_line
  1133. test_content = current_batch + platform_with_first
  1134. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  1135. if current_batch_has_content:
  1136. batches.append(current_batch + base_footer)
  1137. current_batch = base_header + section_header + platform_with_first
  1138. current_batch_has_content = True
  1139. start_index = 1
  1140. else:
  1141. current_batch = test_content
  1142. current_batch_has_content = True
  1143. start_index = 1
  1144. # 处理剩余条目
  1145. for j in range(start_index, len(items)):
  1146. item_line = _format_standalone_platform_item(items[j], j + 1, format_type, rank_threshold)
  1147. test_content = current_batch + item_line
  1148. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  1149. if current_batch_has_content:
  1150. batches.append(current_batch + base_footer)
  1151. current_batch = base_header + section_header + platform_header + item_line
  1152. current_batch_has_content = True
  1153. else:
  1154. current_batch = test_content
  1155. current_batch_has_content = True
  1156. current_batch += "\n"
  1157. # 处理 RSS 源
  1158. for feed in rss_feeds:
  1159. feed_name = feed.get("name", feed.get("id", ""))
  1160. items = feed.get("items", [])
  1161. if not items:
  1162. continue
  1163. # RSS 源标题
  1164. feed_header = ""
  1165. if format_type in ("wework", "bark"):
  1166. feed_header = f"**{feed_name}** ({len(items)} 条):\n\n"
  1167. elif format_type == "telegram":
  1168. feed_header = f"{feed_name} ({len(items)} 条):\n\n"
  1169. elif format_type == "ntfy":
  1170. feed_header = f"**{feed_name}** ({len(items)} 条):\n\n"
  1171. elif format_type == "feishu":
  1172. feed_header = f"**{feed_name}** ({len(items)} 条):\n\n"
  1173. elif format_type == "dingtalk":
  1174. feed_header = f"**{feed_name}** ({len(items)} 条):\n\n"
  1175. elif format_type == "slack":
  1176. feed_header = f"*{feed_name}* ({len(items)} 条):\n\n"
  1177. # 构建第一条 RSS
  1178. first_item_line = ""
  1179. if items:
  1180. first_item_line = _format_standalone_rss_item(items[0], 1, format_type, timezone)
  1181. # 原子性检查
  1182. feed_with_first = feed_header + first_item_line
  1183. test_content = current_batch + feed_with_first
  1184. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  1185. if current_batch_has_content:
  1186. batches.append(current_batch + base_footer)
  1187. current_batch = base_header + section_header + feed_with_first
  1188. current_batch_has_content = True
  1189. start_index = 1
  1190. else:
  1191. current_batch = test_content
  1192. current_batch_has_content = True
  1193. start_index = 1
  1194. # 处理剩余条目
  1195. for j in range(start_index, len(items)):
  1196. item_line = _format_standalone_rss_item(items[j], j + 1, format_type, timezone)
  1197. test_content = current_batch + item_line
  1198. if len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) >= max_bytes:
  1199. if current_batch_has_content:
  1200. batches.append(current_batch + base_footer)
  1201. current_batch = base_header + section_header + feed_header + item_line
  1202. current_batch_has_content = True
  1203. else:
  1204. current_batch = test_content
  1205. current_batch_has_content = True
  1206. current_batch += "\n"
  1207. return current_batch, current_batch_has_content, batches
  1208. def _format_standalone_platform_item(item: Dict, index: int, format_type: str, rank_threshold: int = 10) -> str:
  1209. """格式化独立展示区的热榜条目(复用热点词汇统计区样式)
  1210. Args:
  1211. item: 热榜条目,包含 title, url, rank, ranks, first_time, last_time, count
  1212. index: 序号
  1213. format_type: 格式类型
  1214. rank_threshold: 排名高亮阈值
  1215. Returns:
  1216. 格式化后的条目行字符串
  1217. """
  1218. title = item.get("title", "")
  1219. url = item.get("url", "") or item.get("mobileUrl", "")
  1220. ranks = item.get("ranks", [])
  1221. rank = item.get("rank", 0)
  1222. first_time = item.get("first_time", "")
  1223. last_time = item.get("last_time", "")
  1224. count = item.get("count", 1)
  1225. # 使用 format_rank_display 格式化排名(复用热点词汇统计区逻辑)
  1226. # 如果没有 ranks 列表,用单个 rank 构造
  1227. if not ranks and rank > 0:
  1228. ranks = [rank]
  1229. rank_display = format_rank_display(ranks, rank_threshold, format_type) if ranks else ""
  1230. # 构建时间显示(用 ~ 连接范围,与热点词汇统计区一致)
  1231. # 将 HH-MM 格式转换为 HH:MM 格式
  1232. time_display = ""
  1233. if first_time and last_time and first_time != last_time:
  1234. first_time_display = convert_time_for_display(first_time)
  1235. last_time_display = convert_time_for_display(last_time)
  1236. time_display = f"{first_time_display}~{last_time_display}"
  1237. elif first_time:
  1238. time_display = convert_time_for_display(first_time)
  1239. # 构建次数显示(格式为 (N次),与热点词汇统计区一致)
  1240. count_display = f"({count}次)" if count > 1 else ""
  1241. # 根据格式类型构建条目行(复用热点词汇统计区样式)
  1242. if format_type == "feishu":
  1243. if url:
  1244. item_line = f" {index}. [{title}]({url})"
  1245. else:
  1246. item_line = f" {index}. {title}"
  1247. if rank_display:
  1248. item_line += f" {rank_display}"
  1249. if time_display:
  1250. item_line += f" <font color='grey'>- {time_display}</font>"
  1251. if count_display:
  1252. item_line += f" <font color='green'>{count_display}</font>"
  1253. elif format_type == "dingtalk":
  1254. if url:
  1255. item_line = f" {index}. [{title}]({url})"
  1256. else:
  1257. item_line = f" {index}. {title}"
  1258. if rank_display:
  1259. item_line += f" {rank_display}"
  1260. if time_display:
  1261. item_line += f" - {time_display}"
  1262. if count_display:
  1263. item_line += f" {count_display}"
  1264. elif format_type == "telegram":
  1265. if url:
  1266. item_line = f" {index}. {title} ({url})"
  1267. else:
  1268. item_line = f" {index}. {title}"
  1269. if rank_display:
  1270. item_line += f" {rank_display}"
  1271. if time_display:
  1272. item_line += f" - {time_display}"
  1273. if count_display:
  1274. item_line += f" {count_display}"
  1275. elif format_type == "slack":
  1276. if url:
  1277. item_line = f" {index}. <{url}|{title}>"
  1278. else:
  1279. item_line = f" {index}. {title}"
  1280. if rank_display:
  1281. item_line += f" {rank_display}"
  1282. if time_display:
  1283. item_line += f" _{time_display}_"
  1284. if count_display:
  1285. item_line += f" {count_display}"
  1286. else:
  1287. # wework, bark, ntfy
  1288. if url:
  1289. item_line = f" {index}. [{title}]({url})"
  1290. else:
  1291. item_line = f" {index}. {title}"
  1292. if rank_display:
  1293. item_line += f" {rank_display}"
  1294. if time_display:
  1295. item_line += f" - {time_display}"
  1296. if count_display:
  1297. item_line += f" {count_display}"
  1298. item_line += "\n"
  1299. return item_line
  1300. def _format_standalone_rss_item(
  1301. item: Dict, index: int, format_type: str, timezone: str = "Asia/Shanghai"
  1302. ) -> str:
  1303. """格式化独立展示区的 RSS 条目
  1304. Args:
  1305. item: RSS 条目,包含 title, url, published_at, author
  1306. index: 序号
  1307. format_type: 格式类型
  1308. timezone: 时区名称
  1309. Returns:
  1310. 格式化后的条目行字符串
  1311. """
  1312. title = item.get("title", "")
  1313. url = item.get("url", "")
  1314. published_at = item.get("published_at", "")
  1315. author = item.get("author", "")
  1316. # 使用友好时间格式
  1317. friendly_time = ""
  1318. if published_at:
  1319. friendly_time = format_iso_time_friendly(published_at, timezone, include_date=True)
  1320. # 构建元信息
  1321. meta_parts = []
  1322. if friendly_time:
  1323. meta_parts.append(friendly_time)
  1324. if author:
  1325. meta_parts.append(author)
  1326. meta_str = ", ".join(meta_parts)
  1327. # 根据格式类型构建条目行
  1328. if format_type == "feishu":
  1329. if url:
  1330. item_line = f" {index}. [{title}]({url})"
  1331. else:
  1332. item_line = f" {index}. {title}"
  1333. if meta_str:
  1334. item_line += f" <font color='grey'>- {meta_str}</font>"
  1335. elif format_type == "telegram":
  1336. if url:
  1337. item_line = f" {index}. {title} ({url})"
  1338. else:
  1339. item_line = f" {index}. {title}"
  1340. if meta_str:
  1341. item_line += f" - {meta_str}"
  1342. elif format_type == "slack":
  1343. if url:
  1344. item_line = f" {index}. <{url}|{title}>"
  1345. else:
  1346. item_line = f" {index}. {title}"
  1347. if meta_str:
  1348. item_line += f" _{meta_str}_"
  1349. else:
  1350. # wework, bark, ntfy, dingtalk
  1351. if url:
  1352. item_line = f" {index}. [{title}]({url})"
  1353. else:
  1354. item_line = f" {index}. {title}"
  1355. if meta_str:
  1356. item_line += f" `{meta_str}`"
  1357. item_line += "\n"
  1358. return item_line