main.py 117 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131
  1. # coding=utf-8
  2. import json
  3. import os
  4. import random
  5. import re
  6. import time
  7. import webbrowser
  8. from dataclasses import dataclass
  9. from datetime import datetime
  10. from pathlib import Path
  11. from typing import Dict, List, Tuple, Optional, Union
  12. import pytz
  13. import requests
  14. import yaml
  15. class ConfigManager:
  16. """配置管理器"""
  17. @staticmethod
  18. def _load_config_file() -> Dict:
  19. """加载配置文件"""
  20. config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
  21. if not Path(config_path).exists():
  22. raise FileNotFoundError(f"配置文件 {config_path} 不存在")
  23. try:
  24. with open(config_path, "r", encoding="utf-8") as f:
  25. config_data = yaml.safe_load(f)
  26. print(f"配置文件加载成功: {config_path}")
  27. return config_data
  28. except Exception as e:
  29. raise RuntimeError(f"配置文件解析失败: {e}")
  30. def __init__(self):
  31. self.config_data = self._load_config_file()
  32. self.config = self._build_config()
  33. self.platforms = self.config_data["platforms"]
  34. def _get_webhook_config(self, config_key: str, env_key: str) -> str:
  35. """获取 Webhook 配置"""
  36. env_value = os.environ.get(env_key, "").strip()
  37. if env_value:
  38. return env_value
  39. return (
  40. self.config_data.get("notification", {})
  41. .get("webhooks", {})
  42. .get(config_key, "")
  43. )
  44. def _build_config(self) -> Dict:
  45. """构建配置字典,环境变量优先级高于配置文件"""
  46. feishu_url = self._get_webhook_config("feishu_url", "FEISHU_WEBHOOK_URL")
  47. dingtalk_url = self._get_webhook_config("dingtalk_url", "DINGTALK_WEBHOOK_URL")
  48. wework_url = self._get_webhook_config("wework_url", "WEWORK_WEBHOOK_URL")
  49. telegram_token = self._get_webhook_config(
  50. "telegram_bot_token", "TELEGRAM_BOT_TOKEN"
  51. )
  52. telegram_chat_id = self._get_webhook_config(
  53. "telegram_chat_id", "TELEGRAM_CHAT_ID"
  54. )
  55. # 输出配置来源信息
  56. webhook_sources = []
  57. if feishu_url:
  58. source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
  59. webhook_sources.append(f"飞书({source})")
  60. if dingtalk_url:
  61. source = (
  62. "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
  63. )
  64. webhook_sources.append(f"钉钉({source})")
  65. if wework_url:
  66. source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
  67. webhook_sources.append(f"企业微信({source})")
  68. if telegram_token and telegram_chat_id:
  69. token_source = (
  70. "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
  71. )
  72. chat_source = (
  73. "环境变量" if os.environ.get("TELEGRAM_CHAT_ID") else "配置文件"
  74. )
  75. webhook_sources.append(f"Telegram({token_source}/{chat_source})")
  76. if webhook_sources:
  77. print(f"Webhook 配置来源: {', '.join(webhook_sources)}")
  78. else:
  79. print("未配置任何 Webhook")
  80. config = {
  81. "VERSION": self.config_data["app"]["version"],
  82. "VERSION_CHECK_URL": self.config_data["app"]["version_check_url"],
  83. "SHOW_VERSION_UPDATE": self.config_data["app"]["show_version_update"],
  84. "FEISHU_MESSAGE_SEPARATOR": self.config_data["notification"][
  85. "feishu_message_separator"
  86. ],
  87. "REQUEST_INTERVAL": self.config_data["crawler"]["request_interval"],
  88. "REPORT_MODE": self.config_data["report"]["mode"],
  89. "RANK_THRESHOLD": self.config_data["report"]["rank_threshold"],
  90. "USE_PROXY": self.config_data["crawler"]["use_proxy"],
  91. "DEFAULT_PROXY": self.config_data["crawler"]["default_proxy"],
  92. "ENABLE_CRAWLER": self.config_data["crawler"]["enable_crawler"],
  93. "ENABLE_NOTIFICATION": self.config_data["notification"][
  94. "enable_notification"
  95. ],
  96. "MESSAGE_BATCH_SIZE": self.config_data["notification"][
  97. "message_batch_size"
  98. ],
  99. "BATCH_SEND_INTERVAL": self.config_data["notification"][
  100. "batch_send_interval"
  101. ],
  102. "FEISHU_WEBHOOK_URL": feishu_url,
  103. "DINGTALK_WEBHOOK_URL": dingtalk_url,
  104. "WEWORK_WEBHOOK_URL": wework_url,
  105. "TELEGRAM_BOT_TOKEN": telegram_token,
  106. "TELEGRAM_CHAT_ID": telegram_chat_id,
  107. "WEIGHT_CONFIG": {
  108. "RANK_WEIGHT": self.config_data["weight"]["rank_weight"],
  109. "FREQUENCY_WEIGHT": self.config_data["weight"]["frequency_weight"],
  110. "HOTNESS_WEIGHT": self.config_data["weight"]["hotness_weight"],
  111. },
  112. }
  113. return config
  114. def get_config(self) -> Dict:
  115. """获取配置字典"""
  116. return self.config
  117. def get_platforms(self) -> List:
  118. """获取平台列表"""
  119. return self.platforms
  120. print("正在加载配置...")
  121. config_manager = ConfigManager()
  122. CONFIG = config_manager.get_config()
  123. PLATFORMS = config_manager.get_platforms()
  124. print(f"TrendRadar v{CONFIG['VERSION']} 配置加载完成")
  125. print(f"监控平台数量: {len(PLATFORMS)}")
  126. class TimeHelper:
  127. """时间处理工具"""
  128. @staticmethod
  129. def get_beijing_time() -> datetime:
  130. return datetime.now(pytz.timezone("Asia/Shanghai"))
  131. @staticmethod
  132. def format_date_folder() -> str:
  133. return TimeHelper.get_beijing_time().strftime("%Y年%m月%d日")
  134. @staticmethod
  135. def format_time_filename() -> str:
  136. return TimeHelper.get_beijing_time().strftime("%H时%M分")
  137. class VersionChecker:
  138. """版本检查工具"""
  139. @staticmethod
  140. def parse_version(version_str: str) -> Tuple[int, int, int]:
  141. """解析版本号字符串为元组"""
  142. try:
  143. parts = version_str.strip().split(".")
  144. if len(parts) != 3:
  145. raise ValueError("版本号格式不正确")
  146. return int(parts[0]), int(parts[1]), int(parts[2])
  147. except (ValueError, AttributeError):
  148. print(f"无法解析版本号: {version_str}")
  149. return 0, 0, 0
  150. @staticmethod
  151. def compare_versions(current: str, remote: str) -> int:
  152. """比较版本号"""
  153. current_tuple = VersionChecker.parse_version(current)
  154. remote_tuple = VersionChecker.parse_version(remote)
  155. if current_tuple < remote_tuple:
  156. return -1 # 需要更新
  157. elif current_tuple > remote_tuple:
  158. return 1 # 当前版本更新
  159. else:
  160. return 0 # 版本相同
  161. @staticmethod
  162. def check_for_updates(
  163. current_version: str,
  164. version_url: str,
  165. proxy_url: Optional[str] = None,
  166. timeout: int = 10,
  167. ) -> Tuple[bool, Optional[str]]:
  168. """检查是否有新版本"""
  169. try:
  170. proxies = None
  171. if proxy_url:
  172. proxies = {"http": proxy_url, "https": proxy_url}
  173. headers = {
  174. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  175. "Accept": "text/plain, */*",
  176. "Cache-Control": "no-cache",
  177. }
  178. response = requests.get(
  179. version_url, proxies=proxies, headers=headers, timeout=timeout
  180. )
  181. response.raise_for_status()
  182. remote_version = response.text.strip()
  183. print(f"当前版本: {current_version}, 远程版本: {remote_version}")
  184. comparison = VersionChecker.compare_versions(
  185. current_version, remote_version
  186. )
  187. need_update = comparison == -1
  188. return need_update, remote_version if need_update else None
  189. except Exception as e:
  190. print(f"版本检查失败: {e}")
  191. return False, None
  192. class FileHelper:
  193. """文件操作工具"""
  194. @staticmethod
  195. def ensure_directory_exists(directory: str) -> None:
  196. Path(directory).mkdir(parents=True, exist_ok=True)
  197. @staticmethod
  198. def get_output_path(subfolder: str, filename: str) -> str:
  199. date_folder = TimeHelper.format_date_folder()
  200. output_dir = Path("output") / date_folder / subfolder
  201. FileHelper.ensure_directory_exists(str(output_dir))
  202. return str(output_dir / filename)
  203. class DataFetcher:
  204. """数据获取器"""
  205. def __init__(self, proxy_url: Optional[str] = None):
  206. self.proxy_url = proxy_url
  207. def fetch_data(
  208. self,
  209. id_info: Union[str, Tuple[str, str]],
  210. max_retries: int = 2,
  211. min_retry_wait: int = 3,
  212. max_retry_wait: int = 5,
  213. ) -> Tuple[Optional[str], str, str]:
  214. """获取指定ID数据,支持重试"""
  215. if isinstance(id_info, tuple):
  216. id_value, alias = id_info
  217. else:
  218. id_value = id_info
  219. alias = id_value
  220. url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
  221. proxies = None
  222. if self.proxy_url:
  223. proxies = {"http": self.proxy_url, "https": self.proxy_url}
  224. headers = {
  225. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  226. "Accept": "application/json, text/plain, */*",
  227. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  228. "Connection": "keep-alive",
  229. "Cache-Control": "no-cache",
  230. }
  231. retries = 0
  232. while retries <= max_retries:
  233. try:
  234. response = requests.get(
  235. url, proxies=proxies, headers=headers, timeout=10
  236. )
  237. response.raise_for_status()
  238. data_text = response.text
  239. data_json = json.loads(data_text)
  240. status = data_json.get("status", "未知")
  241. if status not in ["success", "cache"]:
  242. raise ValueError(f"响应状态异常: {status}")
  243. status_info = "最新数据" if status == "success" else "缓存数据"
  244. print(f"获取 {id_value} 成功({status_info})")
  245. return data_text, id_value, alias
  246. except Exception as e:
  247. retries += 1
  248. if retries <= max_retries:
  249. base_wait = random.uniform(min_retry_wait, max_retry_wait)
  250. additional_wait = (retries - 1) * random.uniform(1, 2)
  251. wait_time = base_wait + additional_wait
  252. print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
  253. time.sleep(wait_time)
  254. else:
  255. print(f"请求 {id_value} 失败: {e}")
  256. return None, id_value, alias
  257. return None, id_value, alias
  258. def crawl_websites(
  259. self,
  260. ids_list: List[Union[str, Tuple[str, str]]],
  261. request_interval: int = CONFIG["REQUEST_INTERVAL"],
  262. ) -> Tuple[Dict, Dict, List]:
  263. """爬取多个网站数据"""
  264. results = {}
  265. id_to_name = {}
  266. failed_ids = []
  267. for i, id_info in enumerate(ids_list):
  268. if isinstance(id_info, tuple):
  269. id_value, name = id_info
  270. else:
  271. id_value = id_info
  272. name = id_value
  273. id_to_name[id_value] = name
  274. response, _, _ = self.fetch_data(id_info)
  275. if response:
  276. try:
  277. data = json.loads(response)
  278. results[id_value] = {}
  279. for index, item in enumerate(data.get("items", []), 1):
  280. title = item["title"]
  281. url = item.get("url", "")
  282. mobile_url = item.get("mobileUrl", "")
  283. if title in results[id_value]:
  284. results[id_value][title]["ranks"].append(index)
  285. else:
  286. results[id_value][title] = {
  287. "ranks": [index],
  288. "url": url,
  289. "mobileUrl": mobile_url,
  290. }
  291. except json.JSONDecodeError:
  292. print(f"解析 {id_value} 响应失败")
  293. failed_ids.append(id_value)
  294. except Exception as e:
  295. print(f"处理 {id_value} 数据出错: {e}")
  296. failed_ids.append(id_value)
  297. else:
  298. failed_ids.append(id_value)
  299. if i < len(ids_list) - 1:
  300. actual_interval = request_interval + random.randint(-10, 20)
  301. actual_interval = max(50, actual_interval)
  302. time.sleep(actual_interval / 1000)
  303. print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
  304. return results, id_to_name, failed_ids
  305. class DataProcessor:
  306. """数据处理器"""
  307. @staticmethod
  308. def clean_title(title: str) -> str:
  309. """清理标题中的特殊字符"""
  310. if not isinstance(title, str):
  311. title = str(title)
  312. cleaned_title = title.replace("\n", " ").replace("\r", " ")
  313. cleaned_title = re.sub(r"\s+", " ", cleaned_title)
  314. cleaned_title = cleaned_title.strip()
  315. return cleaned_title
  316. @staticmethod
  317. def is_first_crawl_today() -> bool:
  318. """检测是否是当天第一次爬取"""
  319. date_folder = TimeHelper.format_date_folder()
  320. txt_dir = Path("output") / date_folder / "txt"
  321. if not txt_dir.exists():
  322. return True
  323. files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
  324. return len(files) <= 1 # 0个文件或1个文件都算第一次
  325. @staticmethod
  326. def detect_latest_new_titles(
  327. current_platform_ids: Optional[List[str]] = None,
  328. ) -> Dict:
  329. """检测当日最新批次的新增标题,支持按当前监控平台过滤"""
  330. date_folder = TimeHelper.format_date_folder()
  331. txt_dir = Path("output") / date_folder / "txt"
  332. if not txt_dir.exists():
  333. return {}
  334. files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
  335. if len(files) < 2:
  336. return {}
  337. # 解析最新文件
  338. latest_file = files[-1]
  339. latest_titles, _ = DataProcessor._parse_file_titles(latest_file)
  340. # 如果指定了当前平台列表,过滤最新文件数据
  341. if current_platform_ids is not None:
  342. filtered_latest_titles = {}
  343. for source_id, title_data in latest_titles.items():
  344. if source_id in current_platform_ids:
  345. filtered_latest_titles[source_id] = title_data
  346. latest_titles = filtered_latest_titles
  347. # 汇总历史标题(按平台过滤)
  348. historical_titles = {}
  349. for file_path in files[:-1]:
  350. historical_data, _ = DataProcessor._parse_file_titles(file_path)
  351. # 过滤历史数据
  352. if current_platform_ids is not None:
  353. filtered_historical_data = {}
  354. for source_id, title_data in historical_data.items():
  355. if source_id in current_platform_ids:
  356. filtered_historical_data[source_id] = title_data
  357. historical_data = filtered_historical_data
  358. for source_id, titles_data in historical_data.items():
  359. if source_id not in historical_titles:
  360. historical_titles[source_id] = set()
  361. for title in titles_data.keys():
  362. historical_titles[source_id].add(title)
  363. # 找出新增标题
  364. new_titles = {}
  365. for source_id, latest_source_titles in latest_titles.items():
  366. historical_set = historical_titles.get(source_id, set())
  367. source_new_titles = {}
  368. for title, title_data in latest_source_titles.items():
  369. if title not in historical_set:
  370. source_new_titles[title] = title_data
  371. if source_new_titles:
  372. new_titles[source_id] = source_new_titles
  373. return new_titles
  374. @staticmethod
  375. def _parse_file_titles(file_path: Path) -> Tuple[Dict, Dict]:
  376. """解析单个txt文件的标题数据,返回(titles_by_id, id_to_name)"""
  377. titles_by_id = {}
  378. id_to_name = {}
  379. with open(file_path, "r", encoding="utf-8") as f:
  380. content = f.read()
  381. sections = content.split("\n\n")
  382. for section in sections:
  383. if not section.strip() or "==== 以下ID请求失败 ====" in section:
  384. continue
  385. lines = section.strip().split("\n")
  386. if len(lines) < 2:
  387. continue
  388. # id | name 或 id
  389. header_line = lines[0].strip()
  390. if " | " in header_line:
  391. parts = header_line.split(" | ", 1)
  392. source_id = parts[0].strip()
  393. name = parts[1].strip()
  394. id_to_name[source_id] = name
  395. else:
  396. source_id = header_line
  397. id_to_name[source_id] = source_id
  398. titles_by_id[source_id] = {}
  399. for line in lines[1:]:
  400. if line.strip():
  401. try:
  402. title_part = line.strip()
  403. rank = None
  404. # 提取排名
  405. if (
  406. ". " in title_part
  407. and title_part.split(". ")[0].isdigit()
  408. ):
  409. rank_str, title_part = title_part.split(". ", 1)
  410. rank = int(rank_str)
  411. # 提取 MOBILE URL
  412. mobile_url = ""
  413. if " [MOBILE:" in title_part:
  414. title_part, mobile_part = title_part.rsplit(
  415. " [MOBILE:", 1
  416. )
  417. if mobile_part.endswith("]"):
  418. mobile_url = mobile_part[:-1]
  419. # 提取 URL
  420. url = ""
  421. if " [URL:" in title_part:
  422. title_part, url_part = title_part.rsplit(" [URL:", 1)
  423. if url_part.endswith("]"):
  424. url = url_part[:-1]
  425. title = DataProcessor.clean_title(title_part.strip())
  426. ranks = [rank] if rank is not None else [1]
  427. titles_by_id[source_id][title] = {
  428. "ranks": ranks,
  429. "url": url,
  430. "mobileUrl": mobile_url,
  431. }
  432. except Exception as e:
  433. print(f"解析标题行出错: {line}, 错误: {e}")
  434. return titles_by_id, id_to_name
  435. @staticmethod
  436. def save_titles_to_file(results: Dict, id_to_name: Dict, failed_ids: List) -> str:
  437. """保存标题到文件"""
  438. file_path = FileHelper.get_output_path(
  439. "txt", f"{TimeHelper.format_time_filename()}.txt"
  440. )
  441. with open(file_path, "w", encoding="utf-8") as f:
  442. for id_value, title_data in results.items():
  443. # id | name 或 id
  444. name = id_to_name.get(id_value)
  445. if name and name != id_value:
  446. f.write(f"{id_value} | {name}\n")
  447. else:
  448. f.write(f"{id_value}\n")
  449. # 按排名排序标题
  450. sorted_titles = []
  451. for title, info in title_data.items():
  452. cleaned_title = DataProcessor.clean_title(title)
  453. if isinstance(info, dict):
  454. ranks = info.get("ranks", [])
  455. url = info.get("url", "")
  456. mobile_url = info.get("mobileUrl", "")
  457. else:
  458. ranks = info if isinstance(info, list) else []
  459. url = ""
  460. mobile_url = ""
  461. rank = ranks[0] if ranks else 1
  462. sorted_titles.append((rank, cleaned_title, url, mobile_url))
  463. sorted_titles.sort(key=lambda x: x[0])
  464. for rank, cleaned_title, url, mobile_url in sorted_titles:
  465. line = f"{rank}. {cleaned_title}"
  466. if url:
  467. line += f" [URL:{url}]"
  468. if mobile_url:
  469. line += f" [MOBILE:{mobile_url}]"
  470. f.write(line + "\n")
  471. f.write("\n")
  472. if failed_ids:
  473. f.write("==== 以下ID请求失败 ====\n")
  474. for id_value in failed_ids:
  475. f.write(f"{id_value}\n")
  476. return file_path
  477. @staticmethod
  478. def load_frequency_words(
  479. frequency_file: Optional[str] = None,
  480. ) -> Tuple[List[Dict], List[str]]:
  481. """加载频率词配置"""
  482. if frequency_file is None:
  483. frequency_file = os.environ.get(
  484. "FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
  485. )
  486. frequency_path = Path(frequency_file)
  487. if not frequency_path.exists():
  488. raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
  489. with open(frequency_path, "r", encoding="utf-8") as f:
  490. content = f.read()
  491. word_groups = [
  492. group.strip() for group in content.split("\n\n") if group.strip()
  493. ]
  494. processed_groups = []
  495. filter_words = []
  496. for group in word_groups:
  497. words = [word.strip() for word in group.split("\n") if word.strip()]
  498. group_required_words = []
  499. group_normal_words = []
  500. group_filter_words = []
  501. for word in words:
  502. if word.startswith("!"):
  503. filter_words.append(word[1:])
  504. group_filter_words.append(word[1:])
  505. elif word.startswith("+"):
  506. group_required_words.append(word[1:])
  507. else:
  508. group_normal_words.append(word)
  509. if group_required_words or group_normal_words:
  510. if group_normal_words:
  511. group_key = " ".join(group_normal_words)
  512. else:
  513. group_key = " ".join(group_required_words)
  514. processed_groups.append(
  515. {
  516. "required": group_required_words,
  517. "normal": group_normal_words,
  518. "group_key": group_key,
  519. }
  520. )
  521. return processed_groups, filter_words
  522. @staticmethod
  523. def read_all_today_titles(
  524. current_platform_ids: Optional[List[str]] = None,
  525. ) -> Tuple[Dict, Dict, Dict]:
  526. """读取当天所有标题文件,支持按当前监控平台过滤"""
  527. date_folder = TimeHelper.format_date_folder()
  528. txt_dir = Path("output") / date_folder / "txt"
  529. if not txt_dir.exists():
  530. return {}, {}, {}
  531. all_results = {}
  532. final_id_to_name = {}
  533. title_info = {}
  534. files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
  535. for file_path in files:
  536. time_info = file_path.stem
  537. titles_by_id, file_id_to_name = DataProcessor._parse_file_titles(file_path)
  538. if current_platform_ids is not None:
  539. filtered_titles_by_id = {}
  540. filtered_id_to_name = {}
  541. for source_id, title_data in titles_by_id.items():
  542. if source_id in current_platform_ids:
  543. filtered_titles_by_id[source_id] = title_data
  544. if source_id in file_id_to_name:
  545. filtered_id_to_name[source_id] = file_id_to_name[source_id]
  546. titles_by_id = filtered_titles_by_id
  547. file_id_to_name = filtered_id_to_name
  548. final_id_to_name.update(file_id_to_name)
  549. for source_id, title_data in titles_by_id.items():
  550. DataProcessor._process_source_data(
  551. source_id,
  552. title_data,
  553. time_info,
  554. all_results,
  555. title_info,
  556. )
  557. return all_results, final_id_to_name, title_info
  558. @staticmethod
  559. def _process_source_data(
  560. source_id: str,
  561. title_data: Dict,
  562. time_info: str,
  563. all_results: Dict,
  564. title_info: Dict,
  565. ) -> None:
  566. """处理来源数据,合并重复标题"""
  567. if source_id not in all_results:
  568. all_results[source_id] = title_data
  569. if source_id not in title_info:
  570. title_info[source_id] = {}
  571. for title, data in title_data.items():
  572. ranks = data.get("ranks", [])
  573. url = data.get("url", "")
  574. mobile_url = data.get("mobileUrl", "")
  575. title_info[source_id][title] = {
  576. "first_time": time_info,
  577. "last_time": time_info,
  578. "count": 1,
  579. "ranks": ranks,
  580. "url": url,
  581. "mobileUrl": mobile_url,
  582. }
  583. else:
  584. for title, data in title_data.items():
  585. ranks = data.get("ranks", [])
  586. url = data.get("url", "")
  587. mobile_url = data.get("mobileUrl", "")
  588. if title not in all_results[source_id]:
  589. all_results[source_id][title] = {
  590. "ranks": ranks,
  591. "url": url,
  592. "mobileUrl": mobile_url,
  593. }
  594. title_info[source_id][title] = {
  595. "first_time": time_info,
  596. "last_time": time_info,
  597. "count": 1,
  598. "ranks": ranks,
  599. "url": url,
  600. "mobileUrl": mobile_url,
  601. }
  602. else:
  603. existing_data = all_results[source_id][title]
  604. existing_ranks = existing_data.get("ranks", [])
  605. existing_url = existing_data.get("url", "")
  606. existing_mobile_url = existing_data.get("mobileUrl", "")
  607. merged_ranks = existing_ranks.copy()
  608. for rank in ranks:
  609. if rank not in merged_ranks:
  610. merged_ranks.append(rank)
  611. all_results[source_id][title] = {
  612. "ranks": merged_ranks,
  613. "url": existing_url or url,
  614. "mobileUrl": existing_mobile_url or mobile_url,
  615. }
  616. title_info[source_id][title]["last_time"] = time_info
  617. title_info[source_id][title]["ranks"] = merged_ranks
  618. title_info[source_id][title]["count"] += 1
  619. if not title_info[source_id][title].get("url"):
  620. title_info[source_id][title]["url"] = url
  621. if not title_info[source_id][title].get("mobileUrl"):
  622. title_info[source_id][title]["mobileUrl"] = mobile_url
  623. class StatisticsCalculator:
  624. """统计计算器"""
  625. @staticmethod
  626. def calculate_news_weight(
  627. title_data: Dict, rank_threshold: int = CONFIG["RANK_THRESHOLD"]
  628. ) -> float:
  629. """计算新闻权重,用于排序"""
  630. ranks = title_data.get("ranks", [])
  631. if not ranks:
  632. return 0.0
  633. count = title_data.get("count", len(ranks))
  634. weight_config = CONFIG["WEIGHT_CONFIG"]
  635. # 排名权重:Σ(11 - min(rank, 10)) / 出现次数
  636. rank_scores = []
  637. for rank in ranks:
  638. score = 11 - min(rank, 10)
  639. rank_scores.append(score)
  640. rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
  641. # 频次权重:min(出现次数, 10) × 10
  642. frequency_weight = min(count, 10) * 10
  643. # 热度加成:高排名次数 / 总出现次数 × 100
  644. high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
  645. hotness_ratio = high_rank_count / len(ranks) if ranks else 0
  646. hotness_weight = hotness_ratio * 100
  647. total_weight = (
  648. rank_weight * weight_config["RANK_WEIGHT"]
  649. + frequency_weight * weight_config["FREQUENCY_WEIGHT"]
  650. + hotness_weight * weight_config["HOTNESS_WEIGHT"]
  651. )
  652. return total_weight
  653. @staticmethod
  654. def sort_titles_by_weight(
  655. titles_list: List[Dict], rank_threshold: int = CONFIG["RANK_THRESHOLD"]
  656. ) -> List[Dict]:
  657. """按权重对新闻标题列表进行排序"""
  658. def get_sort_key(title_data):
  659. weight = StatisticsCalculator.calculate_news_weight(
  660. title_data, rank_threshold
  661. )
  662. ranks = title_data.get("ranks", [])
  663. count = title_data.get("count", 1)
  664. # 主要按权重排序,权重相同时按最高排名排序,再相同时按出现次数排序
  665. min_rank = min(ranks) if ranks else 999
  666. return -weight, min_rank, -count
  667. return sorted(titles_list, key=get_sort_key)
  668. @staticmethod
  669. def matches_word_groups(
  670. title: str, word_groups: List[Dict], filter_words: List[str]
  671. ) -> bool:
  672. """检查标题是否匹配词组规则"""
  673. # 如果没有配置词组,则匹配所有标题(支持显示全部新闻)
  674. if not word_groups:
  675. return True
  676. title_lower = title.lower()
  677. # 过滤词检查
  678. if any(filter_word.lower() in title_lower for filter_word in filter_words):
  679. return False
  680. # 词组匹配检查
  681. for group in word_groups:
  682. required_words = group["required"]
  683. normal_words = group["normal"]
  684. # 必须词检查
  685. if required_words:
  686. all_required_present = all(
  687. req_word.lower() in title_lower for req_word in required_words
  688. )
  689. if not all_required_present:
  690. continue
  691. # 普通词检查
  692. if normal_words:
  693. any_normal_present = any(
  694. normal_word.lower() in title_lower for normal_word in normal_words
  695. )
  696. if not any_normal_present:
  697. continue
  698. return True
  699. return False
  700. @staticmethod
  701. def count_word_frequency(
  702. results: Dict,
  703. word_groups: List[Dict],
  704. filter_words: List[str],
  705. id_to_name: Dict,
  706. title_info: Optional[Dict] = None,
  707. rank_threshold: int = CONFIG["RANK_THRESHOLD"],
  708. new_titles: Optional[Dict] = None,
  709. mode: str = "daily",
  710. ) -> Tuple[List[Dict], int]:
  711. """统计词频,支持必须词、频率词、过滤词,并标记新增标题"""
  712. # 如果没有配置词组,创建一个包含所有新闻的虚拟词组
  713. if not word_groups:
  714. print("频率词配置为空,将显示所有新闻")
  715. word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
  716. filter_words = [] # 清空过滤词,显示所有新闻
  717. is_first_today = DataProcessor.is_first_crawl_today()
  718. # 确定处理的数据源和新增标记逻辑
  719. if mode == "incremental":
  720. if is_first_today:
  721. # 增量模式 + 当天第一次:处理所有新闻,都标记为新增
  722. results_to_process = results
  723. all_news_are_new = True
  724. else:
  725. # 增量模式 + 当天非第一次:只处理新增的新闻
  726. results_to_process = new_titles if new_titles else {}
  727. all_news_are_new = True
  728. elif mode == "current":
  729. # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
  730. if title_info:
  731. latest_time = None
  732. for source_titles in title_info.values():
  733. for title_data in source_titles.values():
  734. last_time = title_data.get("last_time", "")
  735. if last_time:
  736. if latest_time is None or last_time > latest_time:
  737. latest_time = last_time
  738. # 只处理 last_time 等于最新时间的新闻
  739. if latest_time:
  740. results_to_process = {}
  741. for source_id, source_titles in results.items():
  742. if source_id in title_info:
  743. filtered_titles = {}
  744. for title, title_data in source_titles.items():
  745. if title in title_info[source_id]:
  746. info = title_info[source_id][title]
  747. if info.get("last_time") == latest_time:
  748. filtered_titles[title] = title_data
  749. if filtered_titles:
  750. results_to_process[source_id] = filtered_titles
  751. print(
  752. f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
  753. )
  754. else:
  755. results_to_process = results
  756. else:
  757. results_to_process = results
  758. all_news_are_new = False
  759. else:
  760. # 当日汇总模式:处理所有新闻
  761. results_to_process = results
  762. all_news_are_new = False
  763. total_input_news = sum(len(titles) for titles in results.values())
  764. filter_status = "全部显示" if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" else "频率词过滤"
  765. print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
  766. word_stats = {}
  767. total_titles = 0
  768. processed_titles = {}
  769. matched_new_count = 0
  770. if title_info is None:
  771. title_info = {}
  772. if new_titles is None:
  773. new_titles = {}
  774. for group in word_groups:
  775. group_key = group["group_key"]
  776. word_stats[group_key] = {"count": 0, "titles": {}}
  777. for source_id, titles_data in results_to_process.items():
  778. total_titles += len(titles_data)
  779. if source_id not in processed_titles:
  780. processed_titles[source_id] = {}
  781. for title, title_data in titles_data.items():
  782. if title in processed_titles.get(source_id, {}):
  783. continue
  784. # 使用统一的匹配逻辑
  785. matches_frequency_words = StatisticsCalculator.matches_word_groups(
  786. title, word_groups, filter_words
  787. )
  788. if not matches_frequency_words:
  789. continue
  790. # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
  791. if (mode == "incremental" and all_news_are_new) or (
  792. mode == "current" and is_first_today
  793. ):
  794. matched_new_count += 1
  795. source_ranks = title_data.get("ranks", [])
  796. source_url = title_data.get("url", "")
  797. source_mobile_url = title_data.get("mobileUrl", "")
  798. # 找到匹配的词组
  799. title_lower = title.lower()
  800. for group in word_groups:
  801. required_words = group["required"]
  802. normal_words = group["normal"]
  803. # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
  804. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
  805. group_key = group["group_key"]
  806. word_stats[group_key]["count"] += 1
  807. if source_id not in word_stats[group_key]["titles"]:
  808. word_stats[group_key]["titles"][source_id] = []
  809. else:
  810. # 原有的匹配逻辑
  811. if required_words:
  812. all_required_present = all(
  813. req_word.lower() in title_lower
  814. for req_word in required_words
  815. )
  816. if not all_required_present:
  817. continue
  818. if normal_words:
  819. any_normal_present = any(
  820. normal_word.lower() in title_lower
  821. for normal_word in normal_words
  822. )
  823. if not any_normal_present:
  824. continue
  825. group_key = group["group_key"]
  826. word_stats[group_key]["count"] += 1
  827. if source_id not in word_stats[group_key]["titles"]:
  828. word_stats[group_key]["titles"][source_id] = []
  829. first_time = ""
  830. last_time = ""
  831. count_info = 1
  832. ranks = source_ranks if source_ranks else []
  833. url = source_url
  834. mobile_url = source_mobile_url
  835. # 对于 current 模式,从历史统计信息中获取完整数据
  836. if (
  837. mode == "current"
  838. and title_info
  839. and source_id in title_info
  840. and title in title_info[source_id]
  841. ):
  842. info = title_info[source_id][title]
  843. first_time = info.get("first_time", "")
  844. last_time = info.get("last_time", "")
  845. count_info = info.get("count", 1)
  846. if "ranks" in info and info["ranks"]:
  847. ranks = info["ranks"]
  848. url = info.get("url", source_url)
  849. mobile_url = info.get("mobileUrl", source_mobile_url)
  850. elif (
  851. title_info
  852. and source_id in title_info
  853. and title in title_info[source_id]
  854. ):
  855. info = title_info[source_id][title]
  856. first_time = info.get("first_time", "")
  857. last_time = info.get("last_time", "")
  858. count_info = info.get("count", 1)
  859. if "ranks" in info and info["ranks"]:
  860. ranks = info["ranks"]
  861. url = info.get("url", source_url)
  862. mobile_url = info.get("mobileUrl", source_mobile_url)
  863. if not ranks:
  864. ranks = [99]
  865. time_display = StatisticsCalculator._format_time_display(
  866. first_time, last_time
  867. )
  868. source_name = id_to_name.get(source_id, source_id)
  869. # 判断是否为新增
  870. is_new = False
  871. if all_news_are_new:
  872. # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
  873. is_new = True
  874. elif new_titles and source_id in new_titles:
  875. # 检查是否在新增列表中
  876. new_titles_for_source = new_titles[source_id]
  877. is_new = title in new_titles_for_source
  878. word_stats[group_key]["titles"][source_id].append(
  879. {
  880. "title": title,
  881. "source_name": source_name,
  882. "first_time": first_time,
  883. "last_time": last_time,
  884. "time_display": time_display,
  885. "count": count_info,
  886. "ranks": ranks,
  887. "rank_threshold": rank_threshold,
  888. "url": url,
  889. "mobileUrl": mobile_url,
  890. "is_new": is_new,
  891. }
  892. )
  893. if source_id not in processed_titles:
  894. processed_titles[source_id] = {}
  895. processed_titles[source_id][title] = True
  896. break
  897. # 最后统一打印汇总信息
  898. if mode == "incremental":
  899. if is_first_today:
  900. total_input_news = sum(len(titles) for titles in results.values())
  901. filter_status = "全部显示" if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" else "频率词匹配"
  902. print(
  903. f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
  904. )
  905. else:
  906. if new_titles:
  907. total_new_count = sum(len(titles) for titles in new_titles.values())
  908. filter_status = "全部显示" if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" else "匹配频率词"
  909. print(
  910. f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}"
  911. )
  912. if matched_new_count == 0 and len(word_groups) > 1:
  913. print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
  914. else:
  915. print("增量模式:未检测到新增新闻")
  916. elif mode == "current":
  917. total_input_news = sum(
  918. len(titles) for titles in results_to_process.values()
  919. )
  920. if is_first_today:
  921. filter_status = "全部显示" if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" else "频率词匹配"
  922. print(
  923. f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
  924. )
  925. else:
  926. matched_count = sum(stat["count"] for stat in word_stats.values())
  927. filter_status = "全部显示" if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" else "频率词匹配"
  928. print(
  929. f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
  930. )
  931. stats = []
  932. for group_key, data in word_stats.items():
  933. all_titles = []
  934. for source_id, title_list in data["titles"].items():
  935. all_titles.extend(title_list)
  936. sorted_titles = StatisticsCalculator.sort_titles_by_weight(
  937. all_titles, rank_threshold
  938. )
  939. stats.append(
  940. {
  941. "word": group_key,
  942. "count": data["count"],
  943. "titles": sorted_titles,
  944. "percentage": (
  945. round(data["count"] / total_titles * 100, 2)
  946. if total_titles > 0
  947. else 0
  948. ),
  949. }
  950. )
  951. stats.sort(key=lambda x: x["count"], reverse=True)
  952. return stats, total_titles
  953. @staticmethod
  954. def _format_rank_base(
  955. ranks: List[int], rank_threshold: int = 5, format_type: str = "html"
  956. ) -> str:
  957. """基础排名格式化方法"""
  958. if not ranks:
  959. return ""
  960. unique_ranks = sorted(set(ranks))
  961. min_rank = unique_ranks[0]
  962. max_rank = unique_ranks[-1]
  963. if format_type == "html":
  964. highlight_start = "<font color='red'><strong>"
  965. highlight_end = "</strong></font>"
  966. elif format_type == "feishu":
  967. highlight_start = "<font color='red'>**"
  968. highlight_end = "**</font>"
  969. elif format_type == "dingtalk":
  970. highlight_start = "**"
  971. highlight_end = "**"
  972. elif format_type == "wework":
  973. highlight_start = "**"
  974. highlight_end = "**"
  975. elif format_type == "telegram":
  976. highlight_start = "<b>"
  977. highlight_end = "</b>"
  978. else:
  979. highlight_start = "**"
  980. highlight_end = "**"
  981. if min_rank <= rank_threshold:
  982. if min_rank == max_rank:
  983. return f"{highlight_start}[{min_rank}]{highlight_end}"
  984. else:
  985. return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}"
  986. else:
  987. if min_rank == max_rank:
  988. return f"[{min_rank}]"
  989. else:
  990. return f"[{min_rank} - {max_rank}]"
  991. @staticmethod
  992. def format_rank_for_html(ranks: List[int], rank_threshold: int = 5) -> str:
  993. """格式化HTML排名显示"""
  994. return StatisticsCalculator._format_rank_base(ranks, rank_threshold, "html")
  995. @staticmethod
  996. def format_rank_for_feishu(ranks: List[int], rank_threshold: int = 5) -> str:
  997. """格式化飞书排名显示"""
  998. return StatisticsCalculator._format_rank_base(ranks, rank_threshold, "feishu")
  999. @staticmethod
  1000. def format_rank_for_dingtalk(ranks: List[int], rank_threshold: int = 5) -> str:
  1001. """格式化钉钉排名显示"""
  1002. return StatisticsCalculator._format_rank_base(ranks, rank_threshold, "dingtalk")
  1003. @staticmethod
  1004. def format_rank_for_wework(ranks: List[int], rank_threshold: int = 5) -> str:
  1005. """格式化企业微信排名显示"""
  1006. return StatisticsCalculator._format_rank_base(ranks, rank_threshold, "wework")
  1007. @staticmethod
  1008. def format_rank_for_telegram(ranks: List[int], rank_threshold: int = 5) -> str:
  1009. """格式化Telegram排名显示"""
  1010. return StatisticsCalculator._format_rank_base(ranks, rank_threshold, "telegram")
  1011. @staticmethod
  1012. def _format_time_display(first_time: str, last_time: str) -> str:
  1013. """格式化时间显示"""
  1014. if not first_time:
  1015. return ""
  1016. if first_time == last_time or not last_time:
  1017. return first_time
  1018. else:
  1019. return f"[{first_time} ~ {last_time}]"
  1020. class ReportGenerator:
  1021. """报告生成器"""
  1022. @staticmethod
  1023. def generate_html_report(
  1024. stats: List[Dict],
  1025. total_titles: int,
  1026. failed_ids: Optional[List] = None,
  1027. new_titles: Optional[Dict] = None,
  1028. id_to_name: Optional[Dict] = None,
  1029. mode: str = "daily",
  1030. is_daily_summary: bool = False,
  1031. ) -> str:
  1032. """生成HTML报告"""
  1033. if is_daily_summary:
  1034. if mode == "current":
  1035. filename = "当前榜单汇总.html"
  1036. elif mode == "incremental":
  1037. filename = "当日增量.html"
  1038. else:
  1039. filename = "当日汇总.html"
  1040. else:
  1041. filename = f"{TimeHelper.format_time_filename()}.html"
  1042. file_path = FileHelper.get_output_path("html", filename)
  1043. report_data = ReportGenerator._prepare_report_data(
  1044. stats, failed_ids, new_titles, id_to_name, mode
  1045. )
  1046. html_content = ReportGenerator._render_html_content(
  1047. report_data, total_titles, is_daily_summary, mode
  1048. )
  1049. with open(file_path, "w", encoding="utf-8") as f:
  1050. f.write(html_content)
  1051. if is_daily_summary:
  1052. root_file_path = Path("index.html")
  1053. with open(root_file_path, "w", encoding="utf-8") as f:
  1054. f.write(html_content)
  1055. return file_path
  1056. @staticmethod
  1057. def _prepare_report_data(
  1058. stats: List[Dict],
  1059. failed_ids: Optional[List] = None,
  1060. new_titles: Optional[Dict] = None,
  1061. id_to_name: Optional[Dict] = None,
  1062. mode: str = "daily",
  1063. ) -> Dict:
  1064. """准备报告数据"""
  1065. processed_new_titles = []
  1066. # 在增量模式下隐藏新增新闻区域
  1067. hide_new_section = mode == "incremental"
  1068. # 只有在非隐藏模式下才处理新增新闻部分
  1069. if not hide_new_section:
  1070. filtered_new_titles = {}
  1071. if new_titles and id_to_name:
  1072. word_groups, filter_words = DataProcessor.load_frequency_words()
  1073. for source_id, titles_data in new_titles.items():
  1074. filtered_titles = ReportGenerator._apply_frequency_filter(
  1075. titles_data, word_groups, filter_words
  1076. )
  1077. if filtered_titles:
  1078. filtered_new_titles[source_id] = filtered_titles
  1079. if filtered_new_titles and id_to_name:
  1080. for source_id, titles_data in filtered_new_titles.items():
  1081. source_name = id_to_name.get(source_id, source_id)
  1082. source_titles = []
  1083. for title, title_data in titles_data.items():
  1084. url, mobile_url, ranks = (
  1085. ReportGenerator._extract_title_data_fields(title_data)
  1086. )
  1087. processed_title = {
  1088. "title": title,
  1089. "source_name": source_name,
  1090. "time_display": "",
  1091. "count": 1,
  1092. "ranks": ranks,
  1093. "rank_threshold": CONFIG["RANK_THRESHOLD"],
  1094. "url": url,
  1095. "mobile_url": mobile_url,
  1096. "is_new": True,
  1097. }
  1098. source_titles.append(processed_title)
  1099. if source_titles:
  1100. processed_new_titles.append(
  1101. {
  1102. "source_id": source_id,
  1103. "source_name": source_name,
  1104. "titles": source_titles,
  1105. }
  1106. )
  1107. processed_stats = []
  1108. for stat in stats:
  1109. if stat["count"] <= 0:
  1110. continue
  1111. processed_titles = []
  1112. for title_data in stat["titles"]:
  1113. processed_title = {
  1114. "title": title_data["title"],
  1115. "source_name": title_data["source_name"],
  1116. "time_display": title_data["time_display"],
  1117. "count": title_data["count"],
  1118. "ranks": title_data["ranks"],
  1119. "rank_threshold": title_data["rank_threshold"],
  1120. "url": title_data.get("url", ""),
  1121. "mobile_url": title_data.get("mobileUrl", ""),
  1122. "is_new": title_data.get("is_new", False),
  1123. }
  1124. processed_titles.append(processed_title)
  1125. processed_stats.append(
  1126. {
  1127. "word": stat["word"],
  1128. "count": stat["count"],
  1129. "percentage": stat.get("percentage", 0),
  1130. "titles": processed_titles,
  1131. }
  1132. )
  1133. return {
  1134. "stats": processed_stats,
  1135. "new_titles": processed_new_titles,
  1136. "failed_ids": failed_ids or [],
  1137. "total_new_count": sum(
  1138. len(source["titles"]) for source in processed_new_titles
  1139. ),
  1140. }
  1141. @staticmethod
  1142. def _extract_title_data_fields(title_data) -> Tuple[str, str, List[int]]:
  1143. """提取标题数据的通用字段"""
  1144. url = title_data.get("url", "")
  1145. mobile_url = title_data.get("mobileUrl", "")
  1146. ranks = title_data.get("ranks", [])
  1147. return url, mobile_url, ranks
  1148. @staticmethod
  1149. def _apply_frequency_filter(
  1150. titles_data: Dict, word_groups: List[Dict], filter_words: List[str]
  1151. ) -> Dict:
  1152. """应用频率词过滤逻辑"""
  1153. filtered_titles = {}
  1154. for title, title_data in titles_data.items():
  1155. if StatisticsCalculator.matches_word_groups(
  1156. title, word_groups, filter_words
  1157. ):
  1158. filtered_titles[title] = title_data
  1159. return filtered_titles
  1160. @staticmethod
  1161. def _html_escape(text: str) -> str:
  1162. """HTML转义"""
  1163. if not isinstance(text, str):
  1164. text = str(text)
  1165. return (
  1166. text.replace("&", "&amp;")
  1167. .replace("<", "&lt;")
  1168. .replace(">", "&gt;")
  1169. .replace('"', "&quot;")
  1170. .replace("'", "&#x27;")
  1171. )
  1172. @staticmethod
  1173. def _format_title_html(title_data: Dict) -> str:
  1174. """格式化HTML标题显示"""
  1175. rank_display = StatisticsCalculator.format_rank_for_html(
  1176. title_data["ranks"], title_data["rank_threshold"]
  1177. )
  1178. link_url = title_data["mobile_url"] or title_data["url"]
  1179. cleaned_title = DataProcessor.clean_title(title_data["title"])
  1180. escaped_title = ReportGenerator._html_escape(cleaned_title)
  1181. escaped_source_name = ReportGenerator._html_escape(title_data["source_name"])
  1182. if link_url:
  1183. escaped_url = ReportGenerator._html_escape(link_url)
  1184. formatted_title = f'[{escaped_source_name}] <a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
  1185. else:
  1186. formatted_title = (
  1187. f'[{escaped_source_name}] <span class="no-link">{escaped_title}</span>'
  1188. )
  1189. if rank_display:
  1190. formatted_title += f" {rank_display}"
  1191. if title_data["time_display"]:
  1192. escaped_time = ReportGenerator._html_escape(title_data["time_display"])
  1193. formatted_title += f" <font color='grey'>- {escaped_time}</font>"
  1194. if title_data["count"] > 1:
  1195. formatted_title += f" <font color='green'>({title_data['count']}次)</font>"
  1196. if title_data["is_new"]:
  1197. formatted_title = f"<div class='new-title'>🆕 {formatted_title}</div>"
  1198. return formatted_title
  1199. @staticmethod
  1200. def _render_html_content(
  1201. report_data: Dict,
  1202. total_titles: int,
  1203. is_daily_summary: bool = False,
  1204. mode: str = "daily",
  1205. ) -> str:
  1206. """渲染HTML内容"""
  1207. html = """
  1208. <!DOCTYPE html>
  1209. <html>
  1210. <head>
  1211. <meta charset="UTF-8">
  1212. <title>频率词统计报告</title>
  1213. <style>
  1214. body { font-family: Arial, sans-serif; margin: 20px; }
  1215. h1, h2 { color: #333; }
  1216. table { border-collapse: collapse; width: 100%; margin-top: 20px; }
  1217. th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
  1218. th { background-color: #f2f2f2; }
  1219. tr:nth-child(even) { background-color: #f9f9f9; }
  1220. .word { font-weight: bold; }
  1221. .count { text-align: center; }
  1222. .percentage { text-align: center; }
  1223. .titles { max-width: 500px; }
  1224. .source { color: #666; font-style: italic; }
  1225. .error { color: #d9534f; }
  1226. .news-link {
  1227. color: #007bff;
  1228. text-decoration: none;
  1229. border-bottom: 1px dotted #007bff;
  1230. }
  1231. .news-link:hover {
  1232. color: #0056b3;
  1233. text-decoration: underline;
  1234. }
  1235. .news-link:visited {
  1236. color: #6f42c1;
  1237. }
  1238. .no-link {
  1239. color: #333;
  1240. }
  1241. .new-title {
  1242. background-color: #fff3cd;
  1243. border: 1px solid #ffc107;
  1244. border-radius: 3px;
  1245. padding: 2px 6px;
  1246. margin: 2px 0;
  1247. }
  1248. .new-section {
  1249. background-color: #d1ecf1;
  1250. border: 1px solid #bee5eb;
  1251. border-radius: 5px;
  1252. padding: 10px;
  1253. margin-top: 10px;
  1254. }
  1255. .new-section h3 {
  1256. color: #0c5460;
  1257. margin-top: 0;
  1258. }
  1259. </style>
  1260. </head>
  1261. <body>
  1262. <h1>频率词统计报告</h1>
  1263. """
  1264. if is_daily_summary:
  1265. if mode == "current":
  1266. html += "<p>报告类型: 当前榜单模式</p>"
  1267. elif mode == "incremental":
  1268. html += "<p>报告类型: 增量模式</p>"
  1269. else:
  1270. html += "<p>报告类型: 当日汇总</p>"
  1271. else:
  1272. html += "<p>报告类型: 实时分析</p>"
  1273. now = TimeHelper.get_beijing_time()
  1274. html += f"<p>总标题数: {total_titles}</p>"
  1275. html += f"<p>生成时间: {now.strftime('%Y-%m-%d %H:%M:%S')}</p>"
  1276. if report_data["failed_ids"]:
  1277. html += """
  1278. <div class="error">
  1279. <h2>请求失败的平台</h2>
  1280. <ul>
  1281. """
  1282. for id_value in report_data["failed_ids"]:
  1283. html += f"<li>{ReportGenerator._html_escape(id_value)}</li>"
  1284. html += """
  1285. </ul>
  1286. </div>
  1287. """
  1288. html += """
  1289. <table>
  1290. <tr>
  1291. <th>排名</th>
  1292. <th>频率词</th>
  1293. <th>出现次数</th>
  1294. <th>占比</th>
  1295. <th>相关标题</th>
  1296. </tr>
  1297. """
  1298. for i, stat in enumerate(report_data["stats"], 1):
  1299. formatted_titles = []
  1300. for title_data in stat["titles"]:
  1301. formatted_title = ReportGenerator._format_title_html(title_data)
  1302. formatted_titles.append(formatted_title)
  1303. escaped_word = ReportGenerator._html_escape(stat["word"])
  1304. html += f"""
  1305. <tr>
  1306. <td>{i}</td>
  1307. <td class="word">{escaped_word}</td>
  1308. <td class="count">{stat['count']}</td>
  1309. <td class="percentage">{stat.get('percentage', 0)}%</td>
  1310. <td class="titles">{"<br>".join(formatted_titles)}</td>
  1311. </tr>
  1312. """
  1313. html += """
  1314. </table>
  1315. """
  1316. if report_data["new_titles"]:
  1317. html += f"""
  1318. <div class="new-section">
  1319. <h3>🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)</h3>
  1320. """
  1321. for source_data in report_data["new_titles"]:
  1322. escaped_source = ReportGenerator._html_escape(
  1323. source_data["source_name"]
  1324. )
  1325. html += (
  1326. f"<h4>{escaped_source} ({len(source_data['titles'])} 条)</h4><ul>"
  1327. )
  1328. for title_data in source_data["titles"]:
  1329. title_data_copy = title_data.copy()
  1330. title_data_copy["is_new"] = False
  1331. formatted_title = ReportGenerator._format_title_html(
  1332. title_data_copy
  1333. )
  1334. if "] " in formatted_title:
  1335. formatted_title = formatted_title.split("] ", 1)[1]
  1336. html += f"<li>{formatted_title}</li>"
  1337. html += "</ul>"
  1338. html += "</div>"
  1339. html += """
  1340. </body>
  1341. </html>
  1342. """
  1343. return html
  1344. @staticmethod
  1345. def _format_title_feishu(title_data: Dict, show_source: bool = True) -> str:
  1346. """格式化飞书标题显示"""
  1347. rank_display = StatisticsCalculator.format_rank_for_feishu(
  1348. title_data["ranks"], title_data["rank_threshold"]
  1349. )
  1350. link_url = title_data["mobile_url"] or title_data["url"]
  1351. cleaned_title = DataProcessor.clean_title(title_data["title"])
  1352. if link_url:
  1353. formatted_title = f"[{cleaned_title}]({link_url})"
  1354. else:
  1355. formatted_title = cleaned_title
  1356. title_prefix = "🆕 " if title_data["is_new"] else ""
  1357. if show_source:
  1358. result = f"<font color='grey'>[{title_data['source_name']}]</font> {title_prefix}{formatted_title}"
  1359. else:
  1360. result = f"{title_prefix}{formatted_title}"
  1361. if rank_display:
  1362. result += f" {rank_display}"
  1363. if title_data["time_display"]:
  1364. result += f" <font color='grey'>- {title_data['time_display']}</font>"
  1365. if title_data["count"] > 1:
  1366. result += f" <font color='green'>({title_data['count']}次)</font>"
  1367. return result
  1368. @staticmethod
  1369. def _format_title_dingtalk(title_data: Dict, show_source: bool = True) -> str:
  1370. """格式化钉钉标题显示"""
  1371. rank_display = StatisticsCalculator.format_rank_for_dingtalk(
  1372. title_data["ranks"], title_data["rank_threshold"]
  1373. )
  1374. link_url = title_data["mobile_url"] or title_data["url"]
  1375. cleaned_title = DataProcessor.clean_title(title_data["title"])
  1376. if link_url:
  1377. formatted_title = f"[{cleaned_title}]({link_url})"
  1378. else:
  1379. formatted_title = cleaned_title
  1380. title_prefix = "🆕 " if title_data["is_new"] else ""
  1381. if show_source:
  1382. result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
  1383. else:
  1384. result = f"{title_prefix}{formatted_title}"
  1385. if rank_display:
  1386. result += f" {rank_display}"
  1387. if title_data["time_display"]:
  1388. result += f" - {title_data['time_display']}"
  1389. if title_data["count"] > 1:
  1390. result += f" ({title_data['count']}次)"
  1391. return result
  1392. @staticmethod
  1393. def _format_title_wework(title_data: Dict, show_source: bool = True) -> str:
  1394. """格式化企业微信标题显示"""
  1395. rank_display = StatisticsCalculator.format_rank_for_wework(
  1396. title_data["ranks"], title_data["rank_threshold"]
  1397. )
  1398. link_url = title_data["mobile_url"] or title_data["url"]
  1399. cleaned_title = DataProcessor.clean_title(title_data["title"])
  1400. if link_url:
  1401. formatted_title = f"[{cleaned_title}]({link_url})"
  1402. else:
  1403. formatted_title = cleaned_title
  1404. title_prefix = "🆕 " if title_data["is_new"] else ""
  1405. if show_source:
  1406. result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
  1407. else:
  1408. result = f"{title_prefix}{formatted_title}"
  1409. if rank_display:
  1410. result += f" {rank_display}"
  1411. if title_data["time_display"]:
  1412. result += f" - {title_data['time_display']}"
  1413. if title_data["count"] > 1:
  1414. result += f" ({title_data['count']}次)"
  1415. return result
  1416. @staticmethod
  1417. def _format_title_telegram(title_data: Dict, show_source: bool = True) -> str:
  1418. """格式化Telegram标题显示"""
  1419. rank_display = StatisticsCalculator.format_rank_for_telegram(
  1420. title_data["ranks"], title_data["rank_threshold"]
  1421. )
  1422. link_url = title_data["mobile_url"] or title_data["url"]
  1423. cleaned_title = DataProcessor.clean_title(title_data["title"])
  1424. if link_url:
  1425. formatted_title = f'<a href="{link_url}">{ReportGenerator._html_escape(cleaned_title)}</a>'
  1426. else:
  1427. formatted_title = cleaned_title
  1428. title_prefix = "🆕 " if title_data["is_new"] else ""
  1429. if show_source:
  1430. result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
  1431. else:
  1432. result = f"{title_prefix}{formatted_title}"
  1433. if rank_display:
  1434. result += f" {rank_display}"
  1435. if title_data["time_display"]:
  1436. result += f" <code>- {title_data['time_display']}</code>"
  1437. if title_data["count"] > 1:
  1438. result += f" <code>({title_data['count']}次)</code>"
  1439. return result
  1440. @staticmethod
  1441. def _render_feishu_content(
  1442. report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily"
  1443. ) -> str:
  1444. """渲染飞书内容"""
  1445. text_content = ""
  1446. if report_data["stats"]:
  1447. text_content += f"📊 **热点词汇统计**\n\n"
  1448. total_count = len(report_data["stats"])
  1449. for i, stat in enumerate(report_data["stats"]):
  1450. word = stat["word"]
  1451. count = stat["count"]
  1452. sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
  1453. if count >= 10:
  1454. text_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
  1455. elif count >= 5:
  1456. text_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
  1457. else:
  1458. text_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  1459. for j, title_data in enumerate(stat["titles"], 1):
  1460. formatted_title = ReportGenerator._format_title_feishu(
  1461. title_data, show_source=True
  1462. )
  1463. text_content += f" {j}. {formatted_title}\n"
  1464. if j < len(stat["titles"]):
  1465. text_content += "\n"
  1466. if i < len(report_data["stats"]) - 1:
  1467. text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
  1468. if not text_content:
  1469. if mode == "incremental":
  1470. mode_text = "增量模式下暂无新增匹配的热点词汇"
  1471. elif mode == "current":
  1472. mode_text = "当前榜单模式下暂无匹配的热点词汇"
  1473. else:
  1474. mode_text = "暂无匹配的热点词汇"
  1475. text_content = f"📭 {mode_text}\n\n"
  1476. if report_data["new_titles"]:
  1477. if text_content and "暂无匹配" not in text_content:
  1478. text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
  1479. text_content += (
  1480. f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  1481. )
  1482. for source_data in report_data["new_titles"]:
  1483. text_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n"
  1484. for j, title_data in enumerate(source_data["titles"], 1):
  1485. title_data_copy = title_data.copy()
  1486. title_data_copy["is_new"] = False
  1487. formatted_title = ReportGenerator._format_title_feishu(
  1488. title_data_copy, show_source=False
  1489. )
  1490. text_content += f" {j}. {formatted_title}\n"
  1491. text_content += "\n"
  1492. if report_data["failed_ids"]:
  1493. if text_content and "暂无匹配" not in text_content:
  1494. text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
  1495. text_content += "⚠️ **数据获取失败的平台:**\n\n"
  1496. for i, id_value in enumerate(report_data["failed_ids"], 1):
  1497. text_content += f" • <font color='red'>{id_value}</font>\n"
  1498. now = TimeHelper.get_beijing_time()
  1499. text_content += f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
  1500. if update_info:
  1501. text_content += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
  1502. return text_content
  1503. @staticmethod
  1504. def _render_dingtalk_content(
  1505. report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily"
  1506. ) -> str:
  1507. """渲染钉钉内容"""
  1508. text_content = ""
  1509. total_titles = sum(
  1510. len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
  1511. )
  1512. now = TimeHelper.get_beijing_time()
  1513. text_content += f"**总新闻数:** {total_titles}\n\n"
  1514. text_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
  1515. text_content += f"**类型:** 热点分析报告\n\n"
  1516. text_content += "---\n\n"
  1517. if report_data["stats"]:
  1518. text_content += f"📊 **热点词汇统计**\n\n"
  1519. total_count = len(report_data["stats"])
  1520. for i, stat in enumerate(report_data["stats"]):
  1521. word = stat["word"]
  1522. count = stat["count"]
  1523. sequence_display = f"[{i + 1}/{total_count}]"
  1524. if count >= 10:
  1525. text_content += (
  1526. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  1527. )
  1528. elif count >= 5:
  1529. text_content += (
  1530. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  1531. )
  1532. else:
  1533. text_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  1534. for j, title_data in enumerate(stat["titles"], 1):
  1535. formatted_title = ReportGenerator._format_title_dingtalk(
  1536. title_data, show_source=True
  1537. )
  1538. text_content += f" {j}. {formatted_title}\n"
  1539. if j < len(stat["titles"]):
  1540. text_content += "\n"
  1541. if i < len(report_data["stats"]) - 1:
  1542. text_content += f"\n---\n\n"
  1543. if not report_data["stats"]:
  1544. if mode == "incremental":
  1545. mode_text = "增量模式下暂无新增匹配的热点词汇"
  1546. elif mode == "current":
  1547. mode_text = "当前榜单模式下暂无匹配的热点词汇"
  1548. else:
  1549. mode_text = "暂无匹配的热点词汇"
  1550. text_content += f"📭 {mode_text}\n\n"
  1551. if report_data["new_titles"]:
  1552. if text_content and "暂无匹配" not in text_content:
  1553. text_content += f"\n---\n\n"
  1554. text_content += (
  1555. f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  1556. )
  1557. for source_data in report_data["new_titles"]:
  1558. text_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  1559. for j, title_data in enumerate(source_data["titles"], 1):
  1560. title_data_copy = title_data.copy()
  1561. title_data_copy["is_new"] = False
  1562. formatted_title = ReportGenerator._format_title_dingtalk(
  1563. title_data_copy, show_source=False
  1564. )
  1565. text_content += f" {j}. {formatted_title}\n"
  1566. text_content += "\n"
  1567. if report_data["failed_ids"]:
  1568. if text_content and "暂无匹配" not in text_content:
  1569. text_content += f"\n---\n\n"
  1570. text_content += "⚠️ **数据获取失败的平台:**\n\n"
  1571. for i, id_value in enumerate(report_data["failed_ids"], 1):
  1572. text_content += f" • **{id_value}**\n"
  1573. text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  1574. if update_info:
  1575. text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  1576. return text_content
  1577. @staticmethod
  1578. def _split_content_into_batches(
  1579. report_data: Dict,
  1580. format_type: str,
  1581. update_info: Optional[Dict] = None,
  1582. max_bytes: int = CONFIG["MESSAGE_BATCH_SIZE"],
  1583. mode: str = "daily",
  1584. ) -> List[str]:
  1585. """分批处理消息内容,确保词组标题+至少第一条新闻的完整性"""
  1586. batches = []
  1587. total_titles = sum(
  1588. len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
  1589. )
  1590. now = TimeHelper.get_beijing_time()
  1591. base_header = ""
  1592. if format_type == "wework":
  1593. base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
  1594. elif format_type == "telegram":
  1595. base_header = f"总新闻数: {total_titles}\n\n"
  1596. base_footer = ""
  1597. if format_type == "wework":
  1598. base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  1599. if update_info:
  1600. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  1601. elif format_type == "telegram":
  1602. base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  1603. if update_info:
  1604. base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
  1605. stats_header = ""
  1606. if report_data["stats"]:
  1607. if format_type == "wework":
  1608. stats_header = f"📊 **热点词汇统计**\n\n"
  1609. elif format_type == "telegram":
  1610. stats_header = f"📊 热点词汇统计\n\n"
  1611. current_batch = base_header
  1612. current_batch_has_content = False
  1613. if (
  1614. not report_data["stats"]
  1615. and not report_data["new_titles"]
  1616. and not report_data["failed_ids"]
  1617. ):
  1618. if mode == "incremental":
  1619. mode_text = "增量模式下暂无新增匹配的热点词汇"
  1620. elif mode == "current":
  1621. mode_text = "当前榜单模式下暂无匹配的热点词汇"
  1622. else:
  1623. mode_text = "暂无匹配的热点词汇"
  1624. simple_content = f"📭 {mode_text}\n\n"
  1625. final_content = base_header + simple_content + base_footer
  1626. batches.append(final_content)
  1627. return batches
  1628. # 处理热点词汇统计
  1629. if report_data["stats"]:
  1630. total_count = len(report_data["stats"])
  1631. # 添加统计标题
  1632. test_content = current_batch + stats_header
  1633. if (
  1634. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  1635. < max_bytes
  1636. ):
  1637. current_batch = test_content
  1638. current_batch_has_content = True
  1639. else:
  1640. if current_batch_has_content:
  1641. batches.append(current_batch + base_footer)
  1642. current_batch = base_header + stats_header
  1643. current_batch_has_content = True
  1644. # 逐个处理词组(确保词组标题+第一条新闻的原子性)
  1645. for i, stat in enumerate(report_data["stats"]):
  1646. word = stat["word"]
  1647. count = stat["count"]
  1648. sequence_display = f"[{i + 1}/{total_count}]"
  1649. # 构建词组标题
  1650. word_header = ""
  1651. if format_type == "wework":
  1652. if count >= 10:
  1653. word_header = (
  1654. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  1655. )
  1656. elif count >= 5:
  1657. word_header = (
  1658. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  1659. )
  1660. else:
  1661. word_header = (
  1662. f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  1663. )
  1664. elif format_type == "telegram":
  1665. if count >= 10:
  1666. word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
  1667. elif count >= 5:
  1668. word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
  1669. else:
  1670. word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
  1671. # 构建第一条新闻
  1672. first_news_line = ""
  1673. if stat["titles"]:
  1674. first_title_data = stat["titles"][0]
  1675. if format_type == "wework":
  1676. formatted_title = ReportGenerator._format_title_wework(
  1677. first_title_data, show_source=True
  1678. )
  1679. elif format_type == "telegram":
  1680. formatted_title = ReportGenerator._format_title_telegram(
  1681. first_title_data, show_source=True
  1682. )
  1683. else:
  1684. formatted_title = f"{first_title_data['title']}"
  1685. first_news_line = f" 1. {formatted_title}\n"
  1686. if len(stat["titles"]) > 1:
  1687. first_news_line += "\n"
  1688. # 原子性检查:词组标题+第一条新闻必须一起处理
  1689. word_with_first_news = word_header + first_news_line
  1690. test_content = current_batch + word_with_first_news
  1691. if (
  1692. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  1693. >= max_bytes
  1694. ):
  1695. # 当前批次容纳不下,开启新批次
  1696. if current_batch_has_content:
  1697. batches.append(current_batch + base_footer)
  1698. current_batch = base_header + stats_header + word_with_first_news
  1699. current_batch_has_content = True
  1700. start_index = 1
  1701. else:
  1702. current_batch = test_content
  1703. current_batch_has_content = True
  1704. start_index = 1
  1705. # 处理剩余新闻条目
  1706. for j in range(start_index, len(stat["titles"])):
  1707. title_data = stat["titles"][j]
  1708. if format_type == "wework":
  1709. formatted_title = ReportGenerator._format_title_wework(
  1710. title_data, show_source=True
  1711. )
  1712. elif format_type == "telegram":
  1713. formatted_title = ReportGenerator._format_title_telegram(
  1714. title_data, show_source=True
  1715. )
  1716. else:
  1717. formatted_title = f"{title_data['title']}"
  1718. news_line = f" {j + 1}. {formatted_title}\n"
  1719. if j < len(stat["titles"]) - 1:
  1720. news_line += "\n"
  1721. test_content = current_batch + news_line
  1722. if (
  1723. len(test_content.encode("utf-8"))
  1724. + len(base_footer.encode("utf-8"))
  1725. >= max_bytes
  1726. ):
  1727. if current_batch_has_content:
  1728. batches.append(current_batch + base_footer)
  1729. current_batch = (
  1730. base_header + stats_header + word_header + news_line
  1731. )
  1732. current_batch_has_content = True
  1733. else:
  1734. current_batch = test_content
  1735. current_batch_has_content = True
  1736. # 词组间分隔符
  1737. if i < len(report_data["stats"]) - 1:
  1738. separator = ""
  1739. if format_type == "wework":
  1740. separator = f"\n\n\n\n"
  1741. elif format_type == "telegram":
  1742. separator = f"\n\n"
  1743. test_content = current_batch + separator
  1744. if (
  1745. len(test_content.encode("utf-8"))
  1746. + len(base_footer.encode("utf-8"))
  1747. < max_bytes
  1748. ):
  1749. current_batch = test_content
  1750. # 处理新增新闻(同样确保来源标题+第一条新闻的原子性)
  1751. if report_data["new_titles"]:
  1752. new_header = ""
  1753. if format_type == "wework":
  1754. new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  1755. elif format_type == "telegram":
  1756. new_header = f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
  1757. test_content = current_batch + new_header
  1758. if (
  1759. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  1760. >= max_bytes
  1761. ):
  1762. if current_batch_has_content:
  1763. batches.append(current_batch + base_footer)
  1764. current_batch = base_header + new_header
  1765. current_batch_has_content = True
  1766. else:
  1767. current_batch = test_content
  1768. current_batch_has_content = True
  1769. # 逐个处理新增新闻来源
  1770. for source_data in report_data["new_titles"]:
  1771. source_header = ""
  1772. if format_type == "wework":
  1773. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  1774. elif format_type == "telegram":
  1775. source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
  1776. # 构建第一条新增新闻
  1777. first_news_line = ""
  1778. if source_data["titles"]:
  1779. first_title_data = source_data["titles"][0]
  1780. title_data_copy = first_title_data.copy()
  1781. title_data_copy["is_new"] = False
  1782. if format_type == "wework":
  1783. formatted_title = ReportGenerator._format_title_wework(
  1784. title_data_copy, show_source=False
  1785. )
  1786. elif format_type == "telegram":
  1787. formatted_title = ReportGenerator._format_title_telegram(
  1788. title_data_copy, show_source=False
  1789. )
  1790. else:
  1791. formatted_title = f"{title_data_copy['title']}"
  1792. first_news_line = f" 1. {formatted_title}\n"
  1793. # 原子性检查:来源标题+第一条新闻
  1794. source_with_first_news = source_header + first_news_line
  1795. test_content = current_batch + source_with_first_news
  1796. if (
  1797. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  1798. >= max_bytes
  1799. ):
  1800. if current_batch_has_content:
  1801. batches.append(current_batch + base_footer)
  1802. current_batch = base_header + new_header + source_with_first_news
  1803. current_batch_has_content = True
  1804. start_index = 1
  1805. else:
  1806. current_batch = test_content
  1807. current_batch_has_content = True
  1808. start_index = 1
  1809. # 处理剩余新增新闻
  1810. for j in range(start_index, len(source_data["titles"])):
  1811. title_data = source_data["titles"][j]
  1812. title_data_copy = title_data.copy()
  1813. title_data_copy["is_new"] = False
  1814. if format_type == "wework":
  1815. formatted_title = ReportGenerator._format_title_wework(
  1816. title_data_copy, show_source=False
  1817. )
  1818. elif format_type == "telegram":
  1819. formatted_title = ReportGenerator._format_title_telegram(
  1820. title_data_copy, show_source=False
  1821. )
  1822. else:
  1823. formatted_title = f"{title_data_copy['title']}"
  1824. news_line = f" {j + 1}. {formatted_title}\n"
  1825. test_content = current_batch + news_line
  1826. if (
  1827. len(test_content.encode("utf-8"))
  1828. + len(base_footer.encode("utf-8"))
  1829. >= max_bytes
  1830. ):
  1831. if current_batch_has_content:
  1832. batches.append(current_batch + base_footer)
  1833. current_batch = (
  1834. base_header + new_header + source_header + news_line
  1835. )
  1836. current_batch_has_content = True
  1837. else:
  1838. current_batch = test_content
  1839. current_batch_has_content = True
  1840. current_batch += "\n"
  1841. if report_data["failed_ids"]:
  1842. failed_header = ""
  1843. if format_type == "wework":
  1844. failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
  1845. elif format_type == "telegram":
  1846. failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
  1847. test_content = current_batch + failed_header
  1848. if (
  1849. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  1850. >= max_bytes
  1851. ):
  1852. if current_batch_has_content:
  1853. batches.append(current_batch + base_footer)
  1854. current_batch = base_header + failed_header
  1855. current_batch_has_content = True
  1856. else:
  1857. current_batch = test_content
  1858. current_batch_has_content = True
  1859. for i, id_value in enumerate(report_data["failed_ids"], 1):
  1860. failed_line = f" • {id_value}\n"
  1861. test_content = current_batch + failed_line
  1862. if (
  1863. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  1864. >= max_bytes
  1865. ):
  1866. if current_batch_has_content:
  1867. batches.append(current_batch + base_footer)
  1868. current_batch = base_header + failed_header + failed_line
  1869. current_batch_has_content = True
  1870. else:
  1871. current_batch = test_content
  1872. current_batch_has_content = True
  1873. # 完成最后批次
  1874. if current_batch_has_content:
  1875. batches.append(current_batch + base_footer)
  1876. return batches
  1877. @staticmethod
  1878. def send_to_webhooks(
  1879. stats: List[Dict],
  1880. failed_ids: Optional[List] = None,
  1881. report_type: str = "当日汇总",
  1882. new_titles: Optional[Dict] = None,
  1883. id_to_name: Optional[Dict] = None,
  1884. update_info: Optional[Dict] = None,
  1885. proxy_url: Optional[str] = None,
  1886. mode: str = "daily",
  1887. ) -> Dict[str, bool]:
  1888. """发送数据到多个webhook平台"""
  1889. results = {}
  1890. report_data = ReportGenerator._prepare_report_data(
  1891. stats, failed_ids, new_titles, id_to_name, mode
  1892. )
  1893. feishu_url = CONFIG["FEISHU_WEBHOOK_URL"]
  1894. dingtalk_url = CONFIG["DINGTALK_WEBHOOK_URL"]
  1895. wework_url = CONFIG["WEWORK_WEBHOOK_URL"]
  1896. telegram_token = CONFIG["TELEGRAM_BOT_TOKEN"]
  1897. telegram_chat_id = CONFIG["TELEGRAM_CHAT_ID"]
  1898. update_info_to_send = update_info if CONFIG["SHOW_VERSION_UPDATE"] else None
  1899. # 发送到飞书
  1900. if feishu_url:
  1901. results["feishu"] = ReportGenerator._send_to_feishu(
  1902. feishu_url,
  1903. report_data,
  1904. report_type,
  1905. update_info_to_send,
  1906. proxy_url,
  1907. mode,
  1908. )
  1909. # 发送到钉钉
  1910. if dingtalk_url:
  1911. results["dingtalk"] = ReportGenerator._send_to_dingtalk(
  1912. dingtalk_url,
  1913. report_data,
  1914. report_type,
  1915. update_info_to_send,
  1916. proxy_url,
  1917. mode,
  1918. )
  1919. # 发送到企业微信
  1920. if wework_url:
  1921. results["wework"] = ReportGenerator._send_to_wework(
  1922. wework_url,
  1923. report_data,
  1924. report_type,
  1925. update_info_to_send,
  1926. proxy_url,
  1927. mode,
  1928. )
  1929. # 发送到 Telegram
  1930. if telegram_token and telegram_chat_id:
  1931. results["telegram"] = ReportGenerator._send_to_telegram(
  1932. telegram_token,
  1933. telegram_chat_id,
  1934. report_data,
  1935. report_type,
  1936. update_info_to_send,
  1937. proxy_url,
  1938. mode,
  1939. )
  1940. if not results:
  1941. print("未配置任何webhook URL,跳过通知发送")
  1942. return results
  1943. @staticmethod
  1944. def _send_to_feishu(
  1945. webhook_url: str,
  1946. report_data: Dict,
  1947. report_type: str,
  1948. update_info: Optional[Dict] = None,
  1949. proxy_url: Optional[str] = None,
  1950. mode: str = "daily",
  1951. ) -> bool:
  1952. """发送到飞书"""
  1953. headers = {"Content-Type": "application/json"}
  1954. text_content = ReportGenerator._render_feishu_content(
  1955. report_data, update_info, mode
  1956. )
  1957. total_titles = sum(
  1958. len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
  1959. )
  1960. now = TimeHelper.get_beijing_time()
  1961. payload = {
  1962. "msg_type": "text",
  1963. "content": {
  1964. "total_titles": total_titles,
  1965. "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
  1966. "report_type": report_type,
  1967. "text": text_content,
  1968. },
  1969. }
  1970. proxies = None
  1971. if proxy_url:
  1972. proxies = {"http": proxy_url, "https": proxy_url}
  1973. try:
  1974. response = requests.post(
  1975. webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
  1976. )
  1977. if response.status_code == 200:
  1978. print(f"飞书通知发送成功 [{report_type}]")
  1979. return True
  1980. else:
  1981. print(
  1982. f"飞书通知发送失败 [{report_type}],状态码:{response.status_code}"
  1983. )
  1984. return False
  1985. except Exception as e:
  1986. print(f"飞书通知发送出错 [{report_type}]:{e}")
  1987. return False
  1988. @staticmethod
  1989. def _send_to_dingtalk(
  1990. webhook_url: str,
  1991. report_data: Dict,
  1992. report_type: str,
  1993. update_info: Optional[Dict] = None,
  1994. proxy_url: Optional[str] = None,
  1995. mode: str = "daily",
  1996. ) -> bool:
  1997. """发送到钉钉"""
  1998. headers = {"Content-Type": "application/json"}
  1999. text_content = ReportGenerator._render_dingtalk_content(
  2000. report_data, update_info, mode
  2001. )
  2002. payload = {
  2003. "msgtype": "markdown",
  2004. "markdown": {
  2005. "title": f"TrendRadar 热点分析报告 - {report_type}",
  2006. "text": text_content,
  2007. },
  2008. }
  2009. proxies = None
  2010. if proxy_url:
  2011. proxies = {"http": proxy_url, "https": proxy_url}
  2012. try:
  2013. response = requests.post(
  2014. webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
  2015. )
  2016. if response.status_code == 200:
  2017. result = response.json()
  2018. if result.get("errcode") == 0:
  2019. print(f"钉钉通知发送成功 [{report_type}]")
  2020. return True
  2021. else:
  2022. print(
  2023. f"钉钉通知发送失败 [{report_type}],错误:{result.get('errmsg')}"
  2024. )
  2025. return False
  2026. else:
  2027. print(
  2028. f"钉钉通知发送失败 [{report_type}],状态码:{response.status_code}"
  2029. )
  2030. return False
  2031. except Exception as e:
  2032. print(f"钉钉通知发送出错 [{report_type}]:{e}")
  2033. return False
  2034. @staticmethod
  2035. def _send_to_wework(
  2036. webhook_url: str,
  2037. report_data: Dict,
  2038. report_type: str,
  2039. update_info: Optional[Dict] = None,
  2040. proxy_url: Optional[str] = None,
  2041. mode: str = "daily",
  2042. ) -> bool:
  2043. """发送到企业微信(支持分批发送)"""
  2044. headers = {"Content-Type": "application/json"}
  2045. proxies = None
  2046. if proxy_url:
  2047. proxies = {"http": proxy_url, "https": proxy_url}
  2048. # 获取分批内容
  2049. batches = ReportGenerator._split_content_into_batches(
  2050. report_data, "wework", update_info, mode=mode
  2051. )
  2052. print(f"企业微信消息分为 {len(batches)} 批次发送 [{report_type}]")
  2053. # 逐批发送
  2054. for i, batch_content in enumerate(batches, 1):
  2055. batch_size = len(batch_content.encode("utf-8"))
  2056. print(
  2057. f"发送企业微信第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
  2058. )
  2059. # 添加批次标识
  2060. if len(batches) > 1:
  2061. batch_header = f"**[第 {i}/{len(batches)} 批次]**\n\n"
  2062. batch_content = batch_header + batch_content
  2063. payload = {"msgtype": "markdown", "markdown": {"content": batch_content}}
  2064. try:
  2065. response = requests.post(
  2066. webhook_url,
  2067. headers=headers,
  2068. json=payload,
  2069. proxies=proxies,
  2070. timeout=30,
  2071. )
  2072. if response.status_code == 200:
  2073. result = response.json()
  2074. if result.get("errcode") == 0:
  2075. print(
  2076. f"企业微信第 {i}/{len(batches)} 批次发送成功 [{report_type}]"
  2077. )
  2078. # 批次间间隔
  2079. if i < len(batches):
  2080. time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
  2081. else:
  2082. print(
  2083. f"企业微信第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}"
  2084. )
  2085. return False
  2086. else:
  2087. print(
  2088. f"企业微信第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
  2089. )
  2090. return False
  2091. except Exception as e:
  2092. print(
  2093. f"企业微信第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}"
  2094. )
  2095. return False
  2096. print(f"企业微信所有 {len(batches)} 批次发送完成 [{report_type}]")
  2097. return True
  2098. @staticmethod
  2099. def _send_to_telegram(
  2100. bot_token: str,
  2101. chat_id: str,
  2102. report_data: Dict,
  2103. report_type: str,
  2104. update_info: Optional[Dict] = None,
  2105. proxy_url: Optional[str] = None,
  2106. mode: str = "daily",
  2107. ) -> bool:
  2108. """发送到Telegram(支持分批发送)"""
  2109. headers = {"Content-Type": "application/json"}
  2110. url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
  2111. proxies = None
  2112. if proxy_url:
  2113. proxies = {"http": proxy_url, "https": proxy_url}
  2114. # 获取分批内容
  2115. batches = ReportGenerator._split_content_into_batches(
  2116. report_data, "telegram", update_info, mode=mode
  2117. )
  2118. print(f"Telegram消息分为 {len(batches)} 批次发送 [{report_type}]")
  2119. # 逐批发送
  2120. for i, batch_content in enumerate(batches, 1):
  2121. batch_size = len(batch_content.encode("utf-8"))
  2122. print(
  2123. f"发送Telegram第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
  2124. )
  2125. # 添加批次标识
  2126. if len(batches) > 1:
  2127. batch_header = f"<b>[第 {i}/{len(batches)} 批次]</b>\n\n"
  2128. batch_content = batch_header + batch_content
  2129. payload = {
  2130. "chat_id": chat_id,
  2131. "text": batch_content,
  2132. "parse_mode": "HTML",
  2133. "disable_web_page_preview": True,
  2134. }
  2135. try:
  2136. response = requests.post(
  2137. url, headers=headers, json=payload, proxies=proxies, timeout=30
  2138. )
  2139. if response.status_code == 200:
  2140. result = response.json()
  2141. if result.get("ok"):
  2142. print(
  2143. f"Telegram第 {i}/{len(batches)} 批次发送成功 [{report_type}]"
  2144. )
  2145. # 批次间间隔
  2146. if i < len(batches):
  2147. time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
  2148. else:
  2149. print(
  2150. f"Telegram第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('description')}"
  2151. )
  2152. return False
  2153. else:
  2154. print(
  2155. f"Telegram第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
  2156. )
  2157. return False
  2158. except Exception as e:
  2159. print(
  2160. f"Telegram第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}"
  2161. )
  2162. return False
  2163. print(f"Telegram所有 {len(batches)} 批次发送完成 [{report_type}]")
  2164. return True
  2165. @dataclass
  2166. class ModeStrategy:
  2167. """模式策略配置"""
  2168. mode_name: str
  2169. description: str
  2170. realtime_report_type: str
  2171. summary_report_type: str
  2172. should_send_realtime: bool
  2173. should_generate_summary: bool
  2174. summary_mode: str
  2175. def get_log_messages(self) -> Dict[str, str]:
  2176. """返回该模式的各种日志消息"""
  2177. return {
  2178. "mode_description": self.description,
  2179. "realtime_skip": f"跳过实时推送通知:{self.mode_name}下未检测到匹配的新闻",
  2180. "summary_skip": f"跳过{self.summary_report_type}通知:未匹配到有效的新闻内容",
  2181. }
  2182. class NewsAnalyzer:
  2183. """新闻分析器"""
  2184. MODE_STRATEGIES = {
  2185. "incremental": ModeStrategy(
  2186. mode_name="增量模式",
  2187. description="增量模式(只关注新增新闻,无新增时不推送)",
  2188. realtime_report_type="实时增量",
  2189. summary_report_type="当日汇总",
  2190. should_send_realtime=True,
  2191. should_generate_summary=True,
  2192. summary_mode="daily",
  2193. ),
  2194. "current": ModeStrategy(
  2195. mode_name="当前榜单模式",
  2196. description="当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)",
  2197. realtime_report_type="实时当前榜单",
  2198. summary_report_type="当前榜单汇总",
  2199. should_send_realtime=True,
  2200. should_generate_summary=True,
  2201. summary_mode="current",
  2202. ),
  2203. "daily": ModeStrategy(
  2204. mode_name="当日汇总模式",
  2205. description="当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)",
  2206. realtime_report_type="",
  2207. summary_report_type="当日汇总",
  2208. should_send_realtime=False,
  2209. should_generate_summary=True,
  2210. summary_mode="daily",
  2211. ),
  2212. }
  2213. def __init__(
  2214. self,
  2215. request_interval: int = CONFIG["REQUEST_INTERVAL"],
  2216. report_mode: str = CONFIG["REPORT_MODE"],
  2217. rank_threshold: int = CONFIG["RANK_THRESHOLD"],
  2218. ):
  2219. self.request_interval = request_interval
  2220. self.report_mode = report_mode
  2221. self.rank_threshold = rank_threshold
  2222. self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
  2223. self.is_docker_container = self._detect_docker_environment()
  2224. self.update_info = None
  2225. self.proxy_url = None
  2226. self._setup_proxy()
  2227. self.data_fetcher = DataFetcher(self.proxy_url)
  2228. if self.is_github_actions:
  2229. self._check_version_update()
  2230. def _detect_docker_environment(self) -> bool:
  2231. """检测是否运行在 Docker 容器中"""
  2232. try:
  2233. if os.environ.get("DOCKER_CONTAINER") == "true":
  2234. return True
  2235. if os.path.exists("/.dockerenv"):
  2236. return True
  2237. return False
  2238. except Exception:
  2239. return False
  2240. def _should_open_browser(self) -> bool:
  2241. """判断是否应该打开浏览器"""
  2242. return not self.is_github_actions and not self.is_docker_container
  2243. def _setup_proxy(self) -> None:
  2244. """设置代理配置"""
  2245. if not self.is_github_actions and CONFIG["USE_PROXY"]:
  2246. self.proxy_url = CONFIG["DEFAULT_PROXY"]
  2247. print("本地环境,使用代理")
  2248. elif not self.is_github_actions and not CONFIG["USE_PROXY"]:
  2249. print("本地环境,未启用代理")
  2250. else:
  2251. print("GitHub Actions环境,不使用代理")
  2252. def _check_version_update(self) -> None:
  2253. """检查版本更新"""
  2254. try:
  2255. need_update, remote_version = VersionChecker.check_for_updates(
  2256. CONFIG["VERSION"], CONFIG["VERSION_CHECK_URL"], self.proxy_url
  2257. )
  2258. if need_update and remote_version:
  2259. self.update_info = {
  2260. "current_version": CONFIG["VERSION"],
  2261. "remote_version": remote_version,
  2262. }
  2263. print(f"发现新版本: {remote_version} (当前: {CONFIG['VERSION']})")
  2264. else:
  2265. print("版本检查完成,当前为最新版本")
  2266. except Exception as e:
  2267. print(f"版本检查出错: {e}")
  2268. def _get_mode_strategy(self) -> ModeStrategy:
  2269. """获取当前模式的策略配置"""
  2270. return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
  2271. def _has_webhook_configured(self) -> bool:
  2272. """检查是否配置了webhook"""
  2273. return any(
  2274. [
  2275. CONFIG["FEISHU_WEBHOOK_URL"],
  2276. CONFIG["DINGTALK_WEBHOOK_URL"],
  2277. CONFIG["WEWORK_WEBHOOK_URL"],
  2278. (CONFIG["TELEGRAM_BOT_TOKEN"] and CONFIG["TELEGRAM_CHAT_ID"]),
  2279. ]
  2280. )
  2281. def _has_valid_content(
  2282. self, stats: List[Dict], new_titles: Optional[Dict] = None
  2283. ) -> bool:
  2284. """检查是否有有效的新闻内容"""
  2285. if self.report_mode in ["incremental", "current"]:
  2286. # 增量模式和current模式下,只要stats有内容就说明有匹配的新闻
  2287. return any(stat["count"] > 0 for stat in stats)
  2288. else:
  2289. # 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻
  2290. has_matched_news = any(stat["count"] > 0 for stat in stats)
  2291. has_new_news = bool(
  2292. new_titles and any(len(titles) > 0 for titles in new_titles.values())
  2293. )
  2294. return has_matched_news or has_new_news
  2295. def _load_analysis_data(
  2296. self,
  2297. ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
  2298. """统一的数据加载和预处理,使用当前监控平台列表过滤历史数据"""
  2299. try:
  2300. # 获取当前配置的监控平台ID列表
  2301. current_platform_ids = []
  2302. for platform in PLATFORMS:
  2303. current_platform_ids.append(platform["id"])
  2304. print(f"当前监控平台: {current_platform_ids}")
  2305. all_results, id_to_name, title_info = DataProcessor.read_all_today_titles(
  2306. current_platform_ids
  2307. )
  2308. if not all_results:
  2309. print("没有找到当天的数据")
  2310. return None
  2311. total_titles = sum(len(titles) for titles in all_results.values())
  2312. print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)")
  2313. new_titles = DataProcessor.detect_latest_new_titles(current_platform_ids)
  2314. word_groups, filter_words = DataProcessor.load_frequency_words()
  2315. return (
  2316. all_results,
  2317. id_to_name,
  2318. title_info,
  2319. new_titles,
  2320. word_groups,
  2321. filter_words,
  2322. )
  2323. except Exception as e:
  2324. print(f"数据加载失败: {e}")
  2325. return None
  2326. def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
  2327. """从当前抓取结果构建标题信息"""
  2328. title_info = {}
  2329. for source_id, titles_data in results.items():
  2330. title_info[source_id] = {}
  2331. for title, title_data in titles_data.items():
  2332. ranks = title_data.get("ranks", [])
  2333. url = title_data.get("url", "")
  2334. mobile_url = title_data.get("mobileUrl", "")
  2335. title_info[source_id][title] = {
  2336. "first_time": time_info,
  2337. "last_time": time_info,
  2338. "count": 1,
  2339. "ranks": ranks,
  2340. "url": url,
  2341. "mobileUrl": mobile_url,
  2342. }
  2343. return title_info
  2344. def _run_analysis_pipeline(
  2345. self,
  2346. data_source: Dict,
  2347. mode: str,
  2348. title_info: Dict,
  2349. new_titles: Dict,
  2350. word_groups: List[Dict],
  2351. filter_words: List[str],
  2352. id_to_name: Dict,
  2353. failed_ids: Optional[List] = None,
  2354. is_daily_summary: bool = False,
  2355. ) -> Tuple[List[Dict], str]:
  2356. """统一的分析流水线:数据处理 → 统计计算 → HTML生成"""
  2357. # 统计计算
  2358. stats, total_titles = StatisticsCalculator.count_word_frequency(
  2359. data_source,
  2360. word_groups,
  2361. filter_words,
  2362. id_to_name,
  2363. title_info,
  2364. self.rank_threshold,
  2365. new_titles,
  2366. mode=mode,
  2367. )
  2368. # HTML生成
  2369. html_file = ReportGenerator.generate_html_report(
  2370. stats,
  2371. total_titles,
  2372. failed_ids=failed_ids,
  2373. new_titles=new_titles,
  2374. id_to_name=id_to_name,
  2375. mode=mode,
  2376. is_daily_summary=is_daily_summary,
  2377. )
  2378. return stats, html_file
  2379. def _send_notification_if_needed(
  2380. self,
  2381. stats: List[Dict],
  2382. report_type: str,
  2383. mode: str,
  2384. failed_ids: Optional[List] = None,
  2385. new_titles: Optional[Dict] = None,
  2386. id_to_name: Optional[Dict] = None,
  2387. ) -> bool:
  2388. """统一的通知发送逻辑,包含所有判断条件"""
  2389. has_webhook = self._has_webhook_configured()
  2390. if (
  2391. CONFIG["ENABLE_NOTIFICATION"]
  2392. and has_webhook
  2393. and self._has_valid_content(stats, new_titles)
  2394. ):
  2395. ReportGenerator.send_to_webhooks(
  2396. stats,
  2397. failed_ids or [],
  2398. report_type,
  2399. new_titles,
  2400. id_to_name,
  2401. self.update_info,
  2402. self.proxy_url,
  2403. mode=mode,
  2404. )
  2405. return True
  2406. elif CONFIG["ENABLE_NOTIFICATION"] and not has_webhook:
  2407. print("⚠️ 警告:通知功能已启用但未配置webhook URL,将跳过通知发送")
  2408. elif not CONFIG["ENABLE_NOTIFICATION"]:
  2409. print(f"跳过{report_type}通知:通知功能已禁用")
  2410. elif (
  2411. CONFIG["ENABLE_NOTIFICATION"]
  2412. and has_webhook
  2413. and not self._has_valid_content(stats, new_titles)
  2414. ):
  2415. mode_strategy = self._get_mode_strategy()
  2416. log_messages = mode_strategy.get_log_messages()
  2417. if "实时" in report_type:
  2418. print(log_messages["realtime_skip"])
  2419. else:
  2420. print(log_messages["summary_skip"])
  2421. return False
  2422. def _generate_summary_report(self, mode_strategy: ModeStrategy) -> Optional[str]:
  2423. """生成汇总报告(带通知)"""
  2424. summary_type = (
  2425. "当前榜单汇总" if mode_strategy.summary_mode == "current" else "当日汇总"
  2426. )
  2427. print(f"生成{summary_type}报告...")
  2428. # 加载分析数据
  2429. analysis_data = self._load_analysis_data()
  2430. if not analysis_data:
  2431. return None
  2432. all_results, id_to_name, title_info, new_titles, word_groups, filter_words = (
  2433. analysis_data
  2434. )
  2435. # 运行分析流水线
  2436. stats, html_file = self._run_analysis_pipeline(
  2437. all_results,
  2438. mode_strategy.summary_mode,
  2439. title_info,
  2440. new_titles,
  2441. word_groups,
  2442. filter_words,
  2443. id_to_name,
  2444. is_daily_summary=True,
  2445. )
  2446. print(f"{summary_type}报告已生成: {html_file}")
  2447. # 发送通知
  2448. self._send_notification_if_needed(
  2449. stats,
  2450. mode_strategy.summary_report_type,
  2451. mode_strategy.summary_mode,
  2452. new_titles=new_titles,
  2453. id_to_name=id_to_name,
  2454. )
  2455. return html_file
  2456. def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
  2457. """生成汇总HTML"""
  2458. summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
  2459. print(f"生成{summary_type}HTML...")
  2460. # 加载分析数据
  2461. analysis_data = self._load_analysis_data()
  2462. if not analysis_data:
  2463. return None
  2464. all_results, id_to_name, title_info, new_titles, word_groups, filter_words = (
  2465. analysis_data
  2466. )
  2467. # 运行分析流水线
  2468. _, html_file = self._run_analysis_pipeline(
  2469. all_results,
  2470. mode,
  2471. title_info,
  2472. new_titles,
  2473. word_groups,
  2474. filter_words,
  2475. id_to_name,
  2476. is_daily_summary=True,
  2477. )
  2478. print(f"{summary_type}HTML已生成: {html_file}")
  2479. return html_file
  2480. def _initialize_and_check_config(self) -> None:
  2481. """通用初始化和配置检查"""
  2482. now = TimeHelper.get_beijing_time()
  2483. print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
  2484. if not CONFIG["ENABLE_CRAWLER"]:
  2485. print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出")
  2486. return
  2487. has_webhook = self._has_webhook_configured()
  2488. if not CONFIG["ENABLE_NOTIFICATION"]:
  2489. print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取")
  2490. elif not has_webhook:
  2491. print("未配置任何webhook URL,将只进行数据抓取,不发送通知")
  2492. else:
  2493. print("通知功能已启用,将发送webhook通知")
  2494. mode_strategy = self._get_mode_strategy()
  2495. print(f"报告模式: {self.report_mode}")
  2496. print(f"运行模式: {mode_strategy.description}")
  2497. def _crawl_data(self) -> Tuple[Dict, Dict, List]:
  2498. """执行数据爬取"""
  2499. ids = []
  2500. for platform in PLATFORMS:
  2501. if "name" in platform:
  2502. ids.append((platform["id"], platform["name"]))
  2503. else:
  2504. ids.append(platform["id"])
  2505. print(f"配置的监控平台: {[p.get('name', p['id']) for p in PLATFORMS]}")
  2506. print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
  2507. FileHelper.ensure_directory_exists("output")
  2508. results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
  2509. ids, self.request_interval
  2510. )
  2511. title_file = DataProcessor.save_titles_to_file(results, id_to_name, failed_ids)
  2512. print(f"标题已保存到: {title_file}")
  2513. return results, id_to_name, failed_ids
  2514. def _execute_mode_strategy(
  2515. self,
  2516. mode_strategy: ModeStrategy,
  2517. results: Dict,
  2518. id_to_name: Dict,
  2519. failed_ids: List,
  2520. ) -> Optional[str]:
  2521. """执行模式特定逻辑"""
  2522. # 获取当前监控平台ID列表
  2523. current_platform_ids = [platform["id"] for platform in PLATFORMS]
  2524. new_titles = DataProcessor.detect_latest_new_titles(current_platform_ids)
  2525. time_info = Path(
  2526. DataProcessor.save_titles_to_file(results, id_to_name, failed_ids)
  2527. ).stem
  2528. word_groups, filter_words = DataProcessor.load_frequency_words()
  2529. # current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性
  2530. if self.report_mode == "current":
  2531. # 加载完整的历史数据(已按当前平台过滤)
  2532. analysis_data = self._load_analysis_data()
  2533. if analysis_data:
  2534. (
  2535. all_results,
  2536. historical_id_to_name,
  2537. historical_title_info,
  2538. historical_new_titles,
  2539. _,
  2540. _,
  2541. ) = analysis_data
  2542. print(
  2543. f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}"
  2544. )
  2545. stats, html_file = self._run_analysis_pipeline(
  2546. all_results,
  2547. self.report_mode,
  2548. historical_title_info,
  2549. historical_new_titles,
  2550. word_groups,
  2551. filter_words,
  2552. historical_id_to_name,
  2553. failed_ids=failed_ids,
  2554. )
  2555. combined_id_to_name = {**historical_id_to_name, **id_to_name}
  2556. print(f"HTML报告已生成: {html_file}")
  2557. # 发送实时通知(使用完整历史数据的统计结果)
  2558. summary_html = None
  2559. if mode_strategy.should_send_realtime:
  2560. self._send_notification_if_needed(
  2561. stats,
  2562. mode_strategy.realtime_report_type,
  2563. self.report_mode,
  2564. failed_ids=failed_ids,
  2565. new_titles=historical_new_titles,
  2566. id_to_name=combined_id_to_name,
  2567. )
  2568. else:
  2569. print("❌ 严重错误:无法读取刚保存的数据文件")
  2570. raise RuntimeError("数据一致性检查失败:保存后立即读取失败")
  2571. else:
  2572. title_info = self._prepare_current_title_info(results, time_info)
  2573. stats, html_file = self._run_analysis_pipeline(
  2574. results,
  2575. self.report_mode,
  2576. title_info,
  2577. new_titles,
  2578. word_groups,
  2579. filter_words,
  2580. id_to_name,
  2581. failed_ids=failed_ids,
  2582. )
  2583. print(f"HTML报告已生成: {html_file}")
  2584. # 发送实时通知(如果需要)
  2585. summary_html = None
  2586. if mode_strategy.should_send_realtime:
  2587. self._send_notification_if_needed(
  2588. stats,
  2589. mode_strategy.realtime_report_type,
  2590. self.report_mode,
  2591. failed_ids=failed_ids,
  2592. new_titles=new_titles,
  2593. id_to_name=id_to_name,
  2594. )
  2595. # 生成汇总报告(如果需要)
  2596. summary_html = None
  2597. if mode_strategy.should_generate_summary:
  2598. if mode_strategy.should_send_realtime:
  2599. # 如果已经发送了实时通知,汇总只生成HTML不发送通知
  2600. summary_html = self._generate_summary_html(mode_strategy.summary_mode)
  2601. else:
  2602. # daily模式:直接生成汇总报告并发送通知
  2603. summary_html = self._generate_summary_report(mode_strategy)
  2604. # 打开浏览器(仅在非容器环境)
  2605. if self._should_open_browser() and html_file:
  2606. if summary_html:
  2607. summary_url = "file://" + str(Path(summary_html).resolve())
  2608. print(f"正在打开汇总报告: {summary_url}")
  2609. webbrowser.open(summary_url)
  2610. else:
  2611. file_url = "file://" + str(Path(html_file).resolve())
  2612. print(f"正在打开HTML报告: {file_url}")
  2613. webbrowser.open(file_url)
  2614. elif self.is_docker_container and html_file:
  2615. if summary_html:
  2616. print(f"汇总报告已生成(Docker环境): {summary_html}")
  2617. else:
  2618. print(f"HTML报告已生成(Docker环境): {html_file}")
  2619. return summary_html
  2620. def run(self) -> None:
  2621. """执行分析流程"""
  2622. try:
  2623. self._initialize_and_check_config()
  2624. mode_strategy = self._get_mode_strategy()
  2625. results, id_to_name, failed_ids = self._crawl_data()
  2626. self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
  2627. except Exception as e:
  2628. print(f"分析流程执行出错: {e}")
  2629. raise
  2630. def main():
  2631. try:
  2632. analyzer = NewsAnalyzer(
  2633. request_interval=CONFIG["REQUEST_INTERVAL"],
  2634. report_mode=CONFIG["REPORT_MODE"],
  2635. rank_threshold=CONFIG["RANK_THRESHOLD"],
  2636. )
  2637. analyzer.run()
  2638. except FileNotFoundError as e:
  2639. print(f"❌ 配置文件错误: {e}")
  2640. print("\n请确保以下文件存在:")
  2641. print(" • config/config.yaml")
  2642. print(" • config/frequency_words.txt")
  2643. print("\n参考项目文档进行正确配置")
  2644. except Exception as e:
  2645. print(f"❌ 程序运行错误: {e}")
  2646. raise
  2647. if __name__ == "__main__":
  2648. main()