main.py 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148
  1. # coding=utf-8
  2. import json
  3. import time
  4. import random
  5. from datetime import datetime
  6. import webbrowser
  7. from typing import Dict, List, Tuple, Optional, Union
  8. from pathlib import Path
  9. import os
  10. import requests
  11. import pytz
  12. # 配置常量
  13. CONFIG = {
  14. "FEISHU_SEPARATOR": "━━━━━━━━━━━━━━━━━━━", # 飞书消息分割线,注意,其它类型的分割线可能会被飞书过滤而不显示
  15. "REQUEST_INTERVAL": 1000, # 请求间隔(毫秒)
  16. "FEISHU_REPORT_TYPE": "daily", # 飞书报告类型: "current"|"daily"|"both"
  17. "RANK_THRESHOLD": 5, # 排名高亮阈值
  18. "USE_PROXY": True, # 是否启用代理
  19. "DEFAULT_PROXY": "http://127.0.0.1:10086",
  20. "CONTINUE_WITHOUT_FEISHU": True, # 控制在没有飞书 webhook URL 时是否继续执行爬虫, 如果 True ,会依然进行爬虫行为,并在 github 上持续的生成爬取的新闻数据
  21. "FEISHU_WEBHOOK_URL": "", # 飞书机器人的 webhook URL,大概长这样:https://www.feishu.cn/flow/api/trigger-webhook/xxxx, 默认为空,推荐通过GitHub Secrets设置
  22. }
  23. class TimeHelper:
  24. """时间处理工具"""
  25. @staticmethod
  26. def get_beijing_time() -> datetime:
  27. """获取北京时间"""
  28. return datetime.now(pytz.timezone("Asia/Shanghai"))
  29. @staticmethod
  30. def format_date_folder() -> str:
  31. """返回日期文件夹格式"""
  32. return TimeHelper.get_beijing_time().strftime("%Y年%m月%d日")
  33. @staticmethod
  34. def format_time_filename() -> str:
  35. """返回时间文件名格式"""
  36. return TimeHelper.get_beijing_time().strftime("%H时%M分")
  37. class FileHelper:
  38. """文件操作工具"""
  39. @staticmethod
  40. def ensure_directory_exists(directory: str) -> None:
  41. """确保目录存在"""
  42. Path(directory).mkdir(parents=True, exist_ok=True)
  43. @staticmethod
  44. def get_output_path(subfolder: str, filename: str) -> str:
  45. """获取输出文件路径"""
  46. date_folder = TimeHelper.format_date_folder()
  47. output_dir = Path("output") / date_folder / subfolder
  48. FileHelper.ensure_directory_exists(str(output_dir))
  49. return str(output_dir / filename)
  50. class DataFetcher:
  51. """数据获取器"""
  52. def __init__(self, proxy_url: Optional[str] = None):
  53. self.proxy_url = proxy_url
  54. def fetch_data(
  55. self,
  56. id_info: Union[str, Tuple[str, str]],
  57. max_retries: int = 2,
  58. min_retry_wait: int = 3,
  59. max_retry_wait: int = 5,
  60. ) -> Tuple[Optional[str], str, str]:
  61. """获取指定ID数据,支持重试"""
  62. # 解析ID和别名
  63. if isinstance(id_info, tuple):
  64. id_value, alias = id_info
  65. else:
  66. id_value = id_info
  67. alias = id_value
  68. url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
  69. # 设置代理
  70. proxies = None
  71. if self.proxy_url:
  72. proxies = {"http": self.proxy_url, "https": self.proxy_url}
  73. headers = {
  74. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  75. "Accept": "application/json, text/plain, */*",
  76. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  77. "Connection": "keep-alive",
  78. "Cache-Control": "no-cache",
  79. }
  80. retries = 0
  81. while retries <= max_retries:
  82. try:
  83. print(f"正在请求 {id_value} 数据... (尝试 {retries + 1}/{max_retries + 1})")
  84. response = requests.get(url, proxies=proxies, headers=headers, timeout=10)
  85. response.raise_for_status()
  86. data_text = response.text
  87. data_json = json.loads(data_text)
  88. # 检查响应状态,接受success和cache
  89. status = data_json.get("status", "未知")
  90. if status not in ["success", "cache"]:
  91. raise ValueError(f"响应状态异常: {status}")
  92. status_info = "最新数据" if status == "success" else "缓存数据"
  93. print(f"成功获取 {id_value} 数据({status_info})")
  94. return data_text, id_value, alias
  95. except Exception as e:
  96. retries += 1
  97. if retries <= max_retries:
  98. # 计算重试等待时间:基础时间+递增时间
  99. base_wait = random.uniform(min_retry_wait, max_retry_wait)
  100. additional_wait = (retries - 1) * random.uniform(1, 2)
  101. wait_time = base_wait + additional_wait
  102. print(f"请求 {id_value} 失败: {e}. 将在 {wait_time:.2f} 秒后重试...")
  103. time.sleep(wait_time)
  104. else:
  105. print(f"请求 {id_value} 失败: {e}. 已达到最大重试次数。")
  106. return None, id_value, alias
  107. return None, id_value, alias
  108. def crawl_websites(
  109. self,
  110. ids_list: List[Union[str, Tuple[str, str]]],
  111. request_interval: int = CONFIG["REQUEST_INTERVAL"],
  112. ) -> Tuple[Dict, Dict, List]:
  113. """爬取多个网站数据"""
  114. results = {}
  115. id_to_alias = {}
  116. failed_ids = []
  117. for i, id_info in enumerate(ids_list):
  118. # 解析ID和别名
  119. if isinstance(id_info, tuple):
  120. id_value, alias = id_info
  121. else:
  122. id_value = id_info
  123. alias = id_value
  124. id_to_alias[id_value] = alias
  125. # 获取数据
  126. response, _, _ = self.fetch_data(id_info)
  127. if response:
  128. try:
  129. data = json.loads(response)
  130. results[id_value] = {}
  131. for index, item in enumerate(data.get("items", []), 1):
  132. title = item["title"]
  133. url = item.get("url", "")
  134. mobile_url = item.get("mobileUrl", "")
  135. if title in results[id_value]:
  136. # 标题已存在,更新排名
  137. results[id_value][title]["ranks"].append(index)
  138. else:
  139. # 新标题
  140. results[id_value][title] = {
  141. "ranks": [index],
  142. "url": url,
  143. "mobileUrl": mobile_url
  144. }
  145. except json.JSONDecodeError:
  146. print(f"解析 {id_value} 响应失败,非有效JSON")
  147. failed_ids.append(id_value)
  148. except Exception as e:
  149. print(f"处理 {id_value} 数据出错: {e}")
  150. failed_ids.append(id_value)
  151. else:
  152. failed_ids.append(id_value)
  153. # 添加请求间隔
  154. if i < len(ids_list) - 1:
  155. actual_interval = request_interval + random.randint(-10, 20)
  156. actual_interval = max(50, actual_interval) # 最少50毫秒
  157. print(f"等待 {actual_interval} 毫秒后发送下一个请求...")
  158. time.sleep(actual_interval / 1000)
  159. print(f"\n请求总结:")
  160. print(f"- 成功获取数据: {list(results.keys())}")
  161. print(f"- 请求失败: {failed_ids}")
  162. return results, id_to_alias, failed_ids
  163. class DataProcessor:
  164. """数据处理器"""
  165. @staticmethod
  166. def save_titles_to_file(results: Dict, id_to_alias: Dict, failed_ids: List) -> str:
  167. """保存标题到文件"""
  168. file_path = FileHelper.get_output_path("txt", f"{TimeHelper.format_time_filename()}.txt")
  169. with open(file_path, "w", encoding="utf-8") as f:
  170. # 写入成功数据
  171. for id_value, title_data in results.items():
  172. display_name = id_to_alias.get(id_value, id_value)
  173. f.write(f"{display_name}\n")
  174. for i, (title, info) in enumerate(title_data.items(), 1):
  175. if isinstance(info, dict):
  176. ranks = info.get("ranks", [])
  177. url = info.get("url", "")
  178. mobile_url = info.get("mobileUrl", "")
  179. rank_str = ",".join(map(str, ranks))
  180. line = f"{i}. {title} (排名:{rank_str})"
  181. if url:
  182. line += f" [URL:{url}]"
  183. if mobile_url:
  184. line += f" [MOBILE:{mobile_url}]"
  185. f.write(line + "\n")
  186. else:
  187. # 兼容旧格式
  188. rank_str = ",".join(map(str, info))
  189. f.write(f"{i}. {title} (排名:{rank_str})\n")
  190. f.write("\n")
  191. # 写入失败信息
  192. if failed_ids:
  193. f.write("==== 以下ID请求失败 ====\n")
  194. for id_value in failed_ids:
  195. display_name = id_to_alias.get(id_value, id_value)
  196. f.write(f"{display_name} (ID: {id_value})\n")
  197. return file_path
  198. @staticmethod
  199. def load_frequency_words(frequency_file: str = "frequency_words.txt") -> Tuple[List[Dict], List[str]]:
  200. """加载频率词配置"""
  201. frequency_path = Path(frequency_file)
  202. if not frequency_path.exists():
  203. print(f"频率词文件 {frequency_file} 不存在")
  204. return [], []
  205. with open(frequency_path, "r", encoding="utf-8") as f:
  206. content = f.read()
  207. # 按双空行分割词组
  208. word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
  209. processed_groups = []
  210. filter_words = []
  211. for group in word_groups:
  212. words = [word.strip() for word in group.split("\n") if word.strip()]
  213. # 分类词汇
  214. group_required_words = [] # +开头必须词
  215. group_normal_words = [] # 普通频率词
  216. group_filter_words = [] # !开头过滤词
  217. for word in words:
  218. if word.startswith("!"):
  219. filter_words.append(word[1:])
  220. group_filter_words.append(word[1:])
  221. elif word.startswith("+"):
  222. group_required_words.append(word[1:])
  223. else:
  224. group_normal_words.append(word)
  225. # 只处理包含有效词的组
  226. if group_required_words or group_normal_words:
  227. # 生成组标识
  228. if group_normal_words:
  229. group_key = " ".join(group_normal_words)
  230. else:
  231. group_key = " ".join(group_required_words)
  232. processed_groups.append({
  233. 'required': group_required_words,
  234. 'normal': group_normal_words,
  235. 'group_key': group_key
  236. })
  237. return processed_groups, filter_words
  238. @staticmethod
  239. def read_all_today_titles() -> Tuple[Dict, Dict, Dict]:
  240. """读取当天所有标题文件"""
  241. date_folder = TimeHelper.format_date_folder()
  242. txt_dir = Path("output") / date_folder / "txt"
  243. if not txt_dir.exists():
  244. print(f"今日文件夹 {txt_dir} 不存在")
  245. return {}, {}, {}
  246. all_results = {}
  247. id_to_alias = {}
  248. title_info = {}
  249. # 按时间排序处理文件
  250. files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
  251. for file_path in files:
  252. time_info = file_path.stem
  253. with open(file_path, "r", encoding="utf-8") as f:
  254. content = f.read()
  255. sections = content.split("\n\n")
  256. for section in sections:
  257. if not section.strip() or "==== 以下ID请求失败 ====" in section:
  258. continue
  259. lines = section.strip().split("\n")
  260. if len(lines) < 2:
  261. continue
  262. source_name = lines[0].strip()
  263. # 解析标题数据
  264. title_data = {}
  265. for line in lines[1:]:
  266. if line.strip():
  267. try:
  268. match_num = None
  269. title_part = line.strip()
  270. # 提取序号
  271. if ". " in title_part and title_part.split(". ")[0].isdigit():
  272. parts = title_part.split(". ", 1)
  273. match_num = int(parts[0])
  274. title_part = parts[1]
  275. # 提取mobileUrl
  276. mobile_url = ""
  277. if " [MOBILE:" in title_part:
  278. title_part, mobile_part = title_part.rsplit(" [MOBILE:", 1)
  279. if mobile_part.endswith("]"):
  280. mobile_url = mobile_part[:-1]
  281. # 提取url
  282. url = ""
  283. if " [URL:" in title_part:
  284. title_part, url_part = title_part.rsplit(" [URL:", 1)
  285. if url_part.endswith("]"):
  286. url = url_part[:-1]
  287. # 提取排名
  288. ranks = []
  289. if " (排名:" in title_part:
  290. title, rank_str = title_part.rsplit(" (排名:", 1)
  291. rank_str = rank_str.rstrip(")")
  292. ranks = [int(r) for r in rank_str.split(",") if r.strip() and r.isdigit()]
  293. else:
  294. title = title_part
  295. if not ranks and match_num is not None:
  296. ranks = [match_num]
  297. if not ranks:
  298. ranks = [99]
  299. title_data[title] = {
  300. "ranks": ranks,
  301. "url": url,
  302. "mobileUrl": mobile_url
  303. }
  304. except Exception as e:
  305. print(f"解析标题行出错: {line}, 错误: {e}")
  306. DataProcessor._process_source_data(
  307. source_name, title_data, time_info,
  308. all_results, title_info, id_to_alias
  309. )
  310. # 转换为ID结果
  311. id_results = {}
  312. id_title_info = {}
  313. for name, titles in all_results.items():
  314. for id_value, alias in id_to_alias.items():
  315. if alias == name:
  316. id_results[id_value] = titles
  317. id_title_info[id_value] = title_info[name]
  318. break
  319. return id_results, id_to_alias, id_title_info
  320. @staticmethod
  321. def _process_source_data(
  322. source_name: str, title_data: Dict, time_info: str,
  323. all_results: Dict, title_info: Dict, id_to_alias: Dict,
  324. ) -> None:
  325. """处理来源数据,合并重复标题"""
  326. if source_name not in all_results:
  327. # 首次遇到此来源
  328. all_results[source_name] = title_data
  329. if source_name not in title_info:
  330. title_info[source_name] = {}
  331. # 记录标题信息
  332. for title, data in title_data.items():
  333. if isinstance(data, dict):
  334. ranks = data.get("ranks", [])
  335. url = data.get("url", "")
  336. mobile_url = data.get("mobileUrl", "")
  337. else:
  338. ranks = data if isinstance(data, list) else []
  339. url = ""
  340. mobile_url = ""
  341. title_info[source_name][title] = {
  342. "first_time": time_info,
  343. "last_time": time_info,
  344. "count": 1,
  345. "ranks": ranks,
  346. "url": url,
  347. "mobileUrl": mobile_url,
  348. }
  349. # 生成反向ID映射
  350. reversed_id = source_name.lower().replace(" ", "-")
  351. id_to_alias[reversed_id] = source_name
  352. else:
  353. # 更新已有来源
  354. for title, data in title_data.items():
  355. if isinstance(data, dict):
  356. ranks = data.get("ranks", [])
  357. url = data.get("url", "")
  358. mobile_url = data.get("mobileUrl", "")
  359. else:
  360. ranks = data if isinstance(data, list) else []
  361. url = ""
  362. mobile_url = ""
  363. if title not in all_results[source_name]:
  364. # 新标题
  365. all_results[source_name][title] = {
  366. "ranks": ranks,
  367. "url": url,
  368. "mobileUrl": mobile_url
  369. }
  370. title_info[source_name][title] = {
  371. "first_time": time_info,
  372. "last_time": time_info,
  373. "count": 1,
  374. "ranks": ranks,
  375. "url": url,
  376. "mobileUrl": mobile_url,
  377. }
  378. else:
  379. # 更新已有标题
  380. existing_data = all_results[source_name][title]
  381. existing_ranks = existing_data.get("ranks", [])
  382. existing_url = existing_data.get("url", "")
  383. existing_mobile_url = existing_data.get("mobileUrl", "")
  384. merged_ranks = existing_ranks.copy()
  385. for rank in ranks:
  386. if rank not in merged_ranks:
  387. merged_ranks.append(rank)
  388. all_results[source_name][title] = {
  389. "ranks": merged_ranks,
  390. "url": existing_url or url,
  391. "mobileUrl": existing_mobile_url or mobile_url
  392. }
  393. title_info[source_name][title]["last_time"] = time_info
  394. title_info[source_name][title]["ranks"] = merged_ranks
  395. title_info[source_name][title]["count"] += 1
  396. # 保留第一个有效URL
  397. if not title_info[source_name][title].get("url"):
  398. title_info[source_name][title]["url"] = url
  399. if not title_info[source_name][title].get("mobileUrl"):
  400. title_info[source_name][title]["mobileUrl"] = mobile_url
  401. class StatisticsCalculator:
  402. """统计计算器"""
  403. @staticmethod
  404. def count_word_frequency(
  405. results: Dict,
  406. word_groups: List[Dict],
  407. filter_words: List[str],
  408. id_to_alias: Dict,
  409. title_info: Optional[Dict] = None,
  410. rank_threshold: int = CONFIG["RANK_THRESHOLD"],
  411. ) -> Tuple[List[Dict], int]:
  412. """统计词频,支持必须词、频率词、过滤词"""
  413. word_stats = {}
  414. total_titles = 0
  415. processed_titles = {} # 跟踪已处理标题
  416. if title_info is None:
  417. title_info = {}
  418. # 初始化统计对象
  419. for group in word_groups:
  420. group_key = group['group_key']
  421. word_stats[group_key] = {"count": 0, "titles": {}}
  422. # 遍历标题进行统计
  423. for source_id, titles_data in results.items():
  424. total_titles += len(titles_data)
  425. if source_id not in processed_titles:
  426. processed_titles[source_id] = {}
  427. for title, title_data in titles_data.items():
  428. if title in processed_titles.get(source_id, {}):
  429. continue
  430. title_lower = title.lower()
  431. # 优先级1:过滤词检查
  432. contains_filter_word = any(
  433. filter_word.lower() in title_lower for filter_word in filter_words
  434. )
  435. if contains_filter_word:
  436. continue
  437. # 兼容数据格式
  438. if isinstance(title_data, dict):
  439. source_ranks = title_data.get("ranks", [])
  440. source_url = title_data.get("url", "")
  441. source_mobile_url = title_data.get("mobileUrl", "")
  442. else:
  443. source_ranks = title_data if isinstance(title_data, list) else []
  444. source_url = ""
  445. source_mobile_url = ""
  446. # 检查每个词组
  447. for group in word_groups:
  448. group_key = group['group_key']
  449. required_words = group['required']
  450. normal_words = group['normal']
  451. # 优先级2:必须词检查
  452. if required_words:
  453. all_required_present = all(
  454. req_word.lower() in title_lower for req_word in required_words
  455. )
  456. if not all_required_present:
  457. continue
  458. # 优先级3:频率词检查
  459. if normal_words:
  460. any_normal_present = any(
  461. normal_word.lower() in title_lower for normal_word in normal_words
  462. )
  463. if not any_normal_present:
  464. continue
  465. # 如果只有必须词没有频率词,且所有必须词都匹配了,那么也算匹配
  466. # 如果既有必须词又有频率词,那么必须词全部匹配且至少一个频率词匹配
  467. # 如果只有频率词,那么至少一个频率词匹配
  468. # 匹配成功,记录数据
  469. word_stats[group_key]["count"] += 1
  470. if source_id not in word_stats[group_key]["titles"]:
  471. word_stats[group_key]["titles"][source_id] = []
  472. # 获取标题详细信息
  473. first_time = ""
  474. last_time = ""
  475. count_info = 1
  476. ranks = source_ranks if source_ranks else []
  477. url = source_url
  478. mobile_url = source_mobile_url
  479. if (title_info and source_id in title_info and title in title_info[source_id]):
  480. info = title_info[source_id][title]
  481. first_time = info.get("first_time", "")
  482. last_time = info.get("last_time", "")
  483. count_info = info.get("count", 1)
  484. if "ranks" in info and info["ranks"]:
  485. ranks = info["ranks"]
  486. url = info.get("url", source_url)
  487. mobile_url = info.get("mobileUrl", source_mobile_url)
  488. if not ranks:
  489. ranks = [99]
  490. time_display = StatisticsCalculator._format_time_display(first_time, last_time)
  491. source_alias = id_to_alias.get(source_id, source_id)
  492. word_stats[group_key]["titles"][source_id].append({
  493. "title": title,
  494. "source_alias": source_alias,
  495. "first_time": first_time,
  496. "last_time": last_time,
  497. "time_display": time_display,
  498. "count": count_info,
  499. "ranks": ranks,
  500. "rank_threshold": rank_threshold,
  501. "url": url,
  502. "mobileUrl": mobile_url,
  503. })
  504. # 标记已处理
  505. if source_id not in processed_titles:
  506. processed_titles[source_id] = {}
  507. processed_titles[source_id][title] = True
  508. break # 只匹配第一个词组
  509. # 转换统计结果
  510. stats = []
  511. for group_key, data in word_stats.items():
  512. all_titles = []
  513. for source_id, title_list in data["titles"].items():
  514. all_titles.extend(title_list)
  515. stats.append({
  516. "word": group_key,
  517. "count": data["count"],
  518. "titles": all_titles,
  519. "percentage": (
  520. round(data["count"] / total_titles * 100, 2)
  521. if total_titles > 0 else 0
  522. ),
  523. })
  524. stats.sort(key=lambda x: x["count"], reverse=True)
  525. return stats, total_titles
  526. @staticmethod
  527. def _format_rank_for_html(ranks: List[int], rank_threshold: int = 5) -> str:
  528. """格式化HTML排名显示"""
  529. if not ranks:
  530. return ""
  531. unique_ranks = sorted(set(ranks))
  532. min_rank = unique_ranks[0]
  533. max_rank = unique_ranks[-1]
  534. if min_rank <= rank_threshold:
  535. if min_rank == max_rank:
  536. return f"<font color='red'><strong>[{min_rank}]</strong></font>"
  537. else:
  538. return f"<font color='red'><strong>[{min_rank} - {max_rank}]</strong></font>"
  539. else:
  540. if min_rank == max_rank:
  541. return f"[{min_rank}]"
  542. else:
  543. return f"[{min_rank} - {max_rank}]"
  544. @staticmethod
  545. def _format_rank_for_feishu(ranks: List[int], rank_threshold: int = 5) -> str:
  546. """格式化飞书排名显示"""
  547. if not ranks:
  548. return ""
  549. unique_ranks = sorted(set(ranks))
  550. min_rank = unique_ranks[0]
  551. max_rank = unique_ranks[-1]
  552. if min_rank <= rank_threshold:
  553. if min_rank == max_rank:
  554. return f"<font color='red'>**[{min_rank}]**</font>"
  555. else:
  556. return f"<font color='red'>**[{min_rank} - {max_rank}]**</font>"
  557. else:
  558. if min_rank == max_rank:
  559. return f"[{min_rank}]"
  560. else:
  561. return f"[{min_rank} - {max_rank}]"
  562. @staticmethod
  563. def _format_time_display(first_time: str, last_time: str) -> str:
  564. """格式化时间显示"""
  565. if not first_time:
  566. return ""
  567. if first_time == last_time or not last_time:
  568. return first_time
  569. else:
  570. return f"[{first_time} ~ {last_time}]"
  571. class ReportGenerator:
  572. """报告生成器"""
  573. @staticmethod
  574. def generate_html_report(
  575. stats: List[Dict],
  576. total_titles: int,
  577. failed_ids: Optional[List] = None,
  578. is_daily: bool = False,
  579. ) -> str:
  580. """生成HTML报告"""
  581. if is_daily:
  582. filename = "当日统计.html"
  583. else:
  584. filename = f"{TimeHelper.format_time_filename()}.html"
  585. file_path = FileHelper.get_output_path("html", filename)
  586. html_content = ReportGenerator._create_html_content(
  587. stats, total_titles, failed_ids, is_daily
  588. )
  589. with open(file_path, "w", encoding="utf-8") as f:
  590. f.write(html_content)
  591. # 当日统计同时生成根目录index.html
  592. if is_daily:
  593. root_file_path = Path("index.html")
  594. with open(root_file_path, "w", encoding="utf-8") as f:
  595. f.write(html_content)
  596. print(f"当日统计报告已保存到根目录: {root_file_path.resolve()}")
  597. return file_path
  598. @staticmethod
  599. def _create_html_content(
  600. stats: List[Dict],
  601. total_titles: int,
  602. failed_ids: Optional[List] = None,
  603. is_daily: bool = False,
  604. ) -> str:
  605. """创建HTML内容"""
  606. html = """
  607. <!DOCTYPE html>
  608. <html>
  609. <head>
  610. <meta charset="UTF-8">
  611. <title>频率词统计报告</title>
  612. <style>
  613. body { font-family: Arial, sans-serif; margin: 20px; }
  614. h1, h2 { color: #333; }
  615. table { border-collapse: collapse; width: 100%; margin-top: 20px; }
  616. th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
  617. th { background-color: #f2f2f2; }
  618. tr:nth-child(even) { background-color: #f9f9f9; }
  619. .word { font-weight: bold; }
  620. .count { text-align: center; }
  621. .percentage { text-align: center; }
  622. .titles { max-width: 500px; }
  623. .source { color: #666; font-style: italic; }
  624. .error { color: #d9534f; }
  625. .news-link {
  626. color: #007bff;
  627. text-decoration: none;
  628. border-bottom: 1px dotted #007bff;
  629. }
  630. .news-link:hover {
  631. color: #0056b3;
  632. text-decoration: underline;
  633. }
  634. .news-link:visited {
  635. color: #6f42c1;
  636. }
  637. .no-link {
  638. color: #333;
  639. }
  640. </style>
  641. </head>
  642. <body>
  643. <h1>频率词统计报告</h1>
  644. """
  645. if is_daily:
  646. html += "<p>报告类型: 当日汇总</p>"
  647. now = TimeHelper.get_beijing_time()
  648. html += f"<p>总标题数: {total_titles}</p>"
  649. html += f"<p>生成时间: {now.strftime('%Y-%m-%d %H:%M:%S')}</p>"
  650. # 失败信息
  651. if failed_ids and len(failed_ids) > 0:
  652. html += """
  653. <div class="error">
  654. <h2>请求失败的平台</h2>
  655. <ul>
  656. """
  657. for id_value in failed_ids:
  658. html += f"<li>{ReportGenerator._html_escape(id_value)}</li>"
  659. html += """
  660. </ul>
  661. </div>
  662. """
  663. html += """
  664. <table>
  665. <tr>
  666. <th>排名</th>
  667. <th>频率词</th>
  668. <th>出现次数</th>
  669. <th>占比</th>
  670. <th>相关标题</th>
  671. </tr>
  672. """
  673. # 表格内容
  674. for i, stat in enumerate(stats, 1):
  675. formatted_titles = []
  676. for title_data in stat["titles"]:
  677. title = title_data["title"]
  678. source_alias = title_data["source_alias"]
  679. time_display = title_data["time_display"]
  680. count_info = title_data["count"]
  681. ranks = title_data["ranks"]
  682. rank_threshold = title_data["rank_threshold"]
  683. url = title_data.get("url", "")
  684. mobile_url = title_data.get("mobileUrl", "")
  685. rank_display = StatisticsCalculator._format_rank_for_html(ranks, rank_threshold)
  686. link_url = mobile_url or url
  687. escaped_title = ReportGenerator._html_escape(title)
  688. escaped_source_alias = ReportGenerator._html_escape(source_alias)
  689. if link_url:
  690. escaped_url = ReportGenerator._html_escape(link_url)
  691. formatted_title = f"[{escaped_source_alias}] <a href=\"{escaped_url}\" target=\"_blank\" class=\"news-link\">{escaped_title}</a>"
  692. else:
  693. formatted_title = f"[{escaped_source_alias}] <span class=\"no-link\">{escaped_title}</span>"
  694. if rank_display:
  695. formatted_title += f" {rank_display}"
  696. if time_display:
  697. escaped_time_display = ReportGenerator._html_escape(time_display)
  698. formatted_title += f" <font color='grey'>- {escaped_time_display}</font>"
  699. if count_info > 1:
  700. formatted_title += f" <font color='green'>({count_info}次)</font>"
  701. formatted_titles.append(formatted_title)
  702. escaped_word = ReportGenerator._html_escape(stat['word'])
  703. html += f"""
  704. <tr>
  705. <td>{i}</td>
  706. <td class="word">{escaped_word}</td>
  707. <td class="count">{stat['count']}</td>
  708. <td class="percentage">{stat['percentage']}%</td>
  709. <td class="titles">{"<br>".join(formatted_titles)}</td>
  710. </tr>
  711. """
  712. html += """
  713. </table>
  714. </body>
  715. </html>
  716. """
  717. return html
  718. @staticmethod
  719. def _html_escape(text: str) -> str:
  720. """HTML转义"""
  721. if not isinstance(text, str):
  722. text = str(text)
  723. return (text.replace("&", "&amp;")
  724. .replace("<", "&lt;")
  725. .replace(">", "&gt;")
  726. .replace('"', "&quot;")
  727. .replace("'", "&#x27;"))
  728. @staticmethod
  729. def send_to_feishu(
  730. stats: List[Dict],
  731. failed_ids: Optional[List] = None,
  732. report_type: str = "单次爬取",
  733. ) -> bool:
  734. """发送数据到飞书"""
  735. webhook_url = os.environ.get("FEISHU_WEBHOOK_URL", CONFIG["FEISHU_WEBHOOK_URL"])
  736. if not webhook_url:
  737. print(f"警告: FEISHU_WEBHOOK_URL未设置,跳过飞书通知")
  738. return False
  739. headers = {"Content-Type": "application/json"}
  740. total_titles = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
  741. text_content = ReportGenerator._build_feishu_content(stats, failed_ids)
  742. now = TimeHelper.get_beijing_time()
  743. payload = {
  744. "msg_type": "text",
  745. "content": {
  746. "total_titles": total_titles,
  747. "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
  748. "report_type": report_type,
  749. "text": text_content,
  750. },
  751. }
  752. try:
  753. response = requests.post(webhook_url, headers=headers, json=payload)
  754. if response.status_code == 200:
  755. print(f"数据发送到飞书成功 [{report_type}]")
  756. return True
  757. else:
  758. print(f"发送到飞书失败 [{report_type}],状态码:{response.status_code},响应:{response.text}")
  759. return False
  760. except Exception as e:
  761. print(f"发送到飞书时出错 [{report_type}]:{e}")
  762. return False
  763. @staticmethod
  764. def _build_feishu_content(stats: List[Dict], failed_ids: Optional[List] = None) -> str:
  765. """构建飞书消息内容"""
  766. text_content = ""
  767. filtered_stats = [stat for stat in stats if stat["count"] > 0]
  768. if filtered_stats:
  769. text_content += "📊 **热点词汇统计**\n\n"
  770. total_count = len(filtered_stats)
  771. for i, stat in enumerate(filtered_stats):
  772. word = stat["word"]
  773. count = stat["count"]
  774. sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
  775. # 频次颜色分级
  776. if count >= 10:
  777. text_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
  778. elif count >= 5:
  779. text_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
  780. else:
  781. text_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  782. # 标题列表
  783. for j, title_data in enumerate(stat["titles"], 1):
  784. title = title_data["title"]
  785. source_alias = title_data["source_alias"]
  786. time_display = title_data["time_display"]
  787. count_info = title_data["count"]
  788. ranks = title_data["ranks"]
  789. rank_threshold = title_data["rank_threshold"]
  790. url = title_data.get("url", "")
  791. mobile_url = title_data.get("mobileUrl", "")
  792. rank_display = StatisticsCalculator._format_rank_for_feishu(ranks, rank_threshold)
  793. link_url = mobile_url or url
  794. if link_url:
  795. formatted_title = f"[{title}]({link_url})"
  796. else:
  797. formatted_title = title
  798. text_content += f" {j}. <font color='grey'>[{source_alias}]</font> {formatted_title}"
  799. if rank_display:
  800. text_content += f" {rank_display}"
  801. if time_display:
  802. text_content += f" <font color='grey'>- {time_display}</font>"
  803. if count_info > 1:
  804. text_content += f" <font color='green'>({count_info}次)</font>"
  805. text_content += "\n"
  806. if j < len(stat["titles"]):
  807. text_content += "\n"
  808. # 分割线
  809. if i < len(filtered_stats) - 1:
  810. text_content += f"\n{CONFIG['FEISHU_SEPARATOR']}\n\n"
  811. if not text_content:
  812. text_content = "📭 暂无匹配的热点词汇\n\n"
  813. # 失败平台信息
  814. if failed_ids and len(failed_ids) > 0:
  815. if text_content and "暂无匹配" not in text_content:
  816. text_content += f"\n{CONFIG['FEISHU_SEPARATOR']}\n\n"
  817. text_content += "⚠️ **数据获取失败的平台:**\n\n"
  818. for i, id_value in enumerate(failed_ids, 1):
  819. text_content += f" • <font color='red'>{id_value}</font>\n"
  820. now = TimeHelper.get_beijing_time()
  821. text_content += f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
  822. return text_content
  823. class NewsAnalyzer:
  824. """新闻分析器"""
  825. def __init__(
  826. self,
  827. request_interval: int = CONFIG["REQUEST_INTERVAL"],
  828. feishu_report_type: str = CONFIG["FEISHU_REPORT_TYPE"],
  829. rank_threshold: int = CONFIG["RANK_THRESHOLD"],
  830. ):
  831. """初始化分析器"""
  832. self.request_interval = request_interval
  833. self.feishu_report_type = feishu_report_type
  834. self.rank_threshold = rank_threshold
  835. self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
  836. # 设置代理
  837. self.proxy_url = None
  838. if not self.is_github_actions and CONFIG["USE_PROXY"]:
  839. self.proxy_url = CONFIG["DEFAULT_PROXY"]
  840. print("本地环境,使用代理")
  841. elif not self.is_github_actions and not CONFIG["USE_PROXY"]:
  842. print("本地环境,未启用代理")
  843. else:
  844. print("GitHub Actions环境,不使用代理")
  845. self.data_fetcher = DataFetcher(self.proxy_url)
  846. def generate_daily_summary(self) -> Optional[str]:
  847. """生成当日统计报告"""
  848. print("开始生成当日统计报告...")
  849. all_results, id_to_alias, title_info = DataProcessor.read_all_today_titles()
  850. if not all_results:
  851. print("没有找到当天的数据")
  852. return None
  853. total_titles = sum(len(titles) for titles in all_results.values())
  854. print(f"读取到 {total_titles} 个标题")
  855. word_groups, filter_words = DataProcessor.load_frequency_words()
  856. stats, total_titles = StatisticsCalculator.count_word_frequency(
  857. all_results, word_groups, filter_words,
  858. id_to_alias, title_info, self.rank_threshold,
  859. )
  860. html_file = ReportGenerator.generate_html_report(
  861. stats, total_titles, is_daily=True
  862. )
  863. print(f"当日HTML统计报告已生成: {html_file}")
  864. if self.feishu_report_type in ["daily", "both"]:
  865. ReportGenerator.send_to_feishu(stats, [], "当日汇总")
  866. return html_file
  867. def run(self) -> None:
  868. """执行分析流程"""
  869. now = TimeHelper.get_beijing_time()
  870. print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
  871. webhook_url = os.environ.get("FEISHU_WEBHOOK_URL", CONFIG["FEISHU_WEBHOOK_URL"])
  872. if not webhook_url and not CONFIG["CONTINUE_WITHOUT_FEISHU"]:
  873. print("错误: FEISHU_WEBHOOK_URL未设置且CONTINUE_WITHOUT_FEISHU为False,程序退出")
  874. return
  875. if not webhook_url:
  876. print("警告: FEISHU_WEBHOOK_URL未设置,将继续执行爬虫但不发送飞书通知")
  877. print(f"飞书报告类型: {self.feishu_report_type}")
  878. print(f"排名阈值: {self.rank_threshold}")
  879. # 爬取目标列表
  880. ids = [
  881. ("toutiao", "今日头条"),
  882. ("baidu", "百度热搜"),
  883. ("wallstreetcn-hot", "华尔街见闻"),
  884. ("thepaper", "澎湃新闻"),
  885. ("bilibili-hot-search", "bilibili 热搜"),
  886. ("cls-hot", "财联社热门"),
  887. ("ifeng", "凤凰网"),
  888. "tieba",
  889. "weibo",
  890. "douyin",
  891. "zhihu",
  892. ]
  893. print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
  894. FileHelper.ensure_directory_exists("output")
  895. # 爬取数据
  896. results, id_to_alias, failed_ids = self.data_fetcher.crawl_websites(ids, self.request_interval)
  897. # 保存文件
  898. title_file = DataProcessor.save_titles_to_file(results, id_to_alias, failed_ids)
  899. print(f"标题已保存到: {title_file}")
  900. time_info = Path(title_file).stem
  901. # 创建标题信息
  902. title_info = {}
  903. for source_id, titles_data in results.items():
  904. title_info[source_id] = {}
  905. for title, title_data in titles_data.items():
  906. if isinstance(title_data, dict):
  907. ranks = title_data.get("ranks", [])
  908. url = title_data.get("url", "")
  909. mobile_url = title_data.get("mobileUrl", "")
  910. else:
  911. ranks = title_data if isinstance(title_data, list) else []
  912. url = ""
  913. mobile_url = ""
  914. title_info[source_id][title] = {
  915. "first_time": time_info,
  916. "last_time": time_info,
  917. "count": 1,
  918. "ranks": ranks,
  919. "url": url,
  920. "mobileUrl": mobile_url,
  921. }
  922. word_groups, filter_words = DataProcessor.load_frequency_words()
  923. stats, total_titles = StatisticsCalculator.count_word_frequency(
  924. results, word_groups, filter_words,
  925. id_to_alias, title_info, self.rank_threshold,
  926. )
  927. # 发送报告
  928. if self.feishu_report_type in ["current", "both"]:
  929. ReportGenerator.send_to_feishu(stats, failed_ids, "单次爬取")
  930. html_file = ReportGenerator.generate_html_report(stats, total_titles, failed_ids)
  931. print(f"HTML报告已生成: {html_file}")
  932. daily_html = self.generate_daily_summary()
  933. # 本地环境自动打开HTML
  934. if not self.is_github_actions and html_file:
  935. file_url = "file://" + str(Path(html_file).resolve())
  936. print(f"正在打开HTML报告: {file_url}")
  937. webbrowser.open(file_url)
  938. if daily_html:
  939. daily_url = "file://" + str(Path(daily_html).resolve())
  940. print(f"正在打开当日统计报告: {daily_url}")
  941. webbrowser.open(daily_url)
  942. def main():
  943. """程序入口"""
  944. analyzer = NewsAnalyzer(
  945. request_interval=CONFIG["REQUEST_INTERVAL"],
  946. feishu_report_type=CONFIG["FEISHU_REPORT_TYPE"],
  947. rank_threshold=CONFIG["RANK_THRESHOLD"],
  948. )
  949. analyzer.run()
  950. if __name__ == "__main__":
  951. main()