Ver código fonte

feat:feishu推送有跳转链接

sansan 11 meses atrás
pai
commit
a19ad13be4
1 arquivos alterados com 212 adições e 52 exclusões
  1. 212 52
      main.py

+ 212 - 52
main.py

@@ -189,14 +189,24 @@ class DataFetcher:
             if response:
                 try:
                     data = json.loads(response)
-                    # 获取标题列表,同时记录排名
+                    # 获取标题列表,同时记录排名、url和mobileUrl
                     results[id_value] = {}
                     for index, item in enumerate(data.get("items", []), 1):
                         title = item["title"]
+                        # 获取url和mobileUrl,提供容错处理
+                        url = item.get("url", "")
+                        mobile_url = item.get("mobileUrl", "")
+                    
                         if title in results[id_value]:
-                            results[id_value][title].append(index)
+                            # 如果标题已存在,更新排名列表,保持第一个URL
+                            results[id_value][title]["ranks"].append(index)
                         else:
-                            results[id_value][title] = [index]
+                            # 新标题,存储完整信息
+                            results[id_value][title] = {
+                                "ranks": [index],
+                                "url": url,
+                                "mobileUrl": mobile_url
+                            }
                 except json.JSONDecodeError:
                     print(f"解析 {id_value} 的响应失败,不是有效的JSON")
                     failed_ids.append(id_value)
@@ -226,7 +236,7 @@ class DataProcessor:
 
     @staticmethod
     def save_titles_to_file(results: Dict, id_to_alias: Dict, failed_ids: List) -> str:
-        """将标题保存到文件,包括失败的请求信息"""
+        """将标题保存到文件,包括失败的请求信息、url和mobileUrl"""
         file_path = FileHelper.get_output_path(
             "txt", f"{TimeHelper.format_time_filename()}.txt"
         )
@@ -236,9 +246,24 @@ class DataProcessor:
             for id_value, title_data in results.items():
                 display_name = id_to_alias.get(id_value, id_value)
                 f.write(f"{display_name}\n")
-                for i, (title, ranks) in enumerate(title_data.items(), 1):
-                    rank_str = ",".join(map(str, ranks))
-                    f.write(f"{i}. {title} (排名:{rank_str})\n")
+                for i, (title, info) in enumerate(title_data.items(), 1):
+                    # 处理新格式数据(包含ranks、url和mobileUrl)
+                    if isinstance(info, dict):
+                        ranks = info.get("ranks", [])
+                        url = info.get("url", "")
+                        mobile_url = info.get("mobileUrl", "")
+                        rank_str = ",".join(map(str, ranks))
+                        # 格式:序号. 标题 (排名:1,2,3) [URL:url] [MOBILE:mobile_url]
+                        line = f"{i}. {title} (排名:{rank_str})"
+                        if url:
+                            line += f" [URL:{url}]"
+                        if mobile_url:
+                            line += f" [MOBILE:{mobile_url}]"
+                        f.write(line + "\n")
+                    else:
+                        # 兼容旧格式数据(只有ranks列表)
+                        rank_str = ",".join(map(str, info))
+                        f.write(f"{i}. {title} (排名:{rank_str})\n")
                 f.write("\n")
 
             # 如果有失败的请求,写入失败信息
@@ -300,6 +325,7 @@ class DataProcessor:
     def read_all_today_titles() -> Tuple[Dict, Dict, Dict]:
         """
         读取当天所有txt文件的标题,并按来源合并,去除重复,记录时间和出现次数
+        兼容新格式(包含url和mobileUrl)和旧格式数据
 
         Returns:
             (all_results, id_to_alias, title_info)元组
@@ -311,11 +337,9 @@ class DataProcessor:
             print(f"今日文件夹 {txt_dir} 不存在")
             return {}, {}, {}
 
-        all_results = {}  # 所有源的所有标题 {source_id: {title: [ranks]}}
+        all_results = {}  # 所有源的所有标题 {source_id: {title: {"ranks": [排名列表], "url": "链接", "mobileUrl": "移动链接"}}}
         id_to_alias = {}  # ID到别名的映射
-        title_info = (
-            {}
-        )  # 标题信息 {source_id: {title: {"first_time": 首次时间, "last_time": 最后时间, "count": 出现次数, "ranks": [排名列表]}}}
+        title_info = {}  # 标题信息
 
         # 读取所有txt文件,按时间排序确保早的时间优先处理
         files = sorted([f for f in os.listdir(txt_dir) if f.endswith(".txt")])
@@ -341,8 +365,8 @@ class DataProcessor:
                     # 第一行是来源名
                     source_name = lines[0].strip()
 
-                    # 提取标题和排名
-                    title_ranks = {}
+                    # 提取标题和排名,兼容新旧格式
+                    title_data = {}
                     for line in lines[1:]:
                         if line.strip():
                             try:
@@ -351,14 +375,25 @@ class DataProcessor:
                                 title_part = line.strip()
 
                                 # 处理格式 "数字. 标题"
-                                if (
-                                    ". " in title_part
-                                    and title_part.split(". ")[0].isdigit()
-                                ):
+                                if ". " in title_part and title_part.split(". ")[0].isdigit():
                                     parts = title_part.split(". ", 1)
                                     match_num = int(parts[0])  # 序号可能是排名
                                     title_part = parts[1]
 
+                                # 提取mobileUrl信息 "[MOBILE:mobile_url]"
+                                mobile_url = ""
+                                if " [MOBILE:" in title_part:
+                                    title_part, mobile_part = title_part.rsplit(" [MOBILE:", 1)
+                                    if mobile_part.endswith("]"):
+                                        mobile_url = mobile_part[:-1]
+
+                                # 提取url信息 "[URL:url]"
+                                url = ""
+                                if " [URL:" in title_part:
+                                    title_part, url_part = title_part.rsplit(" [URL:", 1)
+                                    if url_part.endswith("]"):
+                                        url = url_part[:-1]
+
                                 # 提取排名信息 "标题 (排名:1,2,3)"
                                 ranks = []
                                 if " (排名:" in title_part:
@@ -380,7 +415,11 @@ class DataProcessor:
                                 if not ranks:
                                     ranks = [99]  # 默认排名
 
-                                title_ranks[title] = ranks
+                                title_data[title] = {
+                                    "ranks": ranks,
+                                    "url": url,
+                                    "mobileUrl": mobile_url
+                                }
 
                             except Exception as e:
                                 print(f"解析标题行出错: {line}, 错误: {e}")
@@ -388,14 +427,14 @@ class DataProcessor:
                     # 处理来源数据
                     DataProcessor._process_source_data(
                         source_name,
-                        title_ranks,
+                        title_data,
                         time_info,
                         all_results,
                         title_info,
                         id_to_alias,
                     )
 
-        # 将结果从 {source_name: {title: [ranks]}} 转换为 {source_id: {title: [ranks]}}
+        # 将结果从 {source_name: {title: data}} 转换为 {source_id: {title: data}}
         id_results = {}
         id_title_info = {}
         for name, titles in all_results.items():
@@ -410,28 +449,41 @@ class DataProcessor:
     @staticmethod
     def _process_source_data(
         source_name: str,
-        title_ranks: Dict,
+        title_data: Dict,
         time_info: str,
         all_results: Dict,
         title_info: Dict,
         id_to_alias: Dict,
     ) -> None:
-        """处理来源数据,更新结果和标题信息"""
+        """处理来源数据,更新结果和标题信息,兼容新旧数据格式"""
         if source_name not in all_results:
             # 首次遇到此来源
-            all_results[source_name] = title_ranks
+            all_results[source_name] = title_data
 
             # 初始化标题信息
             if source_name not in title_info:
                 title_info[source_name] = {}
 
-            # 记录每个标题的时间、次数和排名
-            for title, ranks in title_ranks.items():
+            # 记录每个标题的时间、次数、排名、url和mobileUrl
+            for title, data in title_data.items():
+                # 兼容新旧格式
+                if isinstance(data, dict):
+                    ranks = data.get("ranks", [])
+                    url = data.get("url", "")
+                    mobile_url = data.get("mobileUrl", "")
+                else:
+                    # 旧格式兼容
+                    ranks = data if isinstance(data, list) else []
+                    url = ""
+                    mobile_url = ""
+
                 title_info[source_name][title] = {
                     "first_time": time_info,  # 记录首次时间
                     "last_time": time_info,  # 最后时间初始同首次时间
                     "count": 1,
                     "ranks": ranks,
+                    "url": url,
+                    "mobileUrl": mobile_url,
                 }
 
             # 尝试反向生成ID
@@ -439,28 +491,59 @@ class DataProcessor:
             id_to_alias[reversed_id] = source_name
         else:
             # 已有此来源,更新标题
-            for title, ranks in title_ranks.items():
+            for title, data in title_data.items():
+                # 兼容新旧格式
+                if isinstance(data, dict):
+                    ranks = data.get("ranks", [])
+                    url = data.get("url", "")
+                    mobile_url = data.get("mobileUrl", "")
+                else:
+                    # 旧格式兼容
+                    ranks = data if isinstance(data, list) else []
+                    url = ""
+                    mobile_url = ""
+
                 if title not in all_results[source_name]:
-                    all_results[source_name][title] = ranks
+                    all_results[source_name][title] = {
+                        "ranks": ranks,
+                        "url": url,
+                        "mobileUrl": mobile_url
+                    }
                     title_info[source_name][title] = {
                         "first_time": time_info,  # 新标题的首次和最后时间都设为当前
                         "last_time": time_info,
                         "count": 1,
                         "ranks": ranks,
+                        "url": url,
+                        "mobileUrl": mobile_url,
                     }
                 else:
                     # 已存在的标题,更新最后时间,合并排名信息并增加计数
-                    existing_ranks = title_info[source_name][title]["ranks"]
+                    existing_data = all_results[source_name][title]
+                    existing_ranks = existing_data.get("ranks", [])
+                    existing_url = existing_data.get("url", "")
+                    existing_mobile_url = existing_data.get("mobileUrl", "")
+                
                     merged_ranks = existing_ranks.copy()
                     for rank in ranks:
                         if rank not in merged_ranks:
                             merged_ranks.append(rank)
 
-                    title_info[source_name][title][
-                        "last_time"
-                    ] = time_info  # 更新最后时间
+                    # 更新数据,保持第一个有效的URL
+                    all_results[source_name][title] = {
+                        "ranks": merged_ranks,
+                        "url": existing_url or url,
+                        "mobileUrl": existing_mobile_url or mobile_url
+                    }
+
+                    title_info[source_name][title]["last_time"] = time_info  # 更新最后时间
                     title_info[source_name][title]["ranks"] = merged_ranks
                     title_info[source_name][title]["count"] += 1
+                    # 保持第一个有效的URL
+                    if not title_info[source_name][title].get("url"):
+                        title_info[source_name][title]["url"] = url
+                    if not title_info[source_name][title].get("mobileUrl"):
+                        title_info[source_name][title]["mobileUrl"] = mobile_url
 
 
 class StatisticsCalculator:
@@ -477,6 +560,7 @@ class StatisticsCalculator:
     ) -> Tuple[List[Dict], int]:
         """
         统计词频,处理关联词和大小写不敏感,每个标题只计入首个匹配词组,并应用过滤词
+        支持新格式数据(包含url和mobileUrl)
 
         Returns:
             (stats, total_titles)元组
@@ -502,7 +586,7 @@ class StatisticsCalculator:
             if source_id not in processed_titles:
                 processed_titles[source_id] = {}
 
-            for title, source_ranks in titles_data.items():
+            for title, title_data in titles_data.items():
                 # 跳过已处理的标题
                 if title in processed_titles.get(source_id, {}):
                     continue
@@ -518,6 +602,17 @@ class StatisticsCalculator:
                 if contains_filter_word:
                     continue
 
+                # 兼容新旧数据格式
+                if isinstance(title_data, dict):
+                    source_ranks = title_data.get("ranks", [])
+                    source_url = title_data.get("url", "")
+                    source_mobile_url = title_data.get("mobileUrl", "")
+                else:
+                    # 旧格式兼容
+                    source_ranks = title_data if isinstance(title_data, list) else []
+                    source_url = ""
+                    source_mobile_url = ""
+
                 # 按顺序检查每个词组
                 for group in word_groups:
                     group_key = " ".join(group)
@@ -536,6 +631,8 @@ class StatisticsCalculator:
                         last_time = ""
                         count_info = 1
                         ranks = source_ranks if source_ranks else []
+                        url = source_url
+                        mobile_url = source_mobile_url
 
                         if (
                             title_info
@@ -548,6 +645,8 @@ class StatisticsCalculator:
                             count_info = info.get("count", 1)
                             if "ranks" in info and info["ranks"]:
                                 ranks = info["ranks"]
+                            url = info.get("url", source_url)
+                            mobile_url = info.get("mobileUrl", source_mobile_url)
 
                         # 确保排名是有效的
                         if not ranks:
@@ -570,6 +669,8 @@ class StatisticsCalculator:
                                 "count": count_info,
                                 "ranks": ranks,
                                 "rank_threshold": rank_threshold,
+                                "url": url,  # 新增url字段
+                                "mobileUrl": mobile_url,  # 新增mobileUrl字段
                             }
                         )
 
@@ -719,7 +820,7 @@ class ReportGenerator:
         failed_ids: Optional[List] = None,
         is_daily: bool = False,
     ) -> str:
-        """创建HTML内容"""
+        """创建HTML内容,支持可点击的新闻链接"""
         # HTML头部
         html = """
         <!DOCTYPE html>
@@ -740,6 +841,21 @@ class ReportGenerator:
                 .titles { max-width: 500px; }
                 .source { color: #666; font-style: italic; }
                 .error { color: #d9534f; }
+                .news-link { 
+                    color: #007bff; 
+                    text-decoration: none; 
+                    border-bottom: 1px dotted #007bff;
+                }
+                .news-link:hover { 
+                    color: #0056b3; 
+                    text-decoration: underline; 
+                }
+                .news-link:visited { 
+                    color: #6f42c1; 
+                }
+                .no-link { 
+                    color: #333; 
+                }
             </style>
         </head>
         <body>
@@ -763,7 +879,7 @@ class ReportGenerator:
                 <ul>
             """
             for id_value in failed_ids:
-                html += f"<li>{id_value}</li>"
+                html += f"<li>{ReportGenerator._html_escape(id_value)}</li>"
             html += """
                 </ul>
             </div>
@@ -792,27 +908,45 @@ class ReportGenerator:
                 count_info = title_data["count"]
                 ranks = title_data["ranks"]
                 rank_threshold = title_data["rank_threshold"]
+                url = title_data.get("url", "")
+                mobile_url = title_data.get("mobileUrl", "")
 
                 # 使用HTML格式化排名
                 rank_display = StatisticsCalculator._format_rank_for_html(
                     ranks, rank_threshold
                 )
 
-                # 格式化标题信息
-                formatted_title = f"[{source_alias}] {title}"
+                # 优先使用mobileUrl,然后是url,最后无链接
+                link_url = mobile_url or url
+            
+                # 格式化标题信息,添加链接支持
+                escaped_title = ReportGenerator._html_escape(title)
+                escaped_source_alias = ReportGenerator._html_escape(source_alias)
+            
+                if link_url:
+                    # 转义URL
+                    escaped_url = ReportGenerator._html_escape(link_url)
+                    # 有链接时,使用a标签包装标题
+                    formatted_title = f"[{escaped_source_alias}] <a href=\"{escaped_url}\" target=\"_blank\" class=\"news-link\">{escaped_title}</a>"
+                else:
+                    # 没有链接时,使用普通文本
+                    formatted_title = f"[{escaped_source_alias}] <span class=\"no-link\">{escaped_title}</span>"
+            
                 if rank_display:
                     formatted_title += f" {rank_display}"
                 if time_display:
-                    formatted_title += f" <font color='grey'>- {time_display}</font>"
+                    escaped_time_display = ReportGenerator._html_escape(time_display)
+                    formatted_title += f" <font color='grey'>- {escaped_time_display}</font>"
                 if count_info > 1:
                     formatted_title += f" <font color='green'>({count_info}次)</font>"
 
                 formatted_titles.append(formatted_title)
 
+            escaped_word = ReportGenerator._html_escape(stat['word'])
             html += f"""
                 <tr>
                     <td>{i}</td>
-                    <td class="word">{stat['word']}</td>
+                    <td class="word">{escaped_word}</td>
                     <td class="count">{stat['count']}</td>
                     <td class="percentage">{stat['percentage']}%</td>
                     <td class="titles">{"<br>".join(formatted_titles)}</td>
@@ -828,6 +962,18 @@ class ReportGenerator:
 
         return html
 
+    @staticmethod
+    def _html_escape(text: str) -> str:
+        """HTML转义函数"""
+        if not isinstance(text, str):
+            text = str(text)
+    
+        return (text.replace("&", "&amp;")
+                    .replace("<", "&lt;")
+                    .replace(">", "&gt;")
+                    .replace('"', "&quot;")
+                    .replace("'", "&#x27;"))
+
     @staticmethod
     def send_to_feishu(
         stats: List[Dict],
@@ -887,7 +1033,7 @@ class ReportGenerator:
     def _build_feishu_content(
         stats: List[Dict], failed_ids: Optional[List] = None
     ) -> str:
-        """构建飞书消息内容,使用富文本格式"""
+        """构建飞书消息内容,使用富文本格式和markdown链接,优先使用mobileUrl"""
         text_content = ""
 
         # 添加频率词统计信息
@@ -926,25 +1072,26 @@ class ReportGenerator:
                 count_info = title_data["count"]
                 ranks = title_data["ranks"]
                 rank_threshold = title_data["rank_threshold"]
+                url = title_data.get("url", "")
+                mobile_url = title_data.get("mobileUrl", "")
 
                 # 使用飞书格式化排名
                 rank_display = StatisticsCalculator._format_rank_for_feishu(
                     ranks, rank_threshold
                 )
 
-                # 格式化标题信息
-                formatted_title = f"[{source_alias}] {title}"
-                if rank_display:
-                    formatted_title += f" {rank_display}"
-                if time_display:
-                    formatted_title += f" <font color='grey'>- {time_display}</font>"
-                if count_info > 1:
-                    formatted_title += f" <font color='green'>({count_info}次)</font>"
+                # 格式化标题信息,优先使用mobileUrl,然后是url
+                link_url = mobile_url or url  # 优先使用mobileUrl,没有则使用url
+                if link_url:
+                    # 如果有链接,使用markdown链接格式
+                    formatted_title = f"[{title}]({link_url})"
+                else:
+                    # 如果都没有链接,只显示标题
+                    formatted_title = title
 
-                # 使用灰色显示来源
-                text_content += (
-                    f"  {j}. <font color='grey'>[{source_alias}]</font> {title}"
-                )
+                # 构建完整的标题行
+                text_content += f"  {j}. <font color='grey'>[{source_alias}]</font> {formatted_title}"
+            
                 if rank_display:
                     text_content += f" {rank_display}"
                 if time_display:
@@ -1121,12 +1268,25 @@ class NewsAnalyzer:
         title_info = {}
         for source_id, titles_data in results.items():
             title_info[source_id] = {}
-            for title, ranks in titles_data.items():
+            for title, title_data in titles_data.items():
+                # 兼容新格式数据
+                if isinstance(title_data, dict):
+                    ranks = title_data.get("ranks", [])
+                    url = title_data.get("url", "")
+                    mobile_url = title_data.get("mobileUrl", "")
+                else:
+                    # 兼容旧格式数据
+                    ranks = title_data if isinstance(title_data, list) else []
+                    url = ""
+                    mobile_url = ""
+
                 title_info[source_id][title] = {
                     "first_time": time_info,
                     "last_time": time_info,
                     "count": 1,
                     "ranks": ranks,
+                    "url": url,
+                    "mobileUrl": mobile_url,
                 }
 
         # 加载频率词和过滤词