|
@@ -6,13 +6,12 @@
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
import sqlite3
|
|
import sqlite3
|
|
|
-import os
|
|
|
|
|
import shutil
|
|
import shutil
|
|
|
import pytz
|
|
import pytz
|
|
|
import re
|
|
import re
|
|
|
from datetime import datetime, timedelta
|
|
from datetime import datetime, timedelta
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
-from typing import Dict, List, Optional, Any
|
|
|
|
|
|
|
+from typing import Dict, List, Optional
|
|
|
|
|
|
|
|
from trendradar.storage.base import StorageBackend, NewsItem, NewsData
|
|
from trendradar.storage.base import StorageBackend, NewsItem, NewsData
|
|
|
from trendradar.utils.time import (
|
|
from trendradar.utils.time import (
|
|
@@ -20,6 +19,7 @@ from trendradar.utils.time import (
|
|
|
format_date_folder,
|
|
format_date_folder,
|
|
|
format_time_filename,
|
|
format_time_filename,
|
|
|
)
|
|
)
|
|
|
|
|
+from trendradar.utils.url import normalize_url
|
|
|
|
|
|
|
|
|
|
|
|
|
class LocalStorageBackend(StorageBackend):
|
|
class LocalStorageBackend(StorageBackend):
|
|
@@ -148,12 +148,15 @@ class LocalStorageBackend(StorageBackend):
|
|
|
|
|
|
|
|
for item in news_list:
|
|
for item in news_list:
|
|
|
try:
|
|
try:
|
|
|
- # 检查是否已存在(通过 URL + platform_id)
|
|
|
|
|
- if item.url:
|
|
|
|
|
|
|
+ # 标准化 URL(去除动态参数,如微博的 band_rank)
|
|
|
|
|
+ normalized_url = normalize_url(item.url, source_id) if item.url else ""
|
|
|
|
|
+
|
|
|
|
|
+ # 检查是否已存在(通过标准化 URL + platform_id)
|
|
|
|
|
+ if normalized_url:
|
|
|
cursor.execute("""
|
|
cursor.execute("""
|
|
|
SELECT id, title FROM news_items
|
|
SELECT id, title FROM news_items
|
|
|
WHERE url = ? AND platform_id = ?
|
|
WHERE url = ? AND platform_id = ?
|
|
|
- """, (item.url, source_id))
|
|
|
|
|
|
|
+ """, (normalized_url, source_id))
|
|
|
existing = cursor.fetchone()
|
|
existing = cursor.fetchone()
|
|
|
|
|
|
|
|
if existing:
|
|
if existing:
|
|
@@ -191,14 +194,14 @@ class LocalStorageBackend(StorageBackend):
|
|
|
data.crawl_time, now_str, existing_id))
|
|
data.crawl_time, now_str, existing_id))
|
|
|
updated_count += 1
|
|
updated_count += 1
|
|
|
else:
|
|
else:
|
|
|
- # 不存在,插入新记录
|
|
|
|
|
|
|
+ # 不存在,插入新记录(存储标准化后的 URL)
|
|
|
cursor.execute("""
|
|
cursor.execute("""
|
|
|
INSERT INTO news_items
|
|
INSERT INTO news_items
|
|
|
(title, platform_id, rank, url, mobile_url,
|
|
(title, platform_id, rank, url, mobile_url,
|
|
|
first_crawl_time, last_crawl_time, crawl_count,
|
|
first_crawl_time, last_crawl_time, crawl_count,
|
|
|
created_at, updated_at)
|
|
created_at, updated_at)
|
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
|
|
|
- """, (item.title, source_id, item.rank, item.url,
|
|
|
|
|
|
|
+ """, (item.title, source_id, item.rank, normalized_url,
|
|
|
item.mobile_url, data.crawl_time, data.crawl_time,
|
|
item.mobile_url, data.crawl_time, data.crawl_time,
|
|
|
now_str, now_str))
|
|
now_str, now_str))
|
|
|
new_id = cursor.lastrowid
|
|
new_id = cursor.lastrowid
|
|
@@ -217,7 +220,7 @@ class LocalStorageBackend(StorageBackend):
|
|
|
first_crawl_time, last_crawl_time, crawl_count,
|
|
first_crawl_time, last_crawl_time, crawl_count,
|
|
|
created_at, updated_at)
|
|
created_at, updated_at)
|
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
|
|
|
- """, (item.title, source_id, item.rank, item.url,
|
|
|
|
|
|
|
+ """, (item.title, source_id, item.rank, "",
|
|
|
item.mobile_url, data.crawl_time, data.crawl_time,
|
|
item.mobile_url, data.crawl_time, data.crawl_time,
|
|
|
now_str, now_str))
|
|
now_str, now_str))
|
|
|
new_id = cursor.lastrowid
|
|
new_id = cursor.lastrowid
|
|
@@ -524,6 +527,9 @@ class LocalStorageBackend(StorageBackend):
|
|
|
"""
|
|
"""
|
|
|
检测新增的标题
|
|
检测新增的标题
|
|
|
|
|
|
|
|
|
|
+ 该方法比较当前抓取数据与历史数据,找出新增的标题。
|
|
|
|
|
+ 关键逻辑:只有在历史批次中从未出现过的标题才算新增。
|
|
|
|
|
+
|
|
|
Args:
|
|
Args:
|
|
|
current_data: 当前抓取的数据
|
|
current_data: 当前抓取的数据
|
|
|
|
|
|
|
@@ -541,10 +547,24 @@ class LocalStorageBackend(StorageBackend):
|
|
|
new_titles[source_id] = {item.title: item for item in news_list}
|
|
new_titles[source_id] = {item.title: item for item in news_list}
|
|
|
return new_titles
|
|
return new_titles
|
|
|
|
|
|
|
|
- # 收集历史标题
|
|
|
|
|
|
|
+ # 获取当前批次时间
|
|
|
|
|
+ current_time = current_data.crawl_time
|
|
|
|
|
+
|
|
|
|
|
+ # 收集历史标题(first_time < current_time 的标题)
|
|
|
|
|
+ # 这样可以正确处理同一标题因 URL 变化而产生多条记录的情况
|
|
|
historical_titles: Dict[str, set] = {}
|
|
historical_titles: Dict[str, set] = {}
|
|
|
for source_id, news_list in historical_data.items.items():
|
|
for source_id, news_list in historical_data.items.items():
|
|
|
- historical_titles[source_id] = {item.title for item in news_list}
|
|
|
|
|
|
|
+ historical_titles[source_id] = set()
|
|
|
|
|
+ for item in news_list:
|
|
|
|
|
+ first_time = getattr(item, 'first_time', item.crawl_time)
|
|
|
|
|
+ if first_time < current_time:
|
|
|
|
|
+ historical_titles[source_id].add(item.title)
|
|
|
|
|
+
|
|
|
|
|
+ # 检查是否有历史数据
|
|
|
|
|
+ has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
|
|
|
|
|
+ if not has_historical_data:
|
|
|
|
|
+ # 第一次抓取,没有"新增"概念
|
|
|
|
|
+ return {}
|
|
|
|
|
|
|
|
# 检测新增
|
|
# 检测新增
|
|
|
new_titles = {}
|
|
new_titles = {}
|