207 lines
8.5 KiB
Python
207 lines
8.5 KiB
Python
# services/crawler_106.py
|
|
import os
|
|
import requests
|
|
import logging
|
|
from datetime import datetime
|
|
from config import Config
|
|
|
|
CONFIG = Config.CRAWLER_CONFIG["106"]
|
|
|
|
|
|
def get_temp_dir():
|
|
"""获取临时文件存储目录"""
|
|
base_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
|
|
temp_dir = os.path.join(base_dir, 'instance', 'temp')
|
|
if not os.path.exists(temp_dir):
|
|
os.makedirs(temp_dir)
|
|
return temp_dir
|
|
|
|
|
|
def get_106_dynamic_token(port):
|
|
"""获取动态登录 Token"""
|
|
try:
|
|
login_url = f"http://106.75.72.40:{port}/api/login"
|
|
resp = requests.post(login_url, json=CONFIG["login_payload"], timeout=10)
|
|
return resp.text.strip().replace('"', '') if resp.status_code == 200 else None
|
|
except:
|
|
return None
|
|
|
|
|
|
def find_closest_item(items, is_date_level=True):
|
|
"""
|
|
在列表中找到与当前日期最接近的文件夹或文件
|
|
"""
|
|
if not items or not isinstance(items, list): return None
|
|
today = datetime.now()
|
|
scored_items = []
|
|
|
|
for item in items:
|
|
name_val = item.get('name', '')
|
|
path_val = item.get('path', '')
|
|
# 如果是日期层级,名字通常是 2026_02_08 这种格式
|
|
target_str = name_val if name_val else path_val.split('/')[-1]
|
|
|
|
try:
|
|
if is_date_level:
|
|
# 解析文件夹日期格式: YYYY_MM_DD
|
|
current_date = datetime.strptime(target_str, "%Y_%m_%d")
|
|
else:
|
|
# 解析文件修改时间
|
|
mod_str = item.get('modified', '')
|
|
current_date = datetime.fromisoformat(mod_str.replace('Z', '+00:00'))
|
|
|
|
# 计算与当前时间的差距
|
|
diff = abs((today - current_date.replace(tzinfo=None)).total_seconds())
|
|
scored_items.append((diff, item, target_str))
|
|
except:
|
|
continue
|
|
|
|
if not scored_items: return None
|
|
# 按时间差排序,取最小的
|
|
scored_items.sort(key=lambda x: x[0])
|
|
return scored_items[0]
|
|
|
|
|
|
def run_106_logic():
|
|
"""
|
|
106 爬虫主逻辑
|
|
返回 result_list, 每个元素是一个字典
|
|
"""
|
|
results = []
|
|
print(">>> [106爬虫] 启动...")
|
|
main_headers = {"Authorization": CONFIG["primary_auth"], "User-Agent": "Mozilla/5.0"}
|
|
|
|
try:
|
|
# 0. 获取代理列表 (设备列表)
|
|
resp = requests.get(CONFIG["base_url"], headers=main_headers, timeout=20)
|
|
proxies = resp.json().get('proxies', [])
|
|
|
|
for item in proxies:
|
|
name = item.get('name', '')
|
|
# 过滤规则:必须以 _data 结尾
|
|
if not name.lower().endswith('_data'): continue
|
|
|
|
name_upper = name.upper()
|
|
is_tower_underscore = "TOWER_" in name_upper
|
|
is_tower_i = "TOWER" in name_upper and not is_tower_underscore
|
|
|
|
# 过滤规则:必须包含 TOWER 相关标识
|
|
if not (is_tower_underscore or is_tower_i): continue
|
|
|
|
# --- 构建基础数据包 ---
|
|
# 默认使用标准当前时间作为兜底,防止后续步骤失败时时间为空
|
|
current_standard_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
data_packet = {
|
|
'source': '106网站',
|
|
'name': name,
|
|
'status': '正常',
|
|
'value': '',
|
|
'target_time': current_standard_time,
|
|
'raw_json': {},
|
|
'temp_file': None,
|
|
'num_files': 0
|
|
}
|
|
|
|
# 检查在线状态
|
|
if str(item.get('status')).lower() != 'online':
|
|
data_packet['status'] = '离线'
|
|
data_packet['value'] = f"状态: {item.get('status')}"
|
|
results.append(data_packet)
|
|
continue
|
|
|
|
try:
|
|
# 获取端口和 Token
|
|
port = item.get('conf', {}).get('remote_port')
|
|
token = get_106_dynamic_token(port)
|
|
if not token:
|
|
data_packet['status'] = '异常'
|
|
data_packet['value'] = "Token获取失败"
|
|
results.append(data_packet)
|
|
continue
|
|
|
|
headers = {"Authorization": CONFIG["primary_auth"], "x-auth": token}
|
|
api_root = "/api/resources/Data/" if is_tower_underscore else "/api/resources/data/"
|
|
|
|
# --- 1. 获取日期文件夹列表 ---
|
|
res1 = requests.get(f"http://106.75.72.40:{port}{api_root}", headers=headers, timeout=10)
|
|
best_date = find_closest_item(res1.json().get('items', []), True)
|
|
|
|
if not best_date:
|
|
data_packet['value'] = "未找到任何日期文件夹"
|
|
results.append(data_packet)
|
|
continue
|
|
|
|
# ==============================================================================
|
|
# ✅ [核心修复] 时间格式标准化
|
|
# 原逻辑: data_packet['target_time'] = best_date[2] (得到 "2026_02_08")
|
|
# 新逻辑: 将 "2026_02_08" 转换为 "2026-02-08 HH:MM:SS"
|
|
# ==============================================================================
|
|
raw_folder_name = best_date[2] # 例如 "2026_02_08"
|
|
formatted_date_part = raw_folder_name.replace('_', '-') # 变成 "2026-02-08"
|
|
current_time_part = datetime.now().strftime("%H:%M:%S")
|
|
|
|
# 覆盖默认时间,确保数据库存入的是标准时间戳格式
|
|
data_packet['target_time'] = f"{formatted_date_part} {current_time_part}"
|
|
|
|
date_path = f"{api_root}{raw_folder_name}/"
|
|
|
|
# --- 2. 请求具体日期的文件夹内容 (获取 numFiles) ---
|
|
res2 = requests.get(f"http://106.75.72.40:{port}{date_path}", headers=headers, timeout=10)
|
|
folder_data = res2.json()
|
|
|
|
file_count = folder_data.get('numFiles', 0)
|
|
data_packet['num_files'] = file_count
|
|
print(f" -> {name}: 找到日期 {formatted_date_part}, 文件数: {file_count}")
|
|
|
|
# --- 3. 找该文件夹里最新的文件 ---
|
|
best_file = find_closest_item(folder_data.get('items', []), False)
|
|
|
|
if not best_file:
|
|
data_packet['value'] = "文件夹为空"
|
|
results.append(data_packet)
|
|
continue
|
|
|
|
file_item = best_file[1]
|
|
full_path = file_item.get('path') or f"{date_path}{file_item.get('name')}"
|
|
|
|
# --- 4. 下载/读取内容逻辑 ---
|
|
if is_tower_i:
|
|
# [二进制文件] 下载逻辑
|
|
download_url = f"http://106.75.72.40:{port}/api/raw{full_path}"
|
|
res3 = requests.get(download_url, headers=headers, timeout=20, stream=True)
|
|
if res3.status_code == 200:
|
|
safe_name = f"{name}_{datetime.now().strftime('%H%M%S')}.db"
|
|
temp_path = os.path.join(get_temp_dir(), safe_name)
|
|
with open(temp_path, 'wb') as f:
|
|
f.write(res3.content)
|
|
|
|
data_packet['temp_file'] = temp_path
|
|
data_packet['value'] = f"Binary Downloaded: {len(res3.content)} bytes"
|
|
data_packet['raw_json'] = file_item # 借用 file_item 充当 raw_json
|
|
else:
|
|
data_packet['status'] = '异常'
|
|
data_packet['value'] = f"下载失败: {res3.status_code}"
|
|
else:
|
|
# [文本文件] JSON 解析逻辑
|
|
file_api_url = f"http://106.75.72.40:{port}/api/resources{full_path}"
|
|
res3 = requests.get(file_api_url, headers=headers, timeout=20)
|
|
try:
|
|
json_content = res3.json()
|
|
data_packet['raw_json'] = json_content
|
|
# 尝试提取 content 内容,如果没有则截取部分 JSON 字符串
|
|
data_packet['value'] = json_content.get('content', str(json_content)[:100])
|
|
except:
|
|
data_packet['value'] = "JSON解析失败"
|
|
|
|
results.append(data_packet)
|
|
|
|
except Exception as e:
|
|
data_packet['status'] = '异常'
|
|
data_packet['value'] = str(e)[:100]
|
|
results.append(data_packet)
|
|
|
|
except Exception as e:
|
|
logging.error(f"106 Crawler Error: {e}")
|
|
|
|
return results |