Files
ZDXX/2.1版本/services/crawler_106.py

199 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import logging
from datetime import datetime
from config import Config
# 读取配置
CONFIG = Config.CRAWLER_CONFIG["106"]
def get_today_str():
return datetime.now().strftime("%Y_%m_%d")
def get_106_dynamic_token(port):
"""
为指定端口的站点执行登录,获取最新的 x-auth token
严格对应参考代码逻辑
"""
try:
login_url = f"http://106.75.72.40:{port}/api/login"
# 使用 Config 中的 login_payload
resp = requests.post(login_url, json=CONFIG["login_payload"], timeout=10)
if resp.status_code == 200:
# 登录成功后token 通常直接返回在响应体中
return resp.text.strip().replace('"', '')
else:
return None
except Exception as e:
return None
def find_closest_item(items, is_date_level=True):
"""
查找最新的日期文件夹或文件
逻辑完全复用参考代码
"""
if not items or not isinstance(items, list): return None
today = datetime.now()
scored_items = []
for item in items:
name_val = item.get('name', '')
path_val = item.get('path', '')
target_str = name_val if name_val else path_val.split('/')[-1]
try:
if is_date_level:
# 匹配文件夹日期格式 YYYY_MM_DD
current_date = datetime.strptime(target_str, "%Y_%m_%d")
else:
# 匹配文件修改时间
mod_str = item.get('modified', '')
# 处理 ISO 时间格式
current_date = datetime.fromisoformat(mod_str.replace('Z', '+00:00'))
diff = abs((today - current_date.replace(tzinfo=None)).total_seconds())
scored_items.append((diff, item, target_str))
except:
continue
if not scored_items: return None
# 按时间差排序,取最小的
scored_items.sort(key=lambda x: x[0])
return scored_items[0]
def run_106_logic(active_set, save_callback):
"""
106 爬虫主逻辑
active_set: 用于记录扫描到的设备key
save_callback: 存库回调函数
"""
print(">>> [106爬虫] 启动...")
today_str = get_today_str()
# 全局 Auth 用于获取列表
main_headers = {"Authorization": CONFIG["primary_auth"], "User-Agent": "Mozilla/5.0"}
try:
# 获取代理列表
resp = requests.get(CONFIG["base_url"], headers=main_headers, timeout=20)
proxies = resp.json().get('proxies', [])
for item in proxies:
name = item.get('name', '')
# --- 1. 严格过滤逻辑 (复用参考代码) ---
if not name.lower().endswith('_data'):
continue
name_upper = name.upper()
is_tower_underscore = "TOWER_" in name_upper
is_tower_i = "TOWER" in name_upper and not is_tower_underscore
# 如果既不是 TOWER_ 也不是 TOWER (TowerI),则跳过
if not (is_tower_underscore or is_tower_i):
continue
# --- 2. 检查在线状态 ---
if str(item.get('status')).lower() != 'online':
key = save_callback("106网站", name, "离线", f"设备状态: {item.get('status')}")
if key: active_set.add(key)
continue
try:
# --- 3. 获取端口和 Token ---
port = item.get('conf', {}).get('remote_port')
if not port: continue
token = get_106_dynamic_token(port)
if not token:
key = save_callback("106网站", name, "异常", "Token获取失败")
if key: active_set.add(key)
continue
# 构造当前站点的 Headers
headers = {
"Authorization": CONFIG["primary_auth"],
"x-auth": token,
"User-Agent": "Mozilla/5.0"
}
# --- 4. 路径区分逻辑 (核心差异) ---
# Tower_ 使用大写 DataTowerI 使用小写 data
api_root = "/api/resources/Data/" if is_tower_underscore else "/api/resources/data/"
# Step A: 获取根目录列表
res1 = requests.get(f"http://106.75.72.40:{port}{api_root}", headers=headers, timeout=10)
items1 = res1.json().get('items', [])
# Step B: 寻找今日文件夹
best_date = find_closest_item(items1, is_date_level=True)
# 校验日期是否匹配
if not best_date or best_date[2] != today_str:
key = save_callback("106网站", name, "正常", "未找到今日文件夹",
latest_time=best_date[2] if best_date else "N/A")
if key: active_set.add(key)
continue
# Step C: 进入日期文件夹
date_path = f"{api_root}{best_date[2]}/"
res2 = requests.get(f"http://106.75.72.40:{port}{date_path}", headers=headers, timeout=10)
items2 = res2.json().get('items', [])
# Step D: 寻找最新文件
best_file = find_closest_item(items2, is_date_level=False)
if not best_file:
key = save_callback("106网站", name, "正常", "今日文件夹为空", latest_time=today_str)
if key: active_set.add(key)
continue
# 获取文件完整路径
file_item = best_file[1]
full_path = file_item.get('path')
if not full_path:
full_path = f"{date_path}{file_item.get('name')}"
# --- 5. 下载内容 (根据类型区分接口) ---
final_content = ""
if is_tower_i:
# [TowerI 模式] 使用 /api/raw 接口获取二进制流
download_url = f"http://106.75.72.40:{port}/api/raw{full_path}"
res3 = requests.get(download_url, headers=headers, timeout=20, stream=True)
if res3.status_code == 200:
# 数据库存不下二进制,存个描述信息
size_bytes = len(res3.content)
final_content = f"[Binary Data] 成功获取,大小: {size_bytes} 字节"
else:
raise Exception(f"二进制下载失败 Code: {res3.status_code}")
else:
# [Tower_ 模式] 使用 /api/resources 接口获取 JSON content
file_api_url = f"http://106.75.72.40:{port}/api/resources{full_path}"
res3 = requests.get(file_api_url, headers=headers, timeout=20)
try:
# 尝试获取 JSON 里的 content 字段
final_content = res3.json().get('content', '')
if not final_content:
final_content = "[Warning] JSON返回内容为空"
except:
final_content = "[Error] 无法解析JSON内容"
# --- 6. 最终入库 ---
key = save_callback("106网站", name, "正常", "同步成功",
latest_time=today_str, content=final_content)
if key: active_set.add(key)
except Exception as e:
# 捕获单台设备的异常,防止中断循环
err_msg = str(e)[:100] # 截断错误信息防止太长
key = save_callback("106网站", name, "异常", f"采集错误: {err_msg}")
if key: active_set.add(key)
except Exception as e:
logging.error(f"106 Crawler Global Error: {e}")