import requests import logging from datetime import datetime from config import Config # 读取配置 CONFIG = Config.CRAWLER_CONFIG["106"] def get_today_str(): return datetime.now().strftime("%Y_%m_%d") def get_106_dynamic_token(port): """ 为指定端口的站点执行登录,获取最新的 x-auth token 严格对应参考代码逻辑 """ try: login_url = f"http://106.75.72.40:{port}/api/login" # 使用 Config 中的 login_payload resp = requests.post(login_url, json=CONFIG["login_payload"], timeout=10) if resp.status_code == 200: # 登录成功后,token 通常直接返回在响应体中 return resp.text.strip().replace('"', '') else: return None except Exception as e: return None def find_closest_item(items, is_date_level=True): """ 查找最新的日期文件夹或文件 逻辑完全复用参考代码 """ if not items or not isinstance(items, list): return None today = datetime.now() scored_items = [] for item in items: name_val = item.get('name', '') path_val = item.get('path', '') target_str = name_val if name_val else path_val.split('/')[-1] try: if is_date_level: # 匹配文件夹日期格式 YYYY_MM_DD current_date = datetime.strptime(target_str, "%Y_%m_%d") else: # 匹配文件修改时间 mod_str = item.get('modified', '') # 处理 ISO 时间格式 current_date = datetime.fromisoformat(mod_str.replace('Z', '+00:00')) diff = abs((today - current_date.replace(tzinfo=None)).total_seconds()) scored_items.append((diff, item, target_str)) except: continue if not scored_items: return None # 按时间差排序,取最小的 scored_items.sort(key=lambda x: x[0]) return scored_items[0] def run_106_logic(active_set, save_callback): """ 106 爬虫主逻辑 active_set: 用于记录扫描到的设备key save_callback: 存库回调函数 """ print(">>> [106爬虫] 启动...") today_str = get_today_str() # 全局 Auth 用于获取列表 main_headers = {"Authorization": CONFIG["primary_auth"], "User-Agent": "Mozilla/5.0"} try: # 获取代理列表 resp = requests.get(CONFIG["base_url"], headers=main_headers, timeout=20) proxies = resp.json().get('proxies', []) for item in proxies: name = item.get('name', '') # --- 1. 严格过滤逻辑 (复用参考代码) --- if not name.lower().endswith('_data'): continue name_upper = name.upper() is_tower_underscore = "TOWER_" in name_upper is_tower_i = "TOWER" in name_upper and not is_tower_underscore # 如果既不是 TOWER_ 也不是 TOWER (TowerI),则跳过 if not (is_tower_underscore or is_tower_i): continue # --- 2. 检查在线状态 --- if str(item.get('status')).lower() != 'online': key = save_callback("106网站", name, "离线", f"设备状态: {item.get('status')}") if key: active_set.add(key) continue try: # --- 3. 获取端口和 Token --- port = item.get('conf', {}).get('remote_port') if not port: continue token = get_106_dynamic_token(port) if not token: key = save_callback("106网站", name, "异常", "Token获取失败") if key: active_set.add(key) continue # 构造当前站点的 Headers headers = { "Authorization": CONFIG["primary_auth"], "x-auth": token, "User-Agent": "Mozilla/5.0" } # --- 4. 路径区分逻辑 (核心差异) --- # Tower_ 使用大写 Data,TowerI 使用小写 data api_root = "/api/resources/Data/" if is_tower_underscore else "/api/resources/data/" # Step A: 获取根目录列表 res1 = requests.get(f"http://106.75.72.40:{port}{api_root}", headers=headers, timeout=10) items1 = res1.json().get('items', []) # Step B: 寻找今日文件夹 best_date = find_closest_item(items1, is_date_level=True) # 校验日期是否匹配 if not best_date or best_date[2] != today_str: key = save_callback("106网站", name, "正常", "未找到今日文件夹", latest_time=best_date[2] if best_date else "N/A") if key: active_set.add(key) continue # Step C: 进入日期文件夹 date_path = f"{api_root}{best_date[2]}/" res2 = requests.get(f"http://106.75.72.40:{port}{date_path}", headers=headers, timeout=10) items2 = res2.json().get('items', []) # Step D: 寻找最新文件 best_file = find_closest_item(items2, is_date_level=False) if not best_file: key = save_callback("106网站", name, "正常", "今日文件夹为空", latest_time=today_str) if key: active_set.add(key) continue # 获取文件完整路径 file_item = best_file[1] full_path = file_item.get('path') if not full_path: full_path = f"{date_path}{file_item.get('name')}" # --- 5. 下载内容 (根据类型区分接口) --- final_content = "" if is_tower_i: # [TowerI 模式] 使用 /api/raw 接口获取二进制流 download_url = f"http://106.75.72.40:{port}/api/raw{full_path}" res3 = requests.get(download_url, headers=headers, timeout=20, stream=True) if res3.status_code == 200: # 数据库存不下二进制,存个描述信息 size_bytes = len(res3.content) final_content = f"[Binary Data] 成功获取,大小: {size_bytes} 字节" else: raise Exception(f"二进制下载失败 Code: {res3.status_code}") else: # [Tower_ 模式] 使用 /api/resources 接口获取 JSON content file_api_url = f"http://106.75.72.40:{port}/api/resources{full_path}" res3 = requests.get(file_api_url, headers=headers, timeout=20) try: # 尝试获取 JSON 里的 content 字段 final_content = res3.json().get('content', '') if not final_content: final_content = "[Warning] JSON返回内容为空" except: final_content = "[Error] 无法解析JSON内容" # --- 6. 最终入库 --- key = save_callback("106网站", name, "正常", "同步成功", latest_time=today_str, content=final_content) if key: active_set.add(key) except Exception as e: # 捕获单台设备的异常,防止中断循环 err_msg = str(e)[:100] # 截断错误信息防止太长 key = save_callback("106网站", name, "异常", f"采集错误: {err_msg}") if key: active_set.add(key) except Exception as e: logging.error(f"106 Crawler Global Error: {e}")