# services/crawler_106.py import os import requests import logging from datetime import datetime from config import Config CONFIG = Config.CRAWLER_CONFIG["106"] def get_temp_dir(): base_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) temp_dir = os.path.join(base_dir, 'instance', 'temp') if not os.path.exists(temp_dir): os.makedirs(temp_dir) return temp_dir def get_106_dynamic_token(port): try: login_url = f"http://106.75.72.40:{port}/api/login" resp = requests.post(login_url, json=CONFIG["login_payload"], timeout=10) return resp.text.strip().replace('"', '') if resp.status_code == 200 else None except: return None def find_closest_item(items, is_date_level=True): if not items or not isinstance(items, list): return None today = datetime.now() scored_items = [] for item in items: name_val = item.get('name', '') path_val = item.get('path', '') target_str = name_val if name_val else path_val.split('/')[-1] try: if is_date_level: current_date = datetime.strptime(target_str, "%Y_%m_%d") else: mod_str = item.get('modified', '') current_date = datetime.fromisoformat(mod_str.replace('Z', '+00:00')) diff = abs((today - current_date.replace(tzinfo=None)).total_seconds()) scored_items.append((diff, item, target_str)) except: continue if not scored_items: return None scored_items.sort(key=lambda x: x[0]) return scored_items[0] def run_106_logic(): """返回 result_list, 每个元素是一个字典""" results = [] print(">>> [106爬虫] 启动...") # today_str = datetime.now().strftime("%Y_%m_%d") # ❌ 移除严格的“今天”判断 main_headers = {"Authorization": CONFIG["primary_auth"], "User-Agent": "Mozilla/5.0"} try: resp = requests.get(CONFIG["base_url"], headers=main_headers, timeout=20) proxies = resp.json().get('proxies', []) for item in proxies: name = item.get('name', '') if not name.lower().endswith('_data'): continue name_upper = name.upper() is_tower_underscore = "TOWER_" in name_upper is_tower_i = "TOWER" in name_upper and not is_tower_underscore if not (is_tower_underscore or is_tower_i): continue # 构建基础数据包 data_packet = { 'source': '106网站', 'name': name, 'status': '正常', 'value': '', 'target_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'raw_json': {}, 'temp_file': None, 'num_files': 0 # ✅ 默认值 } if str(item.get('status')).lower() != 'online': data_packet['status'] = '离线' data_packet['value'] = f"状态: {item.get('status')}" results.append(data_packet) continue try: port = item.get('conf', {}).get('remote_port') token = get_106_dynamic_token(port) if not token: data_packet['status'] = '异常' data_packet['value'] = "Token获取失败" results.append(data_packet) continue headers = {"Authorization": CONFIG["primary_auth"], "x-auth": token} api_root = "/api/resources/Data/" if is_tower_underscore else "/api/resources/data/" # 1. 获取日期列表 res1 = requests.get(f"http://106.75.72.40:{port}{api_root}", headers=headers, timeout=10) best_date = find_closest_item(res1.json().get('items', []), True) # ✅ 修改点:如果找不到任何日期文件夹,才报错。否则,即使是旧日期也继续往下走。 if not best_date: data_packet['value'] = "未找到任何日期文件夹" results.append(data_packet) continue data_packet['target_time'] = best_date[2] # 记录找到的那个日期 (比如 2026_02_02) date_path = f"{api_root}{best_date[2]}/" # 2. 请求具体日期的文件夹内容 (这一步能获取 numFiles) res2 = requests.get(f"http://106.75.72.40:{port}{date_path}", headers=headers, timeout=10) folder_data = res2.json() # 获取完整JSON # ✅ 核心:提取 numFiles (只要请求成功,这里一定能拿到) file_count = folder_data.get('numFiles', 0) data_packet['num_files'] = file_count print(f" -> {name}: 找到日期 {best_date[2]}, 文件数: {file_count}") # 3. 找该文件夹里最新的文件 best_file = find_closest_item(folder_data.get('items', []), False) if not best_file: data_packet['value'] = "文件夹为空" # 这种情况下 numFiles 应该是 0 results.append(data_packet) continue file_item = best_file[1] full_path = file_item.get('path') or f"{date_path}{file_item.get('name')}" # 4. 下载/读取内容逻辑 if is_tower_i: # 下载二进制文件 download_url = f"http://106.75.72.40:{port}/api/raw{full_path}" res3 = requests.get(download_url, headers=headers, timeout=20, stream=True) if res3.status_code == 200: safe_name = f"{name}_{datetime.now().strftime('%H%M%S')}.db" temp_path = os.path.join(get_temp_dir(), safe_name) with open(temp_path, 'wb') as f: f.write(res3.content) data_packet['temp_file'] = temp_path data_packet['value'] = f"Binary Downloaded: {len(res3.content)} bytes" data_packet['raw_json'] = file_item # 借用 file_item 充当 raw_json else: data_packet['status'] = '异常' data_packet['value'] = f"下载失败: {res3.status_code}" else: # JSON 内容 file_api_url = f"http://106.75.72.40:{port}/api/resources{full_path}" res3 = requests.get(file_api_url, headers=headers, timeout=20) try: json_content = res3.json() data_packet['raw_json'] = json_content data_packet['value'] = json_content.get('content', '') except: data_packet['value'] = "JSON解析失败" results.append(data_packet) except Exception as e: data_packet['status'] = '异常' data_packet['value'] = str(e)[:50] results.append(data_packet) except Exception as e: logging.error(f"106 Crawler Error: {e}") return results