自动写入修改,除了文件个数外,其他信息不展示问题

This commit is contained in:
YueL1331
2026-02-08 10:53:00 +08:00
parent f167bbc2f2
commit 51deee1493
3 changed files with 183 additions and 96 deletions

View File

@ -4,11 +4,13 @@ import threading
import traceback
from datetime import datetime
# 动态导入,防止文件缺失导致整个程序启动失败
# ==============================================================================
# 1. 动态导入模块
# ==============================================================================
try:
from .crawler_106 import run_106_logic
except ImportError:
print("⚠️ 警告: 未找到 crawler_106 模块")
except ImportError as e:
print(f"⚠️ [系统警告] 无法导入 crawler_106: {e}")
def run_106_logic():
@ -16,13 +18,14 @@ except ImportError:
try:
from .crawler_82 import run_82_logic
except ImportError:
print("⚠️ 警告: 未找到 crawler_82 模块")
except ImportError as e:
print(f"⚠️ [系统警告] 无法导入 crawler_82: {e}")
def run_82_logic():
return []
# 全局任务锁
task_lock = threading.Lock()
@ -34,12 +37,13 @@ def execute_monitor_task():
# 1. 锁机制:防止任务重复运行
if task_lock.locked():
logging.warning(">>> 任务正在运行中,跳过")
print(">>> ⚠️ 任务正在运行中,本次请求跳过")
print(">>> ⚠️ [调度] 任务正在运行中,本次请求跳过")
return None
with task_lock:
start_time = datetime.now()
logging.info(">>> 开始执行监控任务...")
print(f"--- [任务开始] {datetime.now().strftime('%H:%M:%S')} ---")
print(f"--- [任务开始] {start_time.strftime('%H:%M:%S')} ---")
all_results = []
@ -47,28 +51,27 @@ def execute_monitor_task():
# 2. 执行 106 爬虫
# ==========================
try:
print(f">>> [106爬虫] 启动...")
list_106 = run_106_logic()
if list_106:
count = len(list_106)
print(f"✅ 106爬虫获取数据: {count}")
# 🔍 [调试] 打印第一条数据,确认 num_files 是否存在
if count > 0:
first = list_106[0]
print(f" [调试检查] 106样本: {first.get('name')} | num_files={first.get('num_files')}")
all_results.extend(list_106)
else:
print("⚠️ 106爬虫未返回数据")
print("⚠️ 106爬虫运行完成,但未返回任何数据 (空列表)")
except Exception as e:
print(f"❌ 106爬虫执行失败: {e}")
print(f"❌ 106爬虫执行严重失败: {e}")
traceback.print_exc()
# ==========================
# 3. 执行 82 爬虫
# ==========================
try:
print(f">>> [82爬虫] 启动...")
list_82 = run_82_logic()
if list_82:
print(f"✅ 82爬虫获取数据: {len(list_82)}")
@ -76,20 +79,26 @@ def execute_monitor_task():
for item in list_82:
if 'num_files' not in item:
item['num_files'] = 0
if 'status' not in item:
item['status'] = 'Unknown'
all_results.extend(list_82)
else:
print("⚠️ 82爬虫运行完成但未返回数据")
except Exception as e:
print(f"❌ 82爬虫执行失败: {e}")
print(f"❌ 82爬虫执行严重失败: {e}")
traceback.print_exc()
# ==========================
# 4. 汇总返回
# ==========================
duration = (datetime.now() - start_time).total_seconds()
logging.info(f">>> 任务完成,共获取 {len(all_results)} 条数据")
print(f"--- [任务结束] 总计获取: {len(all_results)} 台设备 ---")
print(f"--- [任务结束] 总耗时: {duration:.2f}秒 | 总计获取: {len(all_results)} 台设备 ---")
return {
'device_list': all_results,
'target_time': None, # 具体时间已在 item['target_time'] 里
'temp_file_path': None # 废弃旧逻辑,文件路径已在 item['temp_file'] 里
'temp_file_path': None # 废弃旧逻辑
}

View File

@ -9,6 +9,7 @@ CONFIG = Config.CRAWLER_CONFIG["106"]
def get_temp_dir():
"""获取临时文件存储目录"""
base_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
temp_dir = os.path.join(base_dir, 'instance', 'temp')
if not os.path.exists(temp_dir):
@ -17,6 +18,7 @@ def get_temp_dir():
def get_106_dynamic_token(port):
"""获取动态登录 Token"""
try:
login_url = f"http://106.75.72.40:{port}/api/login"
resp = requests.post(login_url, json=CONFIG["login_payload"], timeout=10)
@ -26,59 +28,82 @@ def get_106_dynamic_token(port):
def find_closest_item(items, is_date_level=True):
"""
在列表中找到与当前日期最接近的文件夹或文件
"""
if not items or not isinstance(items, list): return None
today = datetime.now()
scored_items = []
for item in items:
name_val = item.get('name', '')
path_val = item.get('path', '')
# 如果是日期层级,名字通常是 2026_02_08 这种格式
target_str = name_val if name_val else path_val.split('/')[-1]
try:
if is_date_level:
# 解析文件夹日期格式: YYYY_MM_DD
current_date = datetime.strptime(target_str, "%Y_%m_%d")
else:
# 解析文件修改时间
mod_str = item.get('modified', '')
current_date = datetime.fromisoformat(mod_str.replace('Z', '+00:00'))
# 计算与当前时间的差距
diff = abs((today - current_date.replace(tzinfo=None)).total_seconds())
scored_items.append((diff, item, target_str))
except:
continue
if not scored_items: return None
# 按时间差排序,取最小的
scored_items.sort(key=lambda x: x[0])
return scored_items[0]
def run_106_logic():
"""返回 result_list, 每个元素是一个字典"""
"""
106 爬虫主逻辑
返回 result_list, 每个元素是一个字典
"""
results = []
print(">>> [106爬虫] 启动...")
# today_str = datetime.now().strftime("%Y_%m_%d") # ❌ 移除严格的“今天”判断
main_headers = {"Authorization": CONFIG["primary_auth"], "User-Agent": "Mozilla/5.0"}
try:
# 0. 获取代理列表 (设备列表)
resp = requests.get(CONFIG["base_url"], headers=main_headers, timeout=20)
proxies = resp.json().get('proxies', [])
for item in proxies:
name = item.get('name', '')
# 过滤规则:必须以 _data 结尾
if not name.lower().endswith('_data'): continue
name_upper = name.upper()
is_tower_underscore = "TOWER_" in name_upper
is_tower_i = "TOWER" in name_upper and not is_tower_underscore
# 过滤规则:必须包含 TOWER 相关标识
if not (is_tower_underscore or is_tower_i): continue
# 构建基础数据包
# --- 构建基础数据包 ---
# 默认使用标准当前时间作为兜底,防止后续步骤失败时时间为空
current_standard_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
data_packet = {
'source': '106网站',
'name': name,
'status': '正常',
'value': '',
'target_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'target_time': current_standard_time,
'raw_json': {},
'temp_file': None,
'num_files': 0 # ✅ 默认值
'num_files': 0
}
# 检查在线状态
if str(item.get('status')).lower() != 'online':
data_packet['status'] = '离线'
data_packet['value'] = f"状态: {item.get('status')}"
@ -86,6 +111,7 @@ def run_106_logic():
continue
try:
# 获取端口和 Token
port = item.get('conf', {}).get('remote_port')
token = get_106_dynamic_token(port)
if not token:
@ -97,42 +123,51 @@ def run_106_logic():
headers = {"Authorization": CONFIG["primary_auth"], "x-auth": token}
api_root = "/api/resources/Data/" if is_tower_underscore else "/api/resources/data/"
# 1. 获取日期列表
# --- 1. 获取日期文件夹列表 ---
res1 = requests.get(f"http://106.75.72.40:{port}{api_root}", headers=headers, timeout=10)
best_date = find_closest_item(res1.json().get('items', []), True)
# ✅ 修改点:如果找不到任何日期文件夹,才报错。否则,即使是旧日期也继续往下走。
if not best_date:
data_packet['value'] = "未找到任何日期文件夹"
results.append(data_packet)
continue
data_packet['target_time'] = best_date[2] # 记录找到的那个日期 (比如 2026_02_02)
date_path = f"{api_root}{best_date[2]}/"
# ==============================================================================
# ✅ [核心修复] 时间格式标准化
# 原逻辑: data_packet['target_time'] = best_date[2] (得到 "2026_02_08")
# 新逻辑: 将 "2026_02_08" 转换为 "2026-02-08 HH:MM:SS"
# ==============================================================================
raw_folder_name = best_date[2] # 例如 "2026_02_08"
formatted_date_part = raw_folder_name.replace('_', '-') # 变成 "2026-02-08"
current_time_part = datetime.now().strftime("%H:%M:%S")
# 2. 请求具体日期的文件夹内容 (这一步能获取 numFiles)
# 覆盖默认时间,确保数据库存入的是标准时间戳格式
data_packet['target_time'] = f"{formatted_date_part} {current_time_part}"
date_path = f"{api_root}{raw_folder_name}/"
# --- 2. 请求具体日期的文件夹内容 (获取 numFiles) ---
res2 = requests.get(f"http://106.75.72.40:{port}{date_path}", headers=headers, timeout=10)
folder_data = res2.json() # 获取完整JSON
folder_data = res2.json()
# ✅ 核心:提取 numFiles (只要请求成功,这里一定能拿到)
file_count = folder_data.get('numFiles', 0)
data_packet['num_files'] = file_count
print(f" -> {name}: 找到日期 {best_date[2]}, 文件数: {file_count}")
print(f" -> {name}: 找到日期 {formatted_date_part}, 文件数: {file_count}")
# 3. 找该文件夹里最新的文件
# --- 3. 找该文件夹里最新的文件 ---
best_file = find_closest_item(folder_data.get('items', []), False)
if not best_file:
data_packet['value'] = "文件夹为空" # 这种情况下 numFiles 应该是 0
data_packet['value'] = "文件夹为空"
results.append(data_packet)
continue
file_item = best_file[1]
full_path = file_item.get('path') or f"{date_path}{file_item.get('name')}"
# 4. 下载/读取内容逻辑
# --- 4. 下载/读取内容逻辑 ---
if is_tower_i:
# 下载二进制文件
# [二进制文件] 下载逻辑
download_url = f"http://106.75.72.40:{port}/api/raw{full_path}"
res3 = requests.get(download_url, headers=headers, timeout=20, stream=True)
if res3.status_code == 200:
@ -143,18 +178,19 @@ def run_106_logic():
data_packet['temp_file'] = temp_path
data_packet['value'] = f"Binary Downloaded: {len(res3.content)} bytes"
data_packet['raw_json'] = file_item # 借用 file_item 充当 raw_json
data_packet['raw_json'] = file_item # 借用 file_item 充当 raw_json
else:
data_packet['status'] = '异常'
data_packet['value'] = f"下载失败: {res3.status_code}"
else:
# JSON 内容
# [文本文件] JSON 解析逻辑
file_api_url = f"http://106.75.72.40:{port}/api/resources{full_path}"
res3 = requests.get(file_api_url, headers=headers, timeout=20)
try:
json_content = res3.json()
data_packet['raw_json'] = json_content
data_packet['value'] = json_content.get('content', '')
# 尝试提取 content 内容,如果没有则截取部分 JSON 字符串
data_packet['value'] = json_content.get('content', str(json_content)[:100])
except:
data_packet['value'] = "JSON解析失败"
@ -162,7 +198,7 @@ def run_106_logic():
except Exception as e:
data_packet['status'] = '异常'
data_packet['value'] = str(e)[:50]
data_packet['value'] = str(e)[:100]
results.append(data_packet)
except Exception as e: