修改数据获取,确保json文件完整获取

This commit is contained in:
YueL1331
2026-01-08 14:26:34 +08:00
parent a5b0b71d26
commit a8984a156c
9 changed files with 304 additions and 479 deletions

View File

@ -1,135 +1,37 @@
# services/core.py
import logging
from datetime import datetime
import threading
from extensions import db
# 引入新的模型
from models import Device, DeviceHistory
# 引入爬虫逻辑 (保持相对导入不变)
from .crawler_106 import run_106_logic
from .crawler_82 import run_82_logic
task_lock = threading.Lock()
def calculate_offset(latest_time_str):
"""计算时间滞后天数 (保持原有逻辑)"""
if not latest_time_str or latest_time_str == "N/A":
return "从未同步"
try:
clean_date_str = str(latest_time_str).split()[0].replace('_', '-')
target_date = datetime.strptime(clean_date_str, "%Y-%m-%d").date()
diff = (datetime.now().date() - target_date).days
if diff == 0: return "当天已同步"
return f"滞后 {diff}"
except:
return "时间解析失败"
def save_record_to_db(source, name, status, reason, latest_time="N/A", content=None):
"""
智能存储逻辑:
1. 确保 Device 主表存在
2. 仅当 latest_time 发生变化时,才写入 DeviceHistory
"""
try:
# 1. 查询或创建主设备 (Device)
device = Device.query.filter_by(name=name).first()
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
current_offset = calculate_offset(latest_time)
is_new_data = False
if not device:
# === 新设备发现 ===
device = Device(
name=name,
source=source,
install_site="", # 默认空
is_maintaining=False,
is_hidden=False
)
db.session.add(device)
# 需要 flush 这里的 add以便后面生成 ID 存历史,但为了性能可以最后统一 commit
# 这里标记为新数据,强制存一条历史
is_new_data = True
logging.info(f"发现新设备: {name}")
else:
# === 旧设备 ===
# 判断核心逻辑:如果网站上的 latest_time 变了,说明有新数据
if latest_time != "N/A" and device.latest_time != latest_time:
is_new_data = True
# 如果网站没抓到时间(N/A),但我们库里有旧时间,我们需要更新 offset (如昨天滞后1天今天变滞后2天)
if latest_time == "N/A" and device.latest_time:
current_offset = calculate_offset(device.latest_time)
# 2. 更新主表快照信息 (无论是否有新数据,都要更新最后检查时间和状态)
device.check_time = now_str
device.status = status
device.reason = reason
device.offset = current_offset
# 只有抓到有效时间才更新主表的显示时间
if latest_time != "N/A":
device.latest_time = latest_time
# 3. 如果是新数据,写入历史表 (节省空间的核心)
if is_new_data and latest_time != "N/A":
# 先 commit 确保 device.id 存在
db.session.flush()
history = DeviceHistory(
device_id=device.id,
data_time=latest_time,
status=status
)
db.session.add(history)
logging.info(f"[{name}] 数据更新: {latest_time} -> 存入历史")
db.session.commit()
return f"{source}_{name}"
except Exception as e:
db.session.rollback()
logging.error(f"DB Error [{name}]: {e}")
return None
def execute_monitor_task():
"""执行所有爬虫任务的主入口"""
"""
执行所有爬虫,返回一个大列表:
{'device_list': [item1, item2...], 'target_time': '...'}
"""
if task_lock.locked():
logging.warning(">>> 任务正在运行中,跳过本次调度")
return
logging.warning(">>> 任务正在运行中,跳过")
return None
with task_lock:
logging.info(">>> 开始执行监控任务...")
active_set = set()
# 1. 运行爬虫 (传递新的 save_record_to_db)
run_106_logic(active_set, save_record_to_db)
run_82_logic(active_set, save_record_to_db)
# 1. 获取 106 数据列表
list_106 = run_106_logic()
# 2. 处理离线设备 (仅更新主表状态,不增加历史垃圾数据)
try:
# 查询所有未被隐藏且不在维修中的设备
all_devices = Device.query.all()
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 2. 获取 82 数据列表
list_82 = run_82_logic()
for dev in all_devices:
key = f"{dev.source}_{dev.name}"
# 3. 合并
combined_list = list_106 + list_82
# 如果设备在维修中,或者刚才爬到了,就跳过
if dev.is_maintaining or (key in active_set):
continue
logging.info(f">>> 任务完成,共获取 {len(combined_list)} 条数据")
# 没爬到 -> 标记为离线
dev.status = "已离线"
dev.reason = "设备本次扫描未响应"
dev.check_time = now_str
# 注意:这里我们只改状态,不往 History 插数据,防止离线时疯狂增加重复记录
db.session.commit()
except Exception as e:
db.session.rollback()
logging.error(f"离线状态更新失败: {e}")
logging.info(">>> 监控任务完成。")
return {
'device_list': combined_list,
'target_time': None, # 具体时间已在 item 里
'temp_file_path': None # 废弃旧逻辑,文件路径已在 item 里
}

View File

@ -1,199 +1,159 @@
# services/crawler_106.py
import os
import requests
import logging
from datetime import datetime
from config import Config
# 读取配置
CONFIG = Config.CRAWLER_CONFIG["106"]
def get_today_str():
return datetime.now().strftime("%Y_%m_%d")
def get_temp_dir():
base_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
temp_dir = os.path.join(base_dir, 'instance', 'temp')
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
return temp_dir
def get_106_dynamic_token(port):
"""
为指定端口的站点执行登录,获取最新的 x-auth token
严格对应参考代码逻辑
"""
try:
login_url = f"http://106.75.72.40:{port}/api/login"
# 使用 Config 中的 login_payload
resp = requests.post(login_url, json=CONFIG["login_payload"], timeout=10)
if resp.status_code == 200:
# 登录成功后token 通常直接返回在响应体中
return resp.text.strip().replace('"', '')
else:
return None
except Exception as e:
return resp.text.strip().replace('"', '') if resp.status_code == 200 else None
except:
return None
def find_closest_item(items, is_date_level=True):
"""
查找最新的日期文件夹或文件
逻辑完全复用参考代码
"""
if not items or not isinstance(items, list): return None
today = datetime.now()
scored_items = []
for item in items:
name_val = item.get('name', '')
path_val = item.get('path', '')
target_str = name_val if name_val else path_val.split('/')[-1]
try:
if is_date_level:
# 匹配文件夹日期格式 YYYY_MM_DD
current_date = datetime.strptime(target_str, "%Y_%m_%d")
else:
# 匹配文件修改时间
mod_str = item.get('modified', '')
# 处理 ISO 时间格式
current_date = datetime.fromisoformat(mod_str.replace('Z', '+00:00'))
diff = abs((today - current_date.replace(tzinfo=None)).total_seconds())
scored_items.append((diff, item, target_str))
except:
continue
if not scored_items: return None
# 按时间差排序,取最小的
scored_items.sort(key=lambda x: x[0])
return scored_items[0]
def run_106_logic(active_set, save_callback):
"""
106 爬虫主逻辑
active_set: 用于记录扫描到的设备key
save_callback: 存库回调函数
"""
def run_106_logic():
"""返回 result_list, 每个元素是一个字典"""
results = []
print(">>> [106爬虫] 启动...")
today_str = get_today_str()
# 全局 Auth 用于获取列表
today_str = datetime.now().strftime("%Y_%m_%d")
main_headers = {"Authorization": CONFIG["primary_auth"], "User-Agent": "Mozilla/5.0"}
try:
# 获取代理列表
resp = requests.get(CONFIG["base_url"], headers=main_headers, timeout=20)
proxies = resp.json().get('proxies', [])
for item in proxies:
name = item.get('name', '')
# --- 1. 严格过滤逻辑 (复用参考代码) ---
if not name.lower().endswith('_data'):
continue
if not name.lower().endswith('_data'): continue
name_upper = name.upper()
is_tower_underscore = "TOWER_" in name_upper
is_tower_i = "TOWER" in name_upper and not is_tower_underscore
if not (is_tower_underscore or is_tower_i): continue
# 如果既不是 TOWER_ 也不是 TOWER (TowerI),则跳过
if not (is_tower_underscore or is_tower_i):
continue
# 构建基础数据包
data_packet = {
'source': '106网站',
'name': name,
'status': '正常',
'value': '',
'target_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'raw_json': {},
'temp_file': None
}
# --- 2. 检查在线状态 ---
if str(item.get('status')).lower() != 'online':
key = save_callback("106网站", name, "离线", f"设备状态: {item.get('status')}")
if key: active_set.add(key)
data_packet['status'] = '离线'
data_packet['value'] = f"状态: {item.get('status')}"
results.append(data_packet)
continue
try:
# --- 3. 获取端口和 Token ---
port = item.get('conf', {}).get('remote_port')
if not port: continue
token = get_106_dynamic_token(port)
if not token:
key = save_callback("106网站", name, "异常", "Token获取失败")
if key: active_set.add(key)
data_packet['status'] = '异常'
data_packet['value'] = "Token获取失败"
results.append(data_packet)
continue
# 构造当前站点的 Headers
headers = {
"Authorization": CONFIG["primary_auth"],
"x-auth": token,
"User-Agent": "Mozilla/5.0"
}
# --- 4. 路径区分逻辑 (核心差异) ---
# Tower_ 使用大写 DataTowerI 使用小写 data
headers = {"Authorization": CONFIG["primary_auth"], "x-auth": token}
api_root = "/api/resources/Data/" if is_tower_underscore else "/api/resources/data/"
# Step A: 获取根目录列表
res1 = requests.get(f"http://106.75.72.40:{port}{api_root}", headers=headers, timeout=10)
items1 = res1.json().get('items', [])
best_date = find_closest_item(res1.json().get('items', []), True)
# Step B: 寻找今日文件夹
best_date = find_closest_item(items1, is_date_level=True)
# 校验日期是否匹配
if not best_date or best_date[2] != today_str:
key = save_callback("106网站", name, "正常", "未找到今日文件夹",
latest_time=best_date[2] if best_date else "N/A")
if key: active_set.add(key)
data_packet['value'] = "未找到今日文件夹"
data_packet['target_time'] = best_date[2] if best_date else "N/A"
results.append(data_packet)
continue
# Step C: 进入日期文件夹
data_packet['target_time'] = best_date[2] # 实际数据时间
date_path = f"{api_root}{best_date[2]}/"
res2 = requests.get(f"http://106.75.72.40:{port}{date_path}", headers=headers, timeout=10)
items2 = res2.json().get('items', [])
best_file = find_closest_item(res2.json().get('items', []), False)
# Step D: 寻找最新文件
best_file = find_closest_item(items2, is_date_level=False)
if not best_file:
key = save_callback("106网站", name, "正常", "今日文件夹为空", latest_time=today_str)
if key: active_set.add(key)
data_packet['value'] = "今日文件夹为空"
results.append(data_packet)
continue
# 获取文件完整路径
file_item = best_file[1]
full_path = file_item.get('path')
if not full_path:
full_path = f"{date_path}{file_item.get('name')}"
# --- 5. 下载内容 (根据类型区分接口) ---
final_content = ""
full_path = file_item.get('path') or f"{date_path}{file_item.get('name')}"
# 核心逻辑:获取内容
if is_tower_i:
# [TowerI 模式] 使用 /api/raw 接口获取二进制
# 下载二进制文件
download_url = f"http://106.75.72.40:{port}/api/raw{full_path}"
res3 = requests.get(download_url, headers=headers, timeout=20, stream=True)
if res3.status_code == 200:
# 数据库存不下二进制,存个描述信息
size_bytes = len(res3.content)
final_content = f"[Binary Data] 成功获取,大小: {size_bytes} 字节"
safe_name = f"{name}_{datetime.now().strftime('%H%M%S')}.db"
temp_path = os.path.join(get_temp_dir(), safe_name)
with open(temp_path, 'wb') as f:
f.write(res3.content)
data_packet['temp_file'] = temp_path # 🔥 传递给API
data_packet['value'] = f"Binary Downloaded: {len(res3.content)} bytes"
data_packet['raw_json'] = file_item # 用文件属性充当RawData
else:
raise Exception(f"二进制下载失败 Code: {res3.status_code}")
data_packet['status'] = '异常'
data_packet['value'] = f"下载失败: {res3.status_code}"
else:
# [Tower_ 模式] 使用 /api/resources 接口获取 JSON content
# JSON 内容
file_api_url = f"http://106.75.72.40:{port}/api/resources{full_path}"
res3 = requests.get(file_api_url, headers=headers, timeout=20)
try:
# 尝试获取 JSON 里的 content 字段
final_content = res3.json().get('content', '')
if not final_content:
final_content = "[Warning] JSON返回内容为空"
json_content = res3.json()
data_packet['raw_json'] = json_content # 🔥 完整保存
data_packet['value'] = json_content.get('content', '')
except:
final_content = "[Error] 无法解析JSON内容"
data_packet['value'] = "JSON解析失败"
# --- 6. 最终入库 ---
key = save_callback("106网站", name, "正常", "同步成功",
latest_time=today_str, content=final_content)
if key: active_set.add(key)
results.append(data_packet)
except Exception as e:
# 捕获单台设备的异常,防止中断循环
err_msg = str(e)[:100] # 截断错误信息防止太长
key = save_callback("106网站", name, "异常", f"采集错误: {err_msg}")
if key: active_set.add(key)
data_packet['status'] = '异常'
data_packet['value'] = str(e)[:50]
results.append(data_packet)
except Exception as e:
logging.error(f"106 Crawler Global Error: {e}")
logging.error(f"106 Crawler Error: {e}")
return results

View File

@ -1,56 +1,62 @@
# services/crawler_82.py
import requests
import json
import logging
from lxml import etree
from config import Config
from datetime import datetime
# 读取配置
CONFIG = Config.CRAWLER_CONFIG["82"]
def run_82_logic(active_set, save_callback):
session = requests.Session()
def run_82_logic():
"""返回 result_list"""
results = []
print(">>> [82爬虫] 启动...")
session = requests.Session()
try:
# 1. 登录
session.post(f"{CONFIG['base_url']}/login.php", data=CONFIG["login"], timeout=10)
# 2. 获取列表
resp = session.post(f"{CONFIG['base_url']}/GetStationList.php", timeout=10)
# 使用 lxml 解析
html = etree.HTML(resp.content)
if html is None:
print(">>> [82爬虫] 解析页面失败")
return
if html is None: return []
stations = html.xpath('//option/@value')
for sid in [s for s in stations if s]:
data_packet = {
'source': '82网站',
'name': str(sid),
'status': '正常',
'value': '',
'target_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'raw_json': {},
'temp_file': None
}
try:
# 3. 获取单个设备数据
r = session.post(f"{CONFIG['base_url']}/getLastWeatherData.php", data=str(sid),
headers={'Content-Type': 'text/plain'}, timeout=10)
# 尝试解析 JSON
try:
data = r.json()
except ValueError:
except:
data = None
if data:
d_list = data.get('date', [])
latest = str(d_list[-1]) if d_list else "N/A"
# 保存数据
key = save_callback("82网站", sid, "正常", "同步成功", latest_time=latest,
content=json.dumps(data, ensure_ascii=False))
if key: active_set.add(key)
data_packet['target_time'] = latest
data_packet['value'] = f"Data Points: {len(d_list)}"
data_packet['raw_json'] = data # 🔥 存完整JSON
else:
key = save_callback("82网站", sid, "异常", "返回空数据")
if key: active_set.add(key)
data_packet['status'] = '异常'
data_packet['value'] = "返回空数据"
except Exception as e:
# 单个设备失败不影响整体
key = save_callback("82网站", sid, "异常", "单个采集失败")
if key: active_set.add(key)
data_packet['status'] = '异常'
data_packet['value'] = "单个采集失败"
results.append(data_packet)
except Exception as e:
logging.error(f"82 Crawler Error: {e}")
logging.error(f"82 Crawler Error: {e}")
return results