1297 lines
52 KiB
Python
1297 lines
52 KiB
Python
import numpy as np
|
||
import pandas as pd
|
||
import joblib
|
||
import os
|
||
from pathlib import Path
|
||
from typing import List, Dict, Union, Tuple, Optional
|
||
import warnings
|
||
|
||
warnings.filterwarnings('ignore')
|
||
|
||
# 导入预处理模块 - 动态添加路径支持
|
||
import sys
|
||
import os
|
||
|
||
from src.preprocessing.spectral_Preprocessing import Preprocessing
|
||
|
||
# try:
|
||
# from modeling import WaterQualityModeling
|
||
# except ImportError:
|
||
# from src.core.modeling.modeling_batch import WaterQualityModeling
|
||
|
||
# 机器学习相关导入
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
|
||
class WaterQualityInference:
|
||
"""水质参数反演推理类"""
|
||
|
||
def __init__(self, artifacts_dir: str = "models/artifacts",
|
||
external_model=None, external_model_path=None):
|
||
"""
|
||
初始化推理类
|
||
|
||
Args:
|
||
artifacts_dir: 模型保存目录
|
||
external_model: 外部预训练模型对象(来自 GUI 导入,跳过磁盘加载)
|
||
external_model_path: 外部模型文件路径(仅用于日志)
|
||
"""
|
||
self.artifacts_dir = Path(artifacts_dir)
|
||
if not self.artifacts_dir.exists():
|
||
print(f"警告: 模型目录不存在: {artifacts_dir},将在需要时创建")
|
||
|
||
self.best_model_info = None
|
||
self.external_model = external_model
|
||
self.external_model_path = external_model_path
|
||
|
||
# 规范化 loaded_model_data:始终为 dict,确保 ['model'] 访问不崩溃
|
||
if external_model is not None:
|
||
# 外部传入的是裸模型对象 → 包装为 dict,统一后续 .get('model') 访问
|
||
self.loaded_model_data = {'model': external_model, 'preprocess_method': 'None'}
|
||
print(f" 外部模型已规范化: type={type(external_model).__name__}")
|
||
else:
|
||
self.loaded_model_data = None
|
||
|
||
def load_sampling_data(self, csv_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
||
"""
|
||
加载sampling生成的CSV数据(兼容 WQI 增强版 CSV)
|
||
|
||
Args:
|
||
csv_path: CSV文件路径
|
||
旧版:x_coord,y_coord,pixel_x,pixel_y,波长...
|
||
新版:x_coord,y_coord,WQI_...,波长...
|
||
|
||
Returns:
|
||
coords: 经纬度数据 (DataFrame, 2列)
|
||
spectra: 纯光谱数据 (DataFrame, 跳过 WQI 列)
|
||
wqi_df: WQI 指数列 (DataFrame, 0或45列)
|
||
"""
|
||
print(f"正在加载采样数据: {csv_path}")
|
||
|
||
if not os.path.exists(csv_path):
|
||
raise FileNotFoundError(f"采样数据文件不存在: {csv_path}")
|
||
|
||
# 读取CSV文件
|
||
data = pd.read_csv(csv_path)
|
||
|
||
print(f"采样数据加载完成:")
|
||
print(f" 数据形状: {data.shape}")
|
||
print(f" 列名: {list(data.columns[:5])}...") # 只显示前5列
|
||
|
||
# 检查数据列数
|
||
if data.shape[1] < 4:
|
||
raise ValueError(f"数据列数不足,期望至少4列(经度、纬度、其他列、光谱数据),实际得到{data.shape[1]}列")
|
||
|
||
# 前两列为经纬度
|
||
coords = data.iloc[:, :2].copy()
|
||
coords.columns = ['longitude', 'latitude']
|
||
|
||
# 动态识别光谱列(兼容 sampling_spectra.csv 列顺序变更)
|
||
# 列名约定:波长为纯数字字符串如 "374.285004";WQI 为 "WQI_xxx" 前缀
|
||
# 旧版 CSV(无WQI):x_coord,y_coord,pixel_x,pixel_y,波长... → 取 [4:]
|
||
# 新版 CSV(有WQI):x_coord,y_coord,WQI_...,波长... → 过滤 WQI 列后取光谱
|
||
all_cols = list(data.columns)
|
||
spectral_col_indices = []
|
||
wqi_col_indices = []
|
||
for i, col in enumerate(all_cols):
|
||
col_str = str(col)
|
||
if col_str.startswith('WQI_'):
|
||
wqi_col_indices.append(i)
|
||
elif col_str.replace('.', '').lstrip('-').isdigit():
|
||
# 波长列:纯数字字符串
|
||
spectral_col_indices.append(i)
|
||
else:
|
||
# 其他元数据列(x_coord/y_coord/pixel_x/pixel_y),由 coords 接收
|
||
pass
|
||
|
||
# 光谱列 = 纯数字列(WQI 已被排除)
|
||
spectra = data.iloc[:, spectral_col_indices].copy() if spectral_col_indices else data.iloc[:, 4:].copy()
|
||
# WQI 列(用于追加到预测结果输出)
|
||
wqi_df = data.iloc[:, wqi_col_indices].copy() if wqi_col_indices else pd.DataFrame()
|
||
|
||
print(f" 经纬度数据形状: {coords.shape}")
|
||
print(f" 光谱数据形状: {spectra.shape} (自动识别波长列,排除 {len(wqi_col_indices)} 个WQI列)")
|
||
print(f" 经纬度范围: 经度[{coords['longitude'].min():.6f}, {coords['longitude'].max():.6f}], "
|
||
f"纬度[{coords['latitude'].min():.6f}, {coords['latitude'].max():.6f}]")
|
||
|
||
return coords, spectra, wqi_df
|
||
|
||
def random(self, data, label, test_ratio=0.2, random_state=123):
|
||
"""
|
||
随机划分数据集
|
||
|
||
Args:
|
||
data: shape (n_samples, n_features)
|
||
label: shape (n_sample, )
|
||
test_ratio: 测试集比例,默认: 0.2
|
||
random_state: 随机种子,默认: 123
|
||
|
||
Returns:
|
||
X_train: (n_samples, n_features)
|
||
X_test: (n_samples, n_features)
|
||
y_train: (n_sample, )
|
||
y_test: (n_sample, )
|
||
"""
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
data, label, test_size=test_ratio, random_state=random_state
|
||
)
|
||
return X_train, X_test, y_train, y_test
|
||
|
||
def spxy(self, data, label, test_size=0.2):
|
||
"""
|
||
SPXY算法划分数据集(考虑X和Y空间的距离)
|
||
|
||
Args:
|
||
data: shape (n_samples, n_features)
|
||
label: shape (n_samples, )
|
||
test_size: 测试集比例,默认: 0.2
|
||
|
||
Returns:
|
||
X_train: (n_samples, n_features)
|
||
X_test: (n_samples, n_features)
|
||
y_train: (n_samples, )
|
||
y_test: (n_samples, )
|
||
"""
|
||
# 确保 data 和 label 是 NumPy 数组
|
||
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
|
||
label = label.to_numpy() if isinstance(label, pd.Series) else label
|
||
|
||
# 备份原始数据和标签
|
||
x_backup = data
|
||
y_backup = label
|
||
|
||
M = data.shape[0]
|
||
N = round((1 - test_size) * M)
|
||
samples = np.arange(M)
|
||
|
||
# 归一化标签数据
|
||
label = (label - np.mean(label)) / np.std(label)
|
||
D = np.zeros((M, M))
|
||
Dy = np.zeros((M, M))
|
||
|
||
# 计算样本之间的距离
|
||
for i in range(M - 1):
|
||
xa = data[i, :]
|
||
ya = label[i]
|
||
for j in range((i + 1), M):
|
||
xb = data[j, :]
|
||
yb = label[j]
|
||
D[i, j] = np.linalg.norm(xa - xb)
|
||
Dy[i, j] = np.linalg.norm(ya - yb)
|
||
|
||
# 距离归一化
|
||
Dmax = np.max(D)
|
||
Dymax = np.max(Dy)
|
||
D = D / Dmax + Dy / Dymax
|
||
|
||
# 找到最远的两个点
|
||
maxD = D.max(axis=0)
|
||
index_row = D.argmax(axis=0)
|
||
index_column = maxD.argmax()
|
||
|
||
m = np.zeros(N, dtype=int)
|
||
m[0] = index_row[index_column]
|
||
m[1] = index_column
|
||
|
||
dminmax = np.zeros(N)
|
||
dminmax[1] = D[m[0], m[1]]
|
||
|
||
# 根据距离选择训练集
|
||
for i in range(2, N):
|
||
pool = np.delete(samples, m[:i])
|
||
dmin = np.zeros(M - i)
|
||
for j in range(M - i):
|
||
indexa = pool[j]
|
||
d = np.zeros(i)
|
||
for k in range(i):
|
||
indexb = m[k]
|
||
if indexa < indexb:
|
||
d[k] = D[indexa, indexb]
|
||
else:
|
||
d[k] = D[indexb, indexa]
|
||
dmin[j] = np.min(d)
|
||
dminmax[i] = np.max(dmin)
|
||
index = np.argmax(dmin)
|
||
m[i] = pool[index]
|
||
|
||
m_complement = np.delete(samples, m)
|
||
|
||
# 划分训练集和测试集
|
||
X_train = data[m, :]
|
||
y_train = y_backup[m]
|
||
X_test = data[m_complement, :]
|
||
y_test = y_backup[m_complement]
|
||
|
||
return X_train, X_test, y_train, y_test
|
||
|
||
def ks(self, data, label, test_size=0.2):
|
||
"""
|
||
Kennard-Stone算法划分数据集
|
||
|
||
Args:
|
||
data: shape (n_samples, n_features)
|
||
label: shape (n_sample, )
|
||
test_size: 测试集比例,默认: 0.2
|
||
|
||
Returns:
|
||
X_train: (n_samples, n_features)
|
||
X_test: (n_samples, n_features)
|
||
y_train: (n_samples, )
|
||
y_test: (n_samples, )
|
||
"""
|
||
# 确保 data 和 label 是 NumPy 数组
|
||
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
|
||
label = label.to_numpy() if isinstance(label, pd.Series) else label
|
||
|
||
M = data.shape[0]
|
||
N = round((1 - test_size) * M)
|
||
samples = np.arange(M)
|
||
|
||
D = np.zeros((M, M))
|
||
|
||
for i in range((M - 1)):
|
||
xa = data[i, :]
|
||
for j in range((i + 1), M):
|
||
xb = data[j, :]
|
||
D[i, j] = np.linalg.norm(xa - xb)
|
||
|
||
maxD = np.max(D, axis=0)
|
||
index_row = np.argmax(D, axis=0)
|
||
index_column = np.argmax(maxD)
|
||
|
||
m = np.zeros(N)
|
||
m[0] = np.array(index_row[index_column])
|
||
m[1] = np.array(index_column)
|
||
m = m.astype(int)
|
||
dminmax = np.zeros(N)
|
||
dminmax[1] = D[m[0], m[1]]
|
||
|
||
for i in range(2, N):
|
||
pool = np.delete(samples, m[:i])
|
||
dmin = np.zeros((M - i))
|
||
for j in range((M - i)):
|
||
indexa = pool[j]
|
||
d = np.zeros(i)
|
||
for k in range(i):
|
||
indexb = m[k]
|
||
if indexa < indexb:
|
||
d[k] = D[indexa, indexb]
|
||
else:
|
||
d[k] = D[indexb, indexa]
|
||
dmin[j] = np.min(d)
|
||
dminmax[i] = np.max(dmin)
|
||
index = np.argmax(dmin)
|
||
m[i] = pool[index]
|
||
|
||
m_complement = np.delete(np.arange(data.shape[0]), m)
|
||
|
||
X_train = data[m, :]
|
||
y_train = label[m]
|
||
X_test = data[m_complement, :]
|
||
y_test = label[m_complement]
|
||
|
||
return X_train, X_test, y_train, y_test
|
||
|
||
def split_data(self, X: np.ndarray, y: pd.Series, method: str = "random",
|
||
test_size: float = 0.2, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||
"""
|
||
根据指定方法划分数据集
|
||
|
||
Args:
|
||
X: 特征数据
|
||
y: 目标值数据
|
||
method: 划分方法 ("random", "spxy", "ks")
|
||
test_size: 测试集比例
|
||
random_state: 随机种子(仅对random方法有效)
|
||
|
||
Returns:
|
||
X_train, X_test, y_train, y_test
|
||
"""
|
||
print(f"使用 {method} 方法划分数据集")
|
||
|
||
if method == "random":
|
||
return self.random(X, y, test_ratio=test_size, random_state=random_state)
|
||
elif method == "spxy":
|
||
return self.spxy(X, y, test_size=test_size)
|
||
elif method == "ks":
|
||
return self.ks(X, y, test_size=test_size)
|
||
else:
|
||
raise ValueError(f"不支持的划分方法: {method}. 支持的方法: ['random', 'spxy', 'ks']")
|
||
|
||
def get_best_model_from_summary(self, metric: str = 'test_r2') -> Tuple[str, str]:
|
||
"""
|
||
从训练摘要中获取最佳模型信息
|
||
|
||
Args:
|
||
metric: 评估指标(默认使用test_r2,回归任务的主要指标)
|
||
|
||
Returns:
|
||
preprocess_method: 预处理方法
|
||
model_name: 模型名称
|
||
"""
|
||
# 获取当前artifacts_dir的文件夹名称(用作目标列名)
|
||
folder_name = self.artifacts_dir.name
|
||
|
||
# 尝试加载详细结果文件(使用新的命名格式)
|
||
detailed_path = self.artifacts_dir / f"{folder_name}_detailed_results.csv"
|
||
summary_path = self.artifacts_dir / f"{folder_name}_training_summary.csv"
|
||
|
||
# 备用的旧格式文件路径
|
||
old_detailed_path = self.artifacts_dir / "detailed_results.csv"
|
||
old_summary_path = self.artifacts_dir / "training_summary.csv"
|
||
|
||
summary_df = None
|
||
|
||
# 优先使用新格式的详细结果文件
|
||
if detailed_path.exists():
|
||
print(f"使用详细结果文件: {detailed_path}")
|
||
summary_df = pd.read_csv(detailed_path)
|
||
# 将中文列名映射到英文
|
||
metric_mapping = {
|
||
'test_r2': '测试集R²',
|
||
'train_r2': '训练集R²',
|
||
'test_rmse': '测试集RMSE',
|
||
'train_rmse': '训练集RMSE',
|
||
'cv_mean': 'CV均值'
|
||
}
|
||
if metric in metric_mapping and metric_mapping[metric] in summary_df.columns:
|
||
metric_col = metric_mapping[metric]
|
||
else:
|
||
metric_col = metric
|
||
elif summary_path.exists():
|
||
print(f"使用训练摘要文件: {summary_path}")
|
||
summary_df = pd.read_csv(summary_path)
|
||
metric_col = metric
|
||
elif old_detailed_path.exists():
|
||
print(f"使用旧格式详细结果文件: {old_detailed_path}")
|
||
summary_df = pd.read_csv(old_detailed_path)
|
||
# 将中文列名映射到英文
|
||
metric_mapping = {
|
||
'test_r2': '测试集R²',
|
||
'train_r2': '训练集R²',
|
||
'test_rmse': '测试集RMSE',
|
||
'train_rmse': '训练集RMSE',
|
||
'cv_mean': 'CV均值'
|
||
}
|
||
if metric in metric_mapping and metric_mapping[metric] in summary_df.columns:
|
||
metric_col = metric_mapping[metric]
|
||
else:
|
||
metric_col = metric
|
||
elif old_summary_path.exists():
|
||
print(f"使用旧格式训练摘要文件: {old_summary_path}")
|
||
summary_df = pd.read_csv(old_summary_path)
|
||
metric_col = metric
|
||
else:
|
||
raise FileNotFoundError(f"训练摘要文件不存在,尝试的路径:\n"
|
||
f" - {detailed_path}\n"
|
||
f" - {summary_path}\n"
|
||
f" - {old_detailed_path}\n"
|
||
f" - {old_summary_path}")
|
||
|
||
if summary_df.empty:
|
||
raise ValueError("训练摘要为空")
|
||
|
||
# 检查指标列是否存在
|
||
if metric_col not in summary_df.columns:
|
||
available_cols = list(summary_df.columns)
|
||
raise ValueError(f"指标 '{metric_col}' 不存在。可用列: {available_cols}")
|
||
|
||
# 获取最佳模型(对于R²等指标,值越大越好)
|
||
if 'r2' in metric.lower() or 'score' in metric.lower():
|
||
best_idx = summary_df[metric_col].idxmax()
|
||
else: # 对于RMSE、MAE等,值越小越好
|
||
best_idx = summary_df[metric_col].idxmin()
|
||
|
||
best_row = summary_df.loc[best_idx]
|
||
|
||
# 根据文件类型解析模型信息
|
||
if (detailed_path.exists() or old_detailed_path.exists()) and '划分方法' in summary_df.columns:
|
||
# 详细结果文件格式
|
||
split_method = best_row['划分方法']
|
||
preprocess_method = best_row['预处理方法']
|
||
model_name = best_row['建模方法']
|
||
|
||
# 处理 nan/NaN/None 值,转换为 "None" 字符串
|
||
if pd.isna(preprocess_method) or str(preprocess_method).lower() in ['nan', 'none', '']:
|
||
preprocess_method = "None"
|
||
|
||
best_combination = f"{split_method}_{preprocess_method}_{model_name}"
|
||
else:
|
||
# 简化结果文件格式
|
||
best_combination = best_row['combination']
|
||
# 解析组合名称(格式: split_method_preprocess_method_model_name)
|
||
parts = best_combination.split('_')
|
||
if len(parts) < 3:
|
||
raise ValueError(f"无效的模型组合名称格式: {best_combination}")
|
||
|
||
split_method = parts[0]
|
||
preprocess_method = parts[1]
|
||
model_name = '_'.join(parts[2:])
|
||
|
||
# 处理 nan/NaN/None 值,转换为 "None" 字符串
|
||
if pd.isna(preprocess_method) or str(preprocess_method).lower() in ['nan', 'none', '']:
|
||
preprocess_method = "None"
|
||
|
||
print(f"最佳模型组合: {best_combination}")
|
||
print(f" 划分方法: {split_method}")
|
||
print(f" 预处理方法: {preprocess_method}")
|
||
print(f" 模型名称: {model_name}")
|
||
print(f" {metric_col}: {best_row[metric_col]:.4f}")
|
||
|
||
self.best_model_info = {
|
||
'combination': best_combination,
|
||
'split_method': split_method,
|
||
'preprocess_method': preprocess_method,
|
||
'model_name': model_name,
|
||
'metric_value': best_row[metric_col]
|
||
}
|
||
|
||
# 返回用于加载模型的文件名格式
|
||
model_file_prefix = f"{split_method}_{preprocess_method}"
|
||
return model_file_prefix, model_name
|
||
|
||
def load_best_model(self, metric: str = 'test_r2'):
|
||
"""
|
||
加载最佳模型
|
||
|
||
Args:
|
||
metric: 评估指标
|
||
"""
|
||
model_file_prefix, model_name = self.get_best_model_from_summary(metric)
|
||
|
||
# 获取当前artifacts_dir的文件夹名称(用作目标列名)
|
||
folder_name = self.artifacts_dir.name
|
||
|
||
# 构建模型文件路径(新格式:包含目标列名)
|
||
filename = f"{folder_name}_{model_file_prefix}_{model_name}.joblib"
|
||
filepath = self.artifacts_dir / filename
|
||
|
||
# 如果新格式文件不存在,尝试旧格式
|
||
if not filepath.exists():
|
||
old_filename = f"{model_file_prefix}_{model_name}.joblib"
|
||
old_filepath = self.artifacts_dir / old_filename
|
||
if old_filepath.exists():
|
||
filepath = old_filepath
|
||
filename = old_filename
|
||
print(f"使用旧格式模型文件: {filepath}")
|
||
else:
|
||
raise FileNotFoundError(f"模型文件不存在,尝试的路径:\n"
|
||
f" - {filepath}\n"
|
||
f" - {old_filepath}")
|
||
else:
|
||
print(f"使用新格式模型文件: {filepath}")
|
||
|
||
print(f"正在加载模型: {filepath}")
|
||
|
||
# 加载模型数据
|
||
self.loaded_model_data = joblib.load(filepath)
|
||
|
||
print("模型加载完成:")
|
||
print(f" 预处理方法: {self.loaded_model_data['preprocess_method']}")
|
||
print(f" 模型名称: {self.loaded_model_data['model_name']}")
|
||
print(f" 模型类型: {type(self.loaded_model_data['model'])}")
|
||
|
||
if 'metadata' in self.loaded_model_data:
|
||
metadata = self.loaded_model_data['metadata']
|
||
print(f" 数据形状: {metadata.get('data_shape', 'Unknown')}")
|
||
print(f" 目标范围: {metadata.get('target_range', 'Unknown')}")
|
||
if 'test_r2' in metadata:
|
||
print(f" 测试集R²: {metadata['test_r2']:.4f}")
|
||
if 'test_rmse' in metadata:
|
||
print(f" 测试集RMSE: {metadata['test_rmse']:.4f}")
|
||
|
||
def load_specific_model(self, model_file_path: str):
|
||
"""
|
||
加载指定的模型文件
|
||
|
||
Args:
|
||
model_file_path: 模型文件路径
|
||
"""
|
||
if not os.path.exists(model_file_path):
|
||
raise FileNotFoundError(f"模型文件不存在: {model_file_path}")
|
||
|
||
print(f"正在加载指定模型: {model_file_path}")
|
||
|
||
# 加载模型数据
|
||
self.loaded_model_data = joblib.load(model_file_path)
|
||
|
||
print("模型加载完成:")
|
||
print(f" 预处理方法: {self.loaded_model_data['preprocess_method']}")
|
||
print(f" 模型名称: {self.loaded_model_data['model_name']}")
|
||
print(f" 模型类型: {type(self.loaded_model_data['model'])}")
|
||
|
||
def preprocess_spectra(self, spectra: pd.DataFrame) -> np.ndarray:
|
||
"""
|
||
对光谱数据进行预处理
|
||
|
||
Args:
|
||
spectra: 原始光谱数据
|
||
|
||
Returns:
|
||
预处理后的光谱数据
|
||
"""
|
||
if self.loaded_model_data is None:
|
||
raise ValueError("请先加载模型")
|
||
|
||
preprocess_method = self.loaded_model_data['preprocess_method']
|
||
|
||
# 处理 nan/NaN/None 值,转换为 "None" 字符串
|
||
if pd.isna(preprocess_method) or str(preprocess_method).lower() in ['nan', 'none', '']:
|
||
preprocess_method = "None"
|
||
|
||
# 解析预处理方法(可能包含划分方法前缀)
|
||
if '_' in str(preprocess_method):
|
||
parts = str(preprocess_method).split('_')
|
||
# 假设格式为 split_method_preprocess_method
|
||
actual_preprocess_method = '_'.join(parts[1:]) if len(parts) > 1 else parts[-1]
|
||
else:
|
||
actual_preprocess_method = str(preprocess_method)
|
||
|
||
# 再次检查并转换 nan
|
||
if actual_preprocess_method.lower() in ['nan', 'none', '']:
|
||
actual_preprocess_method = "None"
|
||
|
||
print(f"正在应用预处理方法: {actual_preprocess_method}")
|
||
print(f"原始光谱数据形状: {spectra.shape}")
|
||
|
||
# ---- 自动特征补全:50 光谱 → 补全至模型训练时的 95 维(WQI 指数) ----
|
||
# 触发条件:模型期望 n_features_in_ 个特征,但当前 spectra 列数不足
|
||
# 原因:training_spectra.csv 含 50 光谱 + 45 WQI;sampling_spectra.csv 只有 50 光谱
|
||
# 做法:与训练端(calculate_all_indices)完全一致的算法列表,实时补全缺失的 45 个 WQI 列
|
||
model = self.loaded_model_data['model']
|
||
expected_features = getattr(model, 'n_features_in_', None)
|
||
|
||
# ---- 自动特征补全:50 光谱 → 补全至模型训练时的 n_features_in_ 维(WQI 指数) ----
|
||
if expected_features is not None and spectra.shape[1] < expected_features:
|
||
print(f"[特征补全] 检测到特征缺口:当前 {spectra.shape[1]} 列 < 模型期望 {expected_features} 列,"
|
||
f"正在从光谱数据实时计算 WQI 指数...")
|
||
try:
|
||
from src.utils.water_index import WaterQualityIndexCalculator
|
||
calc = WaterQualityIndexCalculator()
|
||
|
||
# 提取纯计算方法(排除 find_closest_wavelength 和 calculate_all_indices,
|
||
# 以及不返回 Series 的辅助方法)
|
||
algorithm_methods = []
|
||
for m in dir(calc):
|
||
if m.startswith('_'):
|
||
continue
|
||
if m in ['find_closest_wavelength', 'calculate_all_indices']:
|
||
continue
|
||
attr = getattr(calc, m)
|
||
if callable(attr):
|
||
algorithm_methods.append(m)
|
||
|
||
original_col_count = spectra.shape[1]
|
||
for algo_name in algorithm_methods:
|
||
try:
|
||
algo_func = getattr(calc, algo_name)
|
||
result = algo_func(spectra)
|
||
# 只追加返回 Series 且长度为样本数的合法结果
|
||
if isinstance(result, pd.Series) and len(result) == len(spectra):
|
||
spectra[algo_name] = result.values
|
||
else:
|
||
spectra[algo_name] = np.nan
|
||
except Exception:
|
||
spectra[algo_name] = np.nan
|
||
|
||
print(f"[特征补全] 完成!光谱列已扩充至 {spectra.shape[1]} 列"
|
||
f"(追加了 {spectra.shape[1] - original_col_count} 个 WQI 指数)")
|
||
except Exception as e:
|
||
print(f"[特征补全] 失败,将使用原始光谱特征: {e}")
|
||
|
||
# ---- 防线 1:强制维度对齐(物理截断)----
|
||
if expected_features is not None and spectra.shape[1] > expected_features:
|
||
print(f"[精准对齐] 正在将 {spectra.shape[1]} 维特征截断为模型要求的 {expected_features} 维")
|
||
spectra = spectra.iloc[:, :expected_features]
|
||
elif expected_features is not None and spectra.shape[1] < expected_features:
|
||
# 维度不足时填充 0
|
||
padding_cols = expected_features - spectra.shape[1]
|
||
for i in range(padding_cols):
|
||
spectra[f'_padding_{i}'] = 0.0
|
||
print(f"[精准对齐] 特征不足,填充 {padding_cols} 列 0")
|
||
|
||
# ---- 防线 2:彻底清洗无穷大数值----
|
||
# 防止 WQI 计算中除零/溢出产生 np.inf / -np.inf 导致预处理崩溃
|
||
spectra = spectra.replace([np.inf, -np.inf], np.nan)
|
||
spectra = spectra.fillna(0)
|
||
|
||
print(f"[特征对齐] 最终输入维度: {spectra.shape}")
|
||
|
||
try:
|
||
# 应用预处理
|
||
spectra_processed = Preprocessing(actual_preprocess_method, spectra)
|
||
|
||
# 确保返回numpy数组
|
||
if isinstance(spectra_processed, pd.DataFrame):
|
||
spectra_processed = spectra_processed.values
|
||
|
||
print(f"预处理后数据形状: {spectra_processed.shape}")
|
||
|
||
return spectra_processed
|
||
|
||
except Exception as e:
|
||
print(f"预处理失败: {e}")
|
||
print("使用原始数据")
|
||
return spectra.values
|
||
|
||
def predict(self, spectra_processed: np.ndarray) -> np.ndarray:
|
||
"""
|
||
使用加载的模型进行预测
|
||
|
||
Args:
|
||
spectra_processed: 预处理后的光谱数据
|
||
|
||
Returns:
|
||
预测结果
|
||
"""
|
||
if self.loaded_model_data is None:
|
||
raise ValueError("请先加载模型")
|
||
|
||
model = self.loaded_model_data['model']
|
||
|
||
print(f"正在进行预测...")
|
||
print(f"输入数据形状: {spectra_processed.shape}")
|
||
|
||
try:
|
||
# 清洗 NaN / Inf,防止 SVR 等模型报错
|
||
spectra_clean = np.nan_to_num(spectra_processed, nan=0.0, posinf=0.0, neginf=0.0)
|
||
if np.any(np.isnan(spectra_clean)) or np.any(np.isinf(spectra_clean)):
|
||
print("警告: 清洗后数据中仍存在 NaN/Inf,已重置为 0")
|
||
spectra_clean = np.nan_to_num(spectra_clean, nan=0.0, posinf=0.0, neginf=0.0)
|
||
|
||
predictions = model.predict(spectra_clean)
|
||
print(f"预测完成,结果形状: {predictions.shape}")
|
||
print(f"预测值范围: [{np.min(predictions):.4f}, {np.max(predictions):.4f}]")
|
||
print(f"预测值统计: 均值={np.mean(predictions):.4f}, 标准差={np.std(predictions):.4f}")
|
||
|
||
return predictions
|
||
|
||
except Exception as e:
|
||
print(f"预测失败: {e}")
|
||
raise
|
||
|
||
def save_predictions(self, coords: pd.DataFrame, predictions: np.ndarray,
|
||
output_path: str, prediction_column: str = 'prediction',
|
||
wqi_columns: Optional[pd.DataFrame] = None):
|
||
"""
|
||
保存预测结果
|
||
|
||
Args:
|
||
coords: 经纬度数据
|
||
predictions: 预测结果
|
||
output_path: 输出文件路径
|
||
prediction_column: 预测列名称
|
||
wqi_columns: Optional[pd.DataFrame] = None
|
||
"""
|
||
print(f"正在保存预测结果到: {output_path}")
|
||
|
||
# 创建结果DataFrame
|
||
result_df = coords.copy()
|
||
# 追加 WQI 水质指数列(如 sampling_spectra.csv 注入了 45 列指数)
|
||
if wqi_columns is not None and not wqi_columns.empty:
|
||
result_df = pd.concat([result_df, wqi_columns.reset_index(drop=True)], axis=1)
|
||
result_df[prediction_column] = predictions
|
||
|
||
# 确保输出目录存在
|
||
output_dir = os.path.dirname(output_path)
|
||
if output_dir:
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
# 根据文件扩展名选择保存格式
|
||
file_ext = Path(output_path).suffix.lower()
|
||
|
||
if file_ext == '.xls':
|
||
# 保存为Excel 97-2003格式
|
||
try:
|
||
result_df.to_excel(output_path, index=False, engine='xlwt')
|
||
print(f" 格式: Excel 97-2003 (.xls)")
|
||
except ImportError:
|
||
print("警告: xlwt库未安装,无法保存为.xls格式,改为保存CSV格式")
|
||
csv_path = output_path.replace('.xls', '.csv')
|
||
result_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
|
||
output_path = csv_path
|
||
elif file_ext == '.xlsx':
|
||
# 保存为Excel 2007+格式
|
||
try:
|
||
result_df.to_excel(output_path, index=False, engine='openpyxl')
|
||
print(f" 格式: Excel 2007+ (.xlsx)")
|
||
except ImportError:
|
||
print("警告: openpyxl库未安装,无法保存为.xlsx格式,改为保存CSV格式")
|
||
csv_path = output_path.replace('.xlsx', '.csv')
|
||
result_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
|
||
output_path = csv_path
|
||
else:
|
||
# 默认保存为CSV格式
|
||
result_df.to_csv(output_path, index=False, encoding='utf-8-sig')
|
||
print(f" 格式: CSV (.csv)")
|
||
|
||
print(f"预测结果保存完成:")
|
||
print(f" 输出文件: {output_path}")
|
||
print(f" 数据形状: {result_df.shape}")
|
||
print(f" 列名: {list(result_df.columns)}")
|
||
|
||
# 显示预测结果统计
|
||
print(f"\n预测结果统计:")
|
||
print(result_df[prediction_column].describe())
|
||
|
||
return result_df
|
||
|
||
def inference_pipeline(self, sampling_csv_path: str, output_csv_path: str,
|
||
metric: str = 'test_r2', prediction_column: str = 'prediction',
|
||
model_file_path: str = None):
|
||
"""
|
||
完整的推理流程
|
||
|
||
Args:
|
||
sampling_csv_path: 采样数据CSV路径
|
||
output_csv_path: 输出预测结果CSV路径
|
||
metric: 选择最佳模型的指标
|
||
prediction_column: 预测列名称
|
||
model_file_path: 指定模型文件路径(可选)
|
||
"""
|
||
print("=" * 80)
|
||
print("开始水质参数反演推理流程")
|
||
print("=" * 80)
|
||
|
||
try:
|
||
# 1. 加载模型
|
||
print("\n步骤1: 加载模型")
|
||
print("-" * 40)
|
||
if self.external_model is not None:
|
||
# 已在 __init__ 中规范化,无需重复赋值
|
||
print(f" 使用外部预训练模型: type={type(self.external_model).__name__}")
|
||
elif model_file_path:
|
||
self.load_specific_model(model_file_path)
|
||
else:
|
||
self.load_best_model(metric=metric)
|
||
|
||
# 2. 加载采样数据(coords=坐标, spectra=纯光谱, wqi_df=45个WQI指数列)
|
||
print("\n步骤2: 加载采样数据")
|
||
print("-" * 40)
|
||
coords, spectra, wqi_df = self.load_sampling_data(sampling_csv_path)
|
||
|
||
# 3. 数据预处理
|
||
print("\n步骤3: 数据预处理")
|
||
print("-" * 40)
|
||
spectra_processed = self.preprocess_spectra(spectra)
|
||
|
||
# 4. 模型预测
|
||
print("\n步骤4: 模型预测")
|
||
print("-" * 40)
|
||
predictions = self.predict(spectra_processed)
|
||
|
||
# 5. 保存预测结果(透传 WQI 列至最终输出文件)
|
||
print("\n步骤5: 保存预测结果")
|
||
print("-" * 40)
|
||
result_df = self.save_predictions(coords, predictions, output_csv_path,
|
||
prediction_column, wqi_df)
|
||
|
||
print("\n" + "=" * 80)
|
||
print("推理流程完成!")
|
||
print("=" * 80)
|
||
|
||
return predictions, result_df
|
||
|
||
except Exception as e:
|
||
print(f"\n推理流程失败: {e}")
|
||
raise
|
||
|
||
def get_model_info(self) -> Dict:
|
||
"""
|
||
获取当前加载模型的信息
|
||
|
||
Returns:
|
||
模型信息字典
|
||
"""
|
||
if self.loaded_model_data is None:
|
||
return {"status": "no_model_loaded"}
|
||
|
||
info = {
|
||
"status": "model_loaded",
|
||
"preprocess_method": self.loaded_model_data.get('preprocess_method', 'Unknown'),
|
||
"model_name": self.loaded_model_data.get('model_name', type(self.external_model).__name__ if self.external_model else 'Unknown'),
|
||
"model_type": str(type(self.loaded_model_data['model'])),
|
||
"metadata": self.loaded_model_data.get('metadata', {})
|
||
}
|
||
|
||
if self.best_model_info:
|
||
info.update(self.best_model_info)
|
||
|
||
return info
|
||
|
||
def batch_inference(self, input_dir: str, output_dir: str,
|
||
metric: str = 'test_r2', prediction_column: str = 'prediction'):
|
||
"""
|
||
批量推理多个采样文件
|
||
|
||
Args:
|
||
input_dir: 输入目录,包含多个采样CSV文件
|
||
output_dir: 输出目录
|
||
metric: 选择最佳模型的指标
|
||
prediction_column: 预测列名称
|
||
"""
|
||
input_path = Path(input_dir)
|
||
output_path = Path(output_dir)
|
||
output_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 查找所有CSV文件
|
||
csv_files = list(input_path.glob("*.csv"))
|
||
|
||
if not csv_files:
|
||
print(f"在目录 {input_dir} 中未找到CSV文件")
|
||
return
|
||
|
||
print(f"找到 {len(csv_files)} 个CSV文件进行批量推理")
|
||
|
||
# 加载模型(只需加载一次)
|
||
self.load_best_model(metric=metric)
|
||
|
||
results = {}
|
||
|
||
for csv_file in csv_files:
|
||
try:
|
||
print(f"\n处理文件: {csv_file.name}")
|
||
output_file = output_path / f"prediction_{csv_file.name}"
|
||
|
||
# 执行推理
|
||
coords, spectra, wqi_df = self.load_sampling_data(str(csv_file))
|
||
spectra_processed = self.preprocess_spectra(spectra)
|
||
predictions = self.predict(spectra_processed)
|
||
result_df = self.save_predictions(coords, predictions, str(output_file),
|
||
prediction_column, wqi_df)
|
||
|
||
results[csv_file.name] = {
|
||
'output_file': str(output_file),
|
||
'sample_count': len(predictions),
|
||
'prediction_stats': {
|
||
'mean': np.mean(predictions),
|
||
'std': np.std(predictions),
|
||
'min': np.min(predictions),
|
||
'max': np.max(predictions)
|
||
}
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f"处理文件 {csv_file.name} 失败: {e}")
|
||
results[csv_file.name] = {'error': str(e)}
|
||
|
||
print(f"\n批量推理完成,共处理 {len(csv_files)} 个文件")
|
||
return results
|
||
|
||
def batch_inference_multi_models(self, models_root_dir: str, sampling_csv_path: str,
|
||
output_dir: str, metric: str = 'test_r2',
|
||
prediction_column: str = 'prediction',
|
||
output_format: str = 'csv',
|
||
external_model=None,
|
||
external_model_path=None,
|
||
external_models_dict=None):
|
||
"""
|
||
使用多个子文件夹中的模型进行批量推理
|
||
|
||
Args:
|
||
models_root_dir: 包含多个子文件夹的根目录,每个子文件夹作为artifacts_dir
|
||
sampling_csv_path: 采样数据CSV路径
|
||
output_dir: 输出目录
|
||
metric: 选择最佳模型的指标
|
||
prediction_column: 预测列名称
|
||
output_format: 输出文件格式 ('csv', 'xls', 'xlsx')
|
||
"""
|
||
models_root = Path(models_root_dir)
|
||
output_path = Path(output_dir)
|
||
output_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
all_results = {}
|
||
|
||
# 优先级 1:_external_models_dict 非空 → 直接用字典的 keys 作为 targets,不扫描磁盘
|
||
print(f"[BatchInference] 终于收到字典啦!包含模型: {list(external_models_dict.keys()) if external_models_dict else 'None'}")
|
||
if external_models_dict is not None and len(external_models_dict) > 0:
|
||
targets = list(external_models_dict.keys())
|
||
print(f"\n使用外部导入模型字典({len(targets)} 个模型)")
|
||
print(f"检测到外部导入模型,将预测以下参数: {targets}")
|
||
elif external_model is not None:
|
||
print(f"\n使用外部预训练模型: {external_model_path or 'unknown'}")
|
||
subdirs = [d for d in models_root.iterdir() if d.is_dir()]
|
||
if not subdirs:
|
||
print(f"在目录 {models_root_dir} 中未找到子文件夹")
|
||
return {}
|
||
print(f"找到 {len(subdirs)} 个模型子文件夹进行批量推理")
|
||
targets = [d.name for d in subdirs]
|
||
else:
|
||
subdirs = [d for d in models_root.iterdir() if d.is_dir()]
|
||
if not subdirs:
|
||
print(f"在目录 {models_root_dir} 中未找到子文件夹")
|
||
return {}
|
||
print(f"找到 {len(subdirs)} 个模型子文件夹进行批量推理")
|
||
targets = [d.name for d in subdirs]
|
||
|
||
print(f"输出格式: {output_format.upper()}")
|
||
|
||
for subdir_name in targets:
|
||
try:
|
||
print(f"\n{'='*60}")
|
||
print(f"处理模型: {subdir_name}")
|
||
print(f"{'='*60}")
|
||
|
||
# 优先级:字典中该 target 的模型 > 共享单模型 > 磁盘加载
|
||
effective_model = None
|
||
if external_models_dict and subdir_name in external_models_dict:
|
||
effective_model = external_models_dict[subdir_name]
|
||
print(f" → 使用字典中模型: {type(effective_model).__name__}")
|
||
elif external_model is not None:
|
||
effective_model = external_model
|
||
print(f" → 使用共享外部模型: {type(effective_model).__name__}")
|
||
|
||
# artifacts_dir:字典模式优先用 placeholder "./",否则用真实子目录
|
||
artifacts_dir = (
|
||
str(models_root / subdir_name)
|
||
if (models_root / subdir_name).is_dir()
|
||
else str(models_root)
|
||
)
|
||
if effective_model is not None:
|
||
model_inferencer = WaterQualityInference(
|
||
artifacts_dir,
|
||
external_model=effective_model,
|
||
external_model_path=external_model_path or "",
|
||
)
|
||
else:
|
||
model_inferencer = WaterQualityInference(artifacts_dir)
|
||
|
||
# 根据输出格式设置文件扩展名
|
||
file_ext = f".{output_format}"
|
||
output_file = output_path / f"{subdir_name}{file_ext}"
|
||
|
||
# 执行推理流程
|
||
predictions, result_df = model_inferencer.inference_pipeline(
|
||
sampling_csv_path=sampling_csv_path,
|
||
output_csv_path=str(output_file),
|
||
metric=metric,
|
||
prediction_column=prediction_column
|
||
)
|
||
|
||
# 收集结果信息
|
||
model_info = model_inferencer.get_model_info()
|
||
all_results[subdir_name] = {
|
||
'status': 'success',
|
||
'output_file': str(output_file),
|
||
'sample_count': len(predictions),
|
||
'model_info': model_info,
|
||
'prediction_stats': {
|
||
'mean': np.mean(predictions),
|
||
'std': np.std(predictions),
|
||
'min': np.min(predictions),
|
||
'max': np.max(predictions)
|
||
}
|
||
}
|
||
|
||
print(f"模型 {subdir_name} 处理完成")
|
||
|
||
except Exception as e:
|
||
print(f"处理模型 {subdir_name} 失败: {e}")
|
||
all_results[subdir_name] = {
|
||
'status': 'error',
|
||
'error': str(e)
|
||
}
|
||
|
||
print(f"\n{'='*80}")
|
||
print(f"批量推理完成,共处理 {len(subdirs)} 个模型文件夹")
|
||
print(f"{'='*80}")
|
||
|
||
# 打印汇总信息
|
||
print("\n汇总结果:")
|
||
for folder_name, result in all_results.items():
|
||
if result['status'] == 'success':
|
||
print(f" ✓ {folder_name}: {result['sample_count']} 个预测值,"
|
||
f"均值={result['prediction_stats']['mean']:.4f}")
|
||
else:
|
||
print(f" ✗ {folder_name}: 失败 - {result['error']}")
|
||
|
||
return all_results
|
||
|
||
def batch_inference_multi_data(self, artifacts_dir: str, input_dir: str,
|
||
output_dir: str, metric: str = 'test_r2',
|
||
prediction_column: str = 'prediction',
|
||
output_format: str = 'csv'):
|
||
"""
|
||
使用一个模型对多个数据文件进行批量推理,输出文件名为数据文件名(不含扩展名)
|
||
|
||
Args:
|
||
artifacts_dir: 模型目录
|
||
input_dir: 输入目录,包含多个采样CSV文件
|
||
output_dir: 输出目录
|
||
metric: 选择最佳模型的指标
|
||
prediction_column: 预测列名称
|
||
output_format: 输出文件格式 ('csv', 'xls', 'xlsx')
|
||
"""
|
||
input_path = Path(input_dir)
|
||
output_path = Path(output_dir)
|
||
output_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 查找所有CSV文件
|
||
csv_files = list(input_path.glob("*.csv"))
|
||
|
||
if not csv_files:
|
||
print(f"在目录 {input_dir} 中未找到CSV文件")
|
||
return
|
||
|
||
print(f"找到 {len(csv_files)} 个CSV文件进行批量推理")
|
||
print(f"输出格式: {output_format.upper()}")
|
||
|
||
# 初始化推理器并加载模型(只需加载一次)
|
||
self.artifacts_dir = Path(artifacts_dir)
|
||
self.load_best_model(metric=metric)
|
||
|
||
results = {}
|
||
|
||
for csv_file in csv_files:
|
||
try:
|
||
# 获取不含扩展名的文件名
|
||
file_stem = csv_file.stem
|
||
print(f"\n处理文件: {csv_file.name}")
|
||
|
||
# 根据输出格式设置文件扩展名
|
||
file_ext = f".{output_format}"
|
||
output_file = output_path / f"{file_stem}{file_ext}"
|
||
|
||
# 执行推理
|
||
coords, spectra, wqi_df = self.load_sampling_data(str(csv_file))
|
||
spectra_processed = self.preprocess_spectra(spectra)
|
||
predictions = self.predict(spectra_processed)
|
||
result_df = self.save_predictions(coords, predictions, str(output_file),
|
||
prediction_column, wqi_df)
|
||
|
||
results[file_stem] = {
|
||
'input_file': str(csv_file),
|
||
'output_file': str(output_file),
|
||
'sample_count': len(predictions),
|
||
'prediction_stats': {
|
||
'mean': np.mean(predictions),
|
||
'std': np.std(predictions),
|
||
'min': np.min(predictions),
|
||
'max': np.max(predictions)
|
||
}
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f"处理文件 {csv_file.name} 失败: {e}")
|
||
results[csv_file.stem] = {'error': str(e)}
|
||
|
||
print(f"\n批量推理完成,共处理 {len(csv_files)} 个文件")
|
||
return results
|
||
|
||
def evaluate_with_split(self, data_csv_path: str, split_method: str = "random",
|
||
test_size: float = 0.2, random_state: int = 42,
|
||
target_column: int = 11, feature_start_column: int = 13,
|
||
metric: str = 'test_r2', prediction_column: str = 'prediction'):
|
||
"""
|
||
使用训练时相同的数据分割方法进行模型评估
|
||
|
||
Args:
|
||
data_csv_path: 包含目标值的完整数据集CSV路径
|
||
split_method: 数据分割方法 ("random", "spxy", "ks")
|
||
test_size: 测试集比例
|
||
random_state: 随机种子
|
||
target_column: 目标值列索引
|
||
feature_start_column: 特征开始列索引
|
||
metric: 选择模型的评估指标
|
||
prediction_column: 预测结果列名
|
||
|
||
Returns:
|
||
评估结果字典
|
||
"""
|
||
print("=" * 80)
|
||
print("开始数据分割评估流程")
|
||
print("=" * 80)
|
||
|
||
try:
|
||
# 1. 加载完整数据集
|
||
print("\n步骤1: 加载完整数据集")
|
||
print("-" * 40)
|
||
data = pd.read_csv(data_csv_path)
|
||
|
||
# 提取目标值和特征
|
||
y = data.iloc[:, target_column]
|
||
X = data.iloc[:, feature_start_column:]
|
||
|
||
# 去除目标值为空的行
|
||
mask = ~y.isna()
|
||
data_cleaned = data[mask]
|
||
y_cleaned = data_cleaned.iloc[:, target_column]
|
||
X_cleaned = data_cleaned.iloc[:, feature_start_column:]
|
||
|
||
print(f"数据加载完成:")
|
||
print(f" 原始样本数: {len(data)}")
|
||
print(f" 清理后样本数: {len(X_cleaned)}")
|
||
print(f" 特征数量: {X_cleaned.shape[1]}")
|
||
print(f" 目标值范围: {y_cleaned.min():.4f} ~ {y_cleaned.max():.4f}")
|
||
|
||
# 2. 加载最佳模型
|
||
print("\n步骤2: 加载最佳模型")
|
||
print("-" * 40)
|
||
self.load_best_model(metric=metric)
|
||
|
||
# 3. 数据预处理
|
||
print("\n步骤3: 数据预处理")
|
||
print("-" * 40)
|
||
X_processed = self.preprocess_spectra(X_cleaned)
|
||
|
||
# 4. 数据分割
|
||
print("\n步骤4: 数据分割")
|
||
print("-" * 40)
|
||
X_train, X_test, y_train, y_test = self.split_data(
|
||
X_processed, y_cleaned, method=split_method,
|
||
test_size=test_size, random_state=random_state
|
||
)
|
||
|
||
print(f"数据分割完成:")
|
||
print(f" 训练集样本数: {X_train.shape[0]}")
|
||
print(f" 测试集样本数: {X_test.shape[0]}")
|
||
|
||
# 5. 模型预测
|
||
print("\n步骤5: 模型预测")
|
||
print("-" * 40)
|
||
|
||
# 训练集预测
|
||
y_train_pred = self.loaded_model_data['model'].predict(X_train)
|
||
|
||
# 测试集预测
|
||
y_test_pred = self.loaded_model_data['model'].predict(X_test)
|
||
|
||
# 6. 计算评估指标
|
||
print("\n步骤6: 计算评估指标")
|
||
print("-" * 40)
|
||
|
||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||
|
||
# 训练集指标
|
||
train_mse = mean_squared_error(y_train, y_train_pred)
|
||
train_mae = mean_absolute_error(y_train, y_train_pred)
|
||
train_r2 = r2_score(y_train, y_train_pred)
|
||
train_rmse = np.sqrt(train_mse)
|
||
|
||
# 测试集指标
|
||
test_mse = mean_squared_error(y_test, y_test_pred)
|
||
test_mae = mean_absolute_error(y_test, y_test_pred)
|
||
test_r2 = r2_score(y_test, y_test_pred)
|
||
test_rmse = np.sqrt(test_mse)
|
||
|
||
results = {
|
||
'split_method': split_method,
|
||
'test_size': test_size,
|
||
'train_size': len(y_train),
|
||
'test_size_actual': len(y_test),
|
||
'train_metrics': {
|
||
'mse': train_mse,
|
||
'mae': train_mae,
|
||
'rmse': train_rmse,
|
||
'r2': train_r2
|
||
},
|
||
'test_metrics': {
|
||
'mse': test_mse,
|
||
'mae': test_mae,
|
||
'rmse': test_rmse,
|
||
'r2': test_r2
|
||
},
|
||
'predictions': {
|
||
'y_train_true': y_train,
|
||
'y_train_pred': y_train_pred,
|
||
'y_test_true': y_test,
|
||
'y_test_pred': y_test_pred
|
||
}
|
||
}
|
||
|
||
print(f"评估完成:")
|
||
print(f" 训练集指标:")
|
||
print(f" R²: {train_r2:.4f}")
|
||
print(f" RMSE: {train_rmse:.4f}")
|
||
print(f" MAE: {train_mae:.4f}")
|
||
print(f" 测试集指标:")
|
||
print(f" R²: {test_r2:.4f}")
|
||
print(f" RMSE: {test_rmse:.4f}")
|
||
print(f" MAE: {test_mae:.4f}")
|
||
|
||
print("\n" + "=" * 80)
|
||
print("数据分割评估流程完成!")
|
||
print("=" * 80)
|
||
|
||
return results
|
||
|
||
except Exception as e:
|
||
print(f"\n数据分割评估失败: {e}")
|
||
raise
|
||
|
||
|
||
def main():
|
||
"""主函数示例"""
|
||
# 创建推理实例
|
||
artifacts_dir = r"E:\code\WQ\yaobao925\qvchuyaoban"
|
||
inferencer = WaterQualityInference(artifacts_dir)
|
||
|
||
# 配置文件路径
|
||
sampling_csv = r"E:\code\WQ\xiaogujia\使用腰堡模型\spectral_sampling_results.csv"
|
||
# output_csv = r"E:\code\WQ\laodao\output"
|
||
|
||
try:
|
||
# # 示例1: 单个模型单个数据文件的推理
|
||
# print("示例1: 单个模型单个数据文件的推理")
|
||
# predictions, result_df = inferencer.inference_pipeline(
|
||
# sampling_csv_path=sampling_csv,
|
||
# output_csv_path=output_csv,
|
||
# metric='test_r2', # 使用测试集R²作为选择最佳模型的指标
|
||
# prediction_column='water_quality_prediction'
|
||
# )
|
||
#
|
||
# print(f"\n推理完成,共生成 {len(predictions)} 个预测值")
|
||
#
|
||
# # 显示模型信息
|
||
# model_info = inferencer.get_model_info()
|
||
# print(f"\n使用的模型信息:")
|
||
# print(f" 组合: {model_info.get('combination', 'Unknown')}")
|
||
# print(f" 预处理: {model_info.get('preprocess_method', 'Unknown')}")
|
||
# print(f" 算法: {model_info.get('model_name', 'Unknown')}")
|
||
|
||
# 示例2: 批量推理多个模型(每个子文件夹作为不同的artifacts_dir)
|
||
print(f"\n{'='*80}")
|
||
print("示例2: 批量推理多个模型")
|
||
models_root_dir = r"E:\code\WQ\yaobao925\qvchuyaoban" # 包含多个子文件夹的根目录
|
||
output_dir = r"E:\code\WQ\xiaogujia\使用腰堡模型\predict"
|
||
|
||
all_results = inferencer.batch_inference_multi_models(
|
||
models_root_dir=models_root_dir,
|
||
sampling_csv_path=sampling_csv,
|
||
output_dir=output_dir,
|
||
metric='test_r2',
|
||
prediction_column='water_quality_prediction'
|
||
)
|
||
|
||
# 示例3: 使用数据分割方法进行模型评估(可选)
|
||
# print(f"\n{'='*80}")
|
||
# print("示例3: 数据分割评估")
|
||
# complete_data_csv = r"E:\code\WQ\laodao\data\捞刀河-浏阳河-圭塘河.csv" # 包含目标值的完整数据集
|
||
#
|
||
# # 使用SPXY方法进行数据分割评估
|
||
# eval_results = inferencer.evaluate_with_split(
|
||
# data_csv_path=complete_data_csv,
|
||
# split_method="spxy", # 可选: "random", "spxy", "ks"
|
||
# test_size=0.2,
|
||
# random_state=42,
|
||
# target_column=11, # 目标值列索引
|
||
# feature_start_column=13, # 特征开始列索引
|
||
# metric='test_r2'
|
||
# )
|
||
#
|
||
# print(f"\n数据分割评估结果:")
|
||
# print(f" 分割方法: {eval_results['split_method']}")
|
||
# print(f" 训练集R²: {eval_results['train_metrics']['r2']:.4f}")
|
||
# print(f" 测试集R²: {eval_results['test_metrics']['r2']:.4f}")
|
||
# print(f" 训练集RMSE: {eval_results['train_metrics']['rmse']:.4f}")
|
||
# print(f" 测试集RMSE: {eval_results['test_metrics']['rmse']:.4f}")
|
||
|
||
except Exception as e:
|
||
print(f"推理失败: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|