Initial commit of WQ_GUI
This commit is contained in:
1
src/core/prediction/__init__.py
Normal file
1
src/core/prediction/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
1144
src/core/prediction/inference_batch.py
Normal file
1144
src/core/prediction/inference_batch.py
Normal file
File diff suppressed because it is too large
Load Diff
894
src/core/prediction/sctter_batch.py
Normal file
894
src/core/prediction/sctter_batch.py
Normal file
@ -0,0 +1,894 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import joblib
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Union, Tuple, Optional
|
||||
import warnings
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.font_manager as fm
|
||||
import scipy.stats as stats
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# 设置中文字体
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans', 'Arial Unicode MS']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
plt.rcParams['font.size'] = 12
|
||||
|
||||
# 机器学习模型导入 - 改为回归模型
|
||||
from sklearn.svm import SVR
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
|
||||
# 第三方模型导入
|
||||
# try:
|
||||
# import lightgbm as lgb
|
||||
# LGB_AVAILABLE = True
|
||||
# except ImportError:
|
||||
# LGB_AVAILABLE = False
|
||||
LGB_AVAILABLE = False # 注释掉lightgbm
|
||||
|
||||
# try:
|
||||
# import catboost as cb
|
||||
# CB_AVAILABLE = True
|
||||
# except ImportError:
|
||||
# CB_AVAILABLE = False
|
||||
CB_AVAILABLE = False # 注释掉catboost
|
||||
|
||||
# 导入预处理模块
|
||||
# 动态导入预处理模块
|
||||
import sys
|
||||
import os
|
||||
|
||||
from src.preprocessing.spectral_Preprocessing import Preprocessing
|
||||
|
||||
|
||||
class WaterQualityScatterBatch:
|
||||
"""水质参数反演批量散点图绘制类"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化批量散点图绘制类"""
|
||||
# 定义支持的回归模型及其参数网格
|
||||
self.model_configs = {
|
||||
'SVR': {
|
||||
'model': SVR,
|
||||
'params': {
|
||||
'C': [0.1, 1, 10, 100],
|
||||
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
|
||||
'kernel': ['rbf', 'poly', 'sigmoid'],
|
||||
'epsilon': [0.01, 0.1, 0.2]
|
||||
},
|
||||
'available': True
|
||||
},
|
||||
'RF': {
|
||||
'model': RandomForestRegressor,
|
||||
'params': {
|
||||
'n_estimators': [50, 100, 200],
|
||||
'max_depth': [None, 10, 20, 30],
|
||||
'min_samples_split': [2, 5, 10],
|
||||
'min_samples_leaf': [1, 2, 4]
|
||||
},
|
||||
'available': True
|
||||
},
|
||||
'KNN': {
|
||||
'model': KNeighborsRegressor,
|
||||
'params': {
|
||||
'n_neighbors': [3, 5, 7, 9, 11],
|
||||
'weights': ['uniform', 'distance'],
|
||||
'metric': ['euclidean', 'manhattan', 'minkowski']
|
||||
},
|
||||
'available': True
|
||||
},
|
||||
'LinearRegression': {
|
||||
'model': LinearRegression,
|
||||
'params': {
|
||||
'fit_intercept': [True, False]
|
||||
},
|
||||
'available': True
|
||||
},
|
||||
'Ridge': {
|
||||
'model': Ridge,
|
||||
'params': {
|
||||
'alpha': [0.01, 0.1, 1, 10, 100],
|
||||
'fit_intercept': [True, False]
|
||||
},
|
||||
'available': True
|
||||
},
|
||||
'Lasso': {
|
||||
'model': Lasso,
|
||||
'params': {
|
||||
'alpha': [0.01, 0.1, 1, 10, 100],
|
||||
'fit_intercept': [True, False],
|
||||
'max_iter': [1000, 2000]
|
||||
},
|
||||
'available': True
|
||||
},
|
||||
'ElasticNet': {
|
||||
'model': ElasticNet,
|
||||
'params': {
|
||||
'alpha': [0.01, 0.1, 1, 10],
|
||||
'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
|
||||
'fit_intercept': [True, False],
|
||||
'max_iter': [1000, 2000]
|
||||
},
|
||||
'available': True
|
||||
},
|
||||
'XGBoost': {
|
||||
'model': None, # xgboost is removed, so set to None
|
||||
'params': {
|
||||
'n_estimators': [50, 100, 200],
|
||||
'max_depth': [3, 6, 9],
|
||||
'learning_rate': [0.01, 0.1, 0.2],
|
||||
'subsample': [0.8, 0.9, 1.0]
|
||||
},
|
||||
'available': False
|
||||
},
|
||||
'LightGBM': {
|
||||
'model': lgb.LGBMRegressor if LGB_AVAILABLE else None,
|
||||
'params': {
|
||||
'n_estimators': [50, 100, 200],
|
||||
'max_depth': [3, 6, 9],
|
||||
'learning_rate': [0.01, 0.1, 0.2],
|
||||
'num_leaves': [31, 50, 100]
|
||||
},
|
||||
'available': LGB_AVAILABLE
|
||||
},
|
||||
'CatBoost': {
|
||||
'model': cb.CatBoostRegressor if CB_AVAILABLE else None,
|
||||
'params': {
|
||||
'iterations': [50, 100, 200],
|
||||
'depth': [3, 6, 9],
|
||||
'learning_rate': [0.01, 0.1, 0.2],
|
||||
'l2_leaf_reg': [1, 3, 5]
|
||||
},
|
||||
'available': CB_AVAILABLE
|
||||
},
|
||||
'PLS': {
|
||||
'model': PLSRegression,
|
||||
'params': {
|
||||
'n_components': [2, 3, 5, 7, 10]
|
||||
},
|
||||
'available': True
|
||||
}
|
||||
}
|
||||
|
||||
# 预处理方法列表
|
||||
self.preprocessing_methods = [
|
||||
"None", "MMS", "SS", "CT", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "WVAE"
|
||||
]
|
||||
|
||||
# 样本划分方法列表
|
||||
self.split_methods = ["random", "spxy", "ks"]
|
||||
|
||||
def load_data(self, csv_path: str, target_column_name: str = None, target_column: int = None, feature_start_column: int = 13) -> Tuple[pd.DataFrame, pd.Series]:
|
||||
"""
|
||||
加载CSV数据
|
||||
|
||||
Args:
|
||||
csv_path: CSV文件路径
|
||||
target_column_name: 目标值列名(优先使用)
|
||||
target_column: 目标值列索引(当列名不存在时使用)
|
||||
feature_start_column: 特征开始列索引
|
||||
|
||||
Returns:
|
||||
X: 特征数据
|
||||
y: 目标值数据
|
||||
"""
|
||||
data = pd.read_csv(csv_path)
|
||||
|
||||
# 根据列名或列索引提取目标值
|
||||
if target_column_name and target_column_name in data.columns:
|
||||
print(f"使用列名 '{target_column_name}' 作为目标值")
|
||||
y = data[target_column_name]
|
||||
target_col_index = data.columns.get_loc(target_column_name)
|
||||
elif target_column is not None:
|
||||
print(f"使用列索引 {target_column} 作为目标值")
|
||||
y = data.iloc[:, target_column]
|
||||
target_col_index = target_column
|
||||
else:
|
||||
raise ValueError("必须指定 target_column_name 或 target_column")
|
||||
|
||||
# 提取特征数据
|
||||
X = data.iloc[:, feature_start_column:]
|
||||
|
||||
# 去除y值为空的行
|
||||
mask = ~y.isna()
|
||||
data_cleaned = data[mask]
|
||||
|
||||
if target_column_name and target_column_name in data.columns:
|
||||
y = data_cleaned[target_column_name]
|
||||
else:
|
||||
y = data_cleaned.iloc[:, target_col_index]
|
||||
X = data_cleaned.iloc[:, feature_start_column:]
|
||||
|
||||
print(f"数据加载完成:")
|
||||
print(f" 目标列: {target_column_name if target_column_name else f'索引{target_col_index}'}")
|
||||
print(f" 样本数量: {X.shape[0]}")
|
||||
print(f" 特征数量: {X.shape[1]}")
|
||||
print(f" 目标值范围: {y.min():.4f} ~ {y.max():.4f}")
|
||||
print(f" 目标值均值: {y.mean():.4f}")
|
||||
|
||||
return X, y
|
||||
|
||||
def preprocess_data(self, X: pd.DataFrame, method: str) -> np.ndarray:
|
||||
"""
|
||||
数据预处理
|
||||
|
||||
Args:
|
||||
X: 原始特征数据
|
||||
method: 预处理方法
|
||||
|
||||
Returns:
|
||||
预处理后的数据
|
||||
"""
|
||||
print(f"应用预处理方法: {method}")
|
||||
|
||||
# 如果方法为None,直接返回原始数据
|
||||
if method == "None" or method is None:
|
||||
print("跳过预处理,使用原始数据")
|
||||
return X.values
|
||||
|
||||
try:
|
||||
X_processed = Preprocessing(method, X)
|
||||
|
||||
# 确保返回的是numpy数组
|
||||
if isinstance(X_processed, pd.DataFrame):
|
||||
X_processed = X_processed.values
|
||||
|
||||
print(f"预处理完成,数据形状: {X_processed.shape}")
|
||||
return X_processed
|
||||
|
||||
except Exception as e:
|
||||
print(f"预处理失败: {e}")
|
||||
print("使用原始数据")
|
||||
return X.values
|
||||
|
||||
def random(self, data, label, test_ratio=0.2, random_state=123):
|
||||
"""随机划分数据集"""
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
data, label, test_size=test_ratio, random_state=random_state
|
||||
)
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
def spxy(self, data, label, test_size=0.2):
|
||||
"""SPXY算法划分数据集"""
|
||||
# 确保 data 和 label 是 NumPy 数组
|
||||
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
|
||||
label = label.to_numpy() if isinstance(label, pd.Series) else label
|
||||
|
||||
# 备份原始数据和标签
|
||||
x_backup = data
|
||||
y_backup = label
|
||||
|
||||
M = data.shape[0]
|
||||
N = round((1 - test_size) * M)
|
||||
samples = np.arange(M)
|
||||
|
||||
# 归一化标签数据
|
||||
label = (label - np.mean(label)) / np.std(label)
|
||||
D = np.zeros((M, M))
|
||||
Dy = np.zeros((M, M))
|
||||
|
||||
# 计算样本之间的距离
|
||||
for i in range(M - 1):
|
||||
xa = data[i, :]
|
||||
ya = label[i]
|
||||
for j in range((i + 1), M):
|
||||
xb = data[j, :]
|
||||
yb = label[j]
|
||||
D[i, j] = np.linalg.norm(xa - xb)
|
||||
Dy[i, j] = np.linalg.norm(ya - yb)
|
||||
|
||||
# 距离归一化
|
||||
Dmax = np.max(D)
|
||||
Dymax = np.max(Dy)
|
||||
D = D / Dmax + Dy / Dymax
|
||||
|
||||
# 找到最远的两个点
|
||||
maxD = D.max(axis=0)
|
||||
index_row = D.argmax(axis=0)
|
||||
index_column = maxD.argmax()
|
||||
|
||||
m = np.zeros(N, dtype=int)
|
||||
m[0] = index_row[index_column]
|
||||
m[1] = index_column
|
||||
|
||||
dminmax = np.zeros(N)
|
||||
dminmax[1] = D[m[0], m[1]]
|
||||
|
||||
# 根据距离选择训练集
|
||||
for i in range(2, N):
|
||||
pool = np.delete(samples, m[:i])
|
||||
dmin = np.zeros(M - i)
|
||||
for j in range(M - i):
|
||||
indexa = pool[j]
|
||||
d = np.zeros(i)
|
||||
for k in range(i):
|
||||
indexb = m[k]
|
||||
if indexa < indexb:
|
||||
d[k] = D[indexa, indexb]
|
||||
else:
|
||||
d[k] = D[indexb, indexa]
|
||||
dmin[j] = np.min(d)
|
||||
dminmax[i] = np.max(dmin)
|
||||
index = np.argmax(dmin)
|
||||
m[i] = pool[index]
|
||||
|
||||
m_complement = np.delete(samples, m)
|
||||
|
||||
# 划分训练集和测试集
|
||||
X_train = data[m, :]
|
||||
y_train = y_backup[m]
|
||||
X_test = data[m_complement, :]
|
||||
y_test = y_backup[m_complement]
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
def ks(self, data, label, test_size=0.2):
|
||||
"""Kennard-Stone算法划分数据集"""
|
||||
# 确保 data 和 label 是 NumPy 数组
|
||||
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
|
||||
label = label.to_numpy() if isinstance(label, pd.Series) else label
|
||||
|
||||
M = data.shape[0]
|
||||
N = round((1 - test_size) * M)
|
||||
samples = np.arange(M)
|
||||
|
||||
D = np.zeros((M, M))
|
||||
|
||||
for i in range((M - 1)):
|
||||
xa = data[i, :]
|
||||
for j in range((i + 1), M):
|
||||
xb = data[j, :]
|
||||
D[i, j] = np.linalg.norm(xa - xb)
|
||||
|
||||
maxD = np.max(D, axis=0)
|
||||
index_row = np.argmax(D, axis=0)
|
||||
index_column = np.argmax(maxD)
|
||||
|
||||
m = np.zeros(N)
|
||||
m[0] = np.array(index_row[index_column])
|
||||
m[1] = np.array(index_column)
|
||||
m = m.astype(int)
|
||||
dminmax = np.zeros(N)
|
||||
dminmax[1] = D[m[0], m[1]]
|
||||
|
||||
for i in range(2, N):
|
||||
pool = np.delete(samples, m[:i])
|
||||
dmin = np.zeros((M - i))
|
||||
for j in range((M - i)):
|
||||
indexa = pool[j]
|
||||
d = np.zeros(i)
|
||||
for k in range(i):
|
||||
indexb = m[k]
|
||||
if indexa < indexb:
|
||||
d[k] = D[indexa, indexb]
|
||||
else:
|
||||
d[k] = D[indexb, indexa]
|
||||
dmin[j] = np.min(d)
|
||||
dminmax[i] = np.max(dmin)
|
||||
index = np.argmax(dmin)
|
||||
m[i] = pool[index]
|
||||
|
||||
m_complement = np.delete(np.arange(data.shape[0]), m)
|
||||
|
||||
X_train = data[m, :]
|
||||
y_train = label[m]
|
||||
X_test = data[m_complement, :]
|
||||
y_test = label[m_complement]
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
def split_data(self, X: np.ndarray, y: pd.Series, method: str = "random",
|
||||
test_size: float = 0.2, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""
|
||||
根据指定方法划分数据集
|
||||
"""
|
||||
print(f"使用 {method} 方法划分数据集")
|
||||
|
||||
if method == "random":
|
||||
return self.random(X, y, test_ratio=test_size, random_state=random_state)
|
||||
elif method == "spxy":
|
||||
return self.spxy(X, y, test_size=test_size)
|
||||
elif method == "ks":
|
||||
return self.ks(X, y, test_size=test_size)
|
||||
else:
|
||||
raise ValueError(f"不支持的划分方法: {method}. 支持的方法: {self.split_methods}")
|
||||
|
||||
def plot_scatter_with_confidence(self, y_train, y_pred_train, y_test, y_pred_test,
|
||||
r2_train, mae_train, r2_test, mae_test,
|
||||
folder_name, split_method, preprocess_method, model_name,
|
||||
save_path):
|
||||
"""
|
||||
绘制带置信区间的散点图,模仿提供的代码样式
|
||||
|
||||
参数:
|
||||
- y_train, y_pred_train: 训练集的真实值和预测值
|
||||
- y_test, y_pred_test: 测试集的真实值和预测值
|
||||
- r2_train, mae_train: 训练集的R²和MAE指标
|
||||
- r2_test, mae_test: 测试集的R²和MAE指标
|
||||
- folder_name: 文件夹名称
|
||||
- split_method: 数据划分方法
|
||||
- preprocess_method: 预处理方法
|
||||
- model_name: 模型名称
|
||||
- save_path: 保存路径
|
||||
"""
|
||||
|
||||
# scale_factor 用于放大置信区间
|
||||
scale_factor = 1.5 # 调整这个值,越大置信区间越宽 scale_factor = 1 是理论上的标准置信区间宽度
|
||||
confidence = 0.95 # 95% 的置信水平
|
||||
|
||||
# 拟合训练集线
|
||||
z_train = np.polyfit(y_train, y_pred_train, 1)
|
||||
p_train = np.poly1d(z_train)
|
||||
predicted_values_train = p_train(y_train)
|
||||
residuals_train = y_pred_train - predicted_values_train
|
||||
mean_error_train = np.mean(residuals_train**2)
|
||||
t_value_train = stats.t.ppf((1 + confidence) / 2., len(y_train) - 1)
|
||||
ci_train = t_value_train * scale_factor * np.sqrt(mean_error_train) * np.sqrt(1 / len(y_train) + (y_train - np.mean(y_train))**2 / np.sum((y_train - np.mean(y_train))**2))
|
||||
x_extended_train = np.linspace(min(y_train), max(y_train), 100)
|
||||
predicted_extended_train = p_train(x_extended_train)
|
||||
ci_extended_train = t_value_train * scale_factor * np.sqrt(mean_error_train) * np.sqrt(1 / len(y_train) + (x_extended_train - np.mean(y_train))**2 / np.sum((y_train - np.mean(y_train))**2))
|
||||
|
||||
# 拟合测试集线
|
||||
z_test = np.polyfit(y_test, y_pred_test, 1)
|
||||
p_test = np.poly1d(z_test)
|
||||
predicted_values_test = p_test(y_test)
|
||||
residuals_test = y_pred_test - predicted_values_test
|
||||
mean_error_test = np.mean(residuals_test**2)
|
||||
t_value_test = stats.t.ppf((1 + confidence) / 2., len(y_test) - 1)
|
||||
ci_test = t_value_test * scale_factor * np.sqrt(mean_error_test) * np.sqrt(1 / len(y_test) + (y_test - np.mean(y_test))**2 / np.sum((y_test - np.mean(y_test))**2))
|
||||
x_extended_test = np.linspace(min(y_test), max(y_test), 100)
|
||||
predicted_extended_test = p_test(x_extended_test)
|
||||
ci_extended_test = t_value_test * scale_factor * np.sqrt(mean_error_test) * np.sqrt(1 / len(y_test) + (x_extended_test - np.mean(y_test))**2 / np.sum((y_test - np.mean(y_test))**2))
|
||||
|
||||
# 设置新的配色方案
|
||||
train_color = '#1f77b4' # 训练集主色:蓝色系
|
||||
test_color = '#ff7f0e' # 测试集主色:橙色系
|
||||
confidence_train_color = '#aec7e8' # 训练集置信区间浅蓝色
|
||||
confidence_test_color = '#ffbb78' # 测试集置信区间浅橙色
|
||||
|
||||
# 设置图形大小和分布
|
||||
fig = plt.figure(figsize=(10, 8), dpi=300) # 降低dpi以提高兼容性
|
||||
gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
|
||||
ax_main = fig.add_subplot(gs[1:, :-1]) # 主图
|
||||
ax_hist_x = fig.add_subplot(gs[0, :-1], sharex=ax_main) # 上方的直方图
|
||||
ax_hist_y = fig.add_subplot(gs[1:, -1], sharey=ax_main) # 右侧的直方图
|
||||
|
||||
# 绘制训练集
|
||||
ax_main.scatter(y_train, y_pred_train, color=train_color, label="训练集预测值", alpha=0.6)
|
||||
ax_main.plot(y_train, p_train(y_train), color=train_color, alpha=0.9,
|
||||
label=f"训练集拟合线\n$R^2$ = {r2_train:.2f}, MAE = {mae_train:.2f}")
|
||||
ax_main.fill_between(x_extended_train, predicted_extended_train - ci_extended_train,
|
||||
predicted_extended_train + ci_extended_train,
|
||||
color=confidence_train_color, alpha=0.5, label="训练集95%置信区间")
|
||||
|
||||
# 绘制测试集
|
||||
ax_main.scatter(y_test, y_pred_test, color=test_color, label="测试集预测值", alpha=0.6)
|
||||
ax_main.plot(y_test, p_test(y_test), color=test_color, alpha=0.9,
|
||||
label=f"测试集拟合线\n$R^2$ = {r2_test:.2f}, MAE = {mae_test:.2f}")
|
||||
ax_main.fill_between(x_extended_test, predicted_extended_test - ci_extended_test,
|
||||
predicted_extended_test + ci_extended_test,
|
||||
color=confidence_test_color, alpha=0.5, label="测试集95%置信区间")
|
||||
|
||||
# 添加参考线
|
||||
ax_main.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())],
|
||||
[min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())],
|
||||
color='grey', linestyle='--', alpha=0.6, label="1:1 参考线")
|
||||
|
||||
# 设置主图
|
||||
ax_main.set_xlabel("观测值", fontsize=12)
|
||||
ax_main.set_ylabel("预测值", fontsize=12)
|
||||
ax_main.legend(loc="upper left", fontsize=10)
|
||||
ax_main.grid(True, alpha=0.3)
|
||||
|
||||
# 绘制上方的直方图 (真实值的分布)
|
||||
ax_hist_x.hist(y_train, bins=20, color=train_color, alpha=0.7, edgecolor='black', label="训练集观测值分布")
|
||||
ax_hist_x.hist(y_test, bins=20, color=test_color, alpha=0.7, edgecolor='black', label="测试集观测值分布")
|
||||
ax_hist_x.tick_params(labelbottom=False) # 隐藏 x 轴的标签
|
||||
ax_hist_x.set_ylabel("频次", fontsize=10)
|
||||
ax_hist_x.legend(fontsize=8)
|
||||
|
||||
# 绘制右侧的直方图 (预测值的分布)
|
||||
ax_hist_y.hist(y_pred_train, bins=20, orientation='horizontal', color=train_color, alpha=0.7, edgecolor='black')
|
||||
ax_hist_y.hist(y_pred_test, bins=20, orientation='horizontal', color=test_color, alpha=0.7, edgecolor='black')
|
||||
ax_hist_y.set_xlabel("频次", fontsize=10)
|
||||
ax_hist_y.tick_params(labelleft=False) # 隐藏 y 轴的标签
|
||||
|
||||
# 添加标题
|
||||
title = f'{folder_name} - 最佳模型预测效果对比图\n'
|
||||
title += f'{split_method}_{preprocess_method}_{model_name}'
|
||||
fig.suptitle(title, fontsize=14, fontweight='bold')
|
||||
|
||||
# 保存和展示图像
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_path, format='png', bbox_inches='tight', dpi=300)
|
||||
print(f"散点图已保存至: {save_path}")
|
||||
|
||||
def get_best_model_from_summary(self, artifacts_dir: Path, metric: str = 'test_r2', target_column_name: str = None) -> Tuple[str, str, Dict]:
|
||||
"""
|
||||
从训练摘要中获取最佳模型信息
|
||||
|
||||
Args:
|
||||
artifacts_dir: 模型目录
|
||||
metric: 评估指标
|
||||
target_column_name: 目标列名(用于构建文件路径)
|
||||
|
||||
Returns:
|
||||
preprocess_method: 预处理方法
|
||||
model_name: 模型名称
|
||||
best_result: 最佳模型结果信息
|
||||
"""
|
||||
# 清理目标列名,移除可能的特殊字符
|
||||
if target_column_name:
|
||||
safe_target_name = "".join(c for c in target_column_name if c.isalnum() or c in ('-', '_')).rstrip()
|
||||
# 尝试加载以目标列名为前缀的详细结果文件
|
||||
detailed_path = artifacts_dir / f"{safe_target_name}_detailed_results.csv"
|
||||
summary_path = artifacts_dir / f"{safe_target_name}_training_summary.csv"
|
||||
else:
|
||||
# 兼容旧版本,使用固定文件名
|
||||
detailed_path = artifacts_dir / "detailed_results.csv"
|
||||
summary_path = artifacts_dir / "training_summary.csv"
|
||||
|
||||
summary_df = None
|
||||
|
||||
# 优先使用详细结果文件
|
||||
if detailed_path.exists():
|
||||
print(f"使用详细结果文件: {detailed_path}")
|
||||
summary_df = pd.read_csv(detailed_path)
|
||||
# 将中文列名映射到英文
|
||||
metric_mapping = {
|
||||
'test_r2': '测试集R²',
|
||||
'train_r2': '训练集R²',
|
||||
'test_rmse': '测试集RMSE',
|
||||
'train_rmse': '训练集RMSE',
|
||||
'cv_mean': 'CV均值'
|
||||
}
|
||||
if metric in metric_mapping and metric_mapping[metric] in summary_df.columns:
|
||||
metric_col = metric_mapping[metric]
|
||||
else:
|
||||
metric_col = metric
|
||||
elif summary_path.exists():
|
||||
print(f"使用训练摘要文件: {summary_path}")
|
||||
summary_df = pd.read_csv(summary_path)
|
||||
metric_col = metric
|
||||
else:
|
||||
# 如果使用了目标列名前缀的文件不存在,尝试查找旧版本的文件
|
||||
if target_column_name:
|
||||
old_detailed_path = artifacts_dir / "detailed_results.csv"
|
||||
old_summary_path = artifacts_dir / "training_summary.csv"
|
||||
|
||||
if old_detailed_path.exists():
|
||||
print(f"使用旧版本详细结果文件: {old_detailed_path}")
|
||||
summary_df = pd.read_csv(old_detailed_path)
|
||||
# 将中文列名映射到英文
|
||||
metric_mapping = {
|
||||
'test_r2': '测试集R²',
|
||||
'train_r2': '训练集R²',
|
||||
'test_rmse': '测试集RMSE',
|
||||
'train_rmse': '训练集RMSE',
|
||||
'cv_mean': 'CV均值'
|
||||
}
|
||||
if metric in metric_mapping and metric_mapping[metric] in summary_df.columns:
|
||||
metric_col = metric_mapping[metric]
|
||||
else:
|
||||
metric_col = metric
|
||||
elif old_summary_path.exists():
|
||||
print(f"使用旧版本训练摘要文件: {old_summary_path}")
|
||||
summary_df = pd.read_csv(old_summary_path)
|
||||
metric_col = metric
|
||||
else:
|
||||
raise FileNotFoundError(f"训练摘要文件不存在: {summary_path} 或 {detailed_path} 或 {old_summary_path} 或 {old_detailed_path}")
|
||||
else:
|
||||
raise FileNotFoundError(f"训练摘要文件不存在: {summary_path} 或 {detailed_path}")
|
||||
|
||||
if summary_df.empty:
|
||||
raise ValueError("训练摘要为空")
|
||||
|
||||
# 检查指标列是否存在
|
||||
if metric_col not in summary_df.columns:
|
||||
available_cols = list(summary_df.columns)
|
||||
raise ValueError(f"指标 '{metric_col}' 不存在。可用列: {available_cols}")
|
||||
|
||||
# 获取最佳模型(对于R²等指标,值越大越好)
|
||||
if 'r2' in metric.lower() or 'score' in metric.lower():
|
||||
best_idx = summary_df[metric_col].idxmax()
|
||||
else: # 对于RMSE、MAE等,值越小越好
|
||||
best_idx = summary_df[metric_col].idxmin()
|
||||
|
||||
best_row = summary_df.loc[best_idx]
|
||||
|
||||
# 根据文件类型解析模型信息
|
||||
if '划分方法' in summary_df.columns:
|
||||
# 详细结果文件格式(中文列名)
|
||||
split_method = best_row['划分方法']
|
||||
preprocess_method = best_row['预处理方法']
|
||||
model_name = best_row['建模方法']
|
||||
best_combination = f"{split_method}_{preprocess_method}_{model_name}"
|
||||
else:
|
||||
# 简化结果文件格式(英文列名)
|
||||
best_combination = best_row['combination']
|
||||
# 解析组合名称(格式: split_method_preprocess_method_model_name)
|
||||
parts = best_combination.split('_')
|
||||
if len(parts) < 3:
|
||||
raise ValueError(f"无效的模型组合名称格式: {best_combination}")
|
||||
|
||||
split_method = parts[0]
|
||||
preprocess_method = parts[1]
|
||||
model_name = '_'.join(parts[2:])
|
||||
|
||||
print(f"最佳模型组合: {best_combination}")
|
||||
print(f" 划分方法: {split_method}")
|
||||
print(f" 预处理方法: {preprocess_method}")
|
||||
print(f" 模型名称: {model_name}")
|
||||
print(f" {metric_col}: {best_row[metric_col]:.4f}")
|
||||
|
||||
# 构建模型文件前缀
|
||||
model_file_prefix = f"{split_method}_{preprocess_method}"
|
||||
|
||||
# 构建结果信息
|
||||
best_result = {
|
||||
'combination': best_combination,
|
||||
'split_method': split_method,
|
||||
'preprocess_method': preprocess_method,
|
||||
'model_name': model_name,
|
||||
'metric_value': best_row[metric_col],
|
||||
'model_file_prefix': model_file_prefix
|
||||
}
|
||||
|
||||
# 尝试获取更多指标信息
|
||||
for col in summary_df.columns:
|
||||
if col not in ['combination', '划分方法', '预处理方法', '建模方法', '最佳参数']:
|
||||
try:
|
||||
best_result[col] = best_row[col]
|
||||
except:
|
||||
pass
|
||||
|
||||
return model_file_prefix, model_name, best_result
|
||||
|
||||
def load_model(self, artifacts_dir: Path, preprocess_method: str, model_name: str, target_column_name: str = None):
|
||||
"""
|
||||
加载保存的模型
|
||||
|
||||
Args:
|
||||
artifacts_dir: 模型目录
|
||||
preprocess_method: 预处理方法名称
|
||||
model_name: 模型名称
|
||||
target_column_name: 目标列名(用于构建文件路径)
|
||||
|
||||
Returns:
|
||||
加载的模型数据
|
||||
"""
|
||||
if target_column_name:
|
||||
# 清理目标列名,移除可能的特殊字符
|
||||
safe_target_name = "".join(c for c in target_column_name if c.isalnum() or c in ('-', '_')).rstrip()
|
||||
# 尝试加载以目标列名为前缀的模型文件
|
||||
filename = f"{safe_target_name}_{preprocess_method}_{model_name}.joblib"
|
||||
filepath = artifacts_dir / filename
|
||||
|
||||
if filepath.exists():
|
||||
print(f"加载模型文件: {filepath}")
|
||||
return joblib.load(filepath)
|
||||
|
||||
# 如果带前缀的文件不存在,尝试加载旧版本的文件
|
||||
old_filename = f"{preprocess_method}_{model_name}.joblib"
|
||||
old_filepath = artifacts_dir / old_filename
|
||||
|
||||
if old_filepath.exists():
|
||||
print(f"加载旧版本模型文件: {old_filepath}")
|
||||
return joblib.load(old_filepath)
|
||||
|
||||
raise FileNotFoundError(f"模型文件不存在: {filepath} 或 {old_filepath}")
|
||||
else:
|
||||
# 兼容旧版本,使用固定文件名
|
||||
filename = f"{preprocess_method}_{model_name}.joblib"
|
||||
filepath = artifacts_dir / filename
|
||||
|
||||
if not filepath.exists():
|
||||
raise FileNotFoundError(f"模型文件不存在: {filepath}")
|
||||
|
||||
return joblib.load(filepath)
|
||||
|
||||
def plot_best_model_scatter(self, artifacts_dir: str, csv_path: str, output_dir: str,
|
||||
folder_name: str, metric: str = 'test_r2',
|
||||
target_column: int = None, feature_start_column: int = 13,
|
||||
test_size: float = 0.2, random_state: int = 42):
|
||||
"""
|
||||
绘制最佳模型的散点图
|
||||
|
||||
Args:
|
||||
artifacts_dir: 模型目录
|
||||
csv_path: 原始CSV数据文件路径
|
||||
output_dir: 输出目录
|
||||
folder_name: 文件夹名称(用作图片名称和目标列名)
|
||||
metric: 评估指标
|
||||
target_column: 目标值列索引(如果为None,则使用folder_name作为列名)
|
||||
feature_start_column: 特征开始列索引
|
||||
test_size: 测试集比例
|
||||
random_state: 随机种子
|
||||
"""
|
||||
artifacts_path = Path(artifacts_dir)
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"处理文件夹: {folder_name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 获取最佳模型信息
|
||||
model_file_prefix, model_name, best_result = self.get_best_model_from_summary(
|
||||
artifacts_path, metric, folder_name
|
||||
)
|
||||
|
||||
# 加载数据 - 优先使用文件夹名称作为目标列名
|
||||
X_raw, y_true = self.load_data(csv_path, target_column_name=folder_name, target_column=target_column, feature_start_column=feature_start_column)
|
||||
|
||||
# 获取最佳模型的预处理方法
|
||||
actual_preprocess_method = best_result['preprocess_method']
|
||||
split_method = best_result['split_method']
|
||||
|
||||
# 加载最佳模型
|
||||
best_model_data = self.load_model(artifacts_path, model_file_prefix, model_name, folder_name)
|
||||
best_model = best_model_data['model']
|
||||
|
||||
# 应用相同的数据预处理
|
||||
X_processed = self.preprocess_data(X_raw, actual_preprocess_method)
|
||||
|
||||
# 使用相同的数据分割方法
|
||||
X_train, X_test, y_train, y_test = self.split_data(
|
||||
X_processed, y_true, method=split_method,
|
||||
test_size=test_size, random_state=random_state
|
||||
)
|
||||
|
||||
# 预测训练集和测试集
|
||||
y_pred_train = best_model.predict(X_train)
|
||||
y_pred_test = best_model.predict(X_test)
|
||||
|
||||
# 计算评估指标
|
||||
train_r2 = r2_score(y_train, y_pred_train)
|
||||
test_r2 = r2_score(y_test, y_pred_test)
|
||||
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
|
||||
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
|
||||
train_mae = mean_absolute_error(y_train, y_pred_train)
|
||||
test_mae = mean_absolute_error(y_test, y_pred_test)
|
||||
|
||||
# 绘制带置信区间的散点图(模仿提供的代码样式)
|
||||
self.plot_scatter_with_confidence(
|
||||
y_train, y_pred_train, y_test, y_pred_test,
|
||||
train_r2, train_mae, test_r2, test_mae,
|
||||
folder_name, split_method, actual_preprocess_method, model_name,
|
||||
output_path / f"{folder_name}_scatter_with_confidence.png"
|
||||
)
|
||||
|
||||
plt.close() # 关闭图形以释放内存
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'save_path': str(output_path / f"{folder_name}_scatter_with_confidence.png"),
|
||||
'best_result': best_result,
|
||||
'metrics': {
|
||||
'train_r2': train_r2,
|
||||
'test_r2': test_r2,
|
||||
'train_rmse': train_rmse,
|
||||
'test_rmse': test_rmse,
|
||||
'train_mae': train_mae,
|
||||
'test_mae': test_mae
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文件夹 {folder_name} 失败: {e}")
|
||||
return {
|
||||
'status': 'error',
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def batch_plot_scatter(self, models_root_dir: str, csv_path: str, output_dir: str,
|
||||
metric: str = 'test_r2', target_column: int = None,
|
||||
feature_start_column: int = 13, test_size: float = 0.2,
|
||||
random_state: int = 42):
|
||||
"""
|
||||
批量处理多个子文件夹中的模型并绘制散点图
|
||||
|
||||
Args:
|
||||
models_root_dir: 包含多个子文件夹的根目录
|
||||
csv_path: 原始CSV数据文件路径
|
||||
output_dir: 输出目录
|
||||
metric: 评估指标
|
||||
target_column: 目标值列索引(如果为None,则使用文件夹名称作为列名)
|
||||
feature_start_column: 特征开始列索引
|
||||
test_size: 测试集比例
|
||||
random_state: 随机种子
|
||||
"""
|
||||
models_root = Path(models_root_dir)
|
||||
|
||||
# 查找所有子文件夹
|
||||
subdirs = [d for d in models_root.iterdir() if d.is_dir()]
|
||||
|
||||
if not subdirs:
|
||||
print(f"在目录 {models_root_dir} 中未找到子文件夹")
|
||||
return {}
|
||||
|
||||
print("=" * 80)
|
||||
print("批量散点图绘制任务")
|
||||
print("=" * 80)
|
||||
print(f"模型根目录: {models_root_dir}")
|
||||
print(f"数据文件: {csv_path}")
|
||||
print(f"输出目录: {output_dir}")
|
||||
print(f"评估指标: {metric}")
|
||||
print(f"找到 {len(subdirs)} 个模型子文件夹")
|
||||
print("=" * 80)
|
||||
|
||||
all_results = {}
|
||||
|
||||
for subdir in subdirs:
|
||||
folder_name = subdir.name
|
||||
result = self.plot_best_model_scatter(
|
||||
artifacts_dir=str(subdir),
|
||||
csv_path=csv_path,
|
||||
output_dir=output_dir,
|
||||
folder_name=folder_name,
|
||||
metric=metric,
|
||||
target_column=target_column,
|
||||
feature_start_column=feature_start_column,
|
||||
test_size=test_size,
|
||||
random_state=random_state
|
||||
)
|
||||
|
||||
all_results[folder_name] = result
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"批量散点图绘制完成,共处理 {len(subdirs)} 个模型文件夹")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# 打印汇总信息
|
||||
print("\n汇总结果:")
|
||||
success_count = 0
|
||||
for folder_name, result in all_results.items():
|
||||
if result['status'] == 'success':
|
||||
metrics = result['metrics']
|
||||
print(f" ✓ {folder_name}: 测试集R²={metrics['test_r2']:.4f}, "
|
||||
f"RMSE={metrics['test_rmse']:.4f}")
|
||||
success_count += 1
|
||||
else:
|
||||
print(f" ✗ {folder_name}: 失败 - {result['error']}")
|
||||
|
||||
print(f"\n成功处理: {success_count}/{len(subdirs)} 个文件夹")
|
||||
print(f"输出目录: {output_dir}")
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数示例"""
|
||||
# 创建批量散点图绘制实例
|
||||
scatter_batch = WaterQualityScatterBatch()
|
||||
|
||||
# 配置路径
|
||||
models_root_dir = r"E:\code\WQ\yaobao925\qvchuyaoban" # 包含多个子文件夹的根目录
|
||||
csv_path = r"E:\code\WQ\yaobao925\data\qvyaoban\data.csv" # 原始数据文件
|
||||
output_dir = r"E:\code\WQ\yaobao925\plot\qvyaoban_sctter" # 散点图输出目录
|
||||
|
||||
# 批量绘制散点图
|
||||
results = scatter_batch.batch_plot_scatter(
|
||||
models_root_dir=models_root_dir,
|
||||
csv_path=csv_path,
|
||||
output_dir=output_dir,
|
||||
metric='test_r2', # 评估指标
|
||||
target_column=None, # 使用文件夹名称作为目标列名
|
||||
feature_start_column=13, # 特征开始列索引
|
||||
test_size=0.2, # 测试集比例
|
||||
random_state=42 # 随机种子
|
||||
)
|
||||
|
||||
print("\n任务完成!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user