Initial commit of WQ_GUI

This commit is contained in:
2026-04-08 15:25:08 +08:00
commit 91e36407ae
302 changed files with 40872 additions and 0 deletions

View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,894 @@
import numpy as np
import pandas as pd
import joblib
import os
from pathlib import Path
from typing import List, Dict, Union, Tuple, Optional
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import scipy.stats as stats
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 12
# 机器学习模型导入 - 改为回归模型
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.cross_decomposition import PLSRegression
# 第三方模型导入
# try:
# import lightgbm as lgb
# LGB_AVAILABLE = True
# except ImportError:
# LGB_AVAILABLE = False
LGB_AVAILABLE = False # 注释掉lightgbm
# try:
# import catboost as cb
# CB_AVAILABLE = True
# except ImportError:
# CB_AVAILABLE = False
CB_AVAILABLE = False # 注释掉catboost
# 导入预处理模块
# 动态导入预处理模块
import sys
import os
from src.preprocessing.spectral_Preprocessing import Preprocessing
class WaterQualityScatterBatch:
"""水质参数反演批量散点图绘制类"""
def __init__(self):
"""初始化批量散点图绘制类"""
# 定义支持的回归模型及其参数网格
self.model_configs = {
'SVR': {
'model': SVR,
'params': {
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
'kernel': ['rbf', 'poly', 'sigmoid'],
'epsilon': [0.01, 0.1, 0.2]
},
'available': True
},
'RF': {
'model': RandomForestRegressor,
'params': {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
},
'available': True
},
'KNN': {
'model': KNeighborsRegressor,
'params': {
'n_neighbors': [3, 5, 7, 9, 11],
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan', 'minkowski']
},
'available': True
},
'LinearRegression': {
'model': LinearRegression,
'params': {
'fit_intercept': [True, False]
},
'available': True
},
'Ridge': {
'model': Ridge,
'params': {
'alpha': [0.01, 0.1, 1, 10, 100],
'fit_intercept': [True, False]
},
'available': True
},
'Lasso': {
'model': Lasso,
'params': {
'alpha': [0.01, 0.1, 1, 10, 100],
'fit_intercept': [True, False],
'max_iter': [1000, 2000]
},
'available': True
},
'ElasticNet': {
'model': ElasticNet,
'params': {
'alpha': [0.01, 0.1, 1, 10],
'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
'fit_intercept': [True, False],
'max_iter': [1000, 2000]
},
'available': True
},
'XGBoost': {
'model': None, # xgboost is removed, so set to None
'params': {
'n_estimators': [50, 100, 200],
'max_depth': [3, 6, 9],
'learning_rate': [0.01, 0.1, 0.2],
'subsample': [0.8, 0.9, 1.0]
},
'available': False
},
'LightGBM': {
'model': lgb.LGBMRegressor if LGB_AVAILABLE else None,
'params': {
'n_estimators': [50, 100, 200],
'max_depth': [3, 6, 9],
'learning_rate': [0.01, 0.1, 0.2],
'num_leaves': [31, 50, 100]
},
'available': LGB_AVAILABLE
},
'CatBoost': {
'model': cb.CatBoostRegressor if CB_AVAILABLE else None,
'params': {
'iterations': [50, 100, 200],
'depth': [3, 6, 9],
'learning_rate': [0.01, 0.1, 0.2],
'l2_leaf_reg': [1, 3, 5]
},
'available': CB_AVAILABLE
},
'PLS': {
'model': PLSRegression,
'params': {
'n_components': [2, 3, 5, 7, 10]
},
'available': True
}
}
# 预处理方法列表
self.preprocessing_methods = [
"None", "MMS", "SS", "CT", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "WVAE"
]
# 样本划分方法列表
self.split_methods = ["random", "spxy", "ks"]
def load_data(self, csv_path: str, target_column_name: str = None, target_column: int = None, feature_start_column: int = 13) -> Tuple[pd.DataFrame, pd.Series]:
"""
加载CSV数据
Args:
csv_path: CSV文件路径
target_column_name: 目标值列名(优先使用)
target_column: 目标值列索引(当列名不存在时使用)
feature_start_column: 特征开始列索引
Returns:
X: 特征数据
y: 目标值数据
"""
data = pd.read_csv(csv_path)
# 根据列名或列索引提取目标值
if target_column_name and target_column_name in data.columns:
print(f"使用列名 '{target_column_name}' 作为目标值")
y = data[target_column_name]
target_col_index = data.columns.get_loc(target_column_name)
elif target_column is not None:
print(f"使用列索引 {target_column} 作为目标值")
y = data.iloc[:, target_column]
target_col_index = target_column
else:
raise ValueError("必须指定 target_column_name 或 target_column")
# 提取特征数据
X = data.iloc[:, feature_start_column:]
# 去除y值为空的行
mask = ~y.isna()
data_cleaned = data[mask]
if target_column_name and target_column_name in data.columns:
y = data_cleaned[target_column_name]
else:
y = data_cleaned.iloc[:, target_col_index]
X = data_cleaned.iloc[:, feature_start_column:]
print(f"数据加载完成:")
print(f" 目标列: {target_column_name if target_column_name else f'索引{target_col_index}'}")
print(f" 样本数量: {X.shape[0]}")
print(f" 特征数量: {X.shape[1]}")
print(f" 目标值范围: {y.min():.4f} ~ {y.max():.4f}")
print(f" 目标值均值: {y.mean():.4f}")
return X, y
def preprocess_data(self, X: pd.DataFrame, method: str) -> np.ndarray:
"""
数据预处理
Args:
X: 原始特征数据
method: 预处理方法
Returns:
预处理后的数据
"""
print(f"应用预处理方法: {method}")
# 如果方法为None直接返回原始数据
if method == "None" or method is None:
print("跳过预处理,使用原始数据")
return X.values
try:
X_processed = Preprocessing(method, X)
# 确保返回的是numpy数组
if isinstance(X_processed, pd.DataFrame):
X_processed = X_processed.values
print(f"预处理完成,数据形状: {X_processed.shape}")
return X_processed
except Exception as e:
print(f"预处理失败: {e}")
print("使用原始数据")
return X.values
def random(self, data, label, test_ratio=0.2, random_state=123):
"""随机划分数据集"""
X_train, X_test, y_train, y_test = train_test_split(
data, label, test_size=test_ratio, random_state=random_state
)
return X_train, X_test, y_train, y_test
def spxy(self, data, label, test_size=0.2):
"""SPXY算法划分数据集"""
# 确保 data 和 label 是 NumPy 数组
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
label = label.to_numpy() if isinstance(label, pd.Series) else label
# 备份原始数据和标签
x_backup = data
y_backup = label
M = data.shape[0]
N = round((1 - test_size) * M)
samples = np.arange(M)
# 归一化标签数据
label = (label - np.mean(label)) / np.std(label)
D = np.zeros((M, M))
Dy = np.zeros((M, M))
# 计算样本之间的距离
for i in range(M - 1):
xa = data[i, :]
ya = label[i]
for j in range((i + 1), M):
xb = data[j, :]
yb = label[j]
D[i, j] = np.linalg.norm(xa - xb)
Dy[i, j] = np.linalg.norm(ya - yb)
# 距离归一化
Dmax = np.max(D)
Dymax = np.max(Dy)
D = D / Dmax + Dy / Dymax
# 找到最远的两个点
maxD = D.max(axis=0)
index_row = D.argmax(axis=0)
index_column = maxD.argmax()
m = np.zeros(N, dtype=int)
m[0] = index_row[index_column]
m[1] = index_column
dminmax = np.zeros(N)
dminmax[1] = D[m[0], m[1]]
# 根据距离选择训练集
for i in range(2, N):
pool = np.delete(samples, m[:i])
dmin = np.zeros(M - i)
for j in range(M - i):
indexa = pool[j]
d = np.zeros(i)
for k in range(i):
indexb = m[k]
if indexa < indexb:
d[k] = D[indexa, indexb]
else:
d[k] = D[indexb, indexa]
dmin[j] = np.min(d)
dminmax[i] = np.max(dmin)
index = np.argmax(dmin)
m[i] = pool[index]
m_complement = np.delete(samples, m)
# 划分训练集和测试集
X_train = data[m, :]
y_train = y_backup[m]
X_test = data[m_complement, :]
y_test = y_backup[m_complement]
return X_train, X_test, y_train, y_test
def ks(self, data, label, test_size=0.2):
"""Kennard-Stone算法划分数据集"""
# 确保 data 和 label 是 NumPy 数组
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
label = label.to_numpy() if isinstance(label, pd.Series) else label
M = data.shape[0]
N = round((1 - test_size) * M)
samples = np.arange(M)
D = np.zeros((M, M))
for i in range((M - 1)):
xa = data[i, :]
for j in range((i + 1), M):
xb = data[j, :]
D[i, j] = np.linalg.norm(xa - xb)
maxD = np.max(D, axis=0)
index_row = np.argmax(D, axis=0)
index_column = np.argmax(maxD)
m = np.zeros(N)
m[0] = np.array(index_row[index_column])
m[1] = np.array(index_column)
m = m.astype(int)
dminmax = np.zeros(N)
dminmax[1] = D[m[0], m[1]]
for i in range(2, N):
pool = np.delete(samples, m[:i])
dmin = np.zeros((M - i))
for j in range((M - i)):
indexa = pool[j]
d = np.zeros(i)
for k in range(i):
indexb = m[k]
if indexa < indexb:
d[k] = D[indexa, indexb]
else:
d[k] = D[indexb, indexa]
dmin[j] = np.min(d)
dminmax[i] = np.max(dmin)
index = np.argmax(dmin)
m[i] = pool[index]
m_complement = np.delete(np.arange(data.shape[0]), m)
X_train = data[m, :]
y_train = label[m]
X_test = data[m_complement, :]
y_test = label[m_complement]
return X_train, X_test, y_train, y_test
def split_data(self, X: np.ndarray, y: pd.Series, method: str = "random",
test_size: float = 0.2, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""
根据指定方法划分数据集
"""
print(f"使用 {method} 方法划分数据集")
if method == "random":
return self.random(X, y, test_ratio=test_size, random_state=random_state)
elif method == "spxy":
return self.spxy(X, y, test_size=test_size)
elif method == "ks":
return self.ks(X, y, test_size=test_size)
else:
raise ValueError(f"不支持的划分方法: {method}. 支持的方法: {self.split_methods}")
def plot_scatter_with_confidence(self, y_train, y_pred_train, y_test, y_pred_test,
r2_train, mae_train, r2_test, mae_test,
folder_name, split_method, preprocess_method, model_name,
save_path):
"""
绘制带置信区间的散点图,模仿提供的代码样式
参数:
- y_train, y_pred_train: 训练集的真实值和预测值
- y_test, y_pred_test: 测试集的真实值和预测值
- r2_train, mae_train: 训练集的R²和MAE指标
- r2_test, mae_test: 测试集的R²和MAE指标
- folder_name: 文件夹名称
- split_method: 数据划分方法
- preprocess_method: 预处理方法
- model_name: 模型名称
- save_path: 保存路径
"""
# scale_factor 用于放大置信区间
scale_factor = 1.5 # 调整这个值,越大置信区间越宽 scale_factor = 1 是理论上的标准置信区间宽度
confidence = 0.95 # 95% 的置信水平
# 拟合训练集线
z_train = np.polyfit(y_train, y_pred_train, 1)
p_train = np.poly1d(z_train)
predicted_values_train = p_train(y_train)
residuals_train = y_pred_train - predicted_values_train
mean_error_train = np.mean(residuals_train**2)
t_value_train = stats.t.ppf((1 + confidence) / 2., len(y_train) - 1)
ci_train = t_value_train * scale_factor * np.sqrt(mean_error_train) * np.sqrt(1 / len(y_train) + (y_train - np.mean(y_train))**2 / np.sum((y_train - np.mean(y_train))**2))
x_extended_train = np.linspace(min(y_train), max(y_train), 100)
predicted_extended_train = p_train(x_extended_train)
ci_extended_train = t_value_train * scale_factor * np.sqrt(mean_error_train) * np.sqrt(1 / len(y_train) + (x_extended_train - np.mean(y_train))**2 / np.sum((y_train - np.mean(y_train))**2))
# 拟合测试集线
z_test = np.polyfit(y_test, y_pred_test, 1)
p_test = np.poly1d(z_test)
predicted_values_test = p_test(y_test)
residuals_test = y_pred_test - predicted_values_test
mean_error_test = np.mean(residuals_test**2)
t_value_test = stats.t.ppf((1 + confidence) / 2., len(y_test) - 1)
ci_test = t_value_test * scale_factor * np.sqrt(mean_error_test) * np.sqrt(1 / len(y_test) + (y_test - np.mean(y_test))**2 / np.sum((y_test - np.mean(y_test))**2))
x_extended_test = np.linspace(min(y_test), max(y_test), 100)
predicted_extended_test = p_test(x_extended_test)
ci_extended_test = t_value_test * scale_factor * np.sqrt(mean_error_test) * np.sqrt(1 / len(y_test) + (x_extended_test - np.mean(y_test))**2 / np.sum((y_test - np.mean(y_test))**2))
# 设置新的配色方案
train_color = '#1f77b4' # 训练集主色:蓝色系
test_color = '#ff7f0e' # 测试集主色:橙色系
confidence_train_color = '#aec7e8' # 训练集置信区间浅蓝色
confidence_test_color = '#ffbb78' # 测试集置信区间浅橙色
# 设置图形大小和分布
fig = plt.figure(figsize=(10, 8), dpi=300) # 降低dpi以提高兼容性
gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
ax_main = fig.add_subplot(gs[1:, :-1]) # 主图
ax_hist_x = fig.add_subplot(gs[0, :-1], sharex=ax_main) # 上方的直方图
ax_hist_y = fig.add_subplot(gs[1:, -1], sharey=ax_main) # 右侧的直方图
# 绘制训练集
ax_main.scatter(y_train, y_pred_train, color=train_color, label="训练集预测值", alpha=0.6)
ax_main.plot(y_train, p_train(y_train), color=train_color, alpha=0.9,
label=f"训练集拟合线\n$R^2$ = {r2_train:.2f}, MAE = {mae_train:.2f}")
ax_main.fill_between(x_extended_train, predicted_extended_train - ci_extended_train,
predicted_extended_train + ci_extended_train,
color=confidence_train_color, alpha=0.5, label="训练集95%置信区间")
# 绘制测试集
ax_main.scatter(y_test, y_pred_test, color=test_color, label="测试集预测值", alpha=0.6)
ax_main.plot(y_test, p_test(y_test), color=test_color, alpha=0.9,
label=f"测试集拟合线\n$R^2$ = {r2_test:.2f}, MAE = {mae_test:.2f}")
ax_main.fill_between(x_extended_test, predicted_extended_test - ci_extended_test,
predicted_extended_test + ci_extended_test,
color=confidence_test_color, alpha=0.5, label="测试集95%置信区间")
# 添加参考线
ax_main.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())],
[min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())],
color='grey', linestyle='--', alpha=0.6, label="1:1 参考线")
# 设置主图
ax_main.set_xlabel("观测值", fontsize=12)
ax_main.set_ylabel("预测值", fontsize=12)
ax_main.legend(loc="upper left", fontsize=10)
ax_main.grid(True, alpha=0.3)
# 绘制上方的直方图 (真实值的分布)
ax_hist_x.hist(y_train, bins=20, color=train_color, alpha=0.7, edgecolor='black', label="训练集观测值分布")
ax_hist_x.hist(y_test, bins=20, color=test_color, alpha=0.7, edgecolor='black', label="测试集观测值分布")
ax_hist_x.tick_params(labelbottom=False) # 隐藏 x 轴的标签
ax_hist_x.set_ylabel("频次", fontsize=10)
ax_hist_x.legend(fontsize=8)
# 绘制右侧的直方图 (预测值的分布)
ax_hist_y.hist(y_pred_train, bins=20, orientation='horizontal', color=train_color, alpha=0.7, edgecolor='black')
ax_hist_y.hist(y_pred_test, bins=20, orientation='horizontal', color=test_color, alpha=0.7, edgecolor='black')
ax_hist_y.set_xlabel("频次", fontsize=10)
ax_hist_y.tick_params(labelleft=False) # 隐藏 y 轴的标签
# 添加标题
title = f'{folder_name} - 最佳模型预测效果对比图\n'
title += f'{split_method}_{preprocess_method}_{model_name}'
fig.suptitle(title, fontsize=14, fontweight='bold')
# 保存和展示图像
plt.tight_layout()
plt.savefig(save_path, format='png', bbox_inches='tight', dpi=300)
print(f"散点图已保存至: {save_path}")
def get_best_model_from_summary(self, artifacts_dir: Path, metric: str = 'test_r2', target_column_name: str = None) -> Tuple[str, str, Dict]:
"""
从训练摘要中获取最佳模型信息
Args:
artifacts_dir: 模型目录
metric: 评估指标
target_column_name: 目标列名(用于构建文件路径)
Returns:
preprocess_method: 预处理方法
model_name: 模型名称
best_result: 最佳模型结果信息
"""
# 清理目标列名,移除可能的特殊字符
if target_column_name:
safe_target_name = "".join(c for c in target_column_name if c.isalnum() or c in ('-', '_')).rstrip()
# 尝试加载以目标列名为前缀的详细结果文件
detailed_path = artifacts_dir / f"{safe_target_name}_detailed_results.csv"
summary_path = artifacts_dir / f"{safe_target_name}_training_summary.csv"
else:
# 兼容旧版本,使用固定文件名
detailed_path = artifacts_dir / "detailed_results.csv"
summary_path = artifacts_dir / "training_summary.csv"
summary_df = None
# 优先使用详细结果文件
if detailed_path.exists():
print(f"使用详细结果文件: {detailed_path}")
summary_df = pd.read_csv(detailed_path)
# 将中文列名映射到英文
metric_mapping = {
'test_r2': '测试集R²',
'train_r2': '训练集R²',
'test_rmse': '测试集RMSE',
'train_rmse': '训练集RMSE',
'cv_mean': 'CV均值'
}
if metric in metric_mapping and metric_mapping[metric] in summary_df.columns:
metric_col = metric_mapping[metric]
else:
metric_col = metric
elif summary_path.exists():
print(f"使用训练摘要文件: {summary_path}")
summary_df = pd.read_csv(summary_path)
metric_col = metric
else:
# 如果使用了目标列名前缀的文件不存在,尝试查找旧版本的文件
if target_column_name:
old_detailed_path = artifacts_dir / "detailed_results.csv"
old_summary_path = artifacts_dir / "training_summary.csv"
if old_detailed_path.exists():
print(f"使用旧版本详细结果文件: {old_detailed_path}")
summary_df = pd.read_csv(old_detailed_path)
# 将中文列名映射到英文
metric_mapping = {
'test_r2': '测试集R²',
'train_r2': '训练集R²',
'test_rmse': '测试集RMSE',
'train_rmse': '训练集RMSE',
'cv_mean': 'CV均值'
}
if metric in metric_mapping and metric_mapping[metric] in summary_df.columns:
metric_col = metric_mapping[metric]
else:
metric_col = metric
elif old_summary_path.exists():
print(f"使用旧版本训练摘要文件: {old_summary_path}")
summary_df = pd.read_csv(old_summary_path)
metric_col = metric
else:
raise FileNotFoundError(f"训练摘要文件不存在: {summary_path}{detailed_path}{old_summary_path}{old_detailed_path}")
else:
raise FileNotFoundError(f"训练摘要文件不存在: {summary_path}{detailed_path}")
if summary_df.empty:
raise ValueError("训练摘要为空")
# 检查指标列是否存在
if metric_col not in summary_df.columns:
available_cols = list(summary_df.columns)
raise ValueError(f"指标 '{metric_col}' 不存在。可用列: {available_cols}")
# 获取最佳模型对于R²等指标值越大越好
if 'r2' in metric.lower() or 'score' in metric.lower():
best_idx = summary_df[metric_col].idxmax()
else: # 对于RMSE、MAE等值越小越好
best_idx = summary_df[metric_col].idxmin()
best_row = summary_df.loc[best_idx]
# 根据文件类型解析模型信息
if '划分方法' in summary_df.columns:
# 详细结果文件格式(中文列名)
split_method = best_row['划分方法']
preprocess_method = best_row['预处理方法']
model_name = best_row['建模方法']
best_combination = f"{split_method}_{preprocess_method}_{model_name}"
else:
# 简化结果文件格式(英文列名)
best_combination = best_row['combination']
# 解析组合名称(格式: split_method_preprocess_method_model_name
parts = best_combination.split('_')
if len(parts) < 3:
raise ValueError(f"无效的模型组合名称格式: {best_combination}")
split_method = parts[0]
preprocess_method = parts[1]
model_name = '_'.join(parts[2:])
print(f"最佳模型组合: {best_combination}")
print(f" 划分方法: {split_method}")
print(f" 预处理方法: {preprocess_method}")
print(f" 模型名称: {model_name}")
print(f" {metric_col}: {best_row[metric_col]:.4f}")
# 构建模型文件前缀
model_file_prefix = f"{split_method}_{preprocess_method}"
# 构建结果信息
best_result = {
'combination': best_combination,
'split_method': split_method,
'preprocess_method': preprocess_method,
'model_name': model_name,
'metric_value': best_row[metric_col],
'model_file_prefix': model_file_prefix
}
# 尝试获取更多指标信息
for col in summary_df.columns:
if col not in ['combination', '划分方法', '预处理方法', '建模方法', '最佳参数']:
try:
best_result[col] = best_row[col]
except:
pass
return model_file_prefix, model_name, best_result
def load_model(self, artifacts_dir: Path, preprocess_method: str, model_name: str, target_column_name: str = None):
"""
加载保存的模型
Args:
artifacts_dir: 模型目录
preprocess_method: 预处理方法名称
model_name: 模型名称
target_column_name: 目标列名(用于构建文件路径)
Returns:
加载的模型数据
"""
if target_column_name:
# 清理目标列名,移除可能的特殊字符
safe_target_name = "".join(c for c in target_column_name if c.isalnum() or c in ('-', '_')).rstrip()
# 尝试加载以目标列名为前缀的模型文件
filename = f"{safe_target_name}_{preprocess_method}_{model_name}.joblib"
filepath = artifacts_dir / filename
if filepath.exists():
print(f"加载模型文件: {filepath}")
return joblib.load(filepath)
# 如果带前缀的文件不存在,尝试加载旧版本的文件
old_filename = f"{preprocess_method}_{model_name}.joblib"
old_filepath = artifacts_dir / old_filename
if old_filepath.exists():
print(f"加载旧版本模型文件: {old_filepath}")
return joblib.load(old_filepath)
raise FileNotFoundError(f"模型文件不存在: {filepath}{old_filepath}")
else:
# 兼容旧版本,使用固定文件名
filename = f"{preprocess_method}_{model_name}.joblib"
filepath = artifacts_dir / filename
if not filepath.exists():
raise FileNotFoundError(f"模型文件不存在: {filepath}")
return joblib.load(filepath)
def plot_best_model_scatter(self, artifacts_dir: str, csv_path: str, output_dir: str,
folder_name: str, metric: str = 'test_r2',
target_column: int = None, feature_start_column: int = 13,
test_size: float = 0.2, random_state: int = 42):
"""
绘制最佳模型的散点图
Args:
artifacts_dir: 模型目录
csv_path: 原始CSV数据文件路径
output_dir: 输出目录
folder_name: 文件夹名称(用作图片名称和目标列名)
metric: 评估指标
target_column: 目标值列索引如果为None则使用folder_name作为列名
feature_start_column: 特征开始列索引
test_size: 测试集比例
random_state: 随机种子
"""
artifacts_path = Path(artifacts_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
try:
print(f"\n{'='*60}")
print(f"处理文件夹: {folder_name}")
print(f"{'='*60}")
# 获取最佳模型信息
model_file_prefix, model_name, best_result = self.get_best_model_from_summary(
artifacts_path, metric, folder_name
)
# 加载数据 - 优先使用文件夹名称作为目标列名
X_raw, y_true = self.load_data(csv_path, target_column_name=folder_name, target_column=target_column, feature_start_column=feature_start_column)
# 获取最佳模型的预处理方法
actual_preprocess_method = best_result['preprocess_method']
split_method = best_result['split_method']
# 加载最佳模型
best_model_data = self.load_model(artifacts_path, model_file_prefix, model_name, folder_name)
best_model = best_model_data['model']
# 应用相同的数据预处理
X_processed = self.preprocess_data(X_raw, actual_preprocess_method)
# 使用相同的数据分割方法
X_train, X_test, y_train, y_test = self.split_data(
X_processed, y_true, method=split_method,
test_size=test_size, random_state=random_state
)
# 预测训练集和测试集
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)
# 计算评估指标
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
# 绘制带置信区间的散点图(模仿提供的代码样式)
self.plot_scatter_with_confidence(
y_train, y_pred_train, y_test, y_pred_test,
train_r2, train_mae, test_r2, test_mae,
folder_name, split_method, actual_preprocess_method, model_name,
output_path / f"{folder_name}_scatter_with_confidence.png"
)
plt.close() # 关闭图形以释放内存
return {
'status': 'success',
'save_path': str(output_path / f"{folder_name}_scatter_with_confidence.png"),
'best_result': best_result,
'metrics': {
'train_r2': train_r2,
'test_r2': test_r2,
'train_rmse': train_rmse,
'test_rmse': test_rmse,
'train_mae': train_mae,
'test_mae': test_mae
}
}
except Exception as e:
print(f"处理文件夹 {folder_name} 失败: {e}")
return {
'status': 'error',
'error': str(e)
}
def batch_plot_scatter(self, models_root_dir: str, csv_path: str, output_dir: str,
metric: str = 'test_r2', target_column: int = None,
feature_start_column: int = 13, test_size: float = 0.2,
random_state: int = 42):
"""
批量处理多个子文件夹中的模型并绘制散点图
Args:
models_root_dir: 包含多个子文件夹的根目录
csv_path: 原始CSV数据文件路径
output_dir: 输出目录
metric: 评估指标
target_column: 目标值列索引如果为None则使用文件夹名称作为列名
feature_start_column: 特征开始列索引
test_size: 测试集比例
random_state: 随机种子
"""
models_root = Path(models_root_dir)
# 查找所有子文件夹
subdirs = [d for d in models_root.iterdir() if d.is_dir()]
if not subdirs:
print(f"在目录 {models_root_dir} 中未找到子文件夹")
return {}
print("=" * 80)
print("批量散点图绘制任务")
print("=" * 80)
print(f"模型根目录: {models_root_dir}")
print(f"数据文件: {csv_path}")
print(f"输出目录: {output_dir}")
print(f"评估指标: {metric}")
print(f"找到 {len(subdirs)} 个模型子文件夹")
print("=" * 80)
all_results = {}
for subdir in subdirs:
folder_name = subdir.name
result = self.plot_best_model_scatter(
artifacts_dir=str(subdir),
csv_path=csv_path,
output_dir=output_dir,
folder_name=folder_name,
metric=metric,
target_column=target_column,
feature_start_column=feature_start_column,
test_size=test_size,
random_state=random_state
)
all_results[folder_name] = result
print(f"\n{'='*80}")
print(f"批量散点图绘制完成,共处理 {len(subdirs)} 个模型文件夹")
print(f"{'='*80}")
# 打印汇总信息
print("\n汇总结果:")
success_count = 0
for folder_name, result in all_results.items():
if result['status'] == 'success':
metrics = result['metrics']
print(f"{folder_name}: 测试集R²={metrics['test_r2']:.4f}, "
f"RMSE={metrics['test_rmse']:.4f}")
success_count += 1
else:
print(f"{folder_name}: 失败 - {result['error']}")
print(f"\n成功处理: {success_count}/{len(subdirs)} 个文件夹")
print(f"输出目录: {output_dir}")
return all_results
def main():
"""主函数示例"""
# 创建批量散点图绘制实例
scatter_batch = WaterQualityScatterBatch()
# 配置路径
models_root_dir = r"E:\code\WQ\yaobao925\qvchuyaoban" # 包含多个子文件夹的根目录
csv_path = r"E:\code\WQ\yaobao925\data\qvyaoban\data.csv" # 原始数据文件
output_dir = r"E:\code\WQ\yaobao925\plot\qvyaoban_sctter" # 散点图输出目录
# 批量绘制散点图
results = scatter_batch.batch_plot_scatter(
models_root_dir=models_root_dir,
csv_path=csv_path,
output_dir=output_dir,
metric='test_r2', # 评估指标
target_column=None, # 使用文件夹名称作为目标列名
feature_start_column=13, # 特征开始列索引
test_size=0.2, # 测试集比例
random_state=42 # 随机种子
)
print("\n任务完成!")
if __name__ == "__main__":
main()