2504 lines
97 KiB
Python
2504 lines
97 KiB
Python
"""
|
||
回归分析工具包
|
||
支持多种回归算法:线性回归、LASSO、岭回归、Boosting、神经网络等
|
||
包含超参数调优、模型评价和保存功能
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
|
||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge, ElasticNet
|
||
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
|
||
from sklearn.tree import DecisionTreeRegressor
|
||
from sklearn.svm import SVR
|
||
from sklearn.gaussian_process import GaussianProcessRegressor
|
||
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
|
||
from sklearn.neural_network import MLPRegressor
|
||
from sklearn.neighbors import KNeighborsRegressor
|
||
from sklearn.base import BaseEstimator, RegressorMixin
|
||
import xgboost as xgb
|
||
import lightgbm as lgb
|
||
from statsmodels.api import OLS, GLM
|
||
from statsmodels.genmod.families import Gaussian
|
||
import warnings
|
||
import joblib
|
||
import os
|
||
from datetime import datetime
|
||
import json
|
||
from scipy.linalg import pinv
|
||
from scipy import stats
|
||
from typing import Optional, List, Dict, Any, Union
|
||
from dataclasses import dataclass, field
|
||
import time
|
||
|
||
warnings.filterwarnings('ignore')
|
||
|
||
|
||
@dataclass
|
||
class DataConfig:
|
||
"""数据配置类"""
|
||
csv_path: str = ""
|
||
label_column: Union[str, int] = ""
|
||
spectrum_columns: Optional[Union[str, List[Union[str, int]]]] = None
|
||
test_size: float = 0.2
|
||
random_state: int = 42
|
||
scale_method: str = 'standard'
|
||
|
||
|
||
@dataclass
|
||
class ModelConfig:
|
||
"""模型配置类"""
|
||
model_names: Optional[Union[str, List[str]]] = None
|
||
tune_hyperparams: bool = True
|
||
tuning_method: str = 'grid'
|
||
cv_folds: int = 5
|
||
random_search_iter: int = 20
|
||
|
||
|
||
@dataclass
|
||
class TrainingConfig:
|
||
"""训练配置类"""
|
||
epochs: int = 100
|
||
batch_size: int = 32
|
||
learning_rate: float = 0.001
|
||
|
||
|
||
@dataclass
|
||
class OutputConfig:
|
||
"""输出配置类"""
|
||
save_models: bool = True
|
||
plot_results: bool = True
|
||
save_dir: str = 'models'
|
||
plot_dir: str = 'plots'
|
||
|
||
|
||
@dataclass
|
||
class RegressionConfig:
|
||
"""回归分析完整配置类 - 为GUI对接设计的标准化接口"""
|
||
data: DataConfig = field(default_factory=DataConfig)
|
||
models: ModelConfig = field(default_factory=ModelConfig)
|
||
training: TrainingConfig = field(default_factory=TrainingConfig)
|
||
output: OutputConfig = field(default_factory=OutputConfig)
|
||
|
||
def __post_init__(self):
|
||
"""参数校验和默认值设置"""
|
||
self._validate_parameters()
|
||
|
||
def _validate_parameters(self):
|
||
"""参数校验"""
|
||
# 数据参数校验
|
||
if not self.data.csv_path:
|
||
raise ValueError("CSV file path must be specified")
|
||
if not self.data.label_column:
|
||
raise ValueError("Label column must be specified")
|
||
if not (0 < self.data.test_size < 1):
|
||
raise ValueError("Test size must be between 0 and 1")
|
||
if self.data.scale_method not in ['standard', 'minmax']:
|
||
raise ValueError("Scale method must be 'standard' or 'minmax'")
|
||
|
||
# 模型参数校验
|
||
if self.models.tuning_method not in ['grid', 'random']:
|
||
raise ValueError("Tuning method must be 'grid' or 'random'")
|
||
if self.models.cv_folds < 2:
|
||
raise ValueError("CV folds must be at least 2")
|
||
|
||
# 处理模型名称
|
||
self._process_model_names()
|
||
|
||
# 训练参数校验
|
||
if self.training.epochs <= 0:
|
||
raise ValueError("Epochs must be positive")
|
||
if self.training.batch_size <= 0:
|
||
raise ValueError("Batch size must be positive")
|
||
if self.training.learning_rate <= 0:
|
||
raise ValueError("Learning rate must be positive")
|
||
|
||
def _process_model_names(self):
|
||
"""处理模型名称,支持'all'参数"""
|
||
if isinstance(self.models.model_names, str):
|
||
if self.models.model_names.lower() == 'all':
|
||
# 获取所有支持的模型名称
|
||
supported_models = self._get_supported_models()
|
||
self.models.model_names = list(supported_models.keys())
|
||
print(f"选择所有可用模型: {len(self.models.model_names)} 个")
|
||
else:
|
||
# 单个模型名称
|
||
supported_models = self._get_supported_models()
|
||
if self.models.model_names not in supported_models:
|
||
raise ValueError(f"不支持的模型类型: {self.models.model_names}")
|
||
self.models.model_names = [self.models.model_names]
|
||
elif isinstance(self.models.model_names, list):
|
||
# 验证列表中的每个模型
|
||
supported_models = self._get_supported_models()
|
||
for model in self.models.model_names:
|
||
if model not in supported_models:
|
||
raise ValueError(f"不支持的模型类型: {model}")
|
||
elif self.models.model_names is None:
|
||
# 默认使用一些常用模型
|
||
self.models.model_names = ['linear', 'ridge', 'lasso', 'randomforest', 'svm']
|
||
|
||
def _get_supported_models(self) -> Dict[str, str]:
|
||
"""获取支持的模型列表"""
|
||
# 基于RegressionAnalyzer中注册的模型
|
||
models = {
|
||
# 线性模型
|
||
'linear': '多元线性回归',
|
||
'lasso': 'LASSO回归',
|
||
'ridge': '岭回归',
|
||
'elasticnet': '弹性网络回归',
|
||
'bayesianridge': '贝叶斯岭回归',
|
||
|
||
# 提升模型
|
||
'lsboost': '最小二乘提升',
|
||
'xgboost': 'XGBoost回归',
|
||
'lightgbm': 'LightGBM回归',
|
||
|
||
# 核方法
|
||
'gaussian': '高斯过程回归',
|
||
'gaussiansvm': '高斯SVM回归',
|
||
'svm': '支持向量回归',
|
||
|
||
# 神经网络
|
||
'elm': '极限学习机',
|
||
'mlp': '多层感知机',
|
||
'lstm': 'LSTM网络',
|
||
'gru': 'GRU网络',
|
||
|
||
# 其他模型
|
||
'gam': '广义加性模型',
|
||
'decisiontree': '决策树回归',
|
||
'randomforest': '随机森林回归',
|
||
'extratrees': '极端随机树回归',
|
||
'adaboost': 'AdaBoost回归'
|
||
}
|
||
|
||
# 只返回当前环境可用的模型
|
||
available_models = {}
|
||
for key, name in models.items():
|
||
try:
|
||
# 这里可以添加更复杂的可用性检查
|
||
available_models[key] = name
|
||
except:
|
||
continue
|
||
|
||
return available_models
|
||
|
||
@classmethod
|
||
def create_default(cls, csv_path: str, label_column: Union[str, int]) -> 'RegressionConfig':
|
||
"""创建默认配置的便捷方法"""
|
||
# 创建配置时暂时跳过验证
|
||
config = cls.__new__(cls)
|
||
config.data = DataConfig()
|
||
config.models = ModelConfig()
|
||
config.training = TrainingConfig()
|
||
config.output = OutputConfig()
|
||
|
||
# 设置必要参数
|
||
config.data.csv_path = csv_path
|
||
config.data.label_column = label_column
|
||
|
||
# 手动调用验证
|
||
config._validate_parameters()
|
||
|
||
return config
|
||
|
||
@classmethod
|
||
def create_quick_analysis(cls, csv_path: str, label_column: Union[str, int],
|
||
model_names: Optional[List[str]] = None) -> 'RegressionConfig':
|
||
"""创建快速分析配置"""
|
||
# 创建配置时暂时跳过验证
|
||
config = cls.__new__(cls)
|
||
config.data = DataConfig()
|
||
config.models = ModelConfig()
|
||
config.training = TrainingConfig()
|
||
config.output = OutputConfig()
|
||
|
||
# 设置必要参数
|
||
config.data.csv_path = csv_path
|
||
config.data.label_column = label_column
|
||
config.models.model_names = model_names
|
||
config.models.tune_hyperparams = False # 快速分析不调参
|
||
config.output.save_models = False # 不保存模型
|
||
|
||
# 手动调用验证
|
||
config._validate_parameters()
|
||
|
||
return config
|
||
|
||
|
||
class ExtremeLearningMachine(BaseEstimator, RegressorMixin):
|
||
"""
|
||
Extreme Learning Machine (ELM) 回归器
|
||
ELM是一种单隐层前馈神经网络,具有快速训练速度
|
||
"""
|
||
|
||
def __init__(self, n_hidden=100, activation='sigmoid', random_state=42):
|
||
self.n_hidden = n_hidden
|
||
self.activation = activation
|
||
self.random_state = random_state
|
||
self.input_weights_ = None
|
||
self.biases_ = None
|
||
self.output_weights_ = None
|
||
|
||
def _activation_function(self, X):
|
||
"""激活函数"""
|
||
if self.activation == 'sigmoid':
|
||
return 1 / (1 + np.exp(-X))
|
||
elif self.activation == 'tanh':
|
||
return np.tanh(X)
|
||
elif self.activation == 'relu':
|
||
return np.maximum(0, X)
|
||
elif self.activation == 'linear':
|
||
return X
|
||
else:
|
||
raise ValueError(f"Unsupported activation function: {self.activation}")
|
||
|
||
def fit(self, X, y):
|
||
"""训练ELM模型"""
|
||
np.random.seed(self.random_state)
|
||
n_samples, n_features = X.shape
|
||
|
||
# Initialize input weights and biases (random)
|
||
self.input_weights_ = np.random.randn(n_features, self.n_hidden)
|
||
self.biases_ = np.random.randn(self.n_hidden)
|
||
|
||
# 计算隐层输出
|
||
H = self._activation_function(X @ self.input_weights_ + self.biases_)
|
||
|
||
# 添加偏置列到隐层输出(可选)
|
||
H = np.column_stack([H, np.ones(n_samples)])
|
||
|
||
# 计算输出权重(解析解)
|
||
self.output_weights_ = pinv(H) @ y
|
||
|
||
return self
|
||
|
||
def predict(self, X):
|
||
"""预测"""
|
||
if self.input_weights_ is None:
|
||
raise ValueError("模型还未训练")
|
||
|
||
# 计算隐层输出
|
||
H = self._activation_function(X @ self.input_weights_ + self.biases_)
|
||
H = np.column_stack([H, np.ones(X.shape[0])])
|
||
|
||
# 计算输出
|
||
return H @ self.output_weights_
|
||
|
||
def _more_tags(self):
|
||
return {'no_validation': True}
|
||
|
||
|
||
class GeneralizedAdditiveModel(BaseEstimator, RegressorMixin):
|
||
"""
|
||
Generalized Additive Model (GAM) 回归器
|
||
使用样条基函数实现GAM
|
||
"""
|
||
|
||
def __init__(self, n_splines=10, degree=3, lambda_=0.1):
|
||
self.n_splines = n_splines
|
||
self.degree = degree
|
||
self.lambda_ = lambda_
|
||
self.coefficients_ = None
|
||
self.knots_ = None
|
||
|
||
def _create_spline_basis(self, X):
|
||
"""创建样条基函数"""
|
||
n_samples, n_features = X.shape
|
||
n_basis = self.n_splines + self.degree + 1
|
||
|
||
# 为每个特征创建基函数
|
||
basis_matrices = []
|
||
|
||
for feature_idx in range(n_features):
|
||
x = X[:, feature_idx]
|
||
x_min, x_max = np.min(x), np.max(x)
|
||
|
||
# 创建节点
|
||
if self.knots_ is None:
|
||
knots = np.linspace(x_min, x_max, self.n_splines + 2)[1:-1]
|
||
else:
|
||
knots = self.knots_[feature_idx]
|
||
|
||
# 创建B样条基函数
|
||
basis = np.zeros((n_samples, n_basis))
|
||
|
||
# 截断幂基函数(简化实现)
|
||
for i in range(n_basis):
|
||
if i < self.degree + 1:
|
||
# 左端多项式
|
||
basis[:, i] = np.power(np.maximum(0, x - x_min), i)
|
||
elif i > n_basis - self.degree - 2:
|
||
# 右端多项式
|
||
power = n_basis - 1 - i
|
||
basis[:, i] = np.power(np.maximum(0, x_max - x), power)
|
||
else:
|
||
# 中间截断幂函数
|
||
basis[:, i] = np.power(np.maximum(0, x - knots[i - self.degree - 1]), self.degree)
|
||
|
||
basis_matrices.append(basis)
|
||
|
||
# 组合所有特征的基函数
|
||
return np.concatenate(basis_matrices, axis=1)
|
||
|
||
def fit(self, X, y):
|
||
"""训练GAM模型"""
|
||
from sklearn.linear_model import Ridge
|
||
|
||
# 创建样条基函数
|
||
X_basis = self._create_spline_basis(X)
|
||
|
||
# 使用岭回归拟合系数(带正则化)
|
||
ridge = Ridge(alpha=self.lambda_, fit_intercept=True)
|
||
ridge.fit(X_basis, y)
|
||
|
||
self.coefficients_ = ridge.coef_
|
||
self.intercept_ = ridge.intercept_
|
||
|
||
return self
|
||
|
||
def predict(self, X):
|
||
"""预测"""
|
||
if self.coefficients_ is None:
|
||
raise ValueError("模型还未训练")
|
||
|
||
X_basis = self._create_spline_basis(X)
|
||
return X_basis @ self.coefficients_ + self.intercept_
|
||
|
||
|
||
class LSTMRegressor(BaseEstimator, RegressorMixin):
|
||
"""
|
||
LSTM回归器 - 将光谱数据视为序列数据
|
||
每个光谱样本的波段作为时间步
|
||
使用PyTorch实现
|
||
"""
|
||
|
||
def __init__(self, units=None, dropout=0.2, recurrent_dropout=0.2, epochs=None,
|
||
batch_size=None, learning_rate=None, random_state=42, device=None,
|
||
config: Optional['TrainingConfig'] = None):
|
||
"""
|
||
LSTM回归器构造函数
|
||
|
||
Parameters:
|
||
units (int, optional): LSTM单元数,如果为None则使用默认值50
|
||
dropout (float): Dropout比例
|
||
recurrent_dropout (float): 循环Dropout比例
|
||
epochs (int, optional): 训练轮数,如果为None则使用默认值100
|
||
batch_size (int, optional): 批次大小,如果为None则使用默认值32
|
||
learning_rate (float, optional): 学习率,如果为None则使用默认值0.001
|
||
random_state (int): 随机种子
|
||
device (str, optional): 计算设备
|
||
config (TrainingConfig, optional): 训练配置对象
|
||
"""
|
||
# 如果提供了配置对象,使用配置中的参数
|
||
if config is not None:
|
||
self.units = units if units is not None else 64 # LSTM默认使用更多单元
|
||
self.dropout = dropout
|
||
self.recurrent_dropout = recurrent_dropout
|
||
self.epochs = epochs if epochs is not None else config.epochs
|
||
self.batch_size = batch_size if batch_size is not None else config.batch_size
|
||
self.learning_rate = learning_rate if learning_rate is not None else config.learning_rate
|
||
else:
|
||
# 使用传统参数方式
|
||
self.units = units if units is not None else 50
|
||
self.dropout = dropout
|
||
self.recurrent_dropout = recurrent_dropout
|
||
self.epochs = epochs if epochs is not None else 100
|
||
self.batch_size = batch_size if batch_size is not None else 32
|
||
self.learning_rate = learning_rate if learning_rate is not None else 0.001
|
||
|
||
self.random_state = random_state
|
||
self.device = device
|
||
self.model_ = None
|
||
self.input_size_ = None
|
||
|
||
# 尝试导入PyTorch
|
||
try:
|
||
import torch
|
||
import torch.nn as nn
|
||
import torch.optim as optim
|
||
self.torch = torch
|
||
self.nn = nn
|
||
self.optim = optim
|
||
self.pytorch_available = True
|
||
|
||
# 设置设备
|
||
if self.device is None:
|
||
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||
else:
|
||
self.device = torch.device(self.device)
|
||
|
||
except ImportError:
|
||
self.pytorch_available = False
|
||
print("Warning: PyTorch not installed, LSTM model will use MLPRegressor approximation")
|
||
|
||
def _create_model(self, input_size):
|
||
"""创建PyTorch LSTM模型"""
|
||
class LSTMModel(self.nn.Module):
|
||
def __init__(self, input_size, hidden_size, dropout, recurrent_dropout):
|
||
super(LSTMModel, self).__init__()
|
||
self.hidden_size = hidden_size
|
||
self.lstm = self.nn.LSTM(
|
||
input_size=input_size,
|
||
hidden_size=hidden_size,
|
||
num_layers=1,
|
||
batch_first=True,
|
||
dropout=recurrent_dropout if recurrent_dropout > 0 else 0,
|
||
bidirectional=False
|
||
)
|
||
self.dropout_layer = self.nn.Dropout(dropout)
|
||
self.fc = self.nn.Linear(hidden_size, 1)
|
||
|
||
def forward(self, x):
|
||
# LSTM前向传播
|
||
lstm_out, _ = self.lstm(x)
|
||
# 取最后一个时间步的输出
|
||
lstm_out = lstm_out[:, -1, :]
|
||
# Dropout
|
||
lstm_out = self.dropout_layer(lstm_out)
|
||
# 全连接层
|
||
output = self.fc(lstm_out)
|
||
return output
|
||
|
||
return LSTMModel(input_size, self.units, self.dropout, self.recurrent_dropout)
|
||
|
||
def fit(self, X, y):
|
||
"""训练LSTM模型"""
|
||
if not self.pytorch_available:
|
||
# 如果没有PyTorch,使用MLPRegressor作为近似
|
||
from sklearn.neural_network import MLPRegressor
|
||
self.model_ = MLPRegressor(
|
||
hidden_layer_sizes=(self.units, self.units//2),
|
||
activation='relu',
|
||
solver='adam',
|
||
max_iter=self.epochs,
|
||
random_state=self.random_state,
|
||
early_stopping=True
|
||
)
|
||
self.model_.fit(X, y)
|
||
return self
|
||
|
||
# 设置随机种子
|
||
self.torch.manual_seed(self.random_state)
|
||
if self.torch.cuda.is_available():
|
||
self.torch.cuda.manual_seed(self.random_state)
|
||
self.torch.cuda.manual_seed_all(self.random_state)
|
||
np.random.seed(self.random_state)
|
||
|
||
# 将特征数据重塑为序列格式 (samples, timesteps, features)
|
||
# 对于光谱数据,每个波段作为时间步,特征数为1
|
||
n_samples, n_features = X.shape
|
||
self.input_size_ = 1 # 每个时间步的特征数
|
||
|
||
# 转换为PyTorch张量
|
||
X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
|
||
y_tensor = self.torch.FloatTensor(y.reshape(-1, 1)).to(self.device)
|
||
|
||
# 创建模型
|
||
self.model_ = self._create_model(self.input_size_).to(self.device)
|
||
|
||
# 定义损失函数和优化器
|
||
criterion = self.nn.MSELoss()
|
||
optimizer = self.optim.Adam(self.model_.parameters(), lr=self.learning_rate)
|
||
|
||
# 训练模型
|
||
self.model_.train()
|
||
for epoch in range(self.epochs):
|
||
# 随机打乱数据
|
||
indices = np.random.permutation(n_samples)
|
||
X_shuffled = X_tensor[indices]
|
||
y_shuffled = y_tensor[indices]
|
||
|
||
# 批量训练
|
||
for i in range(0, n_samples, self.batch_size):
|
||
batch_X = X_shuffled[i:i+self.batch_size]
|
||
batch_y = y_shuffled[i:i+self.batch_size]
|
||
|
||
# 前向传播
|
||
optimizer.zero_grad()
|
||
outputs = self.model_(batch_X)
|
||
loss = criterion(outputs, batch_y)
|
||
|
||
# 反向传播
|
||
loss.backward()
|
||
optimizer.step()
|
||
|
||
return self
|
||
|
||
def predict(self, X):
|
||
"""预测"""
|
||
if self.model_ is None:
|
||
raise ValueError("模型还未训练")
|
||
|
||
if not self.pytorch_available:
|
||
return self.model_.predict(X)
|
||
|
||
# 转换为评估模式
|
||
self.model_.eval()
|
||
|
||
# 重塑输入数据
|
||
n_samples, n_features = X.shape
|
||
X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
|
||
|
||
# 预测
|
||
with self.torch.no_grad():
|
||
predictions = self.model_(X_tensor)
|
||
|
||
return predictions.cpu().numpy().flatten()
|
||
|
||
|
||
class GRURegressor(BaseEstimator, RegressorMixin):
|
||
"""
|
||
GRU回归器 - 将光谱数据视为序列数据
|
||
每个光谱样本的波段作为时间步
|
||
使用PyTorch实现
|
||
"""
|
||
|
||
def __init__(self, units=None, dropout=0.2, recurrent_dropout=0.2, epochs=None,
|
||
batch_size=None, learning_rate=None, random_state=42, device=None,
|
||
config: Optional['TrainingConfig'] = None):
|
||
"""
|
||
GRU回归器构造函数
|
||
|
||
Parameters:
|
||
units (int, optional): GRU单元数,如果为None则使用默认值50
|
||
dropout (float): Dropout比例
|
||
recurrent_dropout (float): 循环Dropout比例
|
||
epochs (int, optional): 训练轮数,如果为None则使用默认值100
|
||
batch_size (int, optional): 批次大小,如果为None则使用默认值32
|
||
learning_rate (float, optional): 学习率,如果为None则使用默认值0.001
|
||
random_state (int): 随机种子
|
||
device (str, optional): 计算设备
|
||
config (TrainingConfig, optional): 训练配置对象
|
||
"""
|
||
# 如果提供了配置对象,使用配置中的参数
|
||
if config is not None:
|
||
self.units = units if units is not None else 64 # GRU默认使用更多单元
|
||
self.dropout = dropout
|
||
self.recurrent_dropout = recurrent_dropout
|
||
self.epochs = epochs if epochs is not None else config.epochs
|
||
self.batch_size = batch_size if batch_size is not None else config.batch_size
|
||
self.learning_rate = learning_rate if learning_rate is not None else config.learning_rate
|
||
else:
|
||
# 使用传统参数方式
|
||
self.units = units if units is not None else 50
|
||
self.dropout = dropout
|
||
self.recurrent_dropout = recurrent_dropout
|
||
self.epochs = epochs if epochs is not None else 100
|
||
self.batch_size = batch_size if batch_size is not None else 32
|
||
self.learning_rate = learning_rate if learning_rate is not None else 0.001
|
||
|
||
self.random_state = random_state
|
||
self.device = device
|
||
self.model_ = None
|
||
self.input_size_ = None
|
||
|
||
# 尝试导入PyTorch
|
||
try:
|
||
import torch
|
||
import torch.nn as nn
|
||
import torch.optim as optim
|
||
self.torch = torch
|
||
self.nn = nn
|
||
self.optim = optim
|
||
self.pytorch_available = True
|
||
|
||
# 设置设备
|
||
if self.device is None:
|
||
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||
else:
|
||
self.device = torch.device(self.device)
|
||
|
||
except ImportError:
|
||
self.pytorch_available = False
|
||
print("Warning: PyTorch not installed, GRU model will use MLPRegressor approximation")
|
||
|
||
def _create_model(self, input_size):
|
||
"""创建PyTorch GRU模型"""
|
||
class GRUModel(self.nn.Module):
|
||
def __init__(self, input_size, hidden_size, dropout, recurrent_dropout):
|
||
super(GRUModel, self).__init__()
|
||
self.hidden_size = hidden_size
|
||
self.gru = self.nn.GRU(
|
||
input_size=input_size,
|
||
hidden_size=hidden_size,
|
||
num_layers=1,
|
||
batch_first=True,
|
||
dropout=recurrent_dropout if recurrent_dropout > 0 else 0,
|
||
bidirectional=False
|
||
)
|
||
self.dropout_layer = self.nn.Dropout(dropout)
|
||
self.fc = self.nn.Linear(hidden_size, 1)
|
||
|
||
def forward(self, x):
|
||
# GRU前向传播
|
||
gru_out, _ = self.gru(x)
|
||
# 取最后一个时间步的输出
|
||
gru_out = gru_out[:, -1, :]
|
||
# Dropout
|
||
gru_out = self.dropout_layer(gru_out)
|
||
# 全连接层
|
||
output = self.fc(gru_out)
|
||
return output
|
||
|
||
return GRUModel(input_size, self.units, self.dropout, self.recurrent_dropout)
|
||
|
||
def fit(self, X, y):
|
||
"""训练GRU模型"""
|
||
if not self.pytorch_available:
|
||
# 如果没有PyTorch,使用MLPRegressor作为近似
|
||
from sklearn.neural_network import MLPRegressor
|
||
self.model_ = MLPRegressor(
|
||
hidden_layer_sizes=(self.units, self.units//2),
|
||
activation='relu',
|
||
solver='adam',
|
||
max_iter=self.epochs,
|
||
random_state=self.random_state,
|
||
early_stopping=True
|
||
)
|
||
self.model_.fit(X, y)
|
||
return self
|
||
|
||
# 设置随机种子
|
||
self.torch.manual_seed(self.random_state)
|
||
if self.torch.cuda.is_available():
|
||
self.torch.cuda.manual_seed(self.random_state)
|
||
self.torch.cuda.manual_seed_all(self.random_state)
|
||
np.random.seed(self.random_state)
|
||
|
||
# 将特征数据重塑为序列格式 (samples, timesteps, features)
|
||
n_samples, n_features = X.shape
|
||
self.input_size_ = 1 # 每个时间步的特征数
|
||
|
||
# 转换为PyTorch张量
|
||
X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
|
||
y_tensor = self.torch.FloatTensor(y.reshape(-1, 1)).to(self.device)
|
||
|
||
# 创建模型
|
||
self.model_ = self._create_model(self.input_size_).to(self.device)
|
||
|
||
# 定义损失函数和优化器
|
||
criterion = self.nn.MSELoss()
|
||
optimizer = self.optim.Adam(self.model_.parameters(), lr=self.learning_rate)
|
||
|
||
# 训练模型
|
||
self.model_.train()
|
||
for epoch in range(self.epochs):
|
||
# 随机打乱数据
|
||
indices = np.random.permutation(n_samples)
|
||
X_shuffled = X_tensor[indices]
|
||
y_shuffled = y_tensor[indices]
|
||
|
||
# 批量训练
|
||
for i in range(0, n_samples, self.batch_size):
|
||
batch_X = X_shuffled[i:i+self.batch_size]
|
||
batch_y = y_shuffled[i:i+self.batch_size]
|
||
|
||
# 前向传播
|
||
optimizer.zero_grad()
|
||
outputs = self.model_(batch_X)
|
||
loss = criterion(outputs, batch_y)
|
||
|
||
# 反向传播
|
||
loss.backward()
|
||
optimizer.step()
|
||
|
||
return self
|
||
|
||
def predict(self, X):
|
||
"""预测"""
|
||
if self.model_ is None:
|
||
raise ValueError("模型还未训练")
|
||
|
||
if not self.pytorch_available:
|
||
return self.model_.predict(X)
|
||
|
||
# 转换为评估模式
|
||
self.model_.eval()
|
||
|
||
# 重塑输入数据
|
||
n_samples, n_features = X.shape
|
||
X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
|
||
|
||
# 预测
|
||
with self.torch.no_grad():
|
||
predictions = self.model_(X_tensor)
|
||
|
||
return predictions.cpu().numpy().flatten()
|
||
|
||
|
||
class RegressionAnalyzer:
|
||
"""
|
||
回归分析器类 - 支持GUI对接的标准化接口
|
||
支持多种回归算法和完整的分析流程
|
||
"""
|
||
|
||
def __init__(self, config: Optional[RegressionConfig] = None):
|
||
"""
|
||
初始化回归分析器
|
||
|
||
Parameters:
|
||
config (RegressionConfig, optional): 配置对象,如果为None则使用默认配置
|
||
"""
|
||
self.config = config or RegressionConfig()
|
||
self._validate_config() # 在构造函数中进行校验
|
||
self.models = {}
|
||
self.scalers = {}
|
||
self.best_params = {}
|
||
self.results = {}
|
||
self.data = None
|
||
self.X = None
|
||
self.y = None
|
||
self.X_train = None
|
||
self.X_test = None
|
||
self.y_train = None
|
||
self.y_test = None
|
||
|
||
# 初始化可视化器
|
||
self.visualizer = RegressionVisualizer(self)
|
||
|
||
def update_config(self, config: RegressionConfig):
|
||
"""
|
||
更新配置 - 为GUI动态配置预留接口
|
||
|
||
Parameters:
|
||
config (RegressionConfig): 新的配置对象
|
||
"""
|
||
self.config = config
|
||
self._validate_config()
|
||
|
||
def _validate_config(self):
|
||
"""配置校验"""
|
||
try:
|
||
self.config._validate_parameters()
|
||
except ValueError as e:
|
||
raise ValueError(f"Configuration validation failed: {e}")
|
||
|
||
def _parse_column_range(self, column_range, total_columns):
|
||
"""
|
||
解析列范围字符串,返回列索引列表
|
||
|
||
Parameters:
|
||
column_range (str or int or list): 列范围,如 "0:5", "2,4,6-8", [0,1,2] 或单个索引
|
||
total_columns (int): 总列数
|
||
|
||
Returns:
|
||
list: 列索引列表
|
||
"""
|
||
if isinstance(column_range, (int, np.integer)):
|
||
# 单个列索引
|
||
if column_range >= total_columns or column_range < 0:
|
||
raise ValueError(f"Column index {column_range} out of range [0, {total_columns-1}]")
|
||
return [column_range]
|
||
|
||
elif isinstance(column_range, str):
|
||
# 解析范围字符串
|
||
columns = []
|
||
# 分割多个范围(用逗号分隔)
|
||
for part in column_range.split(','):
|
||
part = part.strip()
|
||
if ':' in part:
|
||
# 范围选择,如 "0:5"
|
||
start, end = part.split(':')
|
||
start = int(start.strip()) if start.strip() else 0
|
||
end = int(end.strip()) if end.strip() else total_columns
|
||
if start < 0:
|
||
start = total_columns + start
|
||
if end < 0:
|
||
end = total_columns + end
|
||
if start >= total_columns or end > total_columns:
|
||
raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
|
||
columns.extend(range(start, end))
|
||
else:
|
||
# 单个索引
|
||
idx = int(part.strip())
|
||
if idx < 0:
|
||
idx = total_columns + idx
|
||
if idx >= total_columns or idx < 0:
|
||
raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
|
||
columns.append(idx)
|
||
return list(set(columns)) # 去重
|
||
|
||
elif isinstance(column_range, (list, tuple)):
|
||
# 直接的列索引列表
|
||
columns = []
|
||
for idx in column_range:
|
||
if isinstance(idx, str):
|
||
if ':' in idx:
|
||
# 处理列表中的范围字符串
|
||
start, end = idx.split(':')
|
||
start = int(start.strip()) if start.strip() else 0
|
||
end = int(end.strip()) if end.strip() else total_columns
|
||
if start < 0:
|
||
start = total_columns + start
|
||
if end < 0:
|
||
end = total_columns + end
|
||
if start >= total_columns or end > total_columns:
|
||
raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
|
||
columns.extend(range(start, end))
|
||
else:
|
||
idx_int = int(idx.strip())
|
||
if idx_int < 0:
|
||
idx_int = total_columns + idx_int
|
||
if idx_int >= total_columns or idx_int < 0:
|
||
raise ValueError(f"Column index {idx_int} out of range [0, {total_columns-1}]")
|
||
columns.append(idx_int)
|
||
else:
|
||
if idx < 0:
|
||
idx = total_columns + idx
|
||
if idx >= total_columns or idx < 0:
|
||
raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
|
||
columns.append(idx)
|
||
return list(set(columns)) # 去重
|
||
|
||
else:
|
||
raise ValueError(f"Unsupported column range format: {type(column_range)}")
|
||
|
||
def load_csv(self, file_path, label_column, spectrum_columns=None, delimiter=',', header=0):
|
||
"""
|
||
加载CSV文件并指定标签列和光谱列
|
||
|
||
Parameters:
|
||
file_path (str): CSV文件路径
|
||
label_column (str or int or range-like): 标签列,支持范围选择,如 "0:5", "2,4,6-8" 或单个索引
|
||
spectrum_columns (str or list or None): 光谱列,支持范围选择,如 "1:10", "2,4,6-8" 或列索引列表,如果为None则使用除标签列外的所有列
|
||
delimiter (str): 分隔符,默认为','
|
||
header (int): 表头行号,默认为0
|
||
"""
|
||
try:
|
||
# 读取CSV文件
|
||
self.data = pd.read_csv(file_path, delimiter=delimiter, header=header)
|
||
total_columns = len(self.data.columns)
|
||
|
||
# 处理标签列:先检查是否是列名,然后再检查是否是索引
|
||
if isinstance(label_column, str) and label_column in self.data.columns:
|
||
# 如果是有效的列名
|
||
label_idx = self.data.columns.get_loc(label_column)
|
||
self.y = self.data[label_column].values
|
||
else:
|
||
# 尝试作为列索引处理
|
||
try:
|
||
if isinstance(label_column, str):
|
||
# 可能是数字字符串,转换为整数
|
||
label_column = int(label_column)
|
||
label_idx = label_column
|
||
if label_idx < 0:
|
||
label_idx = total_columns + label_idx
|
||
if label_idx < 0 or label_idx >= total_columns:
|
||
raise ValueError(f"Column index {label_column} out of range [0, {total_columns-1}]")
|
||
self.y = self.data.iloc[:, label_idx].values
|
||
except (ValueError, TypeError):
|
||
raise ValueError(f"Invalid label column specification: {label_column}. Must be a valid column name or index.")
|
||
|
||
# 确定光谱列
|
||
if spectrum_columns is None:
|
||
# 使用除标签列外的所有列作为光谱列
|
||
spectrum_indices = [i for i in range(total_columns) if i != label_idx]
|
||
else:
|
||
# 解析光谱列范围
|
||
spectrum_indices = self._parse_column_range(spectrum_columns, total_columns)
|
||
# 排除标签列(如果在光谱列范围内)
|
||
spectrum_indices = [i for i in spectrum_indices if i != label_idx]
|
||
|
||
if not spectrum_indices:
|
||
raise ValueError("No valid spectrum columns found")
|
||
|
||
# 提取光谱数据
|
||
self.X = self.data.iloc[:, spectrum_indices].values
|
||
|
||
# 跳过缺失标签的行
|
||
valid_mask = ~pd.isna(self.y)
|
||
original_samples = len(self.y)
|
||
self.X = self.X[valid_mask]
|
||
self.y = self.y[valid_mask]
|
||
|
||
self.feature_names = [self.data.columns[i] for i in spectrum_indices]
|
||
|
||
skipped_samples = original_samples - len(self.y)
|
||
print(f"Successfully loaded data: {self.X.shape[0]} samples, {self.X.shape[1]} features")
|
||
print(f"Label column: {label_idx} ({self.data.columns[label_idx]})")
|
||
print(f"Spectrum column range: {min(spectrum_indices)}-{max(spectrum_indices)}")
|
||
if skipped_samples > 0:
|
||
print(f"Rows skipped due to missing labels: {skipped_samples}")
|
||
print(f"Label range: {self.y.min():.4f} - {self.y.max():.4f}")
|
||
print(f"Data type check: X type {self.X.dtype}, y type {self.y.dtype}")
|
||
|
||
# Check and process data types
|
||
if self.X.dtype != np.float64:
|
||
self.X = self.X.astype(np.float64)
|
||
if self.y.dtype != np.float64:
|
||
self.y = self.y.astype(np.float64)
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"Failed to load data: {str(e)}")
|
||
return False
|
||
|
||
def preprocess_data(self, test_size=None, random_state=None, scale_method=None):
|
||
"""
|
||
数据预处理:分割训练集和测试集,标准化
|
||
|
||
Parameters:
|
||
test_size (float, optional): 测试集比例,如果为None则使用配置中的值
|
||
random_state (int, optional): 随机种子,如果为None则使用配置中的值
|
||
scale_method (str, optional): 标准化方法,如果为None则使用配置中的值
|
||
"""
|
||
# 使用配置中的默认值
|
||
test_size = test_size if test_size is not None else self.config.data.test_size
|
||
random_state = random_state if random_state is not None else self.config.data.random_state
|
||
scale_method = scale_method if scale_method is not None else self.config.data.scale_method
|
||
|
||
try:
|
||
# 分割数据集
|
||
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
|
||
self.X, self.y, test_size=test_size, random_state=random_state
|
||
)
|
||
|
||
# 标准化
|
||
if scale_method == 'standard':
|
||
self.scalers['X'] = StandardScaler()
|
||
elif scale_method == 'minmax':
|
||
self.scalers['X'] = MinMaxScaler()
|
||
else:
|
||
raise ValueError("scale_method must be 'standard' or 'minmax'")
|
||
|
||
self.X_train_scaled = self.scalers['X'].fit_transform(self.X_train)
|
||
self.X_test_scaled = self.scalers['X'].transform(self.X_test)
|
||
|
||
print(f"Data preprocessing completed:")
|
||
print(f"Training set: {self.X_train.shape[0]} samples")
|
||
print(f"Test set: {self.X_test.shape[0]} samples")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"Data preprocessing failed: {str(e)}")
|
||
return False
|
||
|
||
def add_linear_models(self):
|
||
"""添加线性回归模型"""
|
||
self.models['linear'] = {
|
||
'model': LinearRegression(),
|
||
'name': '多元线性回归'
|
||
}
|
||
|
||
self.models['lasso'] = {
|
||
'model': Lasso(random_state=42),
|
||
'name': 'LASSO回归',
|
||
'params': {
|
||
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
|
||
}
|
||
}
|
||
|
||
self.models['ridge'] = {
|
||
'model': Ridge(random_state=42),
|
||
'name': '岭回归',
|
||
'params': {
|
||
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
|
||
}
|
||
}
|
||
|
||
self.models['elasticnet'] = {
|
||
'model': ElasticNet(random_state=42),
|
||
'name': '弹性网络回归',
|
||
'params': {
|
||
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
|
||
'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
|
||
}
|
||
}
|
||
|
||
self.models['bayesianridge'] = {
|
||
'model': BayesianRidge(),
|
||
'name': '贝叶斯岭回归'
|
||
}
|
||
|
||
def add_boosting_models(self):
|
||
"""添加Boosting模型"""
|
||
self.models['lsboost'] = {
|
||
'model': GradientBoostingRegressor(random_state=42),
|
||
'name': 'LSBoost回归',
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'learning_rate': [0.01, 0.1, 0.2],
|
||
'max_depth': [3, 5, 7],
|
||
'subsample': [0.8, 0.9, 1.0]
|
||
}
|
||
}
|
||
|
||
self.models['xgboost'] = {
|
||
'model': xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
|
||
'name': 'XGBoost回归',
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'learning_rate': [0.01, 0.1, 0.2],
|
||
'max_depth': [3, 5, 7],
|
||
'subsample': [0.8, 0.9, 1.0],
|
||
'colsample_bytree': [0.8, 0.9, 1.0]
|
||
}
|
||
}
|
||
|
||
self.models['lightgbm'] = {
|
||
'model': lgb.LGBMRegressor(random_state=42),
|
||
'name': 'LightGBM回归',
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'learning_rate': [0.01, 0.1, 0.2],
|
||
'max_depth': [3, 5, 7],
|
||
'subsample': [0.8, 0.9, 1.0],
|
||
'colsample_bytree': [0.8, 0.9, 1.0]
|
||
}
|
||
}
|
||
|
||
def add_kernel_models(self):
|
||
"""添加核模型"""
|
||
# 高斯过程回归
|
||
kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))
|
||
self.models['gaussian'] = {
|
||
'model': GaussianProcessRegressor(kernel=kernel, random_state=42),
|
||
'name': '高斯过程回归'
|
||
}
|
||
|
||
# 高斯核SVM
|
||
self.models['gaussiansvm'] = {
|
||
'model': SVR(kernel='rbf'),
|
||
'name': '高斯核SVM回归',
|
||
'params': {
|
||
'C': [0.1, 1.0, 10.0, 100.0],
|
||
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1.0]
|
||
}
|
||
}
|
||
|
||
# 标准SVM回归
|
||
self.models['svm'] = {
|
||
'model': SVR(),
|
||
'name': 'SVM回归',
|
||
'params': {
|
||
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
|
||
'C': [0.1, 1.0, 10.0, 100.0],
|
||
'gamma': ['scale', 'auto']
|
||
}
|
||
}
|
||
|
||
def add_neural_networks(self, training_config: Optional[TrainingConfig] = None):
|
||
"""添加神经网络模型"""
|
||
# ELM回归 (真正的极限学习机实现)
|
||
self.models['elm'] = {
|
||
'model': ExtremeLearningMachine(random_state=42),
|
||
'name': 'ELM回归',
|
||
'params': {
|
||
'n_hidden': [50, 100, 200, 500],
|
||
'activation': ['sigmoid', 'tanh', 'relu']
|
||
}
|
||
}
|
||
|
||
# BP/MLP回归
|
||
self.models['mlp'] = {
|
||
'model': MLPRegressor(random_state=42, max_iter=1000),
|
||
'name': 'BP/MLP回归',
|
||
'params': {
|
||
'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)],
|
||
'activation': ['relu', 'tanh'],
|
||
'learning_rate_init': [0.001, 0.01, 0.1],
|
||
'alpha': [0.0001, 0.001, 0.01]
|
||
}
|
||
}
|
||
|
||
# LSTM回归 (将光谱数据视为序列,波段作为时间步)
|
||
self.models['lstm'] = {
|
||
'model': LSTMRegressor(random_state=42, config=training_config),
|
||
'name': 'LSTM回归',
|
||
'params': {
|
||
'units': [32, 64, 128],
|
||
'dropout': [0.1, 0.2, 0.3],
|
||
'epochs': [50, 100, 200]
|
||
}
|
||
}
|
||
|
||
# GRU回归 (将光谱数据视为序列,波段作为时间步)
|
||
self.models['gru'] = {
|
||
'model': GRURegressor(random_state=42, config=training_config),
|
||
'name': 'GRU回归',
|
||
'params': {
|
||
'units': [32, 64, 128],
|
||
'dropout': [0.1, 0.2, 0.3],
|
||
'epochs': [50, 100, 200]
|
||
}
|
||
}
|
||
|
||
def add_specialized_models(self):
|
||
"""添加专业模型"""
|
||
# GAM回归 (真正的广义加性模型)
|
||
self.models['gam'] = {
|
||
'model': GeneralizedAdditiveModel(),
|
||
'name': 'GAM回归',
|
||
'params': {
|
||
'n_splines': [5, 10, 15, 20],
|
||
'degree': [3, 4],
|
||
'lambda_': [0.001, 0.01, 0.1, 1.0]
|
||
}
|
||
}
|
||
|
||
# 决策树回归
|
||
self.models['decisiontree'] = {
|
||
'model': DecisionTreeRegressor(random_state=42),
|
||
'name': '决策树回归',
|
||
'params': {
|
||
'max_depth': [None, 10, 20, 30],
|
||
'min_samples_split': [2, 5, 10],
|
||
'min_samples_leaf': [1, 2, 4],
|
||
'max_features': ['auto', 'sqrt', 'log2']
|
||
}
|
||
}
|
||
|
||
# 随机森林回归
|
||
self.models['randomforest'] = {
|
||
'model': RandomForestRegressor(random_state=42),
|
||
'name': '随机森林回归',
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'max_depth': [None, 10, 20, 30],
|
||
'min_samples_split': [2, 5, 10],
|
||
'min_samples_leaf': [1, 2, 4],
|
||
'max_features': ['auto', 'sqrt', 'log2']
|
||
}
|
||
}
|
||
|
||
# 极端随机树回归
|
||
self.models['extratrees'] = {
|
||
'model': ExtraTreesRegressor(random_state=42),
|
||
'name': '极端随机树回归',
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'max_depth': [None, 10, 20, 30],
|
||
'min_samples_split': [2, 5, 10],
|
||
'min_samples_leaf': [1, 2, 4],
|
||
'max_features': ['auto', 'sqrt', 'log2']
|
||
}
|
||
}
|
||
|
||
# AdaBoost回归
|
||
self.models['adaboost'] = {
|
||
'model': AdaBoostRegressor(random_state=42),
|
||
'name': 'AdaBoost回归',
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'learning_rate': [0.01, 0.1, 1.0],
|
||
'loss': ['linear', 'square', 'exponential']
|
||
}
|
||
}
|
||
|
||
def initialize_all_models(self, use_config: bool = True):
|
||
"""Initialize all models"""
|
||
self.add_linear_models()
|
||
self.add_boosting_models()
|
||
self.add_kernel_models()
|
||
|
||
# 神经网络模型可以选择是否使用配置
|
||
if use_config and hasattr(self, 'config'):
|
||
self.add_neural_networks(self.config.training)
|
||
else:
|
||
self.add_neural_networks()
|
||
|
||
self.add_specialized_models()
|
||
|
||
print(f"Initialized {len(self.models)} regression models")
|
||
|
||
def get_available_models(self):
|
||
"""获取所有可用模型的名称和描述"""
|
||
return {name: info['name'] for name, info in self.models.items()}
|
||
|
||
def hyperparameter_tuning(self, model_name, method=None, cv=None, n_iter=None):
|
||
"""
|
||
超参数调优
|
||
|
||
Parameters:
|
||
model_name (str): 模型名称
|
||
method (str, optional): 调优方法,如果为None则使用配置中的值
|
||
cv (int, optional): 交叉验证折数,如果为None则使用配置中的值
|
||
n_iter (int, optional): 随机搜索的迭代次数,如果为None则使用配置中的值
|
||
"""
|
||
# 使用配置中的默认值
|
||
method = method if method is not None else self.config.models.tuning_method
|
||
cv = cv if cv is not None else self.config.models.cv_folds
|
||
n_iter = n_iter if n_iter is not None else self.config.models.random_search_iter
|
||
if model_name not in self.models:
|
||
print(f"Model '{model_name}' does not exist")
|
||
return False
|
||
|
||
model_info = self.models[model_name]
|
||
if 'params' not in model_info:
|
||
print(f"Model '{model_name}' has no tunable parameters")
|
||
return False
|
||
|
||
print(f"Starting hyperparameter tuning for model: {model_info['name']}")
|
||
|
||
if method == 'grid':
|
||
search = GridSearchCV(
|
||
model_info['model'],
|
||
model_info['params'],
|
||
cv=cv,
|
||
scoring='neg_mean_squared_error',
|
||
n_jobs=-1,
|
||
verbose=1
|
||
)
|
||
elif method == 'random':
|
||
search = RandomizedSearchCV(
|
||
model_info['model'],
|
||
model_info['params'],
|
||
n_iter=n_iter,
|
||
cv=cv,
|
||
scoring='neg_mean_squared_error',
|
||
n_jobs=-1,
|
||
random_state=42,
|
||
verbose=1
|
||
)
|
||
else:
|
||
print("Tuning method must be 'grid' or 'random'")
|
||
return False
|
||
|
||
try:
|
||
search.fit(self.X_train_scaled, self.y_train)
|
||
self.best_params[model_name] = search.best_params_
|
||
self.models[model_name]['model'] = search.best_estimator_
|
||
|
||
print(f"Best parameters: {search.best_params_}")
|
||
print(f"Best score: {-search.best_score_:.4f}")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"Tuning failed: {str(e)}")
|
||
return False
|
||
|
||
def train_model(self, model_name):
|
||
"""训练单个模型"""
|
||
if model_name not in self.models:
|
||
print(f"Model '{model_name}' does not exist")
|
||
return False
|
||
|
||
try:
|
||
model_info = self.models[model_name]
|
||
model = model_info['model']
|
||
|
||
print(f"Training model: {model_info['name']}")
|
||
|
||
# Train model
|
||
model.fit(self.X_train_scaled, self.y_train)
|
||
|
||
# Predict
|
||
y_pred_train = model.predict(self.X_train_scaled)
|
||
y_pred_test = model.predict(self.X_test_scaled)
|
||
|
||
# Calculate evaluation metrics
|
||
metrics = self.calculate_metrics(self.y_train, y_pred_train, self.y_test, y_pred_test)
|
||
|
||
self.results[model_name] = {
|
||
'model': model,
|
||
'metrics': metrics,
|
||
'y_pred_train': y_pred_train,
|
||
'y_pred_test': y_pred_test
|
||
}
|
||
|
||
print(f"{model_info['name']} training completed")
|
||
print(f"Training R²: {metrics['train_r2']:.4f}, Test R²: {metrics['test_r2']:.4f}")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"Training failed: {str(e)}")
|
||
return False
|
||
|
||
def train_all_models(self, tune_hyperparams=False, tuning_method='grid'):
|
||
"""训练当前配置的所有模型"""
|
||
models_to_train = list(self.models.keys())
|
||
print(f"Training {len(models_to_train)} models: {', '.join(models_to_train)}")
|
||
|
||
for model_name in models_to_train:
|
||
if tune_hyperparams and 'params' in self.models[model_name]:
|
||
self.hyperparameter_tuning(model_name, method=tuning_method)
|
||
self.train_model(model_name)
|
||
|
||
def calculate_metrics(self, y_train, y_pred_train, y_test, y_pred_test):
|
||
"""计算评价指标"""
|
||
metrics = {}
|
||
|
||
# 训练集指标
|
||
metrics['train_mse'] = mean_squared_error(y_train, y_pred_train)
|
||
metrics['train_rmse'] = np.sqrt(metrics['train_mse'])
|
||
metrics['train_mae'] = mean_absolute_error(y_train, y_pred_train)
|
||
metrics['train_r2'] = r2_score(y_train, y_pred_train)
|
||
|
||
# 测试集指标
|
||
metrics['test_mse'] = mean_squared_error(y_test, y_pred_test)
|
||
metrics['test_rmse'] = np.sqrt(metrics['test_mse'])
|
||
metrics['test_mae'] = mean_absolute_error(y_test, y_pred_test)
|
||
metrics['test_r2'] = r2_score(y_test, y_pred_test)
|
||
|
||
return metrics
|
||
|
||
def plot_results(self, save_path=None, plot_type='comprehensive'):
|
||
"""
|
||
绘制结果比较图
|
||
|
||
Parameters:
|
||
save_path (str, optional): 保存路径
|
||
plot_type (str): 绘图类型 ('basic', 'comprehensive', 'prediction', 'residual',
|
||
'metrics', 'error_dist', 'ranking')
|
||
"""
|
||
if not self.results:
|
||
print("No training results to plot")
|
||
return
|
||
|
||
if plot_type == 'comprehensive':
|
||
# 生成综合报告
|
||
self.visualizer.generate_comprehensive_report(
|
||
save_dir=self.config.output.plot_dir,
|
||
prefix='regression_analysis'
|
||
)
|
||
elif plot_type == 'basic':
|
||
# 基础图表(保持向后兼容)
|
||
self._plot_basic_comparison(save_path)
|
||
elif plot_type == 'prediction':
|
||
# 预测值vs真实值散点图
|
||
self.visualizer.plot_prediction_scatter(save_path=save_path)
|
||
elif plot_type == 'residual':
|
||
# 残差分析图
|
||
self.visualizer.plot_residual_analysis(save_path=save_path)
|
||
elif plot_type == 'metrics':
|
||
# 性能指标对比图
|
||
self.visualizer.plot_metrics_comparison(save_path=save_path)
|
||
elif plot_type == 'error_dist':
|
||
# 误差分布图
|
||
self.visualizer.plot_error_distribution(save_path=save_path)
|
||
elif plot_type == 'ranking':
|
||
# 模型排名矩阵
|
||
self.visualizer.plot_model_ranking_matrix(save_path=save_path)
|
||
else:
|
||
print(f"Unknown plot type: {plot_type}")
|
||
return
|
||
|
||
def _plot_basic_comparison(self, save_path=None):
|
||
"""绘制基础比较图(向后兼容)"""
|
||
# 准备数据
|
||
model_names = []
|
||
train_r2 = []
|
||
test_r2 = []
|
||
train_rmse = []
|
||
test_rmse = []
|
||
|
||
for model_name, result in self.results.items():
|
||
model_names.append(self.models[model_name]['name'])
|
||
train_r2.append(result['metrics']['train_r2'])
|
||
test_r2.append(result['metrics']['test_r2'])
|
||
train_rmse.append(result['metrics']['train_rmse'])
|
||
test_rmse.append(result['metrics']['test_rmse'])
|
||
|
||
# 创建图表
|
||
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
|
||
|
||
# R² Score Comparison
|
||
x = np.arange(len(model_names))
|
||
width = 0.35
|
||
|
||
ax1.bar(x - width/2, train_r2, width, label='Training Set', alpha=0.8)
|
||
ax1.bar(x + width/2, test_r2, width, label='Test Set', alpha=0.8)
|
||
ax1.set_xlabel('Model')
|
||
ax1.set_ylabel('R² Score')
|
||
ax1.set_title('R² Score Comparison')
|
||
ax1.set_xticks(x)
|
||
ax1.set_xticklabels(model_names, rotation=45, ha='right')
|
||
ax1.legend()
|
||
ax1.grid(True, alpha=0.3)
|
||
|
||
# RMSE Comparison
|
||
ax2.bar(x - width/2, train_rmse, width, label='Training Set', alpha=0.8)
|
||
ax2.bar(x + width/2, test_rmse, width, label='Test Set', alpha=0.8)
|
||
ax2.set_xlabel('Model')
|
||
ax2.set_ylabel('RMSE')
|
||
ax2.set_title('RMSE Comparison')
|
||
ax2.set_xticks(x)
|
||
ax2.set_xticklabels(model_names, rotation=45, ha='right')
|
||
ax2.legend()
|
||
ax2.grid(True, alpha=0.3)
|
||
|
||
# Predicted vs Actual Values Scatter Plot (Test Set)
|
||
colors = plt.cm.tab10(np.linspace(0, 1, len(self.results)))
|
||
for i, (model_name, result) in enumerate(self.results.items()):
|
||
ax3.scatter(self.y_test, result['y_pred_test'], alpha=0.6, color=colors[i],
|
||
label=self.models[model_name]['name'], s=20)
|
||
|
||
ax3.plot([self.y_test.min(), self.y_test.max()], [self.y_test.min(), self.y_test.max()],
|
||
'k--', linewidth=2, label='Perfect Prediction')
|
||
ax3.set_xlabel('Actual Values')
|
||
ax3.set_ylabel('Predicted Values')
|
||
ax3.set_title('Predicted vs Actual Values (Test Set)')
|
||
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
||
ax3.grid(True, alpha=0.3)
|
||
|
||
# Residual Plot
|
||
for i, (model_name, result) in enumerate(self.results.items()):
|
||
residuals = self.y_test - result['y_pred_test']
|
||
ax4.scatter(result['y_pred_test'], residuals, alpha=0.6, color=colors[i],
|
||
label=self.models[model_name]['name'], s=20)
|
||
|
||
ax4.axhline(y=0, color='k', linestyle='--', linewidth=2)
|
||
ax4.set_xlabel('Predicted Values')
|
||
ax4.set_ylabel('Residuals')
|
||
ax4.set_title('Residual Plot (Test Set)')
|
||
ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
||
ax4.grid(True, alpha=0.3)
|
||
|
||
plt.tight_layout()
|
||
|
||
if save_path:
|
||
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
||
print(f"Chart saved to: {save_path}")
|
||
|
||
# plt.show()
|
||
|
||
def plot_prediction_scatter(self, save_path=None, **kwargs):
|
||
"""绘制预测值vs真实值散点图"""
|
||
self.visualizer.plot_prediction_scatter(save_path=save_path, **kwargs)
|
||
|
||
def plot_residual_analysis(self, save_path=None, **kwargs):
|
||
"""绘制残差分析图"""
|
||
self.visualizer.plot_residual_analysis(save_path=save_path, **kwargs)
|
||
|
||
def plot_metrics_comparison(self, save_path=None, **kwargs):
|
||
"""绘制性能指标对比图"""
|
||
self.visualizer.plot_metrics_comparison(save_path=save_path, **kwargs)
|
||
|
||
def plot_error_distribution(self, save_path=None, **kwargs):
|
||
"""绘制误差分布图"""
|
||
self.visualizer.plot_error_distribution(save_path=save_path, **kwargs)
|
||
|
||
def plot_model_ranking(self, save_path=None, **kwargs):
|
||
"""绘制模型排名矩阵"""
|
||
self.visualizer.plot_model_ranking_matrix(save_path=save_path, **kwargs)
|
||
|
||
def generate_visualization_report(self, save_dir=None, prefix=None):
|
||
"""生成完整的可视化报告"""
|
||
save_dir = save_dir or self.config.output.plot_dir
|
||
prefix = prefix or 'regression_analysis'
|
||
return self.visualizer.generate_comprehensive_report(save_dir=save_dir, prefix=prefix)
|
||
|
||
def save_model(self, model_name, save_dir='models'):
|
||
"""Save model"""
|
||
if model_name not in self.results:
|
||
print(f"Model '{model_name}' has no training results")
|
||
return False
|
||
|
||
if not os.path.exists(save_dir):
|
||
os.makedirs(save_dir)
|
||
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
model_path = os.path.join(save_dir, f'{model_name}_{timestamp}.pkl')
|
||
scaler_path = os.path.join(save_dir, f'scaler_{timestamp}.pkl')
|
||
info_path = os.path.join(save_dir, f'info_{model_name}_{timestamp}.json')
|
||
|
||
try:
|
||
# 保存模型
|
||
joblib.dump(self.results[model_name]['model'], model_path)
|
||
|
||
# 保存标准化器
|
||
joblib.dump(self.scalers['X'], scaler_path)
|
||
|
||
# 保存模型信息
|
||
info = {
|
||
'model_name': model_name,
|
||
'full_name': self.models[model_name]['name'],
|
||
'timestamp': timestamp,
|
||
'metrics': self.results[model_name]['metrics'],
|
||
'best_params': self.best_params.get(model_name, {}),
|
||
'feature_names': self.feature_names
|
||
}
|
||
|
||
with open(info_path, 'w', encoding='utf-8') as f:
|
||
json.dump(info, f, indent=4, ensure_ascii=False)
|
||
|
||
print(f"Model saved:")
|
||
print(f" Model file: {model_path}")
|
||
print(f" Scaler: {scaler_path}")
|
||
print(f" Info file: {info_path}")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"Save failed: {str(e)}")
|
||
return False
|
||
|
||
def save_all_models(self, save_dir='models'):
|
||
"""保存所有模型"""
|
||
for model_name in self.results.keys():
|
||
self.save_model(model_name, save_dir)
|
||
|
||
def load_model(self, model_path, scaler_path=None):
|
||
"""加载模型"""
|
||
try:
|
||
model = joblib.load(model_path)
|
||
if scaler_path:
|
||
scaler = joblib.load(scaler_path)
|
||
else:
|
||
scaler = None
|
||
|
||
return model, scaler
|
||
|
||
except Exception as e:
|
||
print(f"Load failed: {str(e)}")
|
||
return None, None
|
||
|
||
def print_summary(self):
|
||
"""Print results summary"""
|
||
if not self.results:
|
||
print("No training results")
|
||
return
|
||
|
||
print("\n" + "="*80)
|
||
print("Regression Model Performance Summary")
|
||
print("="*80)
|
||
|
||
# Header
|
||
header = "|30"
|
||
print(header)
|
||
|
||
# Result rows
|
||
for model_name, result in sorted(self.results.items(),
|
||
key=lambda x: x[1]['metrics']['test_r2'], reverse=True):
|
||
metrics = result['metrics']
|
||
model_full_name = self.models[model_name]['name']
|
||
print("|30")
|
||
|
||
print("-"*80)
|
||
print("Note: R² closer to 1 is better, RMSE/MAE smaller is better")
|
||
print("="*80)
|
||
|
||
def run_analysis_from_config(self) -> bool:
|
||
"""
|
||
基于配置对象运行完整分析流程 - 推荐用于GUI对接
|
||
|
||
Returns:
|
||
bool: 分析是否成功完成
|
||
"""
|
||
print("Starting regression analysis from configuration...")
|
||
|
||
# 1. 加载数据
|
||
if not self.load_csv(self.config.data.csv_path, self.config.data.label_column,
|
||
self.config.data.spectrum_columns):
|
||
return False
|
||
|
||
# 2. 数据预处理
|
||
if not self.preprocess_data():
|
||
return False
|
||
|
||
# 3. 初始化模型
|
||
self.initialize_all_models(use_config=True)
|
||
|
||
# 4. 过滤模型(如果指定了特定的模型)
|
||
if self.config.models.model_names is not None:
|
||
# 验证指定的模型名称
|
||
invalid_models = [name for name in self.config.models.model_names if name not in self.models]
|
||
if invalid_models:
|
||
print(f"Warning: The following models do not exist: {invalid_models}")
|
||
valid_model_names = [name for name in self.config.models.model_names if name in self.models]
|
||
else:
|
||
valid_model_names = self.config.models.model_names
|
||
|
||
# 只保留指定的模型
|
||
models_to_keep = {}
|
||
for model_name in valid_model_names:
|
||
if model_name in self.models:
|
||
models_to_keep[model_name] = self.models[model_name]
|
||
self.models = models_to_keep
|
||
print(f"Filtered to {len(self.models)} specified models")
|
||
|
||
# 5. 训练模型
|
||
self.train_all_models(tune_hyperparams=self.config.models.tune_hyperparams,
|
||
tuning_method=self.config.models.tuning_method)
|
||
|
||
# 6. 打印汇总
|
||
self.print_summary()
|
||
|
||
# 7. 保存模型
|
||
if self.config.output.save_models:
|
||
self.save_all_models(save_dir=self.config.output.save_dir)
|
||
|
||
# 8. 绘制结果
|
||
if self.config.output.plot_results:
|
||
os.makedirs(self.config.output.plot_dir, exist_ok=True)
|
||
self.generate_visualization_report(
|
||
save_dir=self.config.output.plot_dir,
|
||
prefix='regression_analysis'
|
||
)
|
||
|
||
print("Analysis completed!")
|
||
return True
|
||
|
||
def run_complete_analysis(self, csv_path=None, label_column=None, spectrum_columns=None,
|
||
test_size=None, scale_method=None, tune_hyperparams=None,
|
||
tuning_method=None, save_models=None, plot_results=None,
|
||
model_names=None):
|
||
"""
|
||
运行完整分析流程 - 保持向后兼容性
|
||
|
||
Parameters:
|
||
csv_path (str, optional): CSV文件路径,如果为None则使用配置中的值
|
||
label_column (str or int, optional): 标签列,如果为None则使用配置中的值
|
||
spectrum_columns (str or list or None, optional): 光谱列,如果为None则使用配置中的值
|
||
test_size (float, optional): 测试集比例,如果为None则使用配置中的值
|
||
scale_method (str, optional): 标准化方法,如果为None则使用配置中的值
|
||
tune_hyperparams (bool, optional): 是否调优超参数,如果为None则使用配置中的值
|
||
tuning_method (str, optional): 调优方法,如果为None则使用配置中的值
|
||
save_models (bool, optional): 是否保存模型,如果为None则使用配置中的值
|
||
plot_results (bool, optional): 是否绘制结果图,如果为None则使用配置中的值
|
||
model_names (list or None, optional): 要训练的模型名称列表,如果为None则使用配置中的值
|
||
"""
|
||
# 更新配置对象(向后兼容)
|
||
if csv_path is not None:
|
||
self.config.data.csv_path = csv_path
|
||
if label_column is not None:
|
||
self.config.data.label_column = label_column
|
||
if spectrum_columns is not None:
|
||
self.config.data.spectrum_columns = spectrum_columns
|
||
if test_size is not None:
|
||
self.config.data.test_size = test_size
|
||
if scale_method is not None:
|
||
self.config.data.scale_method = scale_method
|
||
if tune_hyperparams is not None:
|
||
self.config.models.tune_hyperparams = tune_hyperparams
|
||
if tuning_method is not None:
|
||
self.config.models.tuning_method = tuning_method
|
||
if save_models is not None:
|
||
self.config.output.save_models = save_models
|
||
if plot_results is not None:
|
||
self.config.output.plot_results = plot_results
|
||
if model_names is not None:
|
||
self.config.models.model_names = model_names
|
||
|
||
# 使用配置驱动的方法
|
||
return self.run_analysis_from_config()
|
||
|
||
|
||
class RegressionVisualizer:
|
||
"""
|
||
回归分析可视化器 - 提供丰富的可视化功能
|
||
支持预测值vs真实值散点图、残差图、性能指标对比等
|
||
"""
|
||
|
||
def __init__(self, analyzer: Optional['RegressionAnalyzer'] = None):
|
||
"""
|
||
初始化可视化器
|
||
|
||
Parameters:
|
||
analyzer (RegressionAnalyzer, optional): 回归分析器实例
|
||
"""
|
||
self.analyzer = analyzer
|
||
self.colorblind_friendly_palette = [
|
||
'#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
|
||
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
|
||
'#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5'
|
||
]
|
||
plt.style.use('seaborn-v0_8')
|
||
|
||
# 配置matplotlib支持中文显示
|
||
self._configure_chinese_font()
|
||
|
||
def _configure_chinese_font(self):
|
||
"""配置matplotlib以支持中文显示"""
|
||
import matplotlib as mpl
|
||
|
||
# 尝试设置中文字体
|
||
chinese_fonts = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans', 'Arial Unicode MS',
|
||
'WenQuanYi Micro Hei', 'AR PL UMing CN', 'Liberation Serif']
|
||
|
||
font_set = False
|
||
for font in chinese_fonts:
|
||
try:
|
||
# 测试字体是否可用
|
||
test_text = "测试中文"
|
||
fig, ax = plt.subplots()
|
||
ax.text(0.5, 0.5, test_text, fontname=font, fontsize=12)
|
||
plt.close(fig)
|
||
|
||
# 如果没有报错,设置字体
|
||
mpl.rcParams['font.sans-serif'] = [font] + mpl.rcParams['font.sans-serif']
|
||
mpl.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
|
||
font_set = True
|
||
print(f"Successfully set Chinese font to: {font}")
|
||
break
|
||
except:
|
||
continue
|
||
|
||
if not font_set:
|
||
# 如果没有找到合适的中文字体,使用系统默认并启用unicode
|
||
mpl.rcParams['font.sans-serif'] = ['DejaVu Sans', 'SimHei']
|
||
mpl.rcParams['axes.unicode_minus'] = False
|
||
print("Warning: Could not find suitable Chinese font. Using default fonts.")
|
||
|
||
def _ensure_chinese_text(self, text):
|
||
"""确保文本正确显示中文"""
|
||
if isinstance(text, str):
|
||
try:
|
||
# 尝试编码和解码以确保UTF-8格式
|
||
return text.encode('utf-8').decode('utf-8')
|
||
except:
|
||
return text
|
||
return text
|
||
|
||
def set_colorblind_palette(self):
|
||
"""设置色盲友好配色方案"""
|
||
import matplotlib as mpl
|
||
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=self.colorblind_friendly_palette)
|
||
|
||
def plot_prediction_scatter(self, figsize=(16, 12), save_path=None, show_individual=True, show_overlay=True):
|
||
"""
|
||
绘制预测值vs真实值散点图
|
||
|
||
Parameters:
|
||
figsize (tuple): 图形尺寸
|
||
save_path (str, optional): 保存路径
|
||
show_individual (bool): 是否显示多子图
|
||
show_overlay (bool): 是否显示叠加图
|
||
"""
|
||
if not self.analyzer or not self.analyzer.results:
|
||
print("No analyzer results available for plotting")
|
||
return
|
||
|
||
self.set_colorblind_palette()
|
||
n_models = len(self.analyzer.results)
|
||
|
||
if show_individual and show_overlay:
|
||
# 创建复合图:上方多子图,下方叠加图
|
||
fig = plt.figure(figsize=figsize)
|
||
|
||
# 上方:多子图(每个模型一个子图)
|
||
n_cols = min(4, n_models)
|
||
n_rows = (n_models + n_cols - 1) // n_cols
|
||
|
||
gs = fig.add_gridspec(n_rows + 1, n_cols, hspace=0.3, wspace=0.3)
|
||
axes_scatter = []
|
||
for i in range(n_rows):
|
||
for j in range(n_cols):
|
||
if i * n_cols + j < n_models:
|
||
axes_scatter.append(fig.add_subplot(gs[i, j]))
|
||
|
||
# 下方:叠加图
|
||
ax_overlay = fig.add_subplot(gs[n_rows, :])
|
||
|
||
elif show_individual:
|
||
# 只有多子图
|
||
n_cols = min(4, n_models)
|
||
n_rows = (n_models + n_cols - 1) // n_cols
|
||
fig, axes_scatter = plt.subplots(n_rows, n_cols, figsize=figsize)
|
||
if n_models == 1:
|
||
axes_scatter = [axes_scatter]
|
||
else:
|
||
axes_scatter = axes_scatter.flatten()
|
||
ax_overlay = None
|
||
|
||
elif show_overlay:
|
||
# 只有叠加图
|
||
fig, ax_overlay = plt.subplots(1, 1, figsize=(10, 8))
|
||
axes_scatter = []
|
||
else:
|
||
print("At least one of show_individual or show_overlay must be True")
|
||
return
|
||
|
||
# 绘制多子图
|
||
if show_individual:
|
||
for idx, (model_name, result) in enumerate(self.analyzer.results.items()):
|
||
if idx < len(axes_scatter):
|
||
ax = axes_scatter[idx]
|
||
y_true = self.analyzer.y_test
|
||
y_pred = result['y_pred_test']
|
||
|
||
# 散点图
|
||
ax.scatter(y_true, y_pred, alpha=0.6, s=30, color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)])
|
||
|
||
# 45度参考线
|
||
min_val = min(y_true.min(), y_pred.min())
|
||
max_val = max(y_true.max(), y_pred.max())
|
||
ax.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, alpha=0.7)
|
||
|
||
# 添加R²和RMSE文本
|
||
r2 = result['metrics']['test_r2']
|
||
rmse = result['metrics']['test_rmse']
|
||
ax.text(0.05, 0.95, f'R² = {r2:.3f}\nRMSE = {rmse:.3f}',
|
||
transform=ax.transAxes, fontsize=10,
|
||
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
|
||
|
||
ax.set_xlabel(self._ensure_chinese_text('True Values'))
|
||
ax.set_ylabel(self._ensure_chinese_text('Predicted Values'))
|
||
ax.set_title(self._ensure_chinese_text(f'{self.analyzer.models[model_name]["name"]}'))
|
||
ax.grid(True, alpha=0.3)
|
||
ax.axis('equal')
|
||
|
||
# 绘制叠加图
|
||
if show_overlay:
|
||
for idx, (model_name, result) in enumerate(self.analyzer.results.items()):
|
||
y_true = self.analyzer.y_test
|
||
y_pred = result['y_pred_test']
|
||
|
||
ax_overlay.scatter(y_true, y_pred, alpha=0.6, s=30,
|
||
color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)],
|
||
label=f'{self.analyzer.models[model_name]["name"]} (R²={result["metrics"]["test_r2"]:.3f})')
|
||
|
||
# 45度参考线
|
||
min_val = min(self.analyzer.y_test.min(), min([r['y_pred_test'].min() for r in self.analyzer.results.values()]))
|
||
max_val = max(self.analyzer.y_test.max(), max([r['y_pred_test'].max() for r in self.analyzer.results.values()]))
|
||
ax_overlay.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, alpha=0.7, label='Perfect Prediction')
|
||
|
||
ax_overlay.set_xlabel(self._ensure_chinese_text('True Values'))
|
||
ax_overlay.set_ylabel(self._ensure_chinese_text('Predicted Values'))
|
||
ax_overlay.set_title(self._ensure_chinese_text('Predicted vs True Values - All Models Overlay'))
|
||
ax_overlay.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
||
ax_overlay.grid(True, alpha=0.3)
|
||
ax_overlay.axis('equal')
|
||
|
||
plt.tight_layout()
|
||
|
||
if save_path:
|
||
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
||
print(f"Prediction scatter plot saved to: {save_path}")
|
||
|
||
# plt.show()
|
||
|
||
def plot_residual_analysis(self, figsize=(16, 8), save_path=None, n_feature_plots=3):
|
||
"""
|
||
绘制残差分析图
|
||
|
||
Parameters:
|
||
figsize (tuple): 图形尺寸
|
||
save_path (str, optional): 保存路径
|
||
n_feature_plots (int): 显示多少个特征的残差vs特征图
|
||
"""
|
||
if not self.analyzer or not self.analyzer.results:
|
||
print("No analyzer results available for plotting")
|
||
return
|
||
|
||
self.set_colorblind_palette()
|
||
|
||
# 选择表现最好的几个模型进行详细分析
|
||
sorted_models = sorted(self.analyzer.results.items(),
|
||
key=lambda x: x[1]['metrics']['test_r2'], reverse=True)
|
||
top_models = sorted_models[:min(3, len(sorted_models))]
|
||
|
||
fig, axes = plt.subplots(2, 3, figsize=figsize)
|
||
|
||
for idx, (model_name, result) in enumerate(top_models):
|
||
y_true = self.analyzer.y_test
|
||
y_pred = result['y_pred_test']
|
||
residuals = y_true - y_pred
|
||
|
||
# 残差vs预测值
|
||
ax1 = axes[0, 0] if idx == 0 else axes[0, idx]
|
||
ax1.scatter(y_pred, residuals, alpha=0.6, s=20,
|
||
color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)])
|
||
ax1.axhline(y=0, color='k', linestyle='--', linewidth=2)
|
||
ax1.set_xlabel('Predicted Values')
|
||
ax1.set_ylabel('Residuals')
|
||
ax1.set_title(f'Residuals vs Predicted\n{self.analyzer.models[model_name]["name"]}')
|
||
ax1.grid(True, alpha=0.3)
|
||
|
||
# Q-Q图
|
||
ax2 = axes[1, 0] if idx == 0 else axes[1, idx]
|
||
stats.probplot(residuals, dist="norm", plot=ax2)
|
||
ax2.set_title(f'Normal Q-Q Plot\n{self.analyzer.models[model_name]["name"]}')
|
||
|
||
# 残差vs重要特征(如果有特征重要性)
|
||
if idx < n_feature_plots - 2 and hasattr(result['model'], 'feature_importances_'):
|
||
ax3 = axes[idx // 3 + 1, idx % 3 + 1] if idx > 0 else axes[0, 2]
|
||
if idx < 2: # 只显示前两个模型的特征残差图
|
||
try:
|
||
importances = result['model'].feature_importances_
|
||
top_features_idx = np.argsort(importances)[-2:] # 最重要的两个特征
|
||
|
||
for i, feat_idx in enumerate(top_features_idx):
|
||
feat_name = self.analyzer.feature_names[feat_idx] if hasattr(self.analyzer, 'feature_names') else f'Feature {feat_idx}'
|
||
ax3.scatter(self.analyzer.X_test[:, feat_idx], residuals,
|
||
alpha=0.6, s=20, label=f'{feat_name}',
|
||
color=self.colorblind_friendly_palette[(idx*2 + i) % len(self.colorblind_friendly_palette)])
|
||
ax3.axhline(y=0, color='k', linestyle='--', linewidth=2)
|
||
ax3.set_xlabel('Feature Values')
|
||
ax3.set_ylabel('Residuals')
|
||
ax3.set_title(f'Residuals vs Top Features\n{self.analyzer.models[model_name]["name"]}')
|
||
ax3.legend()
|
||
ax3.grid(True, alpha=0.3)
|
||
except:
|
||
ax3.text(0.5, 0.5, 'Feature importance\nnot available',
|
||
transform=ax3.transAxes, ha='center', va='center')
|
||
ax3.set_title(f'Feature Analysis\n{self.analyzer.models[model_name]["name"]}')
|
||
|
||
plt.tight_layout()
|
||
|
||
if save_path:
|
||
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
||
print(f"Residual analysis plot saved to: {save_path}")
|
||
|
||
# plt.show()
|
||
|
||
def plot_metrics_comparison(self, figsize=(16, 10), save_path=None):
|
||
"""
|
||
绘制性能指标对比图
|
||
|
||
Parameters:
|
||
figsize (tuple): 图形尺寸
|
||
save_path (str, optional): 保存路径
|
||
"""
|
||
if not self.analyzer or not self.analyzer.results:
|
||
print("No analyzer results available for plotting")
|
||
return
|
||
|
||
self.set_colorblind_palette()
|
||
|
||
# 准备数据
|
||
model_names = []
|
||
model_full_names = []
|
||
r2_scores = []
|
||
rmse_scores = []
|
||
mae_scores = []
|
||
training_times = []
|
||
memory_usage = []
|
||
|
||
# 模拟训练时间和内存使用(实际应用中需要测量)
|
||
for model_name, result in self.analyzer.results.items():
|
||
model_names.append(model_name)
|
||
model_full_names.append(self.analyzer.models[model_name]['name'])
|
||
r2_scores.append(result['metrics']['test_r2'])
|
||
rmse_scores.append(result['metrics']['test_rmse'])
|
||
mae_scores.append(result['metrics']['test_mae'])
|
||
training_times.append(np.random.uniform(0.1, 5.0)) # 模拟时间
|
||
memory_usage.append(np.random.uniform(50, 500)) # 模拟内存
|
||
|
||
# 创建子图
|
||
fig, axes = plt.subplots(2, 2, figsize=figsize)
|
||
|
||
# 雷达图 - R², RMSE, MAE
|
||
ax_radar = axes[0, 0]
|
||
|
||
# 标准化指标到0-1范围
|
||
r2_norm = (r2_scores - np.min(r2_scores)) / (np.max(r2_scores) - np.min(r2_scores))
|
||
rmse_norm = 1 - (rmse_scores - np.min(rmse_scores)) / (np.max(rmse_scores) - np.min(rmse_scores)) # RMSE越小越好,反转
|
||
mae_norm = 1 - (mae_scores - np.min(mae_scores)) / (np.max(mae_scores) - np.min(mae_scores)) # MAE越小越好,反转
|
||
|
||
# 雷达图数据
|
||
categories = ['R² Score', 'RMSE (inv)', 'MAE (inv)']
|
||
n_models = len(model_names)
|
||
|
||
# 计算角度
|
||
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
|
||
angles += angles[:1] # 闭合图形
|
||
|
||
for i in range(n_models):
|
||
values = [r2_norm[i], rmse_norm[i], mae_norm[i]]
|
||
values += values[:1] # 闭合图形
|
||
ax_radar.plot(angles, values, 'o-', linewidth=2,
|
||
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
|
||
label=model_full_names[i])
|
||
ax_radar.fill(angles, values, alpha=0.25,
|
||
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)])
|
||
|
||
ax_radar.set_xticks(angles[:-1])
|
||
ax_radar.set_xticklabels(categories)
|
||
ax_radar.set_title('Performance Metrics Radar Chart')
|
||
ax_radar.legend(bbox_to_anchor=(1.1, 1), loc='upper left')
|
||
ax_radar.grid(True, alpha=0.3)
|
||
|
||
# 分组柱状图 - 不同指标的比较
|
||
ax_bar = axes[0, 1]
|
||
x = np.arange(len(model_names))
|
||
width = 0.25
|
||
|
||
bars1 = ax_bar.bar(x - width, r2_scores, width, label='R²', alpha=0.8,
|
||
color=self.colorblind_friendly_palette[0])
|
||
bars2 = ax_bar.bar(x, [1/s for s in rmse_scores], width, label='1/RMSE', alpha=0.8,
|
||
color=self.colorblind_friendly_palette[1])
|
||
bars3 = ax_bar.bar(x + width, [1/s for s in mae_scores], width, label='1/MAE', alpha=0.8,
|
||
color=self.colorblind_friendly_palette[2])
|
||
|
||
ax_bar.set_xlabel('Models')
|
||
ax_bar.set_ylabel('Normalized Scores')
|
||
ax_bar.set_title('Normalized Performance Comparison')
|
||
ax_bar.set_xticks(x)
|
||
ax_bar.set_xticklabels(model_full_names, rotation=45, ha='right')
|
||
ax_bar.legend()
|
||
ax_bar.grid(True, alpha=0.3, axis='y')
|
||
|
||
# 堆叠柱状图 - 误差分解(偏差vs方差)
|
||
ax_stack = axes[1, 0]
|
||
bias_errors = [abs(np.mean(residuals)) for residuals in
|
||
[self.analyzer.y_test - result['y_pred_test'] for result in self.analyzer.results.values()]]
|
||
variance_errors = [np.var(residuals) for residuals in
|
||
[self.analyzer.y_test - result['y_pred_test'] for result in self.analyzer.results.values()]]
|
||
|
||
bars_bias = ax_stack.bar(model_names, bias_errors, label='Bias (Mean Abs Error)', alpha=0.8,
|
||
color=self.colorblind_friendly_palette[0])
|
||
bars_var = ax_stack.bar(model_names, variance_errors, bottom=bias_errors,
|
||
label='Variance (Residual Var)', alpha=0.8,
|
||
color=self.colorblind_friendly_palette[1])
|
||
|
||
ax_stack.set_xlabel('Models')
|
||
ax_stack.set_ylabel('Error Components')
|
||
ax_stack.set_title('Bias-Variance Decomposition')
|
||
ax_stack.set_xticklabels(model_full_names, rotation=45, ha='right')
|
||
ax_stack.legend()
|
||
ax_stack.grid(True, alpha=0.3, axis='y')
|
||
|
||
# 气泡图 - 综合评估(R² vs 1/RMSE,气泡大小表示1/MAE)
|
||
ax_bubble = axes[1, 1]
|
||
bubble_sizes = [100 * (1/s) for s in mae_scores] # MAE越大气泡越小
|
||
|
||
scatter = ax_bubble.scatter(r2_scores, [1/s for s in rmse_scores], s=bubble_sizes,
|
||
c=range(len(model_names)), cmap='viridis', alpha=0.6, edgecolors='black')
|
||
|
||
# 添加模型名称标签
|
||
for i, name in enumerate(model_full_names):
|
||
ax_bubble.annotate(name, (r2_scores[i], 1/rmse_scores[i]),
|
||
xytext=(5, 5), textcoords='offset points', fontsize=8)
|
||
|
||
ax_bubble.set_xlabel('R² Score')
|
||
ax_bubble.set_ylabel('1/RMSE')
|
||
ax_bubble.set_title('Comprehensive Performance Assessment\n(Bubble size ∝ 1/MAE)')
|
||
ax_bubble.grid(True, alpha=0.3)
|
||
|
||
# 添加颜色条
|
||
cbar = plt.colorbar(scatter, ax=ax_bubble)
|
||
cbar.set_label('Model Index')
|
||
|
||
plt.tight_layout()
|
||
|
||
if save_path:
|
||
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
||
print(f"Metrics comparison plot saved to: {save_path}")
|
||
|
||
# plt.show()
|
||
|
||
def plot_error_distribution(self, figsize=(16, 8), save_path=None):
|
||
"""
|
||
绘制误差分布图
|
||
|
||
Parameters:
|
||
figsize (tuple): 图形尺寸
|
||
save_path (str, optional): 保存路径
|
||
"""
|
||
if not self.analyzer or not self.analyzer.results:
|
||
print("No analyzer results available for plotting")
|
||
return
|
||
|
||
self.set_colorblind_palette()
|
||
|
||
# 计算所有模型的误差
|
||
model_errors = {}
|
||
for model_name, result in self.analyzer.results.items():
|
||
errors = self.analyzer.y_test - result['y_pred_test']
|
||
model_errors[model_name] = errors
|
||
|
||
fig, axes = plt.subplots(2, 2, figsize=figsize)
|
||
|
||
# 误差分布直方图 - 所有模型并排比较
|
||
ax_hist = axes[0, 0]
|
||
bins = np.linspace(min([min(errors) for errors in model_errors.values()]),
|
||
max([max(errors) for errors in model_errors.values()]), 30)
|
||
|
||
for i, (model_name, errors) in enumerate(model_errors.items()):
|
||
ax_hist.hist(errors, bins=bins, alpha=0.7, label=self.analyzer.models[model_name]['name'],
|
||
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
|
||
density=True)
|
||
|
||
ax_hist.set_xlabel('Prediction Error')
|
||
ax_hist.set_ylabel('Density')
|
||
ax_hist.set_title('Error Distribution Histogram')
|
||
ax_hist.legend()
|
||
ax_hist.grid(True, alpha=0.3)
|
||
|
||
# 核密度估计曲线
|
||
ax_kde = axes[0, 1]
|
||
for i, (model_name, errors) in enumerate(model_errors.items()):
|
||
try:
|
||
sns.kdeplot(data=errors, ax=ax_kde, label=self.analyzer.models[model_name]['name'],
|
||
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
|
||
fill=True, alpha=0.3)
|
||
except:
|
||
# 如果seaborn不可用,使用matplotlib
|
||
ax_kde.hist(errors, bins=30, alpha=0.3, density=True,
|
||
label=self.analyzer.models[model_name]['name'],
|
||
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)])
|
||
|
||
ax_kde.set_xlabel('Prediction Error')
|
||
ax_kde.set_ylabel('Density')
|
||
ax_kde.set_title('Error Distribution KDE')
|
||
ax_kde.legend()
|
||
ax_kde.grid(True, alpha=0.3)
|
||
|
||
# 累积分布函数
|
||
ax_cdf = axes[1, 0]
|
||
error_range = np.linspace(min([min(errors) for errors in model_errors.values()]),
|
||
max([max(errors) for errors in model_errors.values()]), 100)
|
||
|
||
for i, (model_name, errors) in enumerate(model_errors.items()):
|
||
sorted_errors = np.sort(errors)
|
||
y_vals = np.arange(len(sorted_errors)) / float(len(sorted_errors))
|
||
ax_cdf.plot(sorted_errors, y_vals,
|
||
label=self.analyzer.models[model_name]['name'],
|
||
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
|
||
linewidth=2)
|
||
|
||
ax_cdf.set_xlabel('Prediction Error')
|
||
ax_cdf.set_ylabel('Cumulative Probability')
|
||
ax_cdf.set_title('Cumulative Distribution Function')
|
||
ax_cdf.legend()
|
||
ax_cdf.grid(True, alpha=0.3)
|
||
|
||
# 箱线图
|
||
ax_box = axes[1, 1]
|
||
error_data = [errors for errors in model_errors.values()]
|
||
model_labels = [self.analyzer.models[name]['name'] for name in model_errors.keys()]
|
||
|
||
bp = ax_box.boxplot(error_data, labels=model_labels, patch_artist=True)
|
||
for patch, color in zip(bp['boxes'], self.colorblind_friendly_palette):
|
||
patch.set_facecolor(color)
|
||
patch.set_alpha(0.7)
|
||
|
||
# 添加均值点
|
||
for i, errors in enumerate(error_data):
|
||
ax_box.plot(i+1, np.mean(errors), 'ro', markersize=8, label='Mean' if i == 0 else "")
|
||
|
||
ax_box.set_xlabel('Models')
|
||
ax_box.set_ylabel('Prediction Error')
|
||
ax_box.set_title('Error Distribution Box Plot')
|
||
ax_box.legend()
|
||
ax_box.grid(True, alpha=0.3, axis='y')
|
||
|
||
plt.tight_layout()
|
||
|
||
if save_path:
|
||
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
||
print(f"Error distribution plot saved to: {save_path}")
|
||
|
||
# plt.show()
|
||
|
||
def plot_model_ranking_matrix(self, figsize=(14, 10), save_path=None):
|
||
"""
|
||
绘制模型排名矩阵
|
||
|
||
Parameters:
|
||
figsize (tuple): 图形尺寸
|
||
save_path (str, optional): 保存路径
|
||
"""
|
||
if not self.analyzer or not self.analyzer.results:
|
||
print("No analyzer results available for plotting")
|
||
return
|
||
|
||
self.set_colorblind_palette()
|
||
|
||
# 准备指标数据
|
||
metrics_data = []
|
||
model_names = []
|
||
metric_names = ['R²', 'RMSE', 'MAE', 'Training_R²', 'Training_RMSE', 'Training_MAE']
|
||
|
||
for model_name, result in self.analyzer.results.items():
|
||
model_names.append(self.analyzer.models[model_name]['name'])
|
||
metrics = result['metrics']
|
||
metrics_data.append([
|
||
metrics['test_r2'],
|
||
metrics['test_rmse'],
|
||
metrics['test_mae'],
|
||
metrics['train_r2'],
|
||
metrics['train_rmse'],
|
||
metrics['train_mae']
|
||
])
|
||
|
||
metrics_array = np.array(metrics_data)
|
||
|
||
# 计算排名(对于R²,越高越好;对于RMSE/MAE,越低越好)
|
||
rankings = np.zeros_like(metrics_array)
|
||
rankings[:, 0] = len(model_names) - stats.rankdata(metrics_array[:, 0]) + 1 # R²排名(反转)
|
||
rankings[:, 1] = stats.rankdata(metrics_array[:, 1]) # RMSE排名
|
||
rankings[:, 2] = stats.rankdata(metrics_array[:, 2]) # MAE排名
|
||
rankings[:, 3] = len(model_names) - stats.rankdata(metrics_array[:, 3]) + 1 # Training R²排名(反转)
|
||
rankings[:, 4] = stats.rankdata(metrics_array[:, 4]) # Training RMSE排名
|
||
rankings[:, 5] = stats.rankdata(metrics_array[:, 5]) # Training MAE排名
|
||
|
||
fig, axes = plt.subplots(2, 2, figsize=figsize)
|
||
|
||
# 热力图 - 模型vs指标的排名
|
||
ax_heatmap = axes[0, 0]
|
||
im = ax_heatmap.imshow(rankings, cmap='RdYlGn_r', aspect='auto', alpha=0.8)
|
||
|
||
# 设置标签
|
||
ax_heatmap.set_xticks(np.arange(len(metric_names)))
|
||
ax_heatmap.set_yticks(np.arange(len(model_names)))
|
||
ax_heatmap.set_xticklabels(metric_names, rotation=45, ha='right')
|
||
ax_heatmap.set_yticklabels(model_names)
|
||
|
||
# 添加数值标签
|
||
for i in range(len(model_names)):
|
||
for j in range(len(metric_names)):
|
||
text = ax_heatmap.text(j, i, f'{rankings[i, j]:.0f}',
|
||
ha="center", va="center", color="black", fontsize=10)
|
||
|
||
ax_heatmap.set_title('Model Ranking Matrix\n(Lower rank = Better performance)')
|
||
plt.colorbar(im, ax=ax_heatmap, label='Rank')
|
||
|
||
# 平行坐标图
|
||
ax_parallel = axes[0, 1]
|
||
|
||
# 标准化数据到0-1范围
|
||
normalized_data = np.zeros_like(metrics_array)
|
||
for j in range(metrics_array.shape[1]):
|
||
if j in [0, 3]: # R²指标,越高越好
|
||
normalized_data[:, j] = (metrics_array[:, j] - metrics_array[:, j].min()) / (metrics_array[:, j].max() - metrics_array[:, j].min())
|
||
else: # RMSE/MAE指标,越低越好,反转标准化
|
||
normalized_data[:, j] = 1 - (metrics_array[:, j] - metrics_array[:, j].min()) / (metrics_array[:, j].max() - metrics_array[:, j].min())
|
||
|
||
for i in range(len(model_names)):
|
||
ax_parallel.plot(range(len(metric_names)), normalized_data[i],
|
||
marker='o', linewidth=2, markersize=6,
|
||
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
|
||
label=model_names[i], alpha=0.8)
|
||
|
||
ax_parallel.set_xticks(range(len(metric_names)))
|
||
ax_parallel.set_xticklabels(metric_names, rotation=45, ha='right')
|
||
ax_parallel.set_ylabel('Normalized Score (Higher = Better)')
|
||
ax_parallel.set_title('Parallel Coordinates Plot')
|
||
ax_parallel.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
||
ax_parallel.grid(True, alpha=0.3)
|
||
|
||
# 气泡图 - R² vs RMSE,气泡大小表示MAE
|
||
ax_bubble = axes[1, 0]
|
||
r2_scores = metrics_array[:, 0]
|
||
rmse_scores = metrics_array[:, 1]
|
||
mae_scores = metrics_array[:, 2]
|
||
|
||
# 气泡大小(MAE越小气泡越大)
|
||
bubble_sizes = 1000 / (mae_scores + 0.01) # 避免除零
|
||
|
||
scatter = ax_bubble.scatter(r2_scores, rmse_scores, s=bubble_sizes,
|
||
c=range(len(model_names)), cmap='viridis', alpha=0.6, edgecolors='black')
|
||
|
||
# 添加模型名称标签
|
||
for i, name in enumerate(model_names):
|
||
ax_bubble.annotate(name, (r2_scores[i], rmse_scores[i]),
|
||
xytext=(5, 5), textcoords='offset points', fontsize=8)
|
||
|
||
ax_bubble.set_xlabel('R² Score')
|
||
ax_bubble.set_ylabel('RMSE')
|
||
ax_bubble.set_title('Performance Bubble Chart\n(Bubble size ∝ 1/MAE)')
|
||
ax_bubble.grid(True, alpha=0.3)
|
||
|
||
# 添加颜色条
|
||
cbar = plt.colorbar(scatter, ax=ax_bubble)
|
||
cbar.set_label('Model Index')
|
||
|
||
# 综合排名条形图
|
||
ax_ranking = axes[1, 1]
|
||
avg_rankings = np.mean(rankings, axis=1)
|
||
sorted_indices = np.argsort(avg_rankings)
|
||
|
||
bars = ax_ranking.bar(range(len(model_names)),
|
||
avg_rankings[sorted_indices],
|
||
color=[self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)]
|
||
for i in range(len(model_names))], alpha=0.7)
|
||
|
||
ax_ranking.set_xlabel('Models (Sorted by Average Rank)')
|
||
ax_ranking.set_ylabel('Average Rank')
|
||
ax_ranking.set_title('Overall Model Ranking')
|
||
ax_ranking.set_xticks(range(len(model_names)))
|
||
ax_ranking.set_xticklabels([model_names[i] for i in sorted_indices], rotation=45, ha='right')
|
||
ax_ranking.grid(True, alpha=0.3, axis='y')
|
||
|
||
# 添加数值标签
|
||
for i, bar in enumerate(bars):
|
||
height = bar.get_height()
|
||
ax_ranking.text(bar.get_x() + bar.get_width()/2., height,
|
||
'.2f', ha='center', va='bottom')
|
||
|
||
plt.tight_layout()
|
||
|
||
if save_path:
|
||
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
||
print(f"Model ranking matrix plot saved to: {save_path}")
|
||
|
||
# plt.show()
|
||
|
||
def generate_comprehensive_report(self, save_dir='plots', prefix='regression_analysis'):
|
||
"""
|
||
生成综合可视化报告
|
||
|
||
Parameters:
|
||
save_dir (str): 保存目录
|
||
prefix (str): 文件名前缀
|
||
"""
|
||
if not self.analyzer:
|
||
print("No analyzer available for report generation")
|
||
return
|
||
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
|
||
# 生成各种图表
|
||
plot_configs = [
|
||
('prediction_scatter', self.plot_prediction_scatter),
|
||
('residual_analysis', self.plot_residual_analysis),
|
||
('metrics_comparison', self.plot_metrics_comparison),
|
||
('error_distribution', self.plot_error_distribution),
|
||
('model_ranking', self.plot_model_ranking_matrix)
|
||
]
|
||
|
||
saved_files = []
|
||
for plot_name, plot_func in plot_configs:
|
||
try:
|
||
save_path = f'{save_dir}/{prefix}_{plot_name}_{timestamp}.png'
|
||
plot_func(save_path=save_path)
|
||
saved_files.append(save_path)
|
||
plt.close('all') # 关闭所有图形以释放内存
|
||
except Exception as e:
|
||
print(f"Failed to generate {plot_name} plot: {str(e)}")
|
||
|
||
if saved_files:
|
||
print("Comprehensive visualization report generated:")
|
||
for file in saved_files:
|
||
print(f" - {file}")
|
||
else:
|
||
print("No plots were successfully generated")
|
||
|
||
return saved_files
|
||
|
||
|
||
def main():
|
||
"""主函数 - 展示配置驱动和向后兼容两种使用方式"""
|
||
|
||
print("="*60)
|
||
print("Regression Analysis Tool - Configuration-Driven Interface")
|
||
print("="*60)
|
||
|
||
# 方法1:配置驱动方式(推荐用于GUI对接)
|
||
print("\n--- Method 1: Configuration-Driven (Recommended for GUI) ---")
|
||
|
||
# 创建配置对象
|
||
csv_file_path = r"E:\code\content\change\6.csv"
|
||
|
||
|
||
config = RegressionConfig.create_default(
|
||
csv_path=csv_file_path,
|
||
label_column="0"
|
||
)
|
||
# 可选:自定义配置
|
||
config.data.spectrum_columns = "8:" # 光谱列范围
|
||
config.models.model_names ='all'# 选择部分模型进行演示
|
||
config.models.tune_hyperparams = False # 快速分析,不进行超参数调优
|
||
config.output.save_models = True # 不保存模型文件
|
||
config.output.plot_results = True # 启用可视化
|
||
config.output.plot_dir = 'E:\code\content\change\plot\yellow' # 可视化输出目录
|
||
|
||
# 创建分析器并传入配置
|
||
analyzer = RegressionAnalyzer(config)
|
||
|
||
# 查看可用模型
|
||
analyzer.initialize_all_models()
|
||
print("Available models:")
|
||
for model_key, model_name in analyzer.get_available_models().items():
|
||
print(f" {model_key}: {model_name}")
|
||
|
||
# 运行配置驱动的分析
|
||
success = analyzer.run_analysis_from_config()
|
||
if success:
|
||
print("Configuration-driven analysis completed successfully!")
|
||
|
||
# 演示各种可视化功能
|
||
print("\n--- Visualization Demo ---")
|
||
|
||
# 创建可视化目录
|
||
viz_dir = 'visualization_demo'
|
||
os.makedirs(viz_dir, exist_ok=True)
|
||
|
||
print("Generating various visualization plots...")
|
||
|
||
# 1. 预测值vs真实值散点图
|
||
print("1. Prediction vs True Values Scatter Plot...")
|
||
analyzer.plot_prediction_scatter(
|
||
save_path=f'{viz_dir}/prediction_scatter.png',
|
||
show_individual=True,
|
||
show_overlay=True
|
||
)
|
||
|
||
# 2. 残差分析图
|
||
print("2. Residual Analysis Plot...")
|
||
analyzer.plot_residual_analysis(
|
||
save_path=f'{viz_dir}/residual_analysis.png'
|
||
)
|
||
|
||
# 3. 性能指标对比图
|
||
print("3. Performance Metrics Comparison...")
|
||
analyzer.plot_metrics_comparison(
|
||
save_path=f'{viz_dir}/metrics_comparison.png'
|
||
)
|
||
|
||
# 4. 误差分布图
|
||
print("4. Error Distribution Analysis...")
|
||
analyzer.plot_error_distribution(
|
||
save_path=f'{viz_dir}/error_distribution.png'
|
||
)
|
||
|
||
# 5. 模型排名矩阵
|
||
print("5. Model Ranking Matrix...")
|
||
analyzer.plot_model_ranking(
|
||
save_path=f'{viz_dir}/model_ranking.png'
|
||
)
|
||
|
||
# 6. 生成完整可视化报告
|
||
print("6. Generating Comprehensive Visualization Report...")
|
||
saved_plots = analyzer.generate_visualization_report(
|
||
save_dir=viz_dir,
|
||
prefix='demo_report'
|
||
)
|
||
|
||
print(f"\nVisualization completed! Generated {len(saved_plots)} plot files in '{viz_dir}' directory:")
|
||
for plot_file in saved_plots:
|
||
print(f" - {plot_file}")
|
||
|
||
print("\nAvailable visualization methods:")
|
||
print(" - analyzer.plot_prediction_scatter() # 预测值vs真实值散点图")
|
||
print(" - analyzer.plot_residual_analysis() # 残差分析图")
|
||
print(" - analyzer.plot_metrics_comparison() # 性能指标对比图")
|
||
print(" - analyzer.plot_error_distribution() # 误差分布图")
|
||
print(" - analyzer.plot_model_ranking() # 模型排名矩阵")
|
||
print(" - analyzer.generate_visualization_report() # 生成完整报告")
|
||
|
||
else:
|
||
print("Configuration-driven analysis failed!")
|
||
|
||
# # 方法2:向后兼容方式(传统参数传递)
|
||
# print("\n--- Method 2: Backward Compatible (Legacy Parameter Passing) ---")
|
||
#
|
||
# analyzer2 = RegressionAnalyzer() # 使用默认配置
|
||
#
|
||
# # 使用传统的参数传递方式
|
||
# success2 = analyzer2.run_complete_analysis(
|
||
# csv_path=r"E:\code\WQ\pipeline_result\work_dir\5_training_spectra\training_spectra.csv",
|
||
# label_column="0",
|
||
# spectrum_columns="13:",
|
||
# test_size=0.2,
|
||
# scale_method='standard',
|
||
# tune_hyperparams=False,
|
||
# save_models=False,
|
||
# plot_results=True,
|
||
# model_names=['xgboost', 'lightgbm'] # 只训练这两个模型
|
||
# )
|
||
#
|
||
# if success2:
|
||
# print("Backward-compatible analysis completed successfully!")
|
||
# else:
|
||
# print("Backward-compatible analysis failed!")
|
||
#
|
||
# print("\n" + "="*60)
|
||
# print("Both methods are supported. Configuration-driven is recommended for GUI integration.")
|
||
# print("="*60)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |