Files
HSI/rgression_method/regression.py

2504 lines
97 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
回归分析工具包
支持多种回归算法线性回归、LASSO、岭回归、Boosting、神经网络等
包含超参数调优、模型评价和保存功能
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import xgboost as xgb
import lightgbm as lgb
from statsmodels.api import OLS, GLM
from statsmodels.genmod.families import Gaussian
import warnings
import joblib
import os
from datetime import datetime
import json
from scipy.linalg import pinv
from scipy import stats
from typing import Optional, List, Dict, Any, Union
from dataclasses import dataclass, field
import time
warnings.filterwarnings('ignore')
@dataclass
class DataConfig:
"""数据配置类"""
csv_path: str = ""
label_column: Union[str, int] = ""
spectrum_columns: Optional[Union[str, List[Union[str, int]]]] = None
test_size: float = 0.2
random_state: int = 42
scale_method: str = 'standard'
@dataclass
class ModelConfig:
"""模型配置类"""
model_names: Optional[Union[str, List[str]]] = None
tune_hyperparams: bool = True
tuning_method: str = 'grid'
cv_folds: int = 5
random_search_iter: int = 20
@dataclass
class TrainingConfig:
"""训练配置类"""
epochs: int = 100
batch_size: int = 32
learning_rate: float = 0.001
@dataclass
class OutputConfig:
"""输出配置类"""
save_models: bool = True
plot_results: bool = True
save_dir: str = 'models'
plot_dir: str = 'plots'
@dataclass
class RegressionConfig:
"""回归分析完整配置类 - 为GUI对接设计的标准化接口"""
data: DataConfig = field(default_factory=DataConfig)
models: ModelConfig = field(default_factory=ModelConfig)
training: TrainingConfig = field(default_factory=TrainingConfig)
output: OutputConfig = field(default_factory=OutputConfig)
def __post_init__(self):
"""参数校验和默认值设置"""
self._validate_parameters()
def _validate_parameters(self):
"""参数校验"""
# 数据参数校验
if not self.data.csv_path:
raise ValueError("CSV file path must be specified")
if not self.data.label_column:
raise ValueError("Label column must be specified")
if not (0 < self.data.test_size < 1):
raise ValueError("Test size must be between 0 and 1")
if self.data.scale_method not in ['standard', 'minmax']:
raise ValueError("Scale method must be 'standard' or 'minmax'")
# 模型参数校验
if self.models.tuning_method not in ['grid', 'random']:
raise ValueError("Tuning method must be 'grid' or 'random'")
if self.models.cv_folds < 2:
raise ValueError("CV folds must be at least 2")
# 处理模型名称
self._process_model_names()
# 训练参数校验
if self.training.epochs <= 0:
raise ValueError("Epochs must be positive")
if self.training.batch_size <= 0:
raise ValueError("Batch size must be positive")
if self.training.learning_rate <= 0:
raise ValueError("Learning rate must be positive")
def _process_model_names(self):
"""处理模型名称,支持'all'参数"""
if isinstance(self.models.model_names, str):
if self.models.model_names.lower() == 'all':
# 获取所有支持的模型名称
supported_models = self._get_supported_models()
self.models.model_names = list(supported_models.keys())
print(f"选择所有可用模型: {len(self.models.model_names)}")
else:
# 单个模型名称
supported_models = self._get_supported_models()
if self.models.model_names not in supported_models:
raise ValueError(f"不支持的模型类型: {self.models.model_names}")
self.models.model_names = [self.models.model_names]
elif isinstance(self.models.model_names, list):
# 验证列表中的每个模型
supported_models = self._get_supported_models()
for model in self.models.model_names:
if model not in supported_models:
raise ValueError(f"不支持的模型类型: {model}")
elif self.models.model_names is None:
# 默认使用一些常用模型
self.models.model_names = ['linear', 'ridge', 'lasso', 'randomforest', 'svm']
def _get_supported_models(self) -> Dict[str, str]:
"""获取支持的模型列表"""
# 基于RegressionAnalyzer中注册的模型
models = {
# 线性模型
'linear': '多元线性回归',
'lasso': 'LASSO回归',
'ridge': '岭回归',
'elasticnet': '弹性网络回归',
'bayesianridge': '贝叶斯岭回归',
# 提升模型
'lsboost': '最小二乘提升',
'xgboost': 'XGBoost回归',
'lightgbm': 'LightGBM回归',
# 核方法
'gaussian': '高斯过程回归',
'gaussiansvm': '高斯SVM回归',
'svm': '支持向量回归',
# 神经网络
'elm': '极限学习机',
'mlp': '多层感知机',
'lstm': 'LSTM网络',
'gru': 'GRU网络',
# 其他模型
'gam': '广义加性模型',
'decisiontree': '决策树回归',
'randomforest': '随机森林回归',
'extratrees': '极端随机树回归',
'adaboost': 'AdaBoost回归'
}
# 只返回当前环境可用的模型
available_models = {}
for key, name in models.items():
try:
# 这里可以添加更复杂的可用性检查
available_models[key] = name
except:
continue
return available_models
@classmethod
def create_default(cls, csv_path: str, label_column: Union[str, int]) -> 'RegressionConfig':
"""创建默认配置的便捷方法"""
# 创建配置时暂时跳过验证
config = cls.__new__(cls)
config.data = DataConfig()
config.models = ModelConfig()
config.training = TrainingConfig()
config.output = OutputConfig()
# 设置必要参数
config.data.csv_path = csv_path
config.data.label_column = label_column
# 手动调用验证
config._validate_parameters()
return config
@classmethod
def create_quick_analysis(cls, csv_path: str, label_column: Union[str, int],
model_names: Optional[List[str]] = None) -> 'RegressionConfig':
"""创建快速分析配置"""
# 创建配置时暂时跳过验证
config = cls.__new__(cls)
config.data = DataConfig()
config.models = ModelConfig()
config.training = TrainingConfig()
config.output = OutputConfig()
# 设置必要参数
config.data.csv_path = csv_path
config.data.label_column = label_column
config.models.model_names = model_names
config.models.tune_hyperparams = False # 快速分析不调参
config.output.save_models = False # 不保存模型
# 手动调用验证
config._validate_parameters()
return config
class ExtremeLearningMachine(BaseEstimator, RegressorMixin):
"""
Extreme Learning Machine (ELM) 回归器
ELM是一种单隐层前馈神经网络具有快速训练速度
"""
def __init__(self, n_hidden=100, activation='sigmoid', random_state=42):
self.n_hidden = n_hidden
self.activation = activation
self.random_state = random_state
self.input_weights_ = None
self.biases_ = None
self.output_weights_ = None
def _activation_function(self, X):
"""激活函数"""
if self.activation == 'sigmoid':
return 1 / (1 + np.exp(-X))
elif self.activation == 'tanh':
return np.tanh(X)
elif self.activation == 'relu':
return np.maximum(0, X)
elif self.activation == 'linear':
return X
else:
raise ValueError(f"Unsupported activation function: {self.activation}")
def fit(self, X, y):
"""训练ELM模型"""
np.random.seed(self.random_state)
n_samples, n_features = X.shape
# Initialize input weights and biases (random)
self.input_weights_ = np.random.randn(n_features, self.n_hidden)
self.biases_ = np.random.randn(self.n_hidden)
# 计算隐层输出
H = self._activation_function(X @ self.input_weights_ + self.biases_)
# 添加偏置列到隐层输出(可选)
H = np.column_stack([H, np.ones(n_samples)])
# 计算输出权重(解析解)
self.output_weights_ = pinv(H) @ y
return self
def predict(self, X):
"""预测"""
if self.input_weights_ is None:
raise ValueError("模型还未训练")
# 计算隐层输出
H = self._activation_function(X @ self.input_weights_ + self.biases_)
H = np.column_stack([H, np.ones(X.shape[0])])
# 计算输出
return H @ self.output_weights_
def _more_tags(self):
return {'no_validation': True}
class GeneralizedAdditiveModel(BaseEstimator, RegressorMixin):
"""
Generalized Additive Model (GAM) 回归器
使用样条基函数实现GAM
"""
def __init__(self, n_splines=10, degree=3, lambda_=0.1):
self.n_splines = n_splines
self.degree = degree
self.lambda_ = lambda_
self.coefficients_ = None
self.knots_ = None
def _create_spline_basis(self, X):
"""创建样条基函数"""
n_samples, n_features = X.shape
n_basis = self.n_splines + self.degree + 1
# 为每个特征创建基函数
basis_matrices = []
for feature_idx in range(n_features):
x = X[:, feature_idx]
x_min, x_max = np.min(x), np.max(x)
# 创建节点
if self.knots_ is None:
knots = np.linspace(x_min, x_max, self.n_splines + 2)[1:-1]
else:
knots = self.knots_[feature_idx]
# 创建B样条基函数
basis = np.zeros((n_samples, n_basis))
# 截断幂基函数(简化实现)
for i in range(n_basis):
if i < self.degree + 1:
# 左端多项式
basis[:, i] = np.power(np.maximum(0, x - x_min), i)
elif i > n_basis - self.degree - 2:
# 右端多项式
power = n_basis - 1 - i
basis[:, i] = np.power(np.maximum(0, x_max - x), power)
else:
# 中间截断幂函数
basis[:, i] = np.power(np.maximum(0, x - knots[i - self.degree - 1]), self.degree)
basis_matrices.append(basis)
# 组合所有特征的基函数
return np.concatenate(basis_matrices, axis=1)
def fit(self, X, y):
"""训练GAM模型"""
from sklearn.linear_model import Ridge
# 创建样条基函数
X_basis = self._create_spline_basis(X)
# 使用岭回归拟合系数(带正则化)
ridge = Ridge(alpha=self.lambda_, fit_intercept=True)
ridge.fit(X_basis, y)
self.coefficients_ = ridge.coef_
self.intercept_ = ridge.intercept_
return self
def predict(self, X):
"""预测"""
if self.coefficients_ is None:
raise ValueError("模型还未训练")
X_basis = self._create_spline_basis(X)
return X_basis @ self.coefficients_ + self.intercept_
class LSTMRegressor(BaseEstimator, RegressorMixin):
"""
LSTM回归器 - 将光谱数据视为序列数据
每个光谱样本的波段作为时间步
使用PyTorch实现
"""
def __init__(self, units=None, dropout=0.2, recurrent_dropout=0.2, epochs=None,
batch_size=None, learning_rate=None, random_state=42, device=None,
config: Optional['TrainingConfig'] = None):
"""
LSTM回归器构造函数
Parameters:
units (int, optional): LSTM单元数如果为None则使用默认值50
dropout (float): Dropout比例
recurrent_dropout (float): 循环Dropout比例
epochs (int, optional): 训练轮数如果为None则使用默认值100
batch_size (int, optional): 批次大小如果为None则使用默认值32
learning_rate (float, optional): 学习率如果为None则使用默认值0.001
random_state (int): 随机种子
device (str, optional): 计算设备
config (TrainingConfig, optional): 训练配置对象
"""
# 如果提供了配置对象,使用配置中的参数
if config is not None:
self.units = units if units is not None else 64 # LSTM默认使用更多单元
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.epochs = epochs if epochs is not None else config.epochs
self.batch_size = batch_size if batch_size is not None else config.batch_size
self.learning_rate = learning_rate if learning_rate is not None else config.learning_rate
else:
# 使用传统参数方式
self.units = units if units is not None else 50
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.epochs = epochs if epochs is not None else 100
self.batch_size = batch_size if batch_size is not None else 32
self.learning_rate = learning_rate if learning_rate is not None else 0.001
self.random_state = random_state
self.device = device
self.model_ = None
self.input_size_ = None
# 尝试导入PyTorch
try:
import torch
import torch.nn as nn
import torch.optim as optim
self.torch = torch
self.nn = nn
self.optim = optim
self.pytorch_available = True
# 设置设备
if self.device is None:
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
self.device = torch.device(self.device)
except ImportError:
self.pytorch_available = False
print("Warning: PyTorch not installed, LSTM model will use MLPRegressor approximation")
def _create_model(self, input_size):
"""创建PyTorch LSTM模型"""
class LSTMModel(self.nn.Module):
def __init__(self, input_size, hidden_size, dropout, recurrent_dropout):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.lstm = self.nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=1,
batch_first=True,
dropout=recurrent_dropout if recurrent_dropout > 0 else 0,
bidirectional=False
)
self.dropout_layer = self.nn.Dropout(dropout)
self.fc = self.nn.Linear(hidden_size, 1)
def forward(self, x):
# LSTM前向传播
lstm_out, _ = self.lstm(x)
# 取最后一个时间步的输出
lstm_out = lstm_out[:, -1, :]
# Dropout
lstm_out = self.dropout_layer(lstm_out)
# 全连接层
output = self.fc(lstm_out)
return output
return LSTMModel(input_size, self.units, self.dropout, self.recurrent_dropout)
def fit(self, X, y):
"""训练LSTM模型"""
if not self.pytorch_available:
# 如果没有PyTorch使用MLPRegressor作为近似
from sklearn.neural_network import MLPRegressor
self.model_ = MLPRegressor(
hidden_layer_sizes=(self.units, self.units//2),
activation='relu',
solver='adam',
max_iter=self.epochs,
random_state=self.random_state,
early_stopping=True
)
self.model_.fit(X, y)
return self
# 设置随机种子
self.torch.manual_seed(self.random_state)
if self.torch.cuda.is_available():
self.torch.cuda.manual_seed(self.random_state)
self.torch.cuda.manual_seed_all(self.random_state)
np.random.seed(self.random_state)
# 将特征数据重塑为序列格式 (samples, timesteps, features)
# 对于光谱数据每个波段作为时间步特征数为1
n_samples, n_features = X.shape
self.input_size_ = 1 # 每个时间步的特征数
# 转换为PyTorch张量
X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
y_tensor = self.torch.FloatTensor(y.reshape(-1, 1)).to(self.device)
# 创建模型
self.model_ = self._create_model(self.input_size_).to(self.device)
# 定义损失函数和优化器
criterion = self.nn.MSELoss()
optimizer = self.optim.Adam(self.model_.parameters(), lr=self.learning_rate)
# 训练模型
self.model_.train()
for epoch in range(self.epochs):
# 随机打乱数据
indices = np.random.permutation(n_samples)
X_shuffled = X_tensor[indices]
y_shuffled = y_tensor[indices]
# 批量训练
for i in range(0, n_samples, self.batch_size):
batch_X = X_shuffled[i:i+self.batch_size]
batch_y = y_shuffled[i:i+self.batch_size]
# 前向传播
optimizer.zero_grad()
outputs = self.model_(batch_X)
loss = criterion(outputs, batch_y)
# 反向传播
loss.backward()
optimizer.step()
return self
def predict(self, X):
"""预测"""
if self.model_ is None:
raise ValueError("模型还未训练")
if not self.pytorch_available:
return self.model_.predict(X)
# 转换为评估模式
self.model_.eval()
# 重塑输入数据
n_samples, n_features = X.shape
X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
# 预测
with self.torch.no_grad():
predictions = self.model_(X_tensor)
return predictions.cpu().numpy().flatten()
class GRURegressor(BaseEstimator, RegressorMixin):
"""
GRU回归器 - 将光谱数据视为序列数据
每个光谱样本的波段作为时间步
使用PyTorch实现
"""
def __init__(self, units=None, dropout=0.2, recurrent_dropout=0.2, epochs=None,
batch_size=None, learning_rate=None, random_state=42, device=None,
config: Optional['TrainingConfig'] = None):
"""
GRU回归器构造函数
Parameters:
units (int, optional): GRU单元数如果为None则使用默认值50
dropout (float): Dropout比例
recurrent_dropout (float): 循环Dropout比例
epochs (int, optional): 训练轮数如果为None则使用默认值100
batch_size (int, optional): 批次大小如果为None则使用默认值32
learning_rate (float, optional): 学习率如果为None则使用默认值0.001
random_state (int): 随机种子
device (str, optional): 计算设备
config (TrainingConfig, optional): 训练配置对象
"""
# 如果提供了配置对象,使用配置中的参数
if config is not None:
self.units = units if units is not None else 64 # GRU默认使用更多单元
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.epochs = epochs if epochs is not None else config.epochs
self.batch_size = batch_size if batch_size is not None else config.batch_size
self.learning_rate = learning_rate if learning_rate is not None else config.learning_rate
else:
# 使用传统参数方式
self.units = units if units is not None else 50
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.epochs = epochs if epochs is not None else 100
self.batch_size = batch_size if batch_size is not None else 32
self.learning_rate = learning_rate if learning_rate is not None else 0.001
self.random_state = random_state
self.device = device
self.model_ = None
self.input_size_ = None
# 尝试导入PyTorch
try:
import torch
import torch.nn as nn
import torch.optim as optim
self.torch = torch
self.nn = nn
self.optim = optim
self.pytorch_available = True
# 设置设备
if self.device is None:
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
self.device = torch.device(self.device)
except ImportError:
self.pytorch_available = False
print("Warning: PyTorch not installed, GRU model will use MLPRegressor approximation")
def _create_model(self, input_size):
"""创建PyTorch GRU模型"""
class GRUModel(self.nn.Module):
def __init__(self, input_size, hidden_size, dropout, recurrent_dropout):
super(GRUModel, self).__init__()
self.hidden_size = hidden_size
self.gru = self.nn.GRU(
input_size=input_size,
hidden_size=hidden_size,
num_layers=1,
batch_first=True,
dropout=recurrent_dropout if recurrent_dropout > 0 else 0,
bidirectional=False
)
self.dropout_layer = self.nn.Dropout(dropout)
self.fc = self.nn.Linear(hidden_size, 1)
def forward(self, x):
# GRU前向传播
gru_out, _ = self.gru(x)
# 取最后一个时间步的输出
gru_out = gru_out[:, -1, :]
# Dropout
gru_out = self.dropout_layer(gru_out)
# 全连接层
output = self.fc(gru_out)
return output
return GRUModel(input_size, self.units, self.dropout, self.recurrent_dropout)
def fit(self, X, y):
"""训练GRU模型"""
if not self.pytorch_available:
# 如果没有PyTorch使用MLPRegressor作为近似
from sklearn.neural_network import MLPRegressor
self.model_ = MLPRegressor(
hidden_layer_sizes=(self.units, self.units//2),
activation='relu',
solver='adam',
max_iter=self.epochs,
random_state=self.random_state,
early_stopping=True
)
self.model_.fit(X, y)
return self
# 设置随机种子
self.torch.manual_seed(self.random_state)
if self.torch.cuda.is_available():
self.torch.cuda.manual_seed(self.random_state)
self.torch.cuda.manual_seed_all(self.random_state)
np.random.seed(self.random_state)
# 将特征数据重塑为序列格式 (samples, timesteps, features)
n_samples, n_features = X.shape
self.input_size_ = 1 # 每个时间步的特征数
# 转换为PyTorch张量
X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
y_tensor = self.torch.FloatTensor(y.reshape(-1, 1)).to(self.device)
# 创建模型
self.model_ = self._create_model(self.input_size_).to(self.device)
# 定义损失函数和优化器
criterion = self.nn.MSELoss()
optimizer = self.optim.Adam(self.model_.parameters(), lr=self.learning_rate)
# 训练模型
self.model_.train()
for epoch in range(self.epochs):
# 随机打乱数据
indices = np.random.permutation(n_samples)
X_shuffled = X_tensor[indices]
y_shuffled = y_tensor[indices]
# 批量训练
for i in range(0, n_samples, self.batch_size):
batch_X = X_shuffled[i:i+self.batch_size]
batch_y = y_shuffled[i:i+self.batch_size]
# 前向传播
optimizer.zero_grad()
outputs = self.model_(batch_X)
loss = criterion(outputs, batch_y)
# 反向传播
loss.backward()
optimizer.step()
return self
def predict(self, X):
"""预测"""
if self.model_ is None:
raise ValueError("模型还未训练")
if not self.pytorch_available:
return self.model_.predict(X)
# 转换为评估模式
self.model_.eval()
# 重塑输入数据
n_samples, n_features = X.shape
X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
# 预测
with self.torch.no_grad():
predictions = self.model_(X_tensor)
return predictions.cpu().numpy().flatten()
class RegressionAnalyzer:
"""
回归分析器类 - 支持GUI对接的标准化接口
支持多种回归算法和完整的分析流程
"""
def __init__(self, config: Optional[RegressionConfig] = None):
"""
初始化回归分析器
Parameters:
config (RegressionConfig, optional): 配置对象如果为None则使用默认配置
"""
self.config = config or RegressionConfig()
self._validate_config() # 在构造函数中进行校验
self.models = {}
self.scalers = {}
self.best_params = {}
self.results = {}
self.data = None
self.X = None
self.y = None
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
# 初始化可视化器
self.visualizer = RegressionVisualizer(self)
def update_config(self, config: RegressionConfig):
"""
更新配置 - 为GUI动态配置预留接口
Parameters:
config (RegressionConfig): 新的配置对象
"""
self.config = config
self._validate_config()
def _validate_config(self):
"""配置校验"""
try:
self.config._validate_parameters()
except ValueError as e:
raise ValueError(f"Configuration validation failed: {e}")
def _parse_column_range(self, column_range, total_columns):
"""
解析列范围字符串,返回列索引列表
Parameters:
column_range (str or int or list): 列范围,如 "0:5", "2,4,6-8", [0,1,2] 或单个索引
total_columns (int): 总列数
Returns:
list: 列索引列表
"""
if isinstance(column_range, (int, np.integer)):
# 单个列索引
if column_range >= total_columns or column_range < 0:
raise ValueError(f"Column index {column_range} out of range [0, {total_columns-1}]")
return [column_range]
elif isinstance(column_range, str):
# 解析范围字符串
columns = []
# 分割多个范围(用逗号分隔)
for part in column_range.split(','):
part = part.strip()
if ':' in part:
# 范围选择,如 "0:5"
start, end = part.split(':')
start = int(start.strip()) if start.strip() else 0
end = int(end.strip()) if end.strip() else total_columns
if start < 0:
start = total_columns + start
if end < 0:
end = total_columns + end
if start >= total_columns or end > total_columns:
raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
columns.extend(range(start, end))
else:
# 单个索引
idx = int(part.strip())
if idx < 0:
idx = total_columns + idx
if idx >= total_columns or idx < 0:
raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
columns.append(idx)
return list(set(columns)) # 去重
elif isinstance(column_range, (list, tuple)):
# 直接的列索引列表
columns = []
for idx in column_range:
if isinstance(idx, str):
if ':' in idx:
# 处理列表中的范围字符串
start, end = idx.split(':')
start = int(start.strip()) if start.strip() else 0
end = int(end.strip()) if end.strip() else total_columns
if start < 0:
start = total_columns + start
if end < 0:
end = total_columns + end
if start >= total_columns or end > total_columns:
raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
columns.extend(range(start, end))
else:
idx_int = int(idx.strip())
if idx_int < 0:
idx_int = total_columns + idx_int
if idx_int >= total_columns or idx_int < 0:
raise ValueError(f"Column index {idx_int} out of range [0, {total_columns-1}]")
columns.append(idx_int)
else:
if idx < 0:
idx = total_columns + idx
if idx >= total_columns or idx < 0:
raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
columns.append(idx)
return list(set(columns)) # 去重
else:
raise ValueError(f"Unsupported column range format: {type(column_range)}")
def load_csv(self, file_path, label_column, spectrum_columns=None, delimiter=',', header=0):
"""
加载CSV文件并指定标签列和光谱列
Parameters:
file_path (str): CSV文件路径
label_column (str or int or range-like): 标签列,支持范围选择,如 "0:5", "2,4,6-8" 或单个索引
spectrum_columns (str or list or None): 光谱列,支持范围选择,如 "1:10", "2,4,6-8" 或列索引列表如果为None则使用除标签列外的所有列
delimiter (str): 分隔符,默认为','
header (int): 表头行号默认为0
"""
try:
# 读取CSV文件
self.data = pd.read_csv(file_path, delimiter=delimiter, header=header)
total_columns = len(self.data.columns)
# 处理标签列:先检查是否是列名,然后再检查是否是索引
if isinstance(label_column, str) and label_column in self.data.columns:
# 如果是有效的列名
label_idx = self.data.columns.get_loc(label_column)
self.y = self.data[label_column].values
else:
# 尝试作为列索引处理
try:
if isinstance(label_column, str):
# 可能是数字字符串,转换为整数
label_column = int(label_column)
label_idx = label_column
if label_idx < 0:
label_idx = total_columns + label_idx
if label_idx < 0 or label_idx >= total_columns:
raise ValueError(f"Column index {label_column} out of range [0, {total_columns-1}]")
self.y = self.data.iloc[:, label_idx].values
except (ValueError, TypeError):
raise ValueError(f"Invalid label column specification: {label_column}. Must be a valid column name or index.")
# 确定光谱列
if spectrum_columns is None:
# 使用除标签列外的所有列作为光谱列
spectrum_indices = [i for i in range(total_columns) if i != label_idx]
else:
# 解析光谱列范围
spectrum_indices = self._parse_column_range(spectrum_columns, total_columns)
# 排除标签列(如果在光谱列范围内)
spectrum_indices = [i for i in spectrum_indices if i != label_idx]
if not spectrum_indices:
raise ValueError("No valid spectrum columns found")
# 提取光谱数据
self.X = self.data.iloc[:, spectrum_indices].values
# 跳过缺失标签的行
valid_mask = ~pd.isna(self.y)
original_samples = len(self.y)
self.X = self.X[valid_mask]
self.y = self.y[valid_mask]
self.feature_names = [self.data.columns[i] for i in spectrum_indices]
skipped_samples = original_samples - len(self.y)
print(f"Successfully loaded data: {self.X.shape[0]} samples, {self.X.shape[1]} features")
print(f"Label column: {label_idx} ({self.data.columns[label_idx]})")
print(f"Spectrum column range: {min(spectrum_indices)}-{max(spectrum_indices)}")
if skipped_samples > 0:
print(f"Rows skipped due to missing labels: {skipped_samples}")
print(f"Label range: {self.y.min():.4f} - {self.y.max():.4f}")
print(f"Data type check: X type {self.X.dtype}, y type {self.y.dtype}")
# Check and process data types
if self.X.dtype != np.float64:
self.X = self.X.astype(np.float64)
if self.y.dtype != np.float64:
self.y = self.y.astype(np.float64)
return True
except Exception as e:
print(f"Failed to load data: {str(e)}")
return False
def preprocess_data(self, test_size=None, random_state=None, scale_method=None):
"""
数据预处理:分割训练集和测试集,标准化
Parameters:
test_size (float, optional): 测试集比例如果为None则使用配置中的值
random_state (int, optional): 随机种子如果为None则使用配置中的值
scale_method (str, optional): 标准化方法如果为None则使用配置中的值
"""
# 使用配置中的默认值
test_size = test_size if test_size is not None else self.config.data.test_size
random_state = random_state if random_state is not None else self.config.data.random_state
scale_method = scale_method if scale_method is not None else self.config.data.scale_method
try:
# 分割数据集
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
self.X, self.y, test_size=test_size, random_state=random_state
)
# 标准化
if scale_method == 'standard':
self.scalers['X'] = StandardScaler()
elif scale_method == 'minmax':
self.scalers['X'] = MinMaxScaler()
else:
raise ValueError("scale_method must be 'standard' or 'minmax'")
self.X_train_scaled = self.scalers['X'].fit_transform(self.X_train)
self.X_test_scaled = self.scalers['X'].transform(self.X_test)
print(f"Data preprocessing completed:")
print(f"Training set: {self.X_train.shape[0]} samples")
print(f"Test set: {self.X_test.shape[0]} samples")
return True
except Exception as e:
print(f"Data preprocessing failed: {str(e)}")
return False
def add_linear_models(self):
"""添加线性回归模型"""
self.models['linear'] = {
'model': LinearRegression(),
'name': '多元线性回归'
}
self.models['lasso'] = {
'model': Lasso(random_state=42),
'name': 'LASSO回归',
'params': {
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}
}
self.models['ridge'] = {
'model': Ridge(random_state=42),
'name': '岭回归',
'params': {
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}
}
self.models['elasticnet'] = {
'model': ElasticNet(random_state=42),
'name': '弹性网络回归',
'params': {
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
}
self.models['bayesianridge'] = {
'model': BayesianRidge(),
'name': '贝叶斯岭回归'
}
def add_boosting_models(self):
"""添加Boosting模型"""
self.models['lsboost'] = {
'model': GradientBoostingRegressor(random_state=42),
'name': 'LSBoost回归',
'params': {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0]
}
}
self.models['xgboost'] = {
'model': xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
'name': 'XGBoost回归',
'params': {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0]
}
}
self.models['lightgbm'] = {
'model': lgb.LGBMRegressor(random_state=42),
'name': 'LightGBM回归',
'params': {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0]
}
}
def add_kernel_models(self):
"""添加核模型"""
# 高斯过程回归
kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))
self.models['gaussian'] = {
'model': GaussianProcessRegressor(kernel=kernel, random_state=42),
'name': '高斯过程回归'
}
# 高斯核SVM
self.models['gaussiansvm'] = {
'model': SVR(kernel='rbf'),
'name': '高斯核SVM回归',
'params': {
'C': [0.1, 1.0, 10.0, 100.0],
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1.0]
}
}
# 标准SVM回归
self.models['svm'] = {
'model': SVR(),
'name': 'SVM回归',
'params': {
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'C': [0.1, 1.0, 10.0, 100.0],
'gamma': ['scale', 'auto']
}
}
def add_neural_networks(self, training_config: Optional[TrainingConfig] = None):
"""添加神经网络模型"""
# ELM回归 (真正的极限学习机实现)
self.models['elm'] = {
'model': ExtremeLearningMachine(random_state=42),
'name': 'ELM回归',
'params': {
'n_hidden': [50, 100, 200, 500],
'activation': ['sigmoid', 'tanh', 'relu']
}
}
# BP/MLP回归
self.models['mlp'] = {
'model': MLPRegressor(random_state=42, max_iter=1000),
'name': 'BP/MLP回归',
'params': {
'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)],
'activation': ['relu', 'tanh'],
'learning_rate_init': [0.001, 0.01, 0.1],
'alpha': [0.0001, 0.001, 0.01]
}
}
# LSTM回归 (将光谱数据视为序列,波段作为时间步)
self.models['lstm'] = {
'model': LSTMRegressor(random_state=42, config=training_config),
'name': 'LSTM回归',
'params': {
'units': [32, 64, 128],
'dropout': [0.1, 0.2, 0.3],
'epochs': [50, 100, 200]
}
}
# GRU回归 (将光谱数据视为序列,波段作为时间步)
self.models['gru'] = {
'model': GRURegressor(random_state=42, config=training_config),
'name': 'GRU回归',
'params': {
'units': [32, 64, 128],
'dropout': [0.1, 0.2, 0.3],
'epochs': [50, 100, 200]
}
}
def add_specialized_models(self):
"""添加专业模型"""
# GAM回归 (真正的广义加性模型)
self.models['gam'] = {
'model': GeneralizedAdditiveModel(),
'name': 'GAM回归',
'params': {
'n_splines': [5, 10, 15, 20],
'degree': [3, 4],
'lambda_': [0.001, 0.01, 0.1, 1.0]
}
}
# 决策树回归
self.models['decisiontree'] = {
'model': DecisionTreeRegressor(random_state=42),
'name': '决策树回归',
'params': {
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['auto', 'sqrt', 'log2']
}
}
# 随机森林回归
self.models['randomforest'] = {
'model': RandomForestRegressor(random_state=42),
'name': '随机森林回归',
'params': {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['auto', 'sqrt', 'log2']
}
}
# 极端随机树回归
self.models['extratrees'] = {
'model': ExtraTreesRegressor(random_state=42),
'name': '极端随机树回归',
'params': {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['auto', 'sqrt', 'log2']
}
}
# AdaBoost回归
self.models['adaboost'] = {
'model': AdaBoostRegressor(random_state=42),
'name': 'AdaBoost回归',
'params': {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 1.0],
'loss': ['linear', 'square', 'exponential']
}
}
def initialize_all_models(self, use_config: bool = True):
"""Initialize all models"""
self.add_linear_models()
self.add_boosting_models()
self.add_kernel_models()
# 神经网络模型可以选择是否使用配置
if use_config and hasattr(self, 'config'):
self.add_neural_networks(self.config.training)
else:
self.add_neural_networks()
self.add_specialized_models()
print(f"Initialized {len(self.models)} regression models")
def get_available_models(self):
"""获取所有可用模型的名称和描述"""
return {name: info['name'] for name, info in self.models.items()}
def hyperparameter_tuning(self, model_name, method=None, cv=None, n_iter=None):
"""
超参数调优
Parameters:
model_name (str): 模型名称
method (str, optional): 调优方法如果为None则使用配置中的值
cv (int, optional): 交叉验证折数如果为None则使用配置中的值
n_iter (int, optional): 随机搜索的迭代次数如果为None则使用配置中的值
"""
# 使用配置中的默认值
method = method if method is not None else self.config.models.tuning_method
cv = cv if cv is not None else self.config.models.cv_folds
n_iter = n_iter if n_iter is not None else self.config.models.random_search_iter
if model_name not in self.models:
print(f"Model '{model_name}' does not exist")
return False
model_info = self.models[model_name]
if 'params' not in model_info:
print(f"Model '{model_name}' has no tunable parameters")
return False
print(f"Starting hyperparameter tuning for model: {model_info['name']}")
if method == 'grid':
search = GridSearchCV(
model_info['model'],
model_info['params'],
cv=cv,
scoring='neg_mean_squared_error',
n_jobs=-1,
verbose=1
)
elif method == 'random':
search = RandomizedSearchCV(
model_info['model'],
model_info['params'],
n_iter=n_iter,
cv=cv,
scoring='neg_mean_squared_error',
n_jobs=-1,
random_state=42,
verbose=1
)
else:
print("Tuning method must be 'grid' or 'random'")
return False
try:
search.fit(self.X_train_scaled, self.y_train)
self.best_params[model_name] = search.best_params_
self.models[model_name]['model'] = search.best_estimator_
print(f"Best parameters: {search.best_params_}")
print(f"Best score: {-search.best_score_:.4f}")
return True
except Exception as e:
print(f"Tuning failed: {str(e)}")
return False
def train_model(self, model_name):
"""训练单个模型"""
if model_name not in self.models:
print(f"Model '{model_name}' does not exist")
return False
try:
model_info = self.models[model_name]
model = model_info['model']
print(f"Training model: {model_info['name']}")
# Train model
model.fit(self.X_train_scaled, self.y_train)
# Predict
y_pred_train = model.predict(self.X_train_scaled)
y_pred_test = model.predict(self.X_test_scaled)
# Calculate evaluation metrics
metrics = self.calculate_metrics(self.y_train, y_pred_train, self.y_test, y_pred_test)
self.results[model_name] = {
'model': model,
'metrics': metrics,
'y_pred_train': y_pred_train,
'y_pred_test': y_pred_test
}
print(f"{model_info['name']} training completed")
print(f"Training R²: {metrics['train_r2']:.4f}, Test R²: {metrics['test_r2']:.4f}")
return True
except Exception as e:
print(f"Training failed: {str(e)}")
return False
def train_all_models(self, tune_hyperparams=False, tuning_method='grid'):
"""训练当前配置的所有模型"""
models_to_train = list(self.models.keys())
print(f"Training {len(models_to_train)} models: {', '.join(models_to_train)}")
for model_name in models_to_train:
if tune_hyperparams and 'params' in self.models[model_name]:
self.hyperparameter_tuning(model_name, method=tuning_method)
self.train_model(model_name)
def calculate_metrics(self, y_train, y_pred_train, y_test, y_pred_test):
"""计算评价指标"""
metrics = {}
# 训练集指标
metrics['train_mse'] = mean_squared_error(y_train, y_pred_train)
metrics['train_rmse'] = np.sqrt(metrics['train_mse'])
metrics['train_mae'] = mean_absolute_error(y_train, y_pred_train)
metrics['train_r2'] = r2_score(y_train, y_pred_train)
# 测试集指标
metrics['test_mse'] = mean_squared_error(y_test, y_pred_test)
metrics['test_rmse'] = np.sqrt(metrics['test_mse'])
metrics['test_mae'] = mean_absolute_error(y_test, y_pred_test)
metrics['test_r2'] = r2_score(y_test, y_pred_test)
return metrics
def plot_results(self, save_path=None, plot_type='comprehensive'):
"""
绘制结果比较图
Parameters:
save_path (str, optional): 保存路径
plot_type (str): 绘图类型 ('basic', 'comprehensive', 'prediction', 'residual',
'metrics', 'error_dist', 'ranking')
"""
if not self.results:
print("No training results to plot")
return
if plot_type == 'comprehensive':
# 生成综合报告
self.visualizer.generate_comprehensive_report(
save_dir=self.config.output.plot_dir,
prefix='regression_analysis'
)
elif plot_type == 'basic':
# 基础图表(保持向后兼容)
self._plot_basic_comparison(save_path)
elif plot_type == 'prediction':
# 预测值vs真实值散点图
self.visualizer.plot_prediction_scatter(save_path=save_path)
elif plot_type == 'residual':
# 残差分析图
self.visualizer.plot_residual_analysis(save_path=save_path)
elif plot_type == 'metrics':
# 性能指标对比图
self.visualizer.plot_metrics_comparison(save_path=save_path)
elif plot_type == 'error_dist':
# 误差分布图
self.visualizer.plot_error_distribution(save_path=save_path)
elif plot_type == 'ranking':
# 模型排名矩阵
self.visualizer.plot_model_ranking_matrix(save_path=save_path)
else:
print(f"Unknown plot type: {plot_type}")
return
def _plot_basic_comparison(self, save_path=None):
"""绘制基础比较图(向后兼容)"""
# 准备数据
model_names = []
train_r2 = []
test_r2 = []
train_rmse = []
test_rmse = []
for model_name, result in self.results.items():
model_names.append(self.models[model_name]['name'])
train_r2.append(result['metrics']['train_r2'])
test_r2.append(result['metrics']['test_r2'])
train_rmse.append(result['metrics']['train_rmse'])
test_rmse.append(result['metrics']['test_rmse'])
# 创建图表
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
# R² Score Comparison
x = np.arange(len(model_names))
width = 0.35
ax1.bar(x - width/2, train_r2, width, label='Training Set', alpha=0.8)
ax1.bar(x + width/2, test_r2, width, label='Test Set', alpha=0.8)
ax1.set_xlabel('Model')
ax1.set_ylabel('R² Score')
ax1.set_title('R² Score Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(model_names, rotation=45, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3)
# RMSE Comparison
ax2.bar(x - width/2, train_rmse, width, label='Training Set', alpha=0.8)
ax2.bar(x + width/2, test_rmse, width, label='Test Set', alpha=0.8)
ax2.set_xlabel('Model')
ax2.set_ylabel('RMSE')
ax2.set_title('RMSE Comparison')
ax2.set_xticks(x)
ax2.set_xticklabels(model_names, rotation=45, ha='right')
ax2.legend()
ax2.grid(True, alpha=0.3)
# Predicted vs Actual Values Scatter Plot (Test Set)
colors = plt.cm.tab10(np.linspace(0, 1, len(self.results)))
for i, (model_name, result) in enumerate(self.results.items()):
ax3.scatter(self.y_test, result['y_pred_test'], alpha=0.6, color=colors[i],
label=self.models[model_name]['name'], s=20)
ax3.plot([self.y_test.min(), self.y_test.max()], [self.y_test.min(), self.y_test.max()],
'k--', linewidth=2, label='Perfect Prediction')
ax3.set_xlabel('Actual Values')
ax3.set_ylabel('Predicted Values')
ax3.set_title('Predicted vs Actual Values (Test Set)')
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax3.grid(True, alpha=0.3)
# Residual Plot
for i, (model_name, result) in enumerate(self.results.items()):
residuals = self.y_test - result['y_pred_test']
ax4.scatter(result['y_pred_test'], residuals, alpha=0.6, color=colors[i],
label=self.models[model_name]['name'], s=20)
ax4.axhline(y=0, color='k', linestyle='--', linewidth=2)
ax4.set_xlabel('Predicted Values')
ax4.set_ylabel('Residuals')
ax4.set_title('Residual Plot (Test Set)')
ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax4.grid(True, alpha=0.3)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Chart saved to: {save_path}")
# plt.show()
def plot_prediction_scatter(self, save_path=None, **kwargs):
"""绘制预测值vs真实值散点图"""
self.visualizer.plot_prediction_scatter(save_path=save_path, **kwargs)
def plot_residual_analysis(self, save_path=None, **kwargs):
"""绘制残差分析图"""
self.visualizer.plot_residual_analysis(save_path=save_path, **kwargs)
def plot_metrics_comparison(self, save_path=None, **kwargs):
"""绘制性能指标对比图"""
self.visualizer.plot_metrics_comparison(save_path=save_path, **kwargs)
def plot_error_distribution(self, save_path=None, **kwargs):
"""绘制误差分布图"""
self.visualizer.plot_error_distribution(save_path=save_path, **kwargs)
def plot_model_ranking(self, save_path=None, **kwargs):
"""绘制模型排名矩阵"""
self.visualizer.plot_model_ranking_matrix(save_path=save_path, **kwargs)
def generate_visualization_report(self, save_dir=None, prefix=None):
"""生成完整的可视化报告"""
save_dir = save_dir or self.config.output.plot_dir
prefix = prefix or 'regression_analysis'
return self.visualizer.generate_comprehensive_report(save_dir=save_dir, prefix=prefix)
def save_model(self, model_name, save_dir='models'):
"""Save model"""
if model_name not in self.results:
print(f"Model '{model_name}' has no training results")
return False
if not os.path.exists(save_dir):
os.makedirs(save_dir)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
model_path = os.path.join(save_dir, f'{model_name}_{timestamp}.pkl')
scaler_path = os.path.join(save_dir, f'scaler_{timestamp}.pkl')
info_path = os.path.join(save_dir, f'info_{model_name}_{timestamp}.json')
try:
# 保存模型
joblib.dump(self.results[model_name]['model'], model_path)
# 保存标准化器
joblib.dump(self.scalers['X'], scaler_path)
# 保存模型信息
info = {
'model_name': model_name,
'full_name': self.models[model_name]['name'],
'timestamp': timestamp,
'metrics': self.results[model_name]['metrics'],
'best_params': self.best_params.get(model_name, {}),
'feature_names': self.feature_names
}
with open(info_path, 'w', encoding='utf-8') as f:
json.dump(info, f, indent=4, ensure_ascii=False)
print(f"Model saved:")
print(f" Model file: {model_path}")
print(f" Scaler: {scaler_path}")
print(f" Info file: {info_path}")
return True
except Exception as e:
print(f"Save failed: {str(e)}")
return False
def save_all_models(self, save_dir='models'):
"""保存所有模型"""
for model_name in self.results.keys():
self.save_model(model_name, save_dir)
def load_model(self, model_path, scaler_path=None):
"""加载模型"""
try:
model = joblib.load(model_path)
if scaler_path:
scaler = joblib.load(scaler_path)
else:
scaler = None
return model, scaler
except Exception as e:
print(f"Load failed: {str(e)}")
return None, None
def print_summary(self):
"""Print results summary"""
if not self.results:
print("No training results")
return
print("\n" + "="*80)
print("Regression Model Performance Summary")
print("="*80)
# Header
header = "|30"
print(header)
# Result rows
for model_name, result in sorted(self.results.items(),
key=lambda x: x[1]['metrics']['test_r2'], reverse=True):
metrics = result['metrics']
model_full_name = self.models[model_name]['name']
print("|30")
print("-"*80)
print("Note: R² closer to 1 is better, RMSE/MAE smaller is better")
print("="*80)
def run_analysis_from_config(self) -> bool:
"""
基于配置对象运行完整分析流程 - 推荐用于GUI对接
Returns:
bool: 分析是否成功完成
"""
print("Starting regression analysis from configuration...")
# 1. 加载数据
if not self.load_csv(self.config.data.csv_path, self.config.data.label_column,
self.config.data.spectrum_columns):
return False
# 2. 数据预处理
if not self.preprocess_data():
return False
# 3. 初始化模型
self.initialize_all_models(use_config=True)
# 4. 过滤模型(如果指定了特定的模型)
if self.config.models.model_names is not None:
# 验证指定的模型名称
invalid_models = [name for name in self.config.models.model_names if name not in self.models]
if invalid_models:
print(f"Warning: The following models do not exist: {invalid_models}")
valid_model_names = [name for name in self.config.models.model_names if name in self.models]
else:
valid_model_names = self.config.models.model_names
# 只保留指定的模型
models_to_keep = {}
for model_name in valid_model_names:
if model_name in self.models:
models_to_keep[model_name] = self.models[model_name]
self.models = models_to_keep
print(f"Filtered to {len(self.models)} specified models")
# 5. 训练模型
self.train_all_models(tune_hyperparams=self.config.models.tune_hyperparams,
tuning_method=self.config.models.tuning_method)
# 6. 打印汇总
self.print_summary()
# 7. 保存模型
if self.config.output.save_models:
self.save_all_models(save_dir=self.config.output.save_dir)
# 8. 绘制结果
if self.config.output.plot_results:
os.makedirs(self.config.output.plot_dir, exist_ok=True)
self.generate_visualization_report(
save_dir=self.config.output.plot_dir,
prefix='regression_analysis'
)
print("Analysis completed!")
return True
def run_complete_analysis(self, csv_path=None, label_column=None, spectrum_columns=None,
test_size=None, scale_method=None, tune_hyperparams=None,
tuning_method=None, save_models=None, plot_results=None,
model_names=None):
"""
运行完整分析流程 - 保持向后兼容性
Parameters:
csv_path (str, optional): CSV文件路径如果为None则使用配置中的值
label_column (str or int, optional): 标签列如果为None则使用配置中的值
spectrum_columns (str or list or None, optional): 光谱列如果为None则使用配置中的值
test_size (float, optional): 测试集比例如果为None则使用配置中的值
scale_method (str, optional): 标准化方法如果为None则使用配置中的值
tune_hyperparams (bool, optional): 是否调优超参数如果为None则使用配置中的值
tuning_method (str, optional): 调优方法如果为None则使用配置中的值
save_models (bool, optional): 是否保存模型如果为None则使用配置中的值
plot_results (bool, optional): 是否绘制结果图如果为None则使用配置中的值
model_names (list or None, optional): 要训练的模型名称列表如果为None则使用配置中的值
"""
# 更新配置对象(向后兼容)
if csv_path is not None:
self.config.data.csv_path = csv_path
if label_column is not None:
self.config.data.label_column = label_column
if spectrum_columns is not None:
self.config.data.spectrum_columns = spectrum_columns
if test_size is not None:
self.config.data.test_size = test_size
if scale_method is not None:
self.config.data.scale_method = scale_method
if tune_hyperparams is not None:
self.config.models.tune_hyperparams = tune_hyperparams
if tuning_method is not None:
self.config.models.tuning_method = tuning_method
if save_models is not None:
self.config.output.save_models = save_models
if plot_results is not None:
self.config.output.plot_results = plot_results
if model_names is not None:
self.config.models.model_names = model_names
# 使用配置驱动的方法
return self.run_analysis_from_config()
class RegressionVisualizer:
"""
回归分析可视化器 - 提供丰富的可视化功能
支持预测值vs真实值散点图、残差图、性能指标对比等
"""
def __init__(self, analyzer: Optional['RegressionAnalyzer'] = None):
"""
初始化可视化器
Parameters:
analyzer (RegressionAnalyzer, optional): 回归分析器实例
"""
self.analyzer = analyzer
self.colorblind_friendly_palette = [
'#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
'#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5'
]
plt.style.use('seaborn-v0_8')
# 配置matplotlib支持中文显示
self._configure_chinese_font()
def _configure_chinese_font(self):
"""配置matplotlib以支持中文显示"""
import matplotlib as mpl
# 尝试设置中文字体
chinese_fonts = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans', 'Arial Unicode MS',
'WenQuanYi Micro Hei', 'AR PL UMing CN', 'Liberation Serif']
font_set = False
for font in chinese_fonts:
try:
# 测试字体是否可用
test_text = "测试中文"
fig, ax = plt.subplots()
ax.text(0.5, 0.5, test_text, fontname=font, fontsize=12)
plt.close(fig)
# 如果没有报错,设置字体
mpl.rcParams['font.sans-serif'] = [font] + mpl.rcParams['font.sans-serif']
mpl.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
font_set = True
print(f"Successfully set Chinese font to: {font}")
break
except:
continue
if not font_set:
# 如果没有找到合适的中文字体使用系统默认并启用unicode
mpl.rcParams['font.sans-serif'] = ['DejaVu Sans', 'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
print("Warning: Could not find suitable Chinese font. Using default fonts.")
def _ensure_chinese_text(self, text):
"""确保文本正确显示中文"""
if isinstance(text, str):
try:
# 尝试编码和解码以确保UTF-8格式
return text.encode('utf-8').decode('utf-8')
except:
return text
return text
def set_colorblind_palette(self):
"""设置色盲友好配色方案"""
import matplotlib as mpl
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=self.colorblind_friendly_palette)
def plot_prediction_scatter(self, figsize=(16, 12), save_path=None, show_individual=True, show_overlay=True):
"""
绘制预测值vs真实值散点图
Parameters:
figsize (tuple): 图形尺寸
save_path (str, optional): 保存路径
show_individual (bool): 是否显示多子图
show_overlay (bool): 是否显示叠加图
"""
if not self.analyzer or not self.analyzer.results:
print("No analyzer results available for plotting")
return
self.set_colorblind_palette()
n_models = len(self.analyzer.results)
if show_individual and show_overlay:
# 创建复合图:上方多子图,下方叠加图
fig = plt.figure(figsize=figsize)
# 上方:多子图(每个模型一个子图)
n_cols = min(4, n_models)
n_rows = (n_models + n_cols - 1) // n_cols
gs = fig.add_gridspec(n_rows + 1, n_cols, hspace=0.3, wspace=0.3)
axes_scatter = []
for i in range(n_rows):
for j in range(n_cols):
if i * n_cols + j < n_models:
axes_scatter.append(fig.add_subplot(gs[i, j]))
# 下方:叠加图
ax_overlay = fig.add_subplot(gs[n_rows, :])
elif show_individual:
# 只有多子图
n_cols = min(4, n_models)
n_rows = (n_models + n_cols - 1) // n_cols
fig, axes_scatter = plt.subplots(n_rows, n_cols, figsize=figsize)
if n_models == 1:
axes_scatter = [axes_scatter]
else:
axes_scatter = axes_scatter.flatten()
ax_overlay = None
elif show_overlay:
# 只有叠加图
fig, ax_overlay = plt.subplots(1, 1, figsize=(10, 8))
axes_scatter = []
else:
print("At least one of show_individual or show_overlay must be True")
return
# 绘制多子图
if show_individual:
for idx, (model_name, result) in enumerate(self.analyzer.results.items()):
if idx < len(axes_scatter):
ax = axes_scatter[idx]
y_true = self.analyzer.y_test
y_pred = result['y_pred_test']
# 散点图
ax.scatter(y_true, y_pred, alpha=0.6, s=30, color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)])
# 45度参考线
min_val = min(y_true.min(), y_pred.min())
max_val = max(y_true.max(), y_pred.max())
ax.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, alpha=0.7)
# 添加R²和RMSE文本
r2 = result['metrics']['test_r2']
rmse = result['metrics']['test_rmse']
ax.text(0.05, 0.95, f'R² = {r2:.3f}\nRMSE = {rmse:.3f}',
transform=ax.transAxes, fontsize=10,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
ax.set_xlabel(self._ensure_chinese_text('True Values'))
ax.set_ylabel(self._ensure_chinese_text('Predicted Values'))
ax.set_title(self._ensure_chinese_text(f'{self.analyzer.models[model_name]["name"]}'))
ax.grid(True, alpha=0.3)
ax.axis('equal')
# 绘制叠加图
if show_overlay:
for idx, (model_name, result) in enumerate(self.analyzer.results.items()):
y_true = self.analyzer.y_test
y_pred = result['y_pred_test']
ax_overlay.scatter(y_true, y_pred, alpha=0.6, s=30,
color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)],
label=f'{self.analyzer.models[model_name]["name"]} (R²={result["metrics"]["test_r2"]:.3f})')
# 45度参考线
min_val = min(self.analyzer.y_test.min(), min([r['y_pred_test'].min() for r in self.analyzer.results.values()]))
max_val = max(self.analyzer.y_test.max(), max([r['y_pred_test'].max() for r in self.analyzer.results.values()]))
ax_overlay.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, alpha=0.7, label='Perfect Prediction')
ax_overlay.set_xlabel(self._ensure_chinese_text('True Values'))
ax_overlay.set_ylabel(self._ensure_chinese_text('Predicted Values'))
ax_overlay.set_title(self._ensure_chinese_text('Predicted vs True Values - All Models Overlay'))
ax_overlay.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax_overlay.grid(True, alpha=0.3)
ax_overlay.axis('equal')
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Prediction scatter plot saved to: {save_path}")
# plt.show()
def plot_residual_analysis(self, figsize=(16, 8), save_path=None, n_feature_plots=3):
"""
绘制残差分析图
Parameters:
figsize (tuple): 图形尺寸
save_path (str, optional): 保存路径
n_feature_plots (int): 显示多少个特征的残差vs特征图
"""
if not self.analyzer or not self.analyzer.results:
print("No analyzer results available for plotting")
return
self.set_colorblind_palette()
# 选择表现最好的几个模型进行详细分析
sorted_models = sorted(self.analyzer.results.items(),
key=lambda x: x[1]['metrics']['test_r2'], reverse=True)
top_models = sorted_models[:min(3, len(sorted_models))]
fig, axes = plt.subplots(2, 3, figsize=figsize)
for idx, (model_name, result) in enumerate(top_models):
y_true = self.analyzer.y_test
y_pred = result['y_pred_test']
residuals = y_true - y_pred
# 残差vs预测值
ax1 = axes[0, 0] if idx == 0 else axes[0, idx]
ax1.scatter(y_pred, residuals, alpha=0.6, s=20,
color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)])
ax1.axhline(y=0, color='k', linestyle='--', linewidth=2)
ax1.set_xlabel('Predicted Values')
ax1.set_ylabel('Residuals')
ax1.set_title(f'Residuals vs Predicted\n{self.analyzer.models[model_name]["name"]}')
ax1.grid(True, alpha=0.3)
# Q-Q图
ax2 = axes[1, 0] if idx == 0 else axes[1, idx]
stats.probplot(residuals, dist="norm", plot=ax2)
ax2.set_title(f'Normal Q-Q Plot\n{self.analyzer.models[model_name]["name"]}')
# 残差vs重要特征如果有特征重要性
if idx < n_feature_plots - 2 and hasattr(result['model'], 'feature_importances_'):
ax3 = axes[idx // 3 + 1, idx % 3 + 1] if idx > 0 else axes[0, 2]
if idx < 2: # 只显示前两个模型的特征残差图
try:
importances = result['model'].feature_importances_
top_features_idx = np.argsort(importances)[-2:] # 最重要的两个特征
for i, feat_idx in enumerate(top_features_idx):
feat_name = self.analyzer.feature_names[feat_idx] if hasattr(self.analyzer, 'feature_names') else f'Feature {feat_idx}'
ax3.scatter(self.analyzer.X_test[:, feat_idx], residuals,
alpha=0.6, s=20, label=f'{feat_name}',
color=self.colorblind_friendly_palette[(idx*2 + i) % len(self.colorblind_friendly_palette)])
ax3.axhline(y=0, color='k', linestyle='--', linewidth=2)
ax3.set_xlabel('Feature Values')
ax3.set_ylabel('Residuals')
ax3.set_title(f'Residuals vs Top Features\n{self.analyzer.models[model_name]["name"]}')
ax3.legend()
ax3.grid(True, alpha=0.3)
except:
ax3.text(0.5, 0.5, 'Feature importance\nnot available',
transform=ax3.transAxes, ha='center', va='center')
ax3.set_title(f'Feature Analysis\n{self.analyzer.models[model_name]["name"]}')
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Residual analysis plot saved to: {save_path}")
# plt.show()
def plot_metrics_comparison(self, figsize=(16, 10), save_path=None):
"""
绘制性能指标对比图
Parameters:
figsize (tuple): 图形尺寸
save_path (str, optional): 保存路径
"""
if not self.analyzer or not self.analyzer.results:
print("No analyzer results available for plotting")
return
self.set_colorblind_palette()
# 准备数据
model_names = []
model_full_names = []
r2_scores = []
rmse_scores = []
mae_scores = []
training_times = []
memory_usage = []
# 模拟训练时间和内存使用(实际应用中需要测量)
for model_name, result in self.analyzer.results.items():
model_names.append(model_name)
model_full_names.append(self.analyzer.models[model_name]['name'])
r2_scores.append(result['metrics']['test_r2'])
rmse_scores.append(result['metrics']['test_rmse'])
mae_scores.append(result['metrics']['test_mae'])
training_times.append(np.random.uniform(0.1, 5.0)) # 模拟时间
memory_usage.append(np.random.uniform(50, 500)) # 模拟内存
# 创建子图
fig, axes = plt.subplots(2, 2, figsize=figsize)
# 雷达图 - R², RMSE, MAE
ax_radar = axes[0, 0]
# 标准化指标到0-1范围
r2_norm = (r2_scores - np.min(r2_scores)) / (np.max(r2_scores) - np.min(r2_scores))
rmse_norm = 1 - (rmse_scores - np.min(rmse_scores)) / (np.max(rmse_scores) - np.min(rmse_scores)) # RMSE越小越好反转
mae_norm = 1 - (mae_scores - np.min(mae_scores)) / (np.max(mae_scores) - np.min(mae_scores)) # MAE越小越好反转
# 雷达图数据
categories = ['R² Score', 'RMSE (inv)', 'MAE (inv)']
n_models = len(model_names)
# 计算角度
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
angles += angles[:1] # 闭合图形
for i in range(n_models):
values = [r2_norm[i], rmse_norm[i], mae_norm[i]]
values += values[:1] # 闭合图形
ax_radar.plot(angles, values, 'o-', linewidth=2,
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
label=model_full_names[i])
ax_radar.fill(angles, values, alpha=0.25,
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)])
ax_radar.set_xticks(angles[:-1])
ax_radar.set_xticklabels(categories)
ax_radar.set_title('Performance Metrics Radar Chart')
ax_radar.legend(bbox_to_anchor=(1.1, 1), loc='upper left')
ax_radar.grid(True, alpha=0.3)
# 分组柱状图 - 不同指标的比较
ax_bar = axes[0, 1]
x = np.arange(len(model_names))
width = 0.25
bars1 = ax_bar.bar(x - width, r2_scores, width, label='', alpha=0.8,
color=self.colorblind_friendly_palette[0])
bars2 = ax_bar.bar(x, [1/s for s in rmse_scores], width, label='1/RMSE', alpha=0.8,
color=self.colorblind_friendly_palette[1])
bars3 = ax_bar.bar(x + width, [1/s for s in mae_scores], width, label='1/MAE', alpha=0.8,
color=self.colorblind_friendly_palette[2])
ax_bar.set_xlabel('Models')
ax_bar.set_ylabel('Normalized Scores')
ax_bar.set_title('Normalized Performance Comparison')
ax_bar.set_xticks(x)
ax_bar.set_xticklabels(model_full_names, rotation=45, ha='right')
ax_bar.legend()
ax_bar.grid(True, alpha=0.3, axis='y')
# 堆叠柱状图 - 误差分解偏差vs方差
ax_stack = axes[1, 0]
bias_errors = [abs(np.mean(residuals)) for residuals in
[self.analyzer.y_test - result['y_pred_test'] for result in self.analyzer.results.values()]]
variance_errors = [np.var(residuals) for residuals in
[self.analyzer.y_test - result['y_pred_test'] for result in self.analyzer.results.values()]]
bars_bias = ax_stack.bar(model_names, bias_errors, label='Bias (Mean Abs Error)', alpha=0.8,
color=self.colorblind_friendly_palette[0])
bars_var = ax_stack.bar(model_names, variance_errors, bottom=bias_errors,
label='Variance (Residual Var)', alpha=0.8,
color=self.colorblind_friendly_palette[1])
ax_stack.set_xlabel('Models')
ax_stack.set_ylabel('Error Components')
ax_stack.set_title('Bias-Variance Decomposition')
ax_stack.set_xticklabels(model_full_names, rotation=45, ha='right')
ax_stack.legend()
ax_stack.grid(True, alpha=0.3, axis='y')
# 气泡图 - 综合评估R² vs 1/RMSE气泡大小表示1/MAE
ax_bubble = axes[1, 1]
bubble_sizes = [100 * (1/s) for s in mae_scores] # MAE越大气泡越小
scatter = ax_bubble.scatter(r2_scores, [1/s for s in rmse_scores], s=bubble_sizes,
c=range(len(model_names)), cmap='viridis', alpha=0.6, edgecolors='black')
# 添加模型名称标签
for i, name in enumerate(model_full_names):
ax_bubble.annotate(name, (r2_scores[i], 1/rmse_scores[i]),
xytext=(5, 5), textcoords='offset points', fontsize=8)
ax_bubble.set_xlabel('R² Score')
ax_bubble.set_ylabel('1/RMSE')
ax_bubble.set_title('Comprehensive Performance Assessment\n(Bubble size ∝ 1/MAE)')
ax_bubble.grid(True, alpha=0.3)
# 添加颜色条
cbar = plt.colorbar(scatter, ax=ax_bubble)
cbar.set_label('Model Index')
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Metrics comparison plot saved to: {save_path}")
# plt.show()
def plot_error_distribution(self, figsize=(16, 8), save_path=None):
"""
绘制误差分布图
Parameters:
figsize (tuple): 图形尺寸
save_path (str, optional): 保存路径
"""
if not self.analyzer or not self.analyzer.results:
print("No analyzer results available for plotting")
return
self.set_colorblind_palette()
# 计算所有模型的误差
model_errors = {}
for model_name, result in self.analyzer.results.items():
errors = self.analyzer.y_test - result['y_pred_test']
model_errors[model_name] = errors
fig, axes = plt.subplots(2, 2, figsize=figsize)
# 误差分布直方图 - 所有模型并排比较
ax_hist = axes[0, 0]
bins = np.linspace(min([min(errors) for errors in model_errors.values()]),
max([max(errors) for errors in model_errors.values()]), 30)
for i, (model_name, errors) in enumerate(model_errors.items()):
ax_hist.hist(errors, bins=bins, alpha=0.7, label=self.analyzer.models[model_name]['name'],
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
density=True)
ax_hist.set_xlabel('Prediction Error')
ax_hist.set_ylabel('Density')
ax_hist.set_title('Error Distribution Histogram')
ax_hist.legend()
ax_hist.grid(True, alpha=0.3)
# 核密度估计曲线
ax_kde = axes[0, 1]
for i, (model_name, errors) in enumerate(model_errors.items()):
try:
sns.kdeplot(data=errors, ax=ax_kde, label=self.analyzer.models[model_name]['name'],
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
fill=True, alpha=0.3)
except:
# 如果seaborn不可用使用matplotlib
ax_kde.hist(errors, bins=30, alpha=0.3, density=True,
label=self.analyzer.models[model_name]['name'],
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)])
ax_kde.set_xlabel('Prediction Error')
ax_kde.set_ylabel('Density')
ax_kde.set_title('Error Distribution KDE')
ax_kde.legend()
ax_kde.grid(True, alpha=0.3)
# 累积分布函数
ax_cdf = axes[1, 0]
error_range = np.linspace(min([min(errors) for errors in model_errors.values()]),
max([max(errors) for errors in model_errors.values()]), 100)
for i, (model_name, errors) in enumerate(model_errors.items()):
sorted_errors = np.sort(errors)
y_vals = np.arange(len(sorted_errors)) / float(len(sorted_errors))
ax_cdf.plot(sorted_errors, y_vals,
label=self.analyzer.models[model_name]['name'],
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
linewidth=2)
ax_cdf.set_xlabel('Prediction Error')
ax_cdf.set_ylabel('Cumulative Probability')
ax_cdf.set_title('Cumulative Distribution Function')
ax_cdf.legend()
ax_cdf.grid(True, alpha=0.3)
# 箱线图
ax_box = axes[1, 1]
error_data = [errors for errors in model_errors.values()]
model_labels = [self.analyzer.models[name]['name'] for name in model_errors.keys()]
bp = ax_box.boxplot(error_data, labels=model_labels, patch_artist=True)
for patch, color in zip(bp['boxes'], self.colorblind_friendly_palette):
patch.set_facecolor(color)
patch.set_alpha(0.7)
# 添加均值点
for i, errors in enumerate(error_data):
ax_box.plot(i+1, np.mean(errors), 'ro', markersize=8, label='Mean' if i == 0 else "")
ax_box.set_xlabel('Models')
ax_box.set_ylabel('Prediction Error')
ax_box.set_title('Error Distribution Box Plot')
ax_box.legend()
ax_box.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Error distribution plot saved to: {save_path}")
# plt.show()
def plot_model_ranking_matrix(self, figsize=(14, 10), save_path=None):
"""
绘制模型排名矩阵
Parameters:
figsize (tuple): 图形尺寸
save_path (str, optional): 保存路径
"""
if not self.analyzer or not self.analyzer.results:
print("No analyzer results available for plotting")
return
self.set_colorblind_palette()
# 准备指标数据
metrics_data = []
model_names = []
metric_names = ['', 'RMSE', 'MAE', 'Training_R²', 'Training_RMSE', 'Training_MAE']
for model_name, result in self.analyzer.results.items():
model_names.append(self.analyzer.models[model_name]['name'])
metrics = result['metrics']
metrics_data.append([
metrics['test_r2'],
metrics['test_rmse'],
metrics['test_mae'],
metrics['train_r2'],
metrics['train_rmse'],
metrics['train_mae']
])
metrics_array = np.array(metrics_data)
# 计算排名对于R²越高越好对于RMSE/MAE越低越好
rankings = np.zeros_like(metrics_array)
rankings[:, 0] = len(model_names) - stats.rankdata(metrics_array[:, 0]) + 1 # R²排名反转
rankings[:, 1] = stats.rankdata(metrics_array[:, 1]) # RMSE排名
rankings[:, 2] = stats.rankdata(metrics_array[:, 2]) # MAE排名
rankings[:, 3] = len(model_names) - stats.rankdata(metrics_array[:, 3]) + 1 # Training R²排名反转
rankings[:, 4] = stats.rankdata(metrics_array[:, 4]) # Training RMSE排名
rankings[:, 5] = stats.rankdata(metrics_array[:, 5]) # Training MAE排名
fig, axes = plt.subplots(2, 2, figsize=figsize)
# 热力图 - 模型vs指标的排名
ax_heatmap = axes[0, 0]
im = ax_heatmap.imshow(rankings, cmap='RdYlGn_r', aspect='auto', alpha=0.8)
# 设置标签
ax_heatmap.set_xticks(np.arange(len(metric_names)))
ax_heatmap.set_yticks(np.arange(len(model_names)))
ax_heatmap.set_xticklabels(metric_names, rotation=45, ha='right')
ax_heatmap.set_yticklabels(model_names)
# 添加数值标签
for i in range(len(model_names)):
for j in range(len(metric_names)):
text = ax_heatmap.text(j, i, f'{rankings[i, j]:.0f}',
ha="center", va="center", color="black", fontsize=10)
ax_heatmap.set_title('Model Ranking Matrix\n(Lower rank = Better performance)')
plt.colorbar(im, ax=ax_heatmap, label='Rank')
# 平行坐标图
ax_parallel = axes[0, 1]
# 标准化数据到0-1范围
normalized_data = np.zeros_like(metrics_array)
for j in range(metrics_array.shape[1]):
if j in [0, 3]: # R²指标越高越好
normalized_data[:, j] = (metrics_array[:, j] - metrics_array[:, j].min()) / (metrics_array[:, j].max() - metrics_array[:, j].min())
else: # RMSE/MAE指标越低越好反转标准化
normalized_data[:, j] = 1 - (metrics_array[:, j] - metrics_array[:, j].min()) / (metrics_array[:, j].max() - metrics_array[:, j].min())
for i in range(len(model_names)):
ax_parallel.plot(range(len(metric_names)), normalized_data[i],
marker='o', linewidth=2, markersize=6,
color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
label=model_names[i], alpha=0.8)
ax_parallel.set_xticks(range(len(metric_names)))
ax_parallel.set_xticklabels(metric_names, rotation=45, ha='right')
ax_parallel.set_ylabel('Normalized Score (Higher = Better)')
ax_parallel.set_title('Parallel Coordinates Plot')
ax_parallel.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax_parallel.grid(True, alpha=0.3)
# 气泡图 - R² vs RMSE气泡大小表示MAE
ax_bubble = axes[1, 0]
r2_scores = metrics_array[:, 0]
rmse_scores = metrics_array[:, 1]
mae_scores = metrics_array[:, 2]
# 气泡大小MAE越小气泡越大
bubble_sizes = 1000 / (mae_scores + 0.01) # 避免除零
scatter = ax_bubble.scatter(r2_scores, rmse_scores, s=bubble_sizes,
c=range(len(model_names)), cmap='viridis', alpha=0.6, edgecolors='black')
# 添加模型名称标签
for i, name in enumerate(model_names):
ax_bubble.annotate(name, (r2_scores[i], rmse_scores[i]),
xytext=(5, 5), textcoords='offset points', fontsize=8)
ax_bubble.set_xlabel('R² Score')
ax_bubble.set_ylabel('RMSE')
ax_bubble.set_title('Performance Bubble Chart\n(Bubble size ∝ 1/MAE)')
ax_bubble.grid(True, alpha=0.3)
# 添加颜色条
cbar = plt.colorbar(scatter, ax=ax_bubble)
cbar.set_label('Model Index')
# 综合排名条形图
ax_ranking = axes[1, 1]
avg_rankings = np.mean(rankings, axis=1)
sorted_indices = np.argsort(avg_rankings)
bars = ax_ranking.bar(range(len(model_names)),
avg_rankings[sorted_indices],
color=[self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)]
for i in range(len(model_names))], alpha=0.7)
ax_ranking.set_xlabel('Models (Sorted by Average Rank)')
ax_ranking.set_ylabel('Average Rank')
ax_ranking.set_title('Overall Model Ranking')
ax_ranking.set_xticks(range(len(model_names)))
ax_ranking.set_xticklabels([model_names[i] for i in sorted_indices], rotation=45, ha='right')
ax_ranking.grid(True, alpha=0.3, axis='y')
# 添加数值标签
for i, bar in enumerate(bars):
height = bar.get_height()
ax_ranking.text(bar.get_x() + bar.get_width()/2., height,
'.2f', ha='center', va='bottom')
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Model ranking matrix plot saved to: {save_path}")
# plt.show()
def generate_comprehensive_report(self, save_dir='plots', prefix='regression_analysis'):
"""
生成综合可视化报告
Parameters:
save_dir (str): 保存目录
prefix (str): 文件名前缀
"""
if not self.analyzer:
print("No analyzer available for report generation")
return
os.makedirs(save_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# 生成各种图表
plot_configs = [
('prediction_scatter', self.plot_prediction_scatter),
('residual_analysis', self.plot_residual_analysis),
('metrics_comparison', self.plot_metrics_comparison),
('error_distribution', self.plot_error_distribution),
('model_ranking', self.plot_model_ranking_matrix)
]
saved_files = []
for plot_name, plot_func in plot_configs:
try:
save_path = f'{save_dir}/{prefix}_{plot_name}_{timestamp}.png'
plot_func(save_path=save_path)
saved_files.append(save_path)
plt.close('all') # 关闭所有图形以释放内存
except Exception as e:
print(f"Failed to generate {plot_name} plot: {str(e)}")
if saved_files:
print("Comprehensive visualization report generated:")
for file in saved_files:
print(f" - {file}")
else:
print("No plots were successfully generated")
return saved_files
def main():
"""主函数 - 展示配置驱动和向后兼容两种使用方式"""
print("="*60)
print("Regression Analysis Tool - Configuration-Driven Interface")
print("="*60)
# 方法1配置驱动方式推荐用于GUI对接
print("\n--- Method 1: Configuration-Driven (Recommended for GUI) ---")
# 创建配置对象
csv_file_path = r"E:\code\content\change\6.csv"
config = RegressionConfig.create_default(
csv_path=csv_file_path,
label_column="0"
)
# 可选:自定义配置
config.data.spectrum_columns = "8:" # 光谱列范围
config.models.model_names ='all'# 选择部分模型进行演示
config.models.tune_hyperparams = False # 快速分析,不进行超参数调优
config.output.save_models = True # 不保存模型文件
config.output.plot_results = True # 启用可视化
config.output.plot_dir = 'E:\code\content\change\plot\yellow' # 可视化输出目录
# 创建分析器并传入配置
analyzer = RegressionAnalyzer(config)
# 查看可用模型
analyzer.initialize_all_models()
print("Available models:")
for model_key, model_name in analyzer.get_available_models().items():
print(f" {model_key}: {model_name}")
# 运行配置驱动的分析
success = analyzer.run_analysis_from_config()
if success:
print("Configuration-driven analysis completed successfully!")
# 演示各种可视化功能
print("\n--- Visualization Demo ---")
# 创建可视化目录
viz_dir = 'visualization_demo'
os.makedirs(viz_dir, exist_ok=True)
print("Generating various visualization plots...")
# 1. 预测值vs真实值散点图
print("1. Prediction vs True Values Scatter Plot...")
analyzer.plot_prediction_scatter(
save_path=f'{viz_dir}/prediction_scatter.png',
show_individual=True,
show_overlay=True
)
# 2. 残差分析图
print("2. Residual Analysis Plot...")
analyzer.plot_residual_analysis(
save_path=f'{viz_dir}/residual_analysis.png'
)
# 3. 性能指标对比图
print("3. Performance Metrics Comparison...")
analyzer.plot_metrics_comparison(
save_path=f'{viz_dir}/metrics_comparison.png'
)
# 4. 误差分布图
print("4. Error Distribution Analysis...")
analyzer.plot_error_distribution(
save_path=f'{viz_dir}/error_distribution.png'
)
# 5. 模型排名矩阵
print("5. Model Ranking Matrix...")
analyzer.plot_model_ranking(
save_path=f'{viz_dir}/model_ranking.png'
)
# 6. 生成完整可视化报告
print("6. Generating Comprehensive Visualization Report...")
saved_plots = analyzer.generate_visualization_report(
save_dir=viz_dir,
prefix='demo_report'
)
print(f"\nVisualization completed! Generated {len(saved_plots)} plot files in '{viz_dir}' directory:")
for plot_file in saved_plots:
print(f" - {plot_file}")
print("\nAvailable visualization methods:")
print(" - analyzer.plot_prediction_scatter() # 预测值vs真实值散点图")
print(" - analyzer.plot_residual_analysis() # 残差分析图")
print(" - analyzer.plot_metrics_comparison() # 性能指标对比图")
print(" - analyzer.plot_error_distribution() # 误差分布图")
print(" - analyzer.plot_model_ranking() # 模型排名矩阵")
print(" - analyzer.generate_visualization_report() # 生成完整报告")
else:
print("Configuration-driven analysis failed!")
# # 方法2向后兼容方式传统参数传递
# print("\n--- Method 2: Backward Compatible (Legacy Parameter Passing) ---")
#
# analyzer2 = RegressionAnalyzer() # 使用默认配置
#
# # 使用传统的参数传递方式
# success2 = analyzer2.run_complete_analysis(
# csv_path=r"E:\code\WQ\pipeline_result\work_dir\5_training_spectra\training_spectra.csv",
# label_column="0",
# spectrum_columns="13:",
# test_size=0.2,
# scale_method='standard',
# tune_hyperparams=False,
# save_models=False,
# plot_results=True,
# model_names=['xgboost', 'lightgbm'] # 只训练这两个模型
# )
#
# if success2:
# print("Backward-compatible analysis completed successfully!")
# else:
# print("Backward-compatible analysis failed!")
#
# print("\n" + "="*60)
# print("Both methods are supported. Configuration-driven is recommended for GUI integration.")
# print("="*60)
if __name__ == "__main__":
main()