""" 回归分析工具包 支持多种回归算法:线性回归、LASSO、岭回归、Boosting、神经网络等 包含超参数调优、模型评价和保存功能 """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge, ElasticNet from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.svm import SVR from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C from sklearn.neural_network import MLPRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.base import BaseEstimator, RegressorMixin import xgboost as xgb import lightgbm as lgb from statsmodels.api import OLS, GLM from statsmodels.genmod.families import Gaussian import warnings import joblib import os from datetime import datetime import json from scipy.linalg import pinv from scipy import stats from typing import Optional, List, Dict, Any, Union from dataclasses import dataclass, field import time warnings.filterwarnings('ignore') @dataclass class DataConfig: """数据配置类""" csv_path: str = "" label_column: Union[str, int] = "" spectrum_columns: Optional[Union[str, List[Union[str, int]]]] = None test_size: float = 0.2 random_state: int = 42 scale_method: str = 'standard' @dataclass class ModelConfig: """模型配置类""" model_names: Optional[Union[str, List[str]]] = None tune_hyperparams: bool = True tuning_method: str = 'grid' cv_folds: int = 5 random_search_iter: int = 20 @dataclass class TrainingConfig: """训练配置类""" epochs: int = 100 batch_size: int = 32 learning_rate: float = 0.001 @dataclass class OutputConfig: """输出配置类""" save_models: bool = True plot_results: bool = True save_dir: str = 'models' plot_dir: str = 'plots' @dataclass class RegressionConfig: """回归分析完整配置类 - 为GUI对接设计的标准化接口""" data: DataConfig = field(default_factory=DataConfig) models: ModelConfig = field(default_factory=ModelConfig) training: TrainingConfig = field(default_factory=TrainingConfig) output: OutputConfig = field(default_factory=OutputConfig) def __post_init__(self): """参数校验和默认值设置""" self._validate_parameters() def _validate_parameters(self): """参数校验""" # 数据参数校验 if not self.data.csv_path: raise ValueError("CSV file path must be specified") if not self.data.label_column: raise ValueError("Label column must be specified") if not (0 < self.data.test_size < 1): raise ValueError("Test size must be between 0 and 1") if self.data.scale_method not in ['standard', 'minmax']: raise ValueError("Scale method must be 'standard' or 'minmax'") # 模型参数校验 if self.models.tuning_method not in ['grid', 'random']: raise ValueError("Tuning method must be 'grid' or 'random'") if self.models.cv_folds < 2: raise ValueError("CV folds must be at least 2") # 处理模型名称 self._process_model_names() # 训练参数校验 if self.training.epochs <= 0: raise ValueError("Epochs must be positive") if self.training.batch_size <= 0: raise ValueError("Batch size must be positive") if self.training.learning_rate <= 0: raise ValueError("Learning rate must be positive") def _process_model_names(self): """处理模型名称,支持'all'参数""" if isinstance(self.models.model_names, str): if self.models.model_names.lower() == 'all': # 获取所有支持的模型名称 supported_models = self._get_supported_models() self.models.model_names = list(supported_models.keys()) print(f"选择所有可用模型: {len(self.models.model_names)} 个") else: # 单个模型名称 supported_models = self._get_supported_models() if self.models.model_names not in supported_models: raise ValueError(f"不支持的模型类型: {self.models.model_names}") self.models.model_names = [self.models.model_names] elif isinstance(self.models.model_names, list): # 验证列表中的每个模型 supported_models = self._get_supported_models() for model in self.models.model_names: if model not in supported_models: raise ValueError(f"不支持的模型类型: {model}") elif self.models.model_names is None: # 默认使用一些常用模型 self.models.model_names = ['linear', 'ridge', 'lasso', 'randomforest', 'svm'] def _get_supported_models(self) -> Dict[str, str]: """获取支持的模型列表""" # 基于RegressionAnalyzer中注册的模型 models = { # 线性模型 'linear': '多元线性回归', 'lasso': 'LASSO回归', 'ridge': '岭回归', 'elasticnet': '弹性网络回归', 'bayesianridge': '贝叶斯岭回归', # 提升模型 'lsboost': '最小二乘提升', 'xgboost': 'XGBoost回归', 'lightgbm': 'LightGBM回归', # 核方法 'gaussian': '高斯过程回归', 'gaussiansvm': '高斯SVM回归', 'svm': '支持向量回归', # 神经网络 'elm': '极限学习机', 'mlp': '多层感知机', 'lstm': 'LSTM网络', 'gru': 'GRU网络', # 其他模型 'gam': '广义加性模型', 'decisiontree': '决策树回归', 'randomforest': '随机森林回归', 'extratrees': '极端随机树回归', 'adaboost': 'AdaBoost回归' } # 只返回当前环境可用的模型 available_models = {} for key, name in models.items(): try: # 这里可以添加更复杂的可用性检查 available_models[key] = name except: continue return available_models @classmethod def create_default(cls, csv_path: str, label_column: Union[str, int]) -> 'RegressionConfig': """创建默认配置的便捷方法""" # 创建配置时暂时跳过验证 config = cls.__new__(cls) config.data = DataConfig() config.models = ModelConfig() config.training = TrainingConfig() config.output = OutputConfig() # 设置必要参数 config.data.csv_path = csv_path config.data.label_column = label_column # 手动调用验证 config._validate_parameters() return config @classmethod def create_quick_analysis(cls, csv_path: str, label_column: Union[str, int], model_names: Optional[List[str]] = None) -> 'RegressionConfig': """创建快速分析配置""" # 创建配置时暂时跳过验证 config = cls.__new__(cls) config.data = DataConfig() config.models = ModelConfig() config.training = TrainingConfig() config.output = OutputConfig() # 设置必要参数 config.data.csv_path = csv_path config.data.label_column = label_column config.models.model_names = model_names config.models.tune_hyperparams = False # 快速分析不调参 config.output.save_models = False # 不保存模型 # 手动调用验证 config._validate_parameters() return config class ExtremeLearningMachine(BaseEstimator, RegressorMixin): """ Extreme Learning Machine (ELM) 回归器 ELM是一种单隐层前馈神经网络,具有快速训练速度 """ def __init__(self, n_hidden=100, activation='sigmoid', random_state=42): self.n_hidden = n_hidden self.activation = activation self.random_state = random_state self.input_weights_ = None self.biases_ = None self.output_weights_ = None def _activation_function(self, X): """激活函数""" if self.activation == 'sigmoid': return 1 / (1 + np.exp(-X)) elif self.activation == 'tanh': return np.tanh(X) elif self.activation == 'relu': return np.maximum(0, X) elif self.activation == 'linear': return X else: raise ValueError(f"Unsupported activation function: {self.activation}") def fit(self, X, y): """训练ELM模型""" np.random.seed(self.random_state) n_samples, n_features = X.shape # Initialize input weights and biases (random) self.input_weights_ = np.random.randn(n_features, self.n_hidden) self.biases_ = np.random.randn(self.n_hidden) # 计算隐层输出 H = self._activation_function(X @ self.input_weights_ + self.biases_) # 添加偏置列到隐层输出(可选) H = np.column_stack([H, np.ones(n_samples)]) # 计算输出权重(解析解) self.output_weights_ = pinv(H) @ y return self def predict(self, X): """预测""" if self.input_weights_ is None: raise ValueError("模型还未训练") # 计算隐层输出 H = self._activation_function(X @ self.input_weights_ + self.biases_) H = np.column_stack([H, np.ones(X.shape[0])]) # 计算输出 return H @ self.output_weights_ def _more_tags(self): return {'no_validation': True} class GeneralizedAdditiveModel(BaseEstimator, RegressorMixin): """ Generalized Additive Model (GAM) 回归器 使用样条基函数实现GAM """ def __init__(self, n_splines=10, degree=3, lambda_=0.1): self.n_splines = n_splines self.degree = degree self.lambda_ = lambda_ self.coefficients_ = None self.knots_ = None def _create_spline_basis(self, X): """创建样条基函数""" n_samples, n_features = X.shape n_basis = self.n_splines + self.degree + 1 # 为每个特征创建基函数 basis_matrices = [] for feature_idx in range(n_features): x = X[:, feature_idx] x_min, x_max = np.min(x), np.max(x) # 创建节点 if self.knots_ is None: knots = np.linspace(x_min, x_max, self.n_splines + 2)[1:-1] else: knots = self.knots_[feature_idx] # 创建B样条基函数 basis = np.zeros((n_samples, n_basis)) # 截断幂基函数(简化实现) for i in range(n_basis): if i < self.degree + 1: # 左端多项式 basis[:, i] = np.power(np.maximum(0, x - x_min), i) elif i > n_basis - self.degree - 2: # 右端多项式 power = n_basis - 1 - i basis[:, i] = np.power(np.maximum(0, x_max - x), power) else: # 中间截断幂函数 basis[:, i] = np.power(np.maximum(0, x - knots[i - self.degree - 1]), self.degree) basis_matrices.append(basis) # 组合所有特征的基函数 return np.concatenate(basis_matrices, axis=1) def fit(self, X, y): """训练GAM模型""" from sklearn.linear_model import Ridge # 创建样条基函数 X_basis = self._create_spline_basis(X) # 使用岭回归拟合系数(带正则化) ridge = Ridge(alpha=self.lambda_, fit_intercept=True) ridge.fit(X_basis, y) self.coefficients_ = ridge.coef_ self.intercept_ = ridge.intercept_ return self def predict(self, X): """预测""" if self.coefficients_ is None: raise ValueError("模型还未训练") X_basis = self._create_spline_basis(X) return X_basis @ self.coefficients_ + self.intercept_ class LSTMRegressor(BaseEstimator, RegressorMixin): """ LSTM回归器 - 将光谱数据视为序列数据 每个光谱样本的波段作为时间步 使用PyTorch实现 """ def __init__(self, units=None, dropout=0.2, recurrent_dropout=0.2, epochs=None, batch_size=None, learning_rate=None, random_state=42, device=None, config: Optional['TrainingConfig'] = None): """ LSTM回归器构造函数 Parameters: units (int, optional): LSTM单元数,如果为None则使用默认值50 dropout (float): Dropout比例 recurrent_dropout (float): 循环Dropout比例 epochs (int, optional): 训练轮数,如果为None则使用默认值100 batch_size (int, optional): 批次大小,如果为None则使用默认值32 learning_rate (float, optional): 学习率,如果为None则使用默认值0.001 random_state (int): 随机种子 device (str, optional): 计算设备 config (TrainingConfig, optional): 训练配置对象 """ # 如果提供了配置对象,使用配置中的参数 if config is not None: self.units = units if units is not None else 64 # LSTM默认使用更多单元 self.dropout = dropout self.recurrent_dropout = recurrent_dropout self.epochs = epochs if epochs is not None else config.epochs self.batch_size = batch_size if batch_size is not None else config.batch_size self.learning_rate = learning_rate if learning_rate is not None else config.learning_rate else: # 使用传统参数方式 self.units = units if units is not None else 50 self.dropout = dropout self.recurrent_dropout = recurrent_dropout self.epochs = epochs if epochs is not None else 100 self.batch_size = batch_size if batch_size is not None else 32 self.learning_rate = learning_rate if learning_rate is not None else 0.001 self.random_state = random_state self.device = device self.model_ = None self.input_size_ = None # 尝试导入PyTorch try: import torch import torch.nn as nn import torch.optim as optim self.torch = torch self.nn = nn self.optim = optim self.pytorch_available = True # 设置设备 if self.device is None: self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') else: self.device = torch.device(self.device) except ImportError: self.pytorch_available = False print("Warning: PyTorch not installed, LSTM model will use MLPRegressor approximation") def _create_model(self, input_size): """创建PyTorch LSTM模型""" class LSTMModel(self.nn.Module): def __init__(self, input_size, hidden_size, dropout, recurrent_dropout): super(LSTMModel, self).__init__() self.hidden_size = hidden_size self.lstm = self.nn.LSTM( input_size=input_size, hidden_size=hidden_size, num_layers=1, batch_first=True, dropout=recurrent_dropout if recurrent_dropout > 0 else 0, bidirectional=False ) self.dropout_layer = self.nn.Dropout(dropout) self.fc = self.nn.Linear(hidden_size, 1) def forward(self, x): # LSTM前向传播 lstm_out, _ = self.lstm(x) # 取最后一个时间步的输出 lstm_out = lstm_out[:, -1, :] # Dropout lstm_out = self.dropout_layer(lstm_out) # 全连接层 output = self.fc(lstm_out) return output return LSTMModel(input_size, self.units, self.dropout, self.recurrent_dropout) def fit(self, X, y): """训练LSTM模型""" if not self.pytorch_available: # 如果没有PyTorch,使用MLPRegressor作为近似 from sklearn.neural_network import MLPRegressor self.model_ = MLPRegressor( hidden_layer_sizes=(self.units, self.units//2), activation='relu', solver='adam', max_iter=self.epochs, random_state=self.random_state, early_stopping=True ) self.model_.fit(X, y) return self # 设置随机种子 self.torch.manual_seed(self.random_state) if self.torch.cuda.is_available(): self.torch.cuda.manual_seed(self.random_state) self.torch.cuda.manual_seed_all(self.random_state) np.random.seed(self.random_state) # 将特征数据重塑为序列格式 (samples, timesteps, features) # 对于光谱数据,每个波段作为时间步,特征数为1 n_samples, n_features = X.shape self.input_size_ = 1 # 每个时间步的特征数 # 转换为PyTorch张量 X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device) y_tensor = self.torch.FloatTensor(y.reshape(-1, 1)).to(self.device) # 创建模型 self.model_ = self._create_model(self.input_size_).to(self.device) # 定义损失函数和优化器 criterion = self.nn.MSELoss() optimizer = self.optim.Adam(self.model_.parameters(), lr=self.learning_rate) # 训练模型 self.model_.train() for epoch in range(self.epochs): # 随机打乱数据 indices = np.random.permutation(n_samples) X_shuffled = X_tensor[indices] y_shuffled = y_tensor[indices] # 批量训练 for i in range(0, n_samples, self.batch_size): batch_X = X_shuffled[i:i+self.batch_size] batch_y = y_shuffled[i:i+self.batch_size] # 前向传播 optimizer.zero_grad() outputs = self.model_(batch_X) loss = criterion(outputs, batch_y) # 反向传播 loss.backward() optimizer.step() return self def predict(self, X): """预测""" if self.model_ is None: raise ValueError("模型还未训练") if not self.pytorch_available: return self.model_.predict(X) # 转换为评估模式 self.model_.eval() # 重塑输入数据 n_samples, n_features = X.shape X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device) # 预测 with self.torch.no_grad(): predictions = self.model_(X_tensor) return predictions.cpu().numpy().flatten() class GRURegressor(BaseEstimator, RegressorMixin): """ GRU回归器 - 将光谱数据视为序列数据 每个光谱样本的波段作为时间步 使用PyTorch实现 """ def __init__(self, units=None, dropout=0.2, recurrent_dropout=0.2, epochs=None, batch_size=None, learning_rate=None, random_state=42, device=None, config: Optional['TrainingConfig'] = None): """ GRU回归器构造函数 Parameters: units (int, optional): GRU单元数,如果为None则使用默认值50 dropout (float): Dropout比例 recurrent_dropout (float): 循环Dropout比例 epochs (int, optional): 训练轮数,如果为None则使用默认值100 batch_size (int, optional): 批次大小,如果为None则使用默认值32 learning_rate (float, optional): 学习率,如果为None则使用默认值0.001 random_state (int): 随机种子 device (str, optional): 计算设备 config (TrainingConfig, optional): 训练配置对象 """ # 如果提供了配置对象,使用配置中的参数 if config is not None: self.units = units if units is not None else 64 # GRU默认使用更多单元 self.dropout = dropout self.recurrent_dropout = recurrent_dropout self.epochs = epochs if epochs is not None else config.epochs self.batch_size = batch_size if batch_size is not None else config.batch_size self.learning_rate = learning_rate if learning_rate is not None else config.learning_rate else: # 使用传统参数方式 self.units = units if units is not None else 50 self.dropout = dropout self.recurrent_dropout = recurrent_dropout self.epochs = epochs if epochs is not None else 100 self.batch_size = batch_size if batch_size is not None else 32 self.learning_rate = learning_rate if learning_rate is not None else 0.001 self.random_state = random_state self.device = device self.model_ = None self.input_size_ = None # 尝试导入PyTorch try: import torch import torch.nn as nn import torch.optim as optim self.torch = torch self.nn = nn self.optim = optim self.pytorch_available = True # 设置设备 if self.device is None: self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') else: self.device = torch.device(self.device) except ImportError: self.pytorch_available = False print("Warning: PyTorch not installed, GRU model will use MLPRegressor approximation") def _create_model(self, input_size): """创建PyTorch GRU模型""" class GRUModel(self.nn.Module): def __init__(self, input_size, hidden_size, dropout, recurrent_dropout): super(GRUModel, self).__init__() self.hidden_size = hidden_size self.gru = self.nn.GRU( input_size=input_size, hidden_size=hidden_size, num_layers=1, batch_first=True, dropout=recurrent_dropout if recurrent_dropout > 0 else 0, bidirectional=False ) self.dropout_layer = self.nn.Dropout(dropout) self.fc = self.nn.Linear(hidden_size, 1) def forward(self, x): # GRU前向传播 gru_out, _ = self.gru(x) # 取最后一个时间步的输出 gru_out = gru_out[:, -1, :] # Dropout gru_out = self.dropout_layer(gru_out) # 全连接层 output = self.fc(gru_out) return output return GRUModel(input_size, self.units, self.dropout, self.recurrent_dropout) def fit(self, X, y): """训练GRU模型""" if not self.pytorch_available: # 如果没有PyTorch,使用MLPRegressor作为近似 from sklearn.neural_network import MLPRegressor self.model_ = MLPRegressor( hidden_layer_sizes=(self.units, self.units//2), activation='relu', solver='adam', max_iter=self.epochs, random_state=self.random_state, early_stopping=True ) self.model_.fit(X, y) return self # 设置随机种子 self.torch.manual_seed(self.random_state) if self.torch.cuda.is_available(): self.torch.cuda.manual_seed(self.random_state) self.torch.cuda.manual_seed_all(self.random_state) np.random.seed(self.random_state) # 将特征数据重塑为序列格式 (samples, timesteps, features) n_samples, n_features = X.shape self.input_size_ = 1 # 每个时间步的特征数 # 转换为PyTorch张量 X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device) y_tensor = self.torch.FloatTensor(y.reshape(-1, 1)).to(self.device) # 创建模型 self.model_ = self._create_model(self.input_size_).to(self.device) # 定义损失函数和优化器 criterion = self.nn.MSELoss() optimizer = self.optim.Adam(self.model_.parameters(), lr=self.learning_rate) # 训练模型 self.model_.train() for epoch in range(self.epochs): # 随机打乱数据 indices = np.random.permutation(n_samples) X_shuffled = X_tensor[indices] y_shuffled = y_tensor[indices] # 批量训练 for i in range(0, n_samples, self.batch_size): batch_X = X_shuffled[i:i+self.batch_size] batch_y = y_shuffled[i:i+self.batch_size] # 前向传播 optimizer.zero_grad() outputs = self.model_(batch_X) loss = criterion(outputs, batch_y) # 反向传播 loss.backward() optimizer.step() return self def predict(self, X): """预测""" if self.model_ is None: raise ValueError("模型还未训练") if not self.pytorch_available: return self.model_.predict(X) # 转换为评估模式 self.model_.eval() # 重塑输入数据 n_samples, n_features = X.shape X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device) # 预测 with self.torch.no_grad(): predictions = self.model_(X_tensor) return predictions.cpu().numpy().flatten() class RegressionAnalyzer: """ 回归分析器类 - 支持GUI对接的标准化接口 支持多种回归算法和完整的分析流程 """ def __init__(self, config: Optional[RegressionConfig] = None): """ 初始化回归分析器 Parameters: config (RegressionConfig, optional): 配置对象,如果为None则使用默认配置 """ self.config = config or RegressionConfig() self._validate_config() # 在构造函数中进行校验 self.models = {} self.scalers = {} self.best_params = {} self.results = {} self.data = None self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None # 初始化可视化器 self.visualizer = RegressionVisualizer(self) def update_config(self, config: RegressionConfig): """ 更新配置 - 为GUI动态配置预留接口 Parameters: config (RegressionConfig): 新的配置对象 """ self.config = config self._validate_config() def _validate_config(self): """配置校验""" try: self.config._validate_parameters() except ValueError as e: raise ValueError(f"Configuration validation failed: {e}") def _parse_column_range(self, column_range, total_columns): """ 解析列范围字符串,返回列索引列表 Parameters: column_range (str or int or list): 列范围,如 "0:5", "2,4,6-8", [0,1,2] 或单个索引 total_columns (int): 总列数 Returns: list: 列索引列表 """ if isinstance(column_range, (int, np.integer)): # 单个列索引 if column_range >= total_columns or column_range < 0: raise ValueError(f"Column index {column_range} out of range [0, {total_columns-1}]") return [column_range] elif isinstance(column_range, str): # 解析范围字符串 columns = [] # 分割多个范围(用逗号分隔) for part in column_range.split(','): part = part.strip() if ':' in part: # 范围选择,如 "0:5" start, end = part.split(':') start = int(start.strip()) if start.strip() else 0 end = int(end.strip()) if end.strip() else total_columns if start < 0: start = total_columns + start if end < 0: end = total_columns + end if start >= total_columns or end > total_columns: raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]") columns.extend(range(start, end)) else: # 单个索引 idx = int(part.strip()) if idx < 0: idx = total_columns + idx if idx >= total_columns or idx < 0: raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]") columns.append(idx) return list(set(columns)) # 去重 elif isinstance(column_range, (list, tuple)): # 直接的列索引列表 columns = [] for idx in column_range: if isinstance(idx, str): if ':' in idx: # 处理列表中的范围字符串 start, end = idx.split(':') start = int(start.strip()) if start.strip() else 0 end = int(end.strip()) if end.strip() else total_columns if start < 0: start = total_columns + start if end < 0: end = total_columns + end if start >= total_columns or end > total_columns: raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]") columns.extend(range(start, end)) else: idx_int = int(idx.strip()) if idx_int < 0: idx_int = total_columns + idx_int if idx_int >= total_columns or idx_int < 0: raise ValueError(f"Column index {idx_int} out of range [0, {total_columns-1}]") columns.append(idx_int) else: if idx < 0: idx = total_columns + idx if idx >= total_columns or idx < 0: raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]") columns.append(idx) return list(set(columns)) # 去重 else: raise ValueError(f"Unsupported column range format: {type(column_range)}") def load_csv(self, file_path, label_column, spectrum_columns=None, delimiter=',', header=0): """ 加载CSV文件并指定标签列和光谱列 Parameters: file_path (str): CSV文件路径 label_column (str or int or range-like): 标签列,支持范围选择,如 "0:5", "2,4,6-8" 或单个索引 spectrum_columns (str or list or None): 光谱列,支持范围选择,如 "1:10", "2,4,6-8" 或列索引列表,如果为None则使用除标签列外的所有列 delimiter (str): 分隔符,默认为',' header (int): 表头行号,默认为0 """ try: # 读取CSV文件 self.data = pd.read_csv(file_path, delimiter=delimiter, header=header) total_columns = len(self.data.columns) # 处理标签列:先检查是否是列名,然后再检查是否是索引 if isinstance(label_column, str) and label_column in self.data.columns: # 如果是有效的列名 label_idx = self.data.columns.get_loc(label_column) self.y = self.data[label_column].values else: # 尝试作为列索引处理 try: if isinstance(label_column, str): # 可能是数字字符串,转换为整数 label_column = int(label_column) label_idx = label_column if label_idx < 0: label_idx = total_columns + label_idx if label_idx < 0 or label_idx >= total_columns: raise ValueError(f"Column index {label_column} out of range [0, {total_columns-1}]") self.y = self.data.iloc[:, label_idx].values except (ValueError, TypeError): raise ValueError(f"Invalid label column specification: {label_column}. Must be a valid column name or index.") # 确定光谱列 if spectrum_columns is None: # 使用除标签列外的所有列作为光谱列 spectrum_indices = [i for i in range(total_columns) if i != label_idx] else: # 解析光谱列范围 spectrum_indices = self._parse_column_range(spectrum_columns, total_columns) # 排除标签列(如果在光谱列范围内) spectrum_indices = [i for i in spectrum_indices if i != label_idx] if not spectrum_indices: raise ValueError("No valid spectrum columns found") # 提取光谱数据 self.X = self.data.iloc[:, spectrum_indices].values # 跳过缺失标签的行 valid_mask = ~pd.isna(self.y) original_samples = len(self.y) self.X = self.X[valid_mask] self.y = self.y[valid_mask] self.feature_names = [self.data.columns[i] for i in spectrum_indices] skipped_samples = original_samples - len(self.y) print(f"Successfully loaded data: {self.X.shape[0]} samples, {self.X.shape[1]} features") print(f"Label column: {label_idx} ({self.data.columns[label_idx]})") print(f"Spectrum column range: {min(spectrum_indices)}-{max(spectrum_indices)}") if skipped_samples > 0: print(f"Rows skipped due to missing labels: {skipped_samples}") print(f"Label range: {self.y.min():.4f} - {self.y.max():.4f}") print(f"Data type check: X type {self.X.dtype}, y type {self.y.dtype}") # Check and process data types if self.X.dtype != np.float64: self.X = self.X.astype(np.float64) if self.y.dtype != np.float64: self.y = self.y.astype(np.float64) return True except Exception as e: print(f"Failed to load data: {str(e)}") return False def preprocess_data(self, test_size=None, random_state=None, scale_method=None): """ 数据预处理:分割训练集和测试集,标准化 Parameters: test_size (float, optional): 测试集比例,如果为None则使用配置中的值 random_state (int, optional): 随机种子,如果为None则使用配置中的值 scale_method (str, optional): 标准化方法,如果为None则使用配置中的值 """ # 使用配置中的默认值 test_size = test_size if test_size is not None else self.config.data.test_size random_state = random_state if random_state is not None else self.config.data.random_state scale_method = scale_method if scale_method is not None else self.config.data.scale_method try: # 分割数据集 self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=test_size, random_state=random_state ) # 标准化 if scale_method == 'standard': self.scalers['X'] = StandardScaler() elif scale_method == 'minmax': self.scalers['X'] = MinMaxScaler() else: raise ValueError("scale_method must be 'standard' or 'minmax'") self.X_train_scaled = self.scalers['X'].fit_transform(self.X_train) self.X_test_scaled = self.scalers['X'].transform(self.X_test) print(f"Data preprocessing completed:") print(f"Training set: {self.X_train.shape[0]} samples") print(f"Test set: {self.X_test.shape[0]} samples") return True except Exception as e: print(f"Data preprocessing failed: {str(e)}") return False def add_linear_models(self): """添加线性回归模型""" self.models['linear'] = { 'model': LinearRegression(), 'name': '多元线性回归' } self.models['lasso'] = { 'model': Lasso(random_state=42), 'name': 'LASSO回归', 'params': { 'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] } } self.models['ridge'] = { 'model': Ridge(random_state=42), 'name': '岭回归', 'params': { 'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] } } self.models['elasticnet'] = { 'model': ElasticNet(random_state=42), 'name': '弹性网络回归', 'params': { 'alpha': [0.001, 0.01, 0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9] } } self.models['bayesianridge'] = { 'model': BayesianRidge(), 'name': '贝叶斯岭回归' } def add_boosting_models(self): """添加Boosting模型""" self.models['lsboost'] = { 'model': GradientBoostingRegressor(random_state=42), 'name': 'LSBoost回归', 'params': { 'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'subsample': [0.8, 0.9, 1.0] } } self.models['xgboost'] = { 'model': xgb.XGBRegressor(random_state=42, objective='reg:squarederror'), 'name': 'XGBoost回归', 'params': { 'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'subsample': [0.8, 0.9, 1.0], 'colsample_bytree': [0.8, 0.9, 1.0] } } self.models['lightgbm'] = { 'model': lgb.LGBMRegressor(random_state=42), 'name': 'LightGBM回归', 'params': { 'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'subsample': [0.8, 0.9, 1.0], 'colsample_bytree': [0.8, 0.9, 1.0] } } def add_kernel_models(self): """添加核模型""" # 高斯过程回归 kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) self.models['gaussian'] = { 'model': GaussianProcessRegressor(kernel=kernel, random_state=42), 'name': '高斯过程回归' } # 高斯核SVM self.models['gaussiansvm'] = { 'model': SVR(kernel='rbf'), 'name': '高斯核SVM回归', 'params': { 'C': [0.1, 1.0, 10.0, 100.0], 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1.0] } } # 标准SVM回归 self.models['svm'] = { 'model': SVR(), 'name': 'SVM回归', 'params': { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [0.1, 1.0, 10.0, 100.0], 'gamma': ['scale', 'auto'] } } def add_neural_networks(self, training_config: Optional[TrainingConfig] = None): """添加神经网络模型""" # ELM回归 (真正的极限学习机实现) self.models['elm'] = { 'model': ExtremeLearningMachine(random_state=42), 'name': 'ELM回归', 'params': { 'n_hidden': [50, 100, 200, 500], 'activation': ['sigmoid', 'tanh', 'relu'] } } # BP/MLP回归 self.models['mlp'] = { 'model': MLPRegressor(random_state=42, max_iter=1000), 'name': 'BP/MLP回归', 'params': { 'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)], 'activation': ['relu', 'tanh'], 'learning_rate_init': [0.001, 0.01, 0.1], 'alpha': [0.0001, 0.001, 0.01] } } # LSTM回归 (将光谱数据视为序列,波段作为时间步) self.models['lstm'] = { 'model': LSTMRegressor(random_state=42, config=training_config), 'name': 'LSTM回归', 'params': { 'units': [32, 64, 128], 'dropout': [0.1, 0.2, 0.3], 'epochs': [50, 100, 200] } } # GRU回归 (将光谱数据视为序列,波段作为时间步) self.models['gru'] = { 'model': GRURegressor(random_state=42, config=training_config), 'name': 'GRU回归', 'params': { 'units': [32, 64, 128], 'dropout': [0.1, 0.2, 0.3], 'epochs': [50, 100, 200] } } def add_specialized_models(self): """添加专业模型""" # GAM回归 (真正的广义加性模型) self.models['gam'] = { 'model': GeneralizedAdditiveModel(), 'name': 'GAM回归', 'params': { 'n_splines': [5, 10, 15, 20], 'degree': [3, 4], 'lambda_': [0.001, 0.01, 0.1, 1.0] } } # 决策树回归 self.models['decisiontree'] = { 'model': DecisionTreeRegressor(random_state=42), 'name': '决策树回归', 'params': { 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt', 'log2'] } } # 随机森林回归 self.models['randomforest'] = { 'model': RandomForestRegressor(random_state=42), 'name': '随机森林回归', 'params': { 'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt', 'log2'] } } # 极端随机树回归 self.models['extratrees'] = { 'model': ExtraTreesRegressor(random_state=42), 'name': '极端随机树回归', 'params': { 'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt', 'log2'] } } # AdaBoost回归 self.models['adaboost'] = { 'model': AdaBoostRegressor(random_state=42), 'name': 'AdaBoost回归', 'params': { 'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0], 'loss': ['linear', 'square', 'exponential'] } } def initialize_all_models(self, use_config: bool = True): """Initialize all models""" self.add_linear_models() self.add_boosting_models() self.add_kernel_models() # 神经网络模型可以选择是否使用配置 if use_config and hasattr(self, 'config'): self.add_neural_networks(self.config.training) else: self.add_neural_networks() self.add_specialized_models() print(f"Initialized {len(self.models)} regression models") def get_available_models(self): """获取所有可用模型的名称和描述""" return {name: info['name'] for name, info in self.models.items()} def hyperparameter_tuning(self, model_name, method=None, cv=None, n_iter=None): """ 超参数调优 Parameters: model_name (str): 模型名称 method (str, optional): 调优方法,如果为None则使用配置中的值 cv (int, optional): 交叉验证折数,如果为None则使用配置中的值 n_iter (int, optional): 随机搜索的迭代次数,如果为None则使用配置中的值 """ # 使用配置中的默认值 method = method if method is not None else self.config.models.tuning_method cv = cv if cv is not None else self.config.models.cv_folds n_iter = n_iter if n_iter is not None else self.config.models.random_search_iter if model_name not in self.models: print(f"Model '{model_name}' does not exist") return False model_info = self.models[model_name] if 'params' not in model_info: print(f"Model '{model_name}' has no tunable parameters") return False print(f"Starting hyperparameter tuning for model: {model_info['name']}") if method == 'grid': search = GridSearchCV( model_info['model'], model_info['params'], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1 ) elif method == 'random': search = RandomizedSearchCV( model_info['model'], model_info['params'], n_iter=n_iter, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42, verbose=1 ) else: print("Tuning method must be 'grid' or 'random'") return False try: search.fit(self.X_train_scaled, self.y_train) self.best_params[model_name] = search.best_params_ self.models[model_name]['model'] = search.best_estimator_ print(f"Best parameters: {search.best_params_}") print(f"Best score: {-search.best_score_:.4f}") return True except Exception as e: print(f"Tuning failed: {str(e)}") return False def train_model(self, model_name): """训练单个模型""" if model_name not in self.models: print(f"Model '{model_name}' does not exist") return False try: model_info = self.models[model_name] model = model_info['model'] print(f"Training model: {model_info['name']}") # Train model model.fit(self.X_train_scaled, self.y_train) # Predict y_pred_train = model.predict(self.X_train_scaled) y_pred_test = model.predict(self.X_test_scaled) # Calculate evaluation metrics metrics = self.calculate_metrics(self.y_train, y_pred_train, self.y_test, y_pred_test) self.results[model_name] = { 'model': model, 'metrics': metrics, 'y_pred_train': y_pred_train, 'y_pred_test': y_pred_test } print(f"{model_info['name']} training completed") print(f"Training R²: {metrics['train_r2']:.4f}, Test R²: {metrics['test_r2']:.4f}") return True except Exception as e: print(f"Training failed: {str(e)}") return False def train_all_models(self, tune_hyperparams=False, tuning_method='grid'): """训练当前配置的所有模型""" models_to_train = list(self.models.keys()) print(f"Training {len(models_to_train)} models: {', '.join(models_to_train)}") for model_name in models_to_train: if tune_hyperparams and 'params' in self.models[model_name]: self.hyperparameter_tuning(model_name, method=tuning_method) self.train_model(model_name) def calculate_metrics(self, y_train, y_pred_train, y_test, y_pred_test): """计算评价指标""" metrics = {} # 训练集指标 metrics['train_mse'] = mean_squared_error(y_train, y_pred_train) metrics['train_rmse'] = np.sqrt(metrics['train_mse']) metrics['train_mae'] = mean_absolute_error(y_train, y_pred_train) metrics['train_r2'] = r2_score(y_train, y_pred_train) # 测试集指标 metrics['test_mse'] = mean_squared_error(y_test, y_pred_test) metrics['test_rmse'] = np.sqrt(metrics['test_mse']) metrics['test_mae'] = mean_absolute_error(y_test, y_pred_test) metrics['test_r2'] = r2_score(y_test, y_pred_test) return metrics def plot_results(self, save_path=None, plot_type='comprehensive'): """ 绘制结果比较图 Parameters: save_path (str, optional): 保存路径 plot_type (str): 绘图类型 ('basic', 'comprehensive', 'prediction', 'residual', 'metrics', 'error_dist', 'ranking') """ if not self.results: print("No training results to plot") return if plot_type == 'comprehensive': # 生成综合报告 self.visualizer.generate_comprehensive_report( save_dir=self.config.output.plot_dir, prefix='regression_analysis' ) elif plot_type == 'basic': # 基础图表(保持向后兼容) self._plot_basic_comparison(save_path) elif plot_type == 'prediction': # 预测值vs真实值散点图 self.visualizer.plot_prediction_scatter(save_path=save_path) elif plot_type == 'residual': # 残差分析图 self.visualizer.plot_residual_analysis(save_path=save_path) elif plot_type == 'metrics': # 性能指标对比图 self.visualizer.plot_metrics_comparison(save_path=save_path) elif plot_type == 'error_dist': # 误差分布图 self.visualizer.plot_error_distribution(save_path=save_path) elif plot_type == 'ranking': # 模型排名矩阵 self.visualizer.plot_model_ranking_matrix(save_path=save_path) else: print(f"Unknown plot type: {plot_type}") return def _plot_basic_comparison(self, save_path=None): """绘制基础比较图(向后兼容)""" # 准备数据 model_names = [] train_r2 = [] test_r2 = [] train_rmse = [] test_rmse = [] for model_name, result in self.results.items(): model_names.append(self.models[model_name]['name']) train_r2.append(result['metrics']['train_r2']) test_r2.append(result['metrics']['test_r2']) train_rmse.append(result['metrics']['train_rmse']) test_rmse.append(result['metrics']['test_rmse']) # 创建图表 fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # R² Score Comparison x = np.arange(len(model_names)) width = 0.35 ax1.bar(x - width/2, train_r2, width, label='Training Set', alpha=0.8) ax1.bar(x + width/2, test_r2, width, label='Test Set', alpha=0.8) ax1.set_xlabel('Model') ax1.set_ylabel('R² Score') ax1.set_title('R² Score Comparison') ax1.set_xticks(x) ax1.set_xticklabels(model_names, rotation=45, ha='right') ax1.legend() ax1.grid(True, alpha=0.3) # RMSE Comparison ax2.bar(x - width/2, train_rmse, width, label='Training Set', alpha=0.8) ax2.bar(x + width/2, test_rmse, width, label='Test Set', alpha=0.8) ax2.set_xlabel('Model') ax2.set_ylabel('RMSE') ax2.set_title('RMSE Comparison') ax2.set_xticks(x) ax2.set_xticklabels(model_names, rotation=45, ha='right') ax2.legend() ax2.grid(True, alpha=0.3) # Predicted vs Actual Values Scatter Plot (Test Set) colors = plt.cm.tab10(np.linspace(0, 1, len(self.results))) for i, (model_name, result) in enumerate(self.results.items()): ax3.scatter(self.y_test, result['y_pred_test'], alpha=0.6, color=colors[i], label=self.models[model_name]['name'], s=20) ax3.plot([self.y_test.min(), self.y_test.max()], [self.y_test.min(), self.y_test.max()], 'k--', linewidth=2, label='Perfect Prediction') ax3.set_xlabel('Actual Values') ax3.set_ylabel('Predicted Values') ax3.set_title('Predicted vs Actual Values (Test Set)') ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left') ax3.grid(True, alpha=0.3) # Residual Plot for i, (model_name, result) in enumerate(self.results.items()): residuals = self.y_test - result['y_pred_test'] ax4.scatter(result['y_pred_test'], residuals, alpha=0.6, color=colors[i], label=self.models[model_name]['name'], s=20) ax4.axhline(y=0, color='k', linestyle='--', linewidth=2) ax4.set_xlabel('Predicted Values') ax4.set_ylabel('Residuals') ax4.set_title('Residual Plot (Test Set)') ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left') ax4.grid(True, alpha=0.3) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') print(f"Chart saved to: {save_path}") # plt.show() def plot_prediction_scatter(self, save_path=None, **kwargs): """绘制预测值vs真实值散点图""" self.visualizer.plot_prediction_scatter(save_path=save_path, **kwargs) def plot_residual_analysis(self, save_path=None, **kwargs): """绘制残差分析图""" self.visualizer.plot_residual_analysis(save_path=save_path, **kwargs) def plot_metrics_comparison(self, save_path=None, **kwargs): """绘制性能指标对比图""" self.visualizer.plot_metrics_comparison(save_path=save_path, **kwargs) def plot_error_distribution(self, save_path=None, **kwargs): """绘制误差分布图""" self.visualizer.plot_error_distribution(save_path=save_path, **kwargs) def plot_model_ranking(self, save_path=None, **kwargs): """绘制模型排名矩阵""" self.visualizer.plot_model_ranking_matrix(save_path=save_path, **kwargs) def generate_visualization_report(self, save_dir=None, prefix=None): """生成完整的可视化报告""" save_dir = save_dir or self.config.output.plot_dir prefix = prefix or 'regression_analysis' return self.visualizer.generate_comprehensive_report(save_dir=save_dir, prefix=prefix) def save_model(self, model_name, save_dir='models'): """Save model""" if model_name not in self.results: print(f"Model '{model_name}' has no training results") return False if not os.path.exists(save_dir): os.makedirs(save_dir) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') model_path = os.path.join(save_dir, f'{model_name}_{timestamp}.pkl') scaler_path = os.path.join(save_dir, f'scaler_{timestamp}.pkl') info_path = os.path.join(save_dir, f'info_{model_name}_{timestamp}.json') try: # 保存模型 joblib.dump(self.results[model_name]['model'], model_path) # 保存标准化器 joblib.dump(self.scalers['X'], scaler_path) # 保存模型信息 info = { 'model_name': model_name, 'full_name': self.models[model_name]['name'], 'timestamp': timestamp, 'metrics': self.results[model_name]['metrics'], 'best_params': self.best_params.get(model_name, {}), 'feature_names': self.feature_names } with open(info_path, 'w', encoding='utf-8') as f: json.dump(info, f, indent=4, ensure_ascii=False) print(f"Model saved:") print(f" Model file: {model_path}") print(f" Scaler: {scaler_path}") print(f" Info file: {info_path}") return True except Exception as e: print(f"Save failed: {str(e)}") return False def save_all_models(self, save_dir='models'): """保存所有模型""" for model_name in self.results.keys(): self.save_model(model_name, save_dir) def load_model(self, model_path, scaler_path=None): """加载模型""" try: model = joblib.load(model_path) if scaler_path: scaler = joblib.load(scaler_path) else: scaler = None return model, scaler except Exception as e: print(f"Load failed: {str(e)}") return None, None def print_summary(self): """Print results summary""" if not self.results: print("No training results") return print("\n" + "="*80) print("Regression Model Performance Summary") print("="*80) # Header header = "|30" print(header) # Result rows for model_name, result in sorted(self.results.items(), key=lambda x: x[1]['metrics']['test_r2'], reverse=True): metrics = result['metrics'] model_full_name = self.models[model_name]['name'] print("|30") print("-"*80) print("Note: R² closer to 1 is better, RMSE/MAE smaller is better") print("="*80) def run_analysis_from_config(self) -> bool: """ 基于配置对象运行完整分析流程 - 推荐用于GUI对接 Returns: bool: 分析是否成功完成 """ print("Starting regression analysis from configuration...") # 1. 加载数据 if not self.load_csv(self.config.data.csv_path, self.config.data.label_column, self.config.data.spectrum_columns): return False # 2. 数据预处理 if not self.preprocess_data(): return False # 3. 初始化模型 self.initialize_all_models(use_config=True) # 4. 过滤模型(如果指定了特定的模型) if self.config.models.model_names is not None: # 验证指定的模型名称 invalid_models = [name for name in self.config.models.model_names if name not in self.models] if invalid_models: print(f"Warning: The following models do not exist: {invalid_models}") valid_model_names = [name for name in self.config.models.model_names if name in self.models] else: valid_model_names = self.config.models.model_names # 只保留指定的模型 models_to_keep = {} for model_name in valid_model_names: if model_name in self.models: models_to_keep[model_name] = self.models[model_name] self.models = models_to_keep print(f"Filtered to {len(self.models)} specified models") # 5. 训练模型 self.train_all_models(tune_hyperparams=self.config.models.tune_hyperparams, tuning_method=self.config.models.tuning_method) # 6. 打印汇总 self.print_summary() # 7. 保存模型 if self.config.output.save_models: self.save_all_models(save_dir=self.config.output.save_dir) # 8. 绘制结果 if self.config.output.plot_results: os.makedirs(self.config.output.plot_dir, exist_ok=True) self.generate_visualization_report( save_dir=self.config.output.plot_dir, prefix='regression_analysis' ) print("Analysis completed!") return True def run_complete_analysis(self, csv_path=None, label_column=None, spectrum_columns=None, test_size=None, scale_method=None, tune_hyperparams=None, tuning_method=None, save_models=None, plot_results=None, model_names=None): """ 运行完整分析流程 - 保持向后兼容性 Parameters: csv_path (str, optional): CSV文件路径,如果为None则使用配置中的值 label_column (str or int, optional): 标签列,如果为None则使用配置中的值 spectrum_columns (str or list or None, optional): 光谱列,如果为None则使用配置中的值 test_size (float, optional): 测试集比例,如果为None则使用配置中的值 scale_method (str, optional): 标准化方法,如果为None则使用配置中的值 tune_hyperparams (bool, optional): 是否调优超参数,如果为None则使用配置中的值 tuning_method (str, optional): 调优方法,如果为None则使用配置中的值 save_models (bool, optional): 是否保存模型,如果为None则使用配置中的值 plot_results (bool, optional): 是否绘制结果图,如果为None则使用配置中的值 model_names (list or None, optional): 要训练的模型名称列表,如果为None则使用配置中的值 """ # 更新配置对象(向后兼容) if csv_path is not None: self.config.data.csv_path = csv_path if label_column is not None: self.config.data.label_column = label_column if spectrum_columns is not None: self.config.data.spectrum_columns = spectrum_columns if test_size is not None: self.config.data.test_size = test_size if scale_method is not None: self.config.data.scale_method = scale_method if tune_hyperparams is not None: self.config.models.tune_hyperparams = tune_hyperparams if tuning_method is not None: self.config.models.tuning_method = tuning_method if save_models is not None: self.config.output.save_models = save_models if plot_results is not None: self.config.output.plot_results = plot_results if model_names is not None: self.config.models.model_names = model_names # 使用配置驱动的方法 return self.run_analysis_from_config() class RegressionVisualizer: """ 回归分析可视化器 - 提供丰富的可视化功能 支持预测值vs真实值散点图、残差图、性能指标对比等 """ def __init__(self, analyzer: Optional['RegressionAnalyzer'] = None): """ 初始化可视化器 Parameters: analyzer (RegressionAnalyzer, optional): 回归分析器实例 """ self.analyzer = analyzer self.colorblind_friendly_palette = [ '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5' ] plt.style.use('seaborn-v0_8') # 配置matplotlib支持中文显示 self._configure_chinese_font() def _configure_chinese_font(self): """配置matplotlib以支持中文显示""" import matplotlib as mpl # 尝试设置中文字体 chinese_fonts = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans', 'Arial Unicode MS', 'WenQuanYi Micro Hei', 'AR PL UMing CN', 'Liberation Serif'] font_set = False for font in chinese_fonts: try: # 测试字体是否可用 test_text = "测试中文" fig, ax = plt.subplots() ax.text(0.5, 0.5, test_text, fontname=font, fontsize=12) plt.close(fig) # 如果没有报错,设置字体 mpl.rcParams['font.sans-serif'] = [font] + mpl.rcParams['font.sans-serif'] mpl.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 font_set = True print(f"Successfully set Chinese font to: {font}") break except: continue if not font_set: # 如果没有找到合适的中文字体,使用系统默认并启用unicode mpl.rcParams['font.sans-serif'] = ['DejaVu Sans', 'SimHei'] mpl.rcParams['axes.unicode_minus'] = False print("Warning: Could not find suitable Chinese font. Using default fonts.") def _ensure_chinese_text(self, text): """确保文本正确显示中文""" if isinstance(text, str): try: # 尝试编码和解码以确保UTF-8格式 return text.encode('utf-8').decode('utf-8') except: return text return text def set_colorblind_palette(self): """设置色盲友好配色方案""" import matplotlib as mpl mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=self.colorblind_friendly_palette) def plot_prediction_scatter(self, figsize=(16, 12), save_path=None, show_individual=True, show_overlay=True): """ 绘制预测值vs真实值散点图 Parameters: figsize (tuple): 图形尺寸 save_path (str, optional): 保存路径 show_individual (bool): 是否显示多子图 show_overlay (bool): 是否显示叠加图 """ if not self.analyzer or not self.analyzer.results: print("No analyzer results available for plotting") return self.set_colorblind_palette() n_models = len(self.analyzer.results) if show_individual and show_overlay: # 创建复合图:上方多子图,下方叠加图 fig = plt.figure(figsize=figsize) # 上方:多子图(每个模型一个子图) n_cols = min(4, n_models) n_rows = (n_models + n_cols - 1) // n_cols gs = fig.add_gridspec(n_rows + 1, n_cols, hspace=0.3, wspace=0.3) axes_scatter = [] for i in range(n_rows): for j in range(n_cols): if i * n_cols + j < n_models: axes_scatter.append(fig.add_subplot(gs[i, j])) # 下方:叠加图 ax_overlay = fig.add_subplot(gs[n_rows, :]) elif show_individual: # 只有多子图 n_cols = min(4, n_models) n_rows = (n_models + n_cols - 1) // n_cols fig, axes_scatter = plt.subplots(n_rows, n_cols, figsize=figsize) if n_models == 1: axes_scatter = [axes_scatter] else: axes_scatter = axes_scatter.flatten() ax_overlay = None elif show_overlay: # 只有叠加图 fig, ax_overlay = plt.subplots(1, 1, figsize=(10, 8)) axes_scatter = [] else: print("At least one of show_individual or show_overlay must be True") return # 绘制多子图 if show_individual: for idx, (model_name, result) in enumerate(self.analyzer.results.items()): if idx < len(axes_scatter): ax = axes_scatter[idx] y_true = self.analyzer.y_test y_pred = result['y_pred_test'] # 散点图 ax.scatter(y_true, y_pred, alpha=0.6, s=30, color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)]) # 45度参考线 min_val = min(y_true.min(), y_pred.min()) max_val = max(y_true.max(), y_pred.max()) ax.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, alpha=0.7) # 添加R²和RMSE文本 r2 = result['metrics']['test_r2'] rmse = result['metrics']['test_rmse'] ax.text(0.05, 0.95, f'R² = {r2:.3f}\nRMSE = {rmse:.3f}', transform=ax.transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8)) ax.set_xlabel(self._ensure_chinese_text('True Values')) ax.set_ylabel(self._ensure_chinese_text('Predicted Values')) ax.set_title(self._ensure_chinese_text(f'{self.analyzer.models[model_name]["name"]}')) ax.grid(True, alpha=0.3) ax.axis('equal') # 绘制叠加图 if show_overlay: for idx, (model_name, result) in enumerate(self.analyzer.results.items()): y_true = self.analyzer.y_test y_pred = result['y_pred_test'] ax_overlay.scatter(y_true, y_pred, alpha=0.6, s=30, color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)], label=f'{self.analyzer.models[model_name]["name"]} (R²={result["metrics"]["test_r2"]:.3f})') # 45度参考线 min_val = min(self.analyzer.y_test.min(), min([r['y_pred_test'].min() for r in self.analyzer.results.values()])) max_val = max(self.analyzer.y_test.max(), max([r['y_pred_test'].max() for r in self.analyzer.results.values()])) ax_overlay.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, alpha=0.7, label='Perfect Prediction') ax_overlay.set_xlabel(self._ensure_chinese_text('True Values')) ax_overlay.set_ylabel(self._ensure_chinese_text('Predicted Values')) ax_overlay.set_title(self._ensure_chinese_text('Predicted vs True Values - All Models Overlay')) ax_overlay.legend(bbox_to_anchor=(1.05, 1), loc='upper left') ax_overlay.grid(True, alpha=0.3) ax_overlay.axis('equal') plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') print(f"Prediction scatter plot saved to: {save_path}") # plt.show() def plot_residual_analysis(self, figsize=(16, 8), save_path=None, n_feature_plots=3): """ 绘制残差分析图 Parameters: figsize (tuple): 图形尺寸 save_path (str, optional): 保存路径 n_feature_plots (int): 显示多少个特征的残差vs特征图 """ if not self.analyzer or not self.analyzer.results: print("No analyzer results available for plotting") return self.set_colorblind_palette() # 选择表现最好的几个模型进行详细分析 sorted_models = sorted(self.analyzer.results.items(), key=lambda x: x[1]['metrics']['test_r2'], reverse=True) top_models = sorted_models[:min(3, len(sorted_models))] fig, axes = plt.subplots(2, 3, figsize=figsize) for idx, (model_name, result) in enumerate(top_models): y_true = self.analyzer.y_test y_pred = result['y_pred_test'] residuals = y_true - y_pred # 残差vs预测值 ax1 = axes[0, 0] if idx == 0 else axes[0, idx] ax1.scatter(y_pred, residuals, alpha=0.6, s=20, color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)]) ax1.axhline(y=0, color='k', linestyle='--', linewidth=2) ax1.set_xlabel('Predicted Values') ax1.set_ylabel('Residuals') ax1.set_title(f'Residuals vs Predicted\n{self.analyzer.models[model_name]["name"]}') ax1.grid(True, alpha=0.3) # Q-Q图 ax2 = axes[1, 0] if idx == 0 else axes[1, idx] stats.probplot(residuals, dist="norm", plot=ax2) ax2.set_title(f'Normal Q-Q Plot\n{self.analyzer.models[model_name]["name"]}') # 残差vs重要特征(如果有特征重要性) if idx < n_feature_plots - 2 and hasattr(result['model'], 'feature_importances_'): ax3 = axes[idx // 3 + 1, idx % 3 + 1] if idx > 0 else axes[0, 2] if idx < 2: # 只显示前两个模型的特征残差图 try: importances = result['model'].feature_importances_ top_features_idx = np.argsort(importances)[-2:] # 最重要的两个特征 for i, feat_idx in enumerate(top_features_idx): feat_name = self.analyzer.feature_names[feat_idx] if hasattr(self.analyzer, 'feature_names') else f'Feature {feat_idx}' ax3.scatter(self.analyzer.X_test[:, feat_idx], residuals, alpha=0.6, s=20, label=f'{feat_name}', color=self.colorblind_friendly_palette[(idx*2 + i) % len(self.colorblind_friendly_palette)]) ax3.axhline(y=0, color='k', linestyle='--', linewidth=2) ax3.set_xlabel('Feature Values') ax3.set_ylabel('Residuals') ax3.set_title(f'Residuals vs Top Features\n{self.analyzer.models[model_name]["name"]}') ax3.legend() ax3.grid(True, alpha=0.3) except: ax3.text(0.5, 0.5, 'Feature importance\nnot available', transform=ax3.transAxes, ha='center', va='center') ax3.set_title(f'Feature Analysis\n{self.analyzer.models[model_name]["name"]}') plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') print(f"Residual analysis plot saved to: {save_path}") # plt.show() def plot_metrics_comparison(self, figsize=(16, 10), save_path=None): """ 绘制性能指标对比图 Parameters: figsize (tuple): 图形尺寸 save_path (str, optional): 保存路径 """ if not self.analyzer or not self.analyzer.results: print("No analyzer results available for plotting") return self.set_colorblind_palette() # 准备数据 model_names = [] model_full_names = [] r2_scores = [] rmse_scores = [] mae_scores = [] training_times = [] memory_usage = [] # 模拟训练时间和内存使用(实际应用中需要测量) for model_name, result in self.analyzer.results.items(): model_names.append(model_name) model_full_names.append(self.analyzer.models[model_name]['name']) r2_scores.append(result['metrics']['test_r2']) rmse_scores.append(result['metrics']['test_rmse']) mae_scores.append(result['metrics']['test_mae']) training_times.append(np.random.uniform(0.1, 5.0)) # 模拟时间 memory_usage.append(np.random.uniform(50, 500)) # 模拟内存 # 创建子图 fig, axes = plt.subplots(2, 2, figsize=figsize) # 雷达图 - R², RMSE, MAE ax_radar = axes[0, 0] # 标准化指标到0-1范围 r2_norm = (r2_scores - np.min(r2_scores)) / (np.max(r2_scores) - np.min(r2_scores)) rmse_norm = 1 - (rmse_scores - np.min(rmse_scores)) / (np.max(rmse_scores) - np.min(rmse_scores)) # RMSE越小越好,反转 mae_norm = 1 - (mae_scores - np.min(mae_scores)) / (np.max(mae_scores) - np.min(mae_scores)) # MAE越小越好,反转 # 雷达图数据 categories = ['R² Score', 'RMSE (inv)', 'MAE (inv)'] n_models = len(model_names) # 计算角度 angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist() angles += angles[:1] # 闭合图形 for i in range(n_models): values = [r2_norm[i], rmse_norm[i], mae_norm[i]] values += values[:1] # 闭合图形 ax_radar.plot(angles, values, 'o-', linewidth=2, color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)], label=model_full_names[i]) ax_radar.fill(angles, values, alpha=0.25, color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)]) ax_radar.set_xticks(angles[:-1]) ax_radar.set_xticklabels(categories) ax_radar.set_title('Performance Metrics Radar Chart') ax_radar.legend(bbox_to_anchor=(1.1, 1), loc='upper left') ax_radar.grid(True, alpha=0.3) # 分组柱状图 - 不同指标的比较 ax_bar = axes[0, 1] x = np.arange(len(model_names)) width = 0.25 bars1 = ax_bar.bar(x - width, r2_scores, width, label='R²', alpha=0.8, color=self.colorblind_friendly_palette[0]) bars2 = ax_bar.bar(x, [1/s for s in rmse_scores], width, label='1/RMSE', alpha=0.8, color=self.colorblind_friendly_palette[1]) bars3 = ax_bar.bar(x + width, [1/s for s in mae_scores], width, label='1/MAE', alpha=0.8, color=self.colorblind_friendly_palette[2]) ax_bar.set_xlabel('Models') ax_bar.set_ylabel('Normalized Scores') ax_bar.set_title('Normalized Performance Comparison') ax_bar.set_xticks(x) ax_bar.set_xticklabels(model_full_names, rotation=45, ha='right') ax_bar.legend() ax_bar.grid(True, alpha=0.3, axis='y') # 堆叠柱状图 - 误差分解(偏差vs方差) ax_stack = axes[1, 0] bias_errors = [abs(np.mean(residuals)) for residuals in [self.analyzer.y_test - result['y_pred_test'] for result in self.analyzer.results.values()]] variance_errors = [np.var(residuals) for residuals in [self.analyzer.y_test - result['y_pred_test'] for result in self.analyzer.results.values()]] bars_bias = ax_stack.bar(model_names, bias_errors, label='Bias (Mean Abs Error)', alpha=0.8, color=self.colorblind_friendly_palette[0]) bars_var = ax_stack.bar(model_names, variance_errors, bottom=bias_errors, label='Variance (Residual Var)', alpha=0.8, color=self.colorblind_friendly_palette[1]) ax_stack.set_xlabel('Models') ax_stack.set_ylabel('Error Components') ax_stack.set_title('Bias-Variance Decomposition') ax_stack.set_xticklabels(model_full_names, rotation=45, ha='right') ax_stack.legend() ax_stack.grid(True, alpha=0.3, axis='y') # 气泡图 - 综合评估(R² vs 1/RMSE,气泡大小表示1/MAE) ax_bubble = axes[1, 1] bubble_sizes = [100 * (1/s) for s in mae_scores] # MAE越大气泡越小 scatter = ax_bubble.scatter(r2_scores, [1/s for s in rmse_scores], s=bubble_sizes, c=range(len(model_names)), cmap='viridis', alpha=0.6, edgecolors='black') # 添加模型名称标签 for i, name in enumerate(model_full_names): ax_bubble.annotate(name, (r2_scores[i], 1/rmse_scores[i]), xytext=(5, 5), textcoords='offset points', fontsize=8) ax_bubble.set_xlabel('R² Score') ax_bubble.set_ylabel('1/RMSE') ax_bubble.set_title('Comprehensive Performance Assessment\n(Bubble size ∝ 1/MAE)') ax_bubble.grid(True, alpha=0.3) # 添加颜色条 cbar = plt.colorbar(scatter, ax=ax_bubble) cbar.set_label('Model Index') plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') print(f"Metrics comparison plot saved to: {save_path}") # plt.show() def plot_error_distribution(self, figsize=(16, 8), save_path=None): """ 绘制误差分布图 Parameters: figsize (tuple): 图形尺寸 save_path (str, optional): 保存路径 """ if not self.analyzer or not self.analyzer.results: print("No analyzer results available for plotting") return self.set_colorblind_palette() # 计算所有模型的误差 model_errors = {} for model_name, result in self.analyzer.results.items(): errors = self.analyzer.y_test - result['y_pred_test'] model_errors[model_name] = errors fig, axes = plt.subplots(2, 2, figsize=figsize) # 误差分布直方图 - 所有模型并排比较 ax_hist = axes[0, 0] bins = np.linspace(min([min(errors) for errors in model_errors.values()]), max([max(errors) for errors in model_errors.values()]), 30) for i, (model_name, errors) in enumerate(model_errors.items()): ax_hist.hist(errors, bins=bins, alpha=0.7, label=self.analyzer.models[model_name]['name'], color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)], density=True) ax_hist.set_xlabel('Prediction Error') ax_hist.set_ylabel('Density') ax_hist.set_title('Error Distribution Histogram') ax_hist.legend() ax_hist.grid(True, alpha=0.3) # 核密度估计曲线 ax_kde = axes[0, 1] for i, (model_name, errors) in enumerate(model_errors.items()): try: sns.kdeplot(data=errors, ax=ax_kde, label=self.analyzer.models[model_name]['name'], color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)], fill=True, alpha=0.3) except: # 如果seaborn不可用,使用matplotlib ax_kde.hist(errors, bins=30, alpha=0.3, density=True, label=self.analyzer.models[model_name]['name'], color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)]) ax_kde.set_xlabel('Prediction Error') ax_kde.set_ylabel('Density') ax_kde.set_title('Error Distribution KDE') ax_kde.legend() ax_kde.grid(True, alpha=0.3) # 累积分布函数 ax_cdf = axes[1, 0] error_range = np.linspace(min([min(errors) for errors in model_errors.values()]), max([max(errors) for errors in model_errors.values()]), 100) for i, (model_name, errors) in enumerate(model_errors.items()): sorted_errors = np.sort(errors) y_vals = np.arange(len(sorted_errors)) / float(len(sorted_errors)) ax_cdf.plot(sorted_errors, y_vals, label=self.analyzer.models[model_name]['name'], color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)], linewidth=2) ax_cdf.set_xlabel('Prediction Error') ax_cdf.set_ylabel('Cumulative Probability') ax_cdf.set_title('Cumulative Distribution Function') ax_cdf.legend() ax_cdf.grid(True, alpha=0.3) # 箱线图 ax_box = axes[1, 1] error_data = [errors for errors in model_errors.values()] model_labels = [self.analyzer.models[name]['name'] for name in model_errors.keys()] bp = ax_box.boxplot(error_data, labels=model_labels, patch_artist=True) for patch, color in zip(bp['boxes'], self.colorblind_friendly_palette): patch.set_facecolor(color) patch.set_alpha(0.7) # 添加均值点 for i, errors in enumerate(error_data): ax_box.plot(i+1, np.mean(errors), 'ro', markersize=8, label='Mean' if i == 0 else "") ax_box.set_xlabel('Models') ax_box.set_ylabel('Prediction Error') ax_box.set_title('Error Distribution Box Plot') ax_box.legend() ax_box.grid(True, alpha=0.3, axis='y') plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') print(f"Error distribution plot saved to: {save_path}") # plt.show() def plot_model_ranking_matrix(self, figsize=(14, 10), save_path=None): """ 绘制模型排名矩阵 Parameters: figsize (tuple): 图形尺寸 save_path (str, optional): 保存路径 """ if not self.analyzer or not self.analyzer.results: print("No analyzer results available for plotting") return self.set_colorblind_palette() # 准备指标数据 metrics_data = [] model_names = [] metric_names = ['R²', 'RMSE', 'MAE', 'Training_R²', 'Training_RMSE', 'Training_MAE'] for model_name, result in self.analyzer.results.items(): model_names.append(self.analyzer.models[model_name]['name']) metrics = result['metrics'] metrics_data.append([ metrics['test_r2'], metrics['test_rmse'], metrics['test_mae'], metrics['train_r2'], metrics['train_rmse'], metrics['train_mae'] ]) metrics_array = np.array(metrics_data) # 计算排名(对于R²,越高越好;对于RMSE/MAE,越低越好) rankings = np.zeros_like(metrics_array) rankings[:, 0] = len(model_names) - stats.rankdata(metrics_array[:, 0]) + 1 # R²排名(反转) rankings[:, 1] = stats.rankdata(metrics_array[:, 1]) # RMSE排名 rankings[:, 2] = stats.rankdata(metrics_array[:, 2]) # MAE排名 rankings[:, 3] = len(model_names) - stats.rankdata(metrics_array[:, 3]) + 1 # Training R²排名(反转) rankings[:, 4] = stats.rankdata(metrics_array[:, 4]) # Training RMSE排名 rankings[:, 5] = stats.rankdata(metrics_array[:, 5]) # Training MAE排名 fig, axes = plt.subplots(2, 2, figsize=figsize) # 热力图 - 模型vs指标的排名 ax_heatmap = axes[0, 0] im = ax_heatmap.imshow(rankings, cmap='RdYlGn_r', aspect='auto', alpha=0.8) # 设置标签 ax_heatmap.set_xticks(np.arange(len(metric_names))) ax_heatmap.set_yticks(np.arange(len(model_names))) ax_heatmap.set_xticklabels(metric_names, rotation=45, ha='right') ax_heatmap.set_yticklabels(model_names) # 添加数值标签 for i in range(len(model_names)): for j in range(len(metric_names)): text = ax_heatmap.text(j, i, f'{rankings[i, j]:.0f}', ha="center", va="center", color="black", fontsize=10) ax_heatmap.set_title('Model Ranking Matrix\n(Lower rank = Better performance)') plt.colorbar(im, ax=ax_heatmap, label='Rank') # 平行坐标图 ax_parallel = axes[0, 1] # 标准化数据到0-1范围 normalized_data = np.zeros_like(metrics_array) for j in range(metrics_array.shape[1]): if j in [0, 3]: # R²指标,越高越好 normalized_data[:, j] = (metrics_array[:, j] - metrics_array[:, j].min()) / (metrics_array[:, j].max() - metrics_array[:, j].min()) else: # RMSE/MAE指标,越低越好,反转标准化 normalized_data[:, j] = 1 - (metrics_array[:, j] - metrics_array[:, j].min()) / (metrics_array[:, j].max() - metrics_array[:, j].min()) for i in range(len(model_names)): ax_parallel.plot(range(len(metric_names)), normalized_data[i], marker='o', linewidth=2, markersize=6, color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)], label=model_names[i], alpha=0.8) ax_parallel.set_xticks(range(len(metric_names))) ax_parallel.set_xticklabels(metric_names, rotation=45, ha='right') ax_parallel.set_ylabel('Normalized Score (Higher = Better)') ax_parallel.set_title('Parallel Coordinates Plot') ax_parallel.legend(bbox_to_anchor=(1.05, 1), loc='upper left') ax_parallel.grid(True, alpha=0.3) # 气泡图 - R² vs RMSE,气泡大小表示MAE ax_bubble = axes[1, 0] r2_scores = metrics_array[:, 0] rmse_scores = metrics_array[:, 1] mae_scores = metrics_array[:, 2] # 气泡大小(MAE越小气泡越大) bubble_sizes = 1000 / (mae_scores + 0.01) # 避免除零 scatter = ax_bubble.scatter(r2_scores, rmse_scores, s=bubble_sizes, c=range(len(model_names)), cmap='viridis', alpha=0.6, edgecolors='black') # 添加模型名称标签 for i, name in enumerate(model_names): ax_bubble.annotate(name, (r2_scores[i], rmse_scores[i]), xytext=(5, 5), textcoords='offset points', fontsize=8) ax_bubble.set_xlabel('R² Score') ax_bubble.set_ylabel('RMSE') ax_bubble.set_title('Performance Bubble Chart\n(Bubble size ∝ 1/MAE)') ax_bubble.grid(True, alpha=0.3) # 添加颜色条 cbar = plt.colorbar(scatter, ax=ax_bubble) cbar.set_label('Model Index') # 综合排名条形图 ax_ranking = axes[1, 1] avg_rankings = np.mean(rankings, axis=1) sorted_indices = np.argsort(avg_rankings) bars = ax_ranking.bar(range(len(model_names)), avg_rankings[sorted_indices], color=[self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)] for i in range(len(model_names))], alpha=0.7) ax_ranking.set_xlabel('Models (Sorted by Average Rank)') ax_ranking.set_ylabel('Average Rank') ax_ranking.set_title('Overall Model Ranking') ax_ranking.set_xticks(range(len(model_names))) ax_ranking.set_xticklabels([model_names[i] for i in sorted_indices], rotation=45, ha='right') ax_ranking.grid(True, alpha=0.3, axis='y') # 添加数值标签 for i, bar in enumerate(bars): height = bar.get_height() ax_ranking.text(bar.get_x() + bar.get_width()/2., height, '.2f', ha='center', va='bottom') plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') print(f"Model ranking matrix plot saved to: {save_path}") # plt.show() def generate_comprehensive_report(self, save_dir='plots', prefix='regression_analysis'): """ 生成综合可视化报告 Parameters: save_dir (str): 保存目录 prefix (str): 文件名前缀 """ if not self.analyzer: print("No analyzer available for report generation") return os.makedirs(save_dir, exist_ok=True) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # 生成各种图表 plot_configs = [ ('prediction_scatter', self.plot_prediction_scatter), ('residual_analysis', self.plot_residual_analysis), ('metrics_comparison', self.plot_metrics_comparison), ('error_distribution', self.plot_error_distribution), ('model_ranking', self.plot_model_ranking_matrix) ] saved_files = [] for plot_name, plot_func in plot_configs: try: save_path = f'{save_dir}/{prefix}_{plot_name}_{timestamp}.png' plot_func(save_path=save_path) saved_files.append(save_path) plt.close('all') # 关闭所有图形以释放内存 except Exception as e: print(f"Failed to generate {plot_name} plot: {str(e)}") if saved_files: print("Comprehensive visualization report generated:") for file in saved_files: print(f" - {file}") else: print("No plots were successfully generated") return saved_files def main(): """主函数 - 展示配置驱动和向后兼容两种使用方式""" print("="*60) print("Regression Analysis Tool - Configuration-Driven Interface") print("="*60) # 方法1:配置驱动方式(推荐用于GUI对接) print("\n--- Method 1: Configuration-Driven (Recommended for GUI) ---") # 创建配置对象 csv_file_path = r"E:\code\content\change\6.csv" config = RegressionConfig.create_default( csv_path=csv_file_path, label_column="0" ) # 可选:自定义配置 config.data.spectrum_columns = "8:" # 光谱列范围 config.models.model_names ='all'# 选择部分模型进行演示 config.models.tune_hyperparams = False # 快速分析,不进行超参数调优 config.output.save_models = True # 不保存模型文件 config.output.plot_results = True # 启用可视化 config.output.plot_dir = 'E:\code\content\change\plot\yellow' # 可视化输出目录 # 创建分析器并传入配置 analyzer = RegressionAnalyzer(config) # 查看可用模型 analyzer.initialize_all_models() print("Available models:") for model_key, model_name in analyzer.get_available_models().items(): print(f" {model_key}: {model_name}") # 运行配置驱动的分析 success = analyzer.run_analysis_from_config() if success: print("Configuration-driven analysis completed successfully!") # 演示各种可视化功能 print("\n--- Visualization Demo ---") # 创建可视化目录 viz_dir = 'visualization_demo' os.makedirs(viz_dir, exist_ok=True) print("Generating various visualization plots...") # 1. 预测值vs真实值散点图 print("1. Prediction vs True Values Scatter Plot...") analyzer.plot_prediction_scatter( save_path=f'{viz_dir}/prediction_scatter.png', show_individual=True, show_overlay=True ) # 2. 残差分析图 print("2. Residual Analysis Plot...") analyzer.plot_residual_analysis( save_path=f'{viz_dir}/residual_analysis.png' ) # 3. 性能指标对比图 print("3. Performance Metrics Comparison...") analyzer.plot_metrics_comparison( save_path=f'{viz_dir}/metrics_comparison.png' ) # 4. 误差分布图 print("4. Error Distribution Analysis...") analyzer.plot_error_distribution( save_path=f'{viz_dir}/error_distribution.png' ) # 5. 模型排名矩阵 print("5. Model Ranking Matrix...") analyzer.plot_model_ranking( save_path=f'{viz_dir}/model_ranking.png' ) # 6. 生成完整可视化报告 print("6. Generating Comprehensive Visualization Report...") saved_plots = analyzer.generate_visualization_report( save_dir=viz_dir, prefix='demo_report' ) print(f"\nVisualization completed! Generated {len(saved_plots)} plot files in '{viz_dir}' directory:") for plot_file in saved_plots: print(f" - {plot_file}") print("\nAvailable visualization methods:") print(" - analyzer.plot_prediction_scatter() # 预测值vs真实值散点图") print(" - analyzer.plot_residual_analysis() # 残差分析图") print(" - analyzer.plot_metrics_comparison() # 性能指标对比图") print(" - analyzer.plot_error_distribution() # 误差分布图") print(" - analyzer.plot_model_ranking() # 模型排名矩阵") print(" - analyzer.generate_visualization_report() # 生成完整报告") else: print("Configuration-driven analysis failed!") # # 方法2:向后兼容方式(传统参数传递) # print("\n--- Method 2: Backward Compatible (Legacy Parameter Passing) ---") # # analyzer2 = RegressionAnalyzer() # 使用默认配置 # # # 使用传统的参数传递方式 # success2 = analyzer2.run_complete_analysis( # csv_path=r"E:\code\WQ\pipeline_result\work_dir\5_training_spectra\training_spectra.csv", # label_column="0", # spectrum_columns="13:", # test_size=0.2, # scale_method='standard', # tune_hyperparams=False, # save_models=False, # plot_results=True, # model_names=['xgboost', 'lightgbm'] # 只训练这两个模型 # ) # # if success2: # print("Backward-compatible analysis completed successfully!") # else: # print("Backward-compatible analysis failed!") # # print("\n" + "="*60) # print("Both methods are supported. Configuration-driven is recommended for GUI integration.") # print("="*60) if __name__ == "__main__": main()