"""
回归分析工具包
支持多种回归算法：线性回归、LASSO、岭回归、Boosting、神经网络等
包含超参数调优、模型评价和保存功能
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import xgboost as xgb
import lightgbm as lgb
from statsmodels.api import OLS, GLM
from statsmodels.genmod.families import Gaussian
import warnings
import joblib
import os
from datetime import datetime
import json
from scipy.linalg import pinv
from scipy import stats
from typing import Optional, List, Dict, Any, Union
from dataclasses import dataclass, field
import time

warnings.filterwarnings('ignore')


@dataclass
class DataConfig:
    """数据配置类"""
    csv_path: str = ""
    label_column: Union[str, int] = ""
    spectrum_columns: Optional[Union[str, List[Union[str, int]]]] = None
    test_size: float = 0.2
    random_state: int = 42
    scale_method: str = 'standard'


@dataclass
class ModelConfig:
    """模型配置类"""
    model_names: Optional[Union[str, List[str]]] = None
    tune_hyperparams: bool = True
    tuning_method: str = 'grid'
    cv_folds: int = 5
    random_search_iter: int = 20


@dataclass
class TrainingConfig:
    """训练配置类"""
    epochs: int = 100
    batch_size: int = 32
    learning_rate: float = 0.001


@dataclass
class OutputConfig:
    """输出配置类"""
    save_models: bool = True
    plot_results: bool = True
    save_dir: str = 'models'
    plot_dir: str = 'plots'


@dataclass
class RegressionConfig:
    """回归分析完整配置类 - 为GUI对接设计的标准化接口"""
    data: DataConfig = field(default_factory=DataConfig)
    models: ModelConfig = field(default_factory=ModelConfig)
    training: TrainingConfig = field(default_factory=TrainingConfig)
    output: OutputConfig = field(default_factory=OutputConfig)

    def __post_init__(self):
        """参数校验和默认值设置"""
        self._validate_parameters()

    def _validate_parameters(self):
        """参数校验"""
        # 数据参数校验
        if not self.data.csv_path:
            raise ValueError("CSV file path must be specified")
        if not self.data.label_column:
            raise ValueError("Label column must be specified")
        if not (0 < self.data.test_size < 1):
            raise ValueError("Test size must be between 0 and 1")
        if self.data.scale_method not in ['standard', 'minmax']:
            raise ValueError("Scale method must be 'standard' or 'minmax'")

        # 模型参数校验
        if self.models.tuning_method not in ['grid', 'random']:
            raise ValueError("Tuning method must be 'grid' or 'random'")
        if self.models.cv_folds < 2:
            raise ValueError("CV folds must be at least 2")

        # 处理模型名称
        self._process_model_names()

        # 训练参数校验
        if self.training.epochs <= 0:
            raise ValueError("Epochs must be positive")
        if self.training.batch_size <= 0:
            raise ValueError("Batch size must be positive")
        if self.training.learning_rate <= 0:
            raise ValueError("Learning rate must be positive")

    def _process_model_names(self):
        """处理模型名称，支持'all'参数"""
        if isinstance(self.models.model_names, str):
            if self.models.model_names.lower() == 'all':
                # 获取所有支持的模型名称
                supported_models = self._get_supported_models()
                self.models.model_names = list(supported_models.keys())
                print(f"选择所有可用模型: {len(self.models.model_names)} 个")
            else:
                # 单个模型名称
                supported_models = self._get_supported_models()
                if self.models.model_names not in supported_models:
                    raise ValueError(f"不支持的模型类型: {self.models.model_names}")
                self.models.model_names = [self.models.model_names]
        elif isinstance(self.models.model_names, list):
            # 验证列表中的每个模型
            supported_models = self._get_supported_models()
            for model in self.models.model_names:
                if model not in supported_models:
                    raise ValueError(f"不支持的模型类型: {model}")
        elif self.models.model_names is None:
            # 默认使用一些常用模型
            self.models.model_names = ['linear', 'ridge', 'lasso', 'randomforest', 'svm']

    def _get_supported_models(self) -> Dict[str, str]:
        """获取支持的模型列表"""
        # 基于RegressionAnalyzer中注册的模型
        models = {
            # 线性模型
            'linear': '多元线性回归',
            'lasso': 'LASSO回归',
            'ridge': '岭回归',
            'elasticnet': '弹性网络回归',
            'bayesianridge': '贝叶斯岭回归',

            # 提升模型
            'lsboost': '最小二乘提升',
            'xgboost': 'XGBoost回归',
            'lightgbm': 'LightGBM回归',

            # 核方法
            'gaussian': '高斯过程回归',
            'gaussiansvm': '高斯SVM回归',
            'svm': '支持向量回归',

            # 神经网络
            'elm': '极限学习机',
            'mlp': '多层感知机',
            'lstm': 'LSTM网络',
            'gru': 'GRU网络',

            # 其他模型
            'gam': '广义加性模型',
            'decisiontree': '决策树回归',
            'randomforest': '随机森林回归',
            'extratrees': '极端随机树回归',
            'adaboost': 'AdaBoost回归'
        }

        # 只返回当前环境可用的模型
        available_models = {}
        for key, name in models.items():
            try:
                # 这里可以添加更复杂的可用性检查
                available_models[key] = name
            except:
                continue

        return available_models

    @classmethod
    def create_default(cls, csv_path: str, label_column: Union[str, int]) -> 'RegressionConfig':
        """创建默认配置的便捷方法"""
        # 创建配置时暂时跳过验证
        config = cls.__new__(cls)
        config.data = DataConfig()
        config.models = ModelConfig()
        config.training = TrainingConfig()
        config.output = OutputConfig()

        # 设置必要参数
        config.data.csv_path = csv_path
        config.data.label_column = label_column

        # 手动调用验证
        config._validate_parameters()

        return config

    @classmethod
    def create_quick_analysis(cls, csv_path: str, label_column: Union[str, int],
                            model_names: Optional[List[str]] = None) -> 'RegressionConfig':
        """创建快速分析配置"""
        # 创建配置时暂时跳过验证
        config = cls.__new__(cls)
        config.data = DataConfig()
        config.models = ModelConfig()
        config.training = TrainingConfig()
        config.output = OutputConfig()

        # 设置必要参数
        config.data.csv_path = csv_path
        config.data.label_column = label_column
        config.models.model_names = model_names
        config.models.tune_hyperparams = False  # 快速分析不调参
        config.output.save_models = False  # 不保存模型

        # 手动调用验证
        config._validate_parameters()

        return config


class ExtremeLearningMachine(BaseEstimator, RegressorMixin):
    """
    Extreme Learning Machine (ELM) 回归器
    ELM是一种单隐层前馈神经网络，具有快速训练速度
    """

    def __init__(self, n_hidden=100, activation='sigmoid', random_state=42):
        self.n_hidden = n_hidden
        self.activation = activation
        self.random_state = random_state
        self.input_weights_ = None
        self.biases_ = None
        self.output_weights_ = None

    def _activation_function(self, X):
        """激活函数"""
        if self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-X))
        elif self.activation == 'tanh':
            return np.tanh(X)
        elif self.activation == 'relu':
            return np.maximum(0, X)
        elif self.activation == 'linear':
            return X
        else:
            raise ValueError(f"Unsupported activation function: {self.activation}")

    def fit(self, X, y):
        """训练ELM模型"""
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape

        # Initialize input weights and biases (random)
        self.input_weights_ = np.random.randn(n_features, self.n_hidden)
        self.biases_ = np.random.randn(self.n_hidden)

        # 计算隐层输出
        H = self._activation_function(X @ self.input_weights_ + self.biases_)

        # 添加偏置列到隐层输出（可选）
        H = np.column_stack([H, np.ones(n_samples)])

        # 计算输出权重（解析解）
        self.output_weights_ = pinv(H) @ y

        return self

    def predict(self, X):
        """预测"""
        if self.input_weights_ is None:
            raise ValueError("模型还未训练")

        # 计算隐层输出
        H = self._activation_function(X @ self.input_weights_ + self.biases_)
        H = np.column_stack([H, np.ones(X.shape[0])])

        # 计算输出
        return H @ self.output_weights_

    def _more_tags(self):
        return {'no_validation': True}


class GeneralizedAdditiveModel(BaseEstimator, RegressorMixin):
    """
    Generalized Additive Model (GAM) 回归器
    使用样条基函数实现GAM
    """

    def __init__(self, n_splines=10, degree=3, lambda_=0.1):
        self.n_splines = n_splines
        self.degree = degree
        self.lambda_ = lambda_
        self.coefficients_ = None
        self.knots_ = None

    def _create_spline_basis(self, X):
        """创建样条基函数"""
        n_samples, n_features = X.shape
        n_basis = self.n_splines + self.degree + 1

        # 为每个特征创建基函数
        basis_matrices = []

        for feature_idx in range(n_features):
            x = X[:, feature_idx]
            x_min, x_max = np.min(x), np.max(x)

            # 创建节点
            if self.knots_ is None:
                knots = np.linspace(x_min, x_max, self.n_splines + 2)[1:-1]
            else:
                knots = self.knots_[feature_idx]

            # 创建B样条基函数
            basis = np.zeros((n_samples, n_basis))

            # 截断幂基函数（简化实现）
            for i in range(n_basis):
                if i < self.degree + 1:
                    # 左端多项式
                    basis[:, i] = np.power(np.maximum(0, x - x_min), i)
                elif i > n_basis - self.degree - 2:
                    # 右端多项式
                    power = n_basis - 1 - i
                    basis[:, i] = np.power(np.maximum(0, x_max - x), power)
                else:
                    # 中间截断幂函数
                    basis[:, i] = np.power(np.maximum(0, x - knots[i - self.degree - 1]), self.degree)

            basis_matrices.append(basis)

        # 组合所有特征的基函数
        return np.concatenate(basis_matrices, axis=1)

    def fit(self, X, y):
        """训练GAM模型"""
        from sklearn.linear_model import Ridge

        # 创建样条基函数
        X_basis = self._create_spline_basis(X)

        # 使用岭回归拟合系数（带正则化）
        ridge = Ridge(alpha=self.lambda_, fit_intercept=True)
        ridge.fit(X_basis, y)

        self.coefficients_ = ridge.coef_
        self.intercept_ = ridge.intercept_

        return self

    def predict(self, X):
        """预测"""
        if self.coefficients_ is None:
            raise ValueError("模型还未训练")

        X_basis = self._create_spline_basis(X)
        return X_basis @ self.coefficients_ + self.intercept_


class LSTMRegressor(BaseEstimator, RegressorMixin):
    """
    LSTM回归器 - 将光谱数据视为序列数据
    每个光谱样本的波段作为时间步
    使用PyTorch实现
    """

    def __init__(self, units=None, dropout=0.2, recurrent_dropout=0.2, epochs=None,
                 batch_size=None, learning_rate=None, random_state=42, device=None,
                 config: Optional['TrainingConfig'] = None):
        """
        LSTM回归器构造函数

        Parameters:
        units (int, optional): LSTM单元数，如果为None则使用默认值50
        dropout (float): Dropout比例
        recurrent_dropout (float): 循环Dropout比例
        epochs (int, optional): 训练轮数，如果为None则使用默认值100
        batch_size (int, optional): 批次大小，如果为None则使用默认值32
        learning_rate (float, optional): 学习率，如果为None则使用默认值0.001
        random_state (int): 随机种子
        device (str, optional): 计算设备
        config (TrainingConfig, optional): 训练配置对象
        """
        # 如果提供了配置对象，使用配置中的参数
        if config is not None:
            self.units = units if units is not None else 64  # LSTM默认使用更多单元
            self.dropout = dropout
            self.recurrent_dropout = recurrent_dropout
            self.epochs = epochs if epochs is not None else config.epochs
            self.batch_size = batch_size if batch_size is not None else config.batch_size
            self.learning_rate = learning_rate if learning_rate is not None else config.learning_rate
        else:
            # 使用传统参数方式
            self.units = units if units is not None else 50
            self.dropout = dropout
            self.recurrent_dropout = recurrent_dropout
            self.epochs = epochs if epochs is not None else 100
            self.batch_size = batch_size if batch_size is not None else 32
            self.learning_rate = learning_rate if learning_rate is not None else 0.001

        self.random_state = random_state
        self.device = device
        self.model_ = None
        self.input_size_ = None

        # 尝试导入PyTorch
        try:
            import torch
            import torch.nn as nn
            import torch.optim as optim
            self.torch = torch
            self.nn = nn
            self.optim = optim
            self.pytorch_available = True
            
            # 设置设备
            if self.device is None:
                self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            else:
                self.device = torch.device(self.device)
                
        except ImportError:
            self.pytorch_available = False
            print("Warning: PyTorch not installed, LSTM model will use MLPRegressor approximation")

    def _create_model(self, input_size):
        """创建PyTorch LSTM模型"""
        class LSTMModel(self.nn.Module):
            def __init__(self, input_size, hidden_size, dropout, recurrent_dropout):
                super(LSTMModel, self).__init__()
                self.hidden_size = hidden_size
                self.lstm = self.nn.LSTM(
                    input_size=input_size,
                    hidden_size=hidden_size,
                    num_layers=1,
                    batch_first=True,
                    dropout=recurrent_dropout if recurrent_dropout > 0 else 0,
                    bidirectional=False
                )
                self.dropout_layer = self.nn.Dropout(dropout)
                self.fc = self.nn.Linear(hidden_size, 1)
                
            def forward(self, x):
                # LSTM前向传播
                lstm_out, _ = self.lstm(x)
                # 取最后一个时间步的输出
                lstm_out = lstm_out[:, -1, :]
                # Dropout
                lstm_out = self.dropout_layer(lstm_out)
                # 全连接层
                output = self.fc(lstm_out)
                return output
        
        return LSTMModel(input_size, self.units, self.dropout, self.recurrent_dropout)

    def fit(self, X, y):
        """训练LSTM模型"""
        if not self.pytorch_available:
            # 如果没有PyTorch，使用MLPRegressor作为近似
            from sklearn.neural_network import MLPRegressor
            self.model_ = MLPRegressor(
                hidden_layer_sizes=(self.units, self.units//2),
                activation='relu',
                solver='adam',
                max_iter=self.epochs,
                random_state=self.random_state,
                early_stopping=True
            )
            self.model_.fit(X, y)
            return self

        # 设置随机种子
        self.torch.manual_seed(self.random_state)
        if self.torch.cuda.is_available():
            self.torch.cuda.manual_seed(self.random_state)
            self.torch.cuda.manual_seed_all(self.random_state)
        np.random.seed(self.random_state)

        # 将特征数据重塑为序列格式 (samples, timesteps, features)
        # 对于光谱数据，每个波段作为时间步，特征数为1
        n_samples, n_features = X.shape
        self.input_size_ = 1  # 每个时间步的特征数
        
        # 转换为PyTorch张量
        X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
        y_tensor = self.torch.FloatTensor(y.reshape(-1, 1)).to(self.device)

        # 创建模型
        self.model_ = self._create_model(self.input_size_).to(self.device)
        
        # 定义损失函数和优化器
        criterion = self.nn.MSELoss()
        optimizer = self.optim.Adam(self.model_.parameters(), lr=self.learning_rate)

        # 训练模型
        self.model_.train()
        for epoch in range(self.epochs):
            # 随机打乱数据
            indices = np.random.permutation(n_samples)
            X_shuffled = X_tensor[indices]
            y_shuffled = y_tensor[indices]
            
            # 批量训练
            for i in range(0, n_samples, self.batch_size):
                batch_X = X_shuffled[i:i+self.batch_size]
                batch_y = y_shuffled[i:i+self.batch_size]
                
                # 前向传播
                optimizer.zero_grad()
                outputs = self.model_(batch_X)
                loss = criterion(outputs, batch_y)
                
                # 反向传播
                loss.backward()
                optimizer.step()

        return self

    def predict(self, X):
        """预测"""
        if self.model_ is None:
            raise ValueError("模型还未训练")

        if not self.pytorch_available:
            return self.model_.predict(X)

        # 转换为评估模式
        self.model_.eval()
        
        # 重塑输入数据
        n_samples, n_features = X.shape
        X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)

        # 预测
        with self.torch.no_grad():
            predictions = self.model_(X_tensor)
        
        return predictions.cpu().numpy().flatten()


class GRURegressor(BaseEstimator, RegressorMixin):
    """
    GRU回归器 - 将光谱数据视为序列数据
    每个光谱样本的波段作为时间步
    使用PyTorch实现
    """

    def __init__(self, units=None, dropout=0.2, recurrent_dropout=0.2, epochs=None,
                 batch_size=None, learning_rate=None, random_state=42, device=None,
                 config: Optional['TrainingConfig'] = None):
        """
        GRU回归器构造函数

        Parameters:
        units (int, optional): GRU单元数，如果为None则使用默认值50
        dropout (float): Dropout比例
        recurrent_dropout (float): 循环Dropout比例
        epochs (int, optional): 训练轮数，如果为None则使用默认值100
        batch_size (int, optional): 批次大小，如果为None则使用默认值32
        learning_rate (float, optional): 学习率，如果为None则使用默认值0.001
        random_state (int): 随机种子
        device (str, optional): 计算设备
        config (TrainingConfig, optional): 训练配置对象
        """
        # 如果提供了配置对象，使用配置中的参数
        if config is not None:
            self.units = units if units is not None else 64  # GRU默认使用更多单元
            self.dropout = dropout
            self.recurrent_dropout = recurrent_dropout
            self.epochs = epochs if epochs is not None else config.epochs
            self.batch_size = batch_size if batch_size is not None else config.batch_size
            self.learning_rate = learning_rate if learning_rate is not None else config.learning_rate
        else:
            # 使用传统参数方式
            self.units = units if units is not None else 50
            self.dropout = dropout
            self.recurrent_dropout = recurrent_dropout
            self.epochs = epochs if epochs is not None else 100
            self.batch_size = batch_size if batch_size is not None else 32
            self.learning_rate = learning_rate if learning_rate is not None else 0.001

        self.random_state = random_state
        self.device = device
        self.model_ = None
        self.input_size_ = None

        # 尝试导入PyTorch
        try:
            import torch
            import torch.nn as nn
            import torch.optim as optim
            self.torch = torch
            self.nn = nn
            self.optim = optim
            self.pytorch_available = True
            
            # 设置设备
            if self.device is None:
                self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            else:
                self.device = torch.device(self.device)
                
        except ImportError:
            self.pytorch_available = False
            print("Warning: PyTorch not installed, GRU model will use MLPRegressor approximation")

    def _create_model(self, input_size):
        """创建PyTorch GRU模型"""
        class GRUModel(self.nn.Module):
            def __init__(self, input_size, hidden_size, dropout, recurrent_dropout):
                super(GRUModel, self).__init__()
                self.hidden_size = hidden_size
                self.gru = self.nn.GRU(
                    input_size=input_size,
                    hidden_size=hidden_size,
                    num_layers=1,
                    batch_first=True,
                    dropout=recurrent_dropout if recurrent_dropout > 0 else 0,
                    bidirectional=False
                )
                self.dropout_layer = self.nn.Dropout(dropout)
                self.fc = self.nn.Linear(hidden_size, 1)
                
            def forward(self, x):
                # GRU前向传播
                gru_out, _ = self.gru(x)
                # 取最后一个时间步的输出
                gru_out = gru_out[:, -1, :]
                # Dropout
                gru_out = self.dropout_layer(gru_out)
                # 全连接层
                output = self.fc(gru_out)
                return output
        
        return GRUModel(input_size, self.units, self.dropout, self.recurrent_dropout)

    def fit(self, X, y):
        """训练GRU模型"""
        if not self.pytorch_available:
            # 如果没有PyTorch，使用MLPRegressor作为近似
            from sklearn.neural_network import MLPRegressor
            self.model_ = MLPRegressor(
                hidden_layer_sizes=(self.units, self.units//2),
                activation='relu',
                solver='adam',
                max_iter=self.epochs,
                random_state=self.random_state,
                early_stopping=True
            )
            self.model_.fit(X, y)
            return self

        # 设置随机种子
        self.torch.manual_seed(self.random_state)
        if self.torch.cuda.is_available():
            self.torch.cuda.manual_seed(self.random_state)
            self.torch.cuda.manual_seed_all(self.random_state)
        np.random.seed(self.random_state)

        # 将特征数据重塑为序列格式 (samples, timesteps, features)
        n_samples, n_features = X.shape
        self.input_size_ = 1  # 每个时间步的特征数
        
        # 转换为PyTorch张量
        X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)
        y_tensor = self.torch.FloatTensor(y.reshape(-1, 1)).to(self.device)

        # 创建模型
        self.model_ = self._create_model(self.input_size_).to(self.device)
        
        # 定义损失函数和优化器
        criterion = self.nn.MSELoss()
        optimizer = self.optim.Adam(self.model_.parameters(), lr=self.learning_rate)

        # 训练模型
        self.model_.train()
        for epoch in range(self.epochs):
            # 随机打乱数据
            indices = np.random.permutation(n_samples)
            X_shuffled = X_tensor[indices]
            y_shuffled = y_tensor[indices]
            
            # 批量训练
            for i in range(0, n_samples, self.batch_size):
                batch_X = X_shuffled[i:i+self.batch_size]
                batch_y = y_shuffled[i:i+self.batch_size]
                
                # 前向传播
                optimizer.zero_grad()
                outputs = self.model_(batch_X)
                loss = criterion(outputs, batch_y)
                
                # 反向传播
                loss.backward()
                optimizer.step()

        return self

    def predict(self, X):
        """预测"""
        if self.model_ is None:
            raise ValueError("模型还未训练")

        if not self.pytorch_available:
            return self.model_.predict(X)

        # 转换为评估模式
        self.model_.eval()
        
        # 重塑输入数据
        n_samples, n_features = X.shape
        X_tensor = self.torch.FloatTensor(X.reshape(n_samples, n_features, 1)).to(self.device)

        # 预测
        with self.torch.no_grad():
            predictions = self.model_(X_tensor)
        
        return predictions.cpu().numpy().flatten()


class RegressionAnalyzer:
    """
    回归分析器类 - 支持GUI对接的标准化接口
    支持多种回归算法和完整的分析流程
    """

    def __init__(self, config: Optional[RegressionConfig] = None):
        """
        初始化回归分析器

        Parameters:
        config (RegressionConfig, optional): 配置对象，如果为None则使用默认配置
        """
        self.config = config or RegressionConfig()
        self._validate_config()  # 在构造函数中进行校验
        self.models = {}
        self.scalers = {}
        self.best_params = {}
        self.results = {}
        self.data = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

        # 初始化可视化器
        self.visualizer = RegressionVisualizer(self)

    def update_config(self, config: RegressionConfig):
        """
        更新配置 - 为GUI动态配置预留接口

        Parameters:
        config (RegressionConfig): 新的配置对象
        """
        self.config = config
        self._validate_config()

    def _validate_config(self):
        """配置校验"""
        try:
            self.config._validate_parameters()
        except ValueError as e:
            raise ValueError(f"Configuration validation failed: {e}")

    def _parse_column_range(self, column_range, total_columns):
        """
        解析列范围字符串，返回列索引列表

        Parameters:
        column_range (str or int or list): 列范围，如 "0:5", "2,4,6-8", [0,1,2] 或单个索引
        total_columns (int): 总列数

        Returns:
        list: 列索引列表
        """
        if isinstance(column_range, (int, np.integer)):
            # 单个列索引
            if column_range >= total_columns or column_range < 0:
                raise ValueError(f"Column index {column_range} out of range [0, {total_columns-1}]")
            return [column_range]

        elif isinstance(column_range, str):
            # 解析范围字符串
            columns = []
            # 分割多个范围（用逗号分隔）
            for part in column_range.split(','):
                part = part.strip()
                if ':' in part:
                    # 范围选择，如 "0:5"
                    start, end = part.split(':')
                    start = int(start.strip()) if start.strip() else 0
                    end = int(end.strip()) if end.strip() else total_columns
                    if start < 0:
                        start = total_columns + start
                    if end < 0:
                        end = total_columns + end
                    if start >= total_columns or end > total_columns:
                        raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
                    columns.extend(range(start, end))
                else:
                    # 单个索引
                    idx = int(part.strip())
                    if idx < 0:
                        idx = total_columns + idx
                    if idx >= total_columns or idx < 0:
                        raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
                    columns.append(idx)
            return list(set(columns))  # 去重

        elif isinstance(column_range, (list, tuple)):
            # 直接的列索引列表
            columns = []
            for idx in column_range:
                if isinstance(idx, str):
                    if ':' in idx:
                        # 处理列表中的范围字符串
                        start, end = idx.split(':')
                        start = int(start.strip()) if start.strip() else 0
                        end = int(end.strip()) if end.strip() else total_columns
                        if start < 0:
                            start = total_columns + start
                        if end < 0:
                            end = total_columns + end
                        if start >= total_columns or end > total_columns:
                            raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
                        columns.extend(range(start, end))
                    else:
                        idx_int = int(idx.strip())
                        if idx_int < 0:
                            idx_int = total_columns + idx_int
                        if idx_int >= total_columns or idx_int < 0:
                            raise ValueError(f"Column index {idx_int} out of range [0, {total_columns-1}]")
                        columns.append(idx_int)
                else:
                    if idx < 0:
                        idx = total_columns + idx
                    if idx >= total_columns or idx < 0:
                        raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
                    columns.append(idx)
            return list(set(columns))  # 去重

        else:
            raise ValueError(f"Unsupported column range format: {type(column_range)}")

    def load_csv(self, file_path, label_column, spectrum_columns=None, delimiter=',', header=0):
        """
        加载CSV文件并指定标签列和光谱列

        Parameters:
        file_path (str): CSV文件路径
        label_column (str or int or range-like): 标签列，支持范围选择，如 "0:5", "2,4,6-8" 或单个索引
        spectrum_columns (str or list or None): 光谱列，支持范围选择，如 "1:10", "2,4,6-8" 或列索引列表，如果为None则使用除标签列外的所有列
        delimiter (str): 分隔符，默认为','
        header (int): 表头行号，默认为0
        """
        try:
            # 读取CSV文件
            self.data = pd.read_csv(file_path, delimiter=delimiter, header=header)
            total_columns = len(self.data.columns)

            # 处理标签列：先检查是否是列名，然后再检查是否是索引
            if isinstance(label_column, str) and label_column in self.data.columns:
                # 如果是有效的列名
                label_idx = self.data.columns.get_loc(label_column)
                self.y = self.data[label_column].values
            else:
                # 尝试作为列索引处理
                try:
                    if isinstance(label_column, str):
                        # 可能是数字字符串，转换为整数
                        label_column = int(label_column)
                    label_idx = label_column
                    if label_idx < 0:
                        label_idx = total_columns + label_idx
                    if label_idx < 0 or label_idx >= total_columns:
                        raise ValueError(f"Column index {label_column} out of range [0, {total_columns-1}]")
                    self.y = self.data.iloc[:, label_idx].values
                except (ValueError, TypeError):
                    raise ValueError(f"Invalid label column specification: {label_column}. Must be a valid column name or index.")

            # 确定光谱列
            if spectrum_columns is None:
                # 使用除标签列外的所有列作为光谱列
                spectrum_indices = [i for i in range(total_columns) if i != label_idx]
            else:
                # 解析光谱列范围
                spectrum_indices = self._parse_column_range(spectrum_columns, total_columns)
                # 排除标签列（如果在光谱列范围内）
                spectrum_indices = [i for i in spectrum_indices if i != label_idx]

            if not spectrum_indices:
                raise ValueError("No valid spectrum columns found")

            # 提取光谱数据
            self.X = self.data.iloc[:, spectrum_indices].values

            # 跳过缺失标签的行
            valid_mask = ~pd.isna(self.y)
            original_samples = len(self.y)
            self.X = self.X[valid_mask]
            self.y = self.y[valid_mask]

            self.feature_names = [self.data.columns[i] for i in spectrum_indices]

            skipped_samples = original_samples - len(self.y)
            print(f"Successfully loaded data: {self.X.shape[0]} samples, {self.X.shape[1]} features")
            print(f"Label column: {label_idx} ({self.data.columns[label_idx]})")
            print(f"Spectrum column range: {min(spectrum_indices)}-{max(spectrum_indices)}")
            if skipped_samples > 0:
                print(f"Rows skipped due to missing labels: {skipped_samples}")
            print(f"Label range: {self.y.min():.4f} - {self.y.max():.4f}")
            print(f"Data type check: X type {self.X.dtype}, y type {self.y.dtype}")

            # Check and process data types
            if self.X.dtype != np.float64:
                self.X = self.X.astype(np.float64)
            if self.y.dtype != np.float64:
                self.y = self.y.astype(np.float64)

            return True

        except Exception as e:
            print(f"Failed to load data: {str(e)}")
            return False

    def preprocess_data(self, test_size=None, random_state=None, scale_method=None):
        """
        数据预处理：分割训练集和测试集，标准化

        Parameters:
        test_size (float, optional): 测试集比例，如果为None则使用配置中的值
        random_state (int, optional): 随机种子，如果为None则使用配置中的值
        scale_method (str, optional): 标准化方法，如果为None则使用配置中的值
        """
        # 使用配置中的默认值
        test_size = test_size if test_size is not None else self.config.data.test_size
        random_state = random_state if random_state is not None else self.config.data.random_state
        scale_method = scale_method if scale_method is not None else self.config.data.scale_method

        try:
            # 分割数据集
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                self.X, self.y, test_size=test_size, random_state=random_state
            )

            # 标准化
            if scale_method == 'standard':
                self.scalers['X'] = StandardScaler()
            elif scale_method == 'minmax':
                self.scalers['X'] = MinMaxScaler()
            else:
                raise ValueError("scale_method must be 'standard' or 'minmax'")

            self.X_train_scaled = self.scalers['X'].fit_transform(self.X_train)
            self.X_test_scaled = self.scalers['X'].transform(self.X_test)

            print(f"Data preprocessing completed:")
            print(f"Training set: {self.X_train.shape[0]} samples")
            print(f"Test set: {self.X_test.shape[0]} samples")

            return True

        except Exception as e:
            print(f"Data preprocessing failed: {str(e)}")
            return False

    def add_linear_models(self):
        """添加线性回归模型"""
        self.models['linear'] = {
            'model': LinearRegression(),
            'name': '多元线性回归'
        }

        self.models['lasso'] = {
            'model': Lasso(random_state=42),
            'name': 'LASSO回归',
            'params': {
                'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
            }
        }

        self.models['ridge'] = {
            'model': Ridge(random_state=42),
            'name': '岭回归',
            'params': {
                'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
            }
        }

        self.models['elasticnet'] = {
            'model': ElasticNet(random_state=42),
            'name': '弹性网络回归',
            'params': {
                'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
                'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
            }
        }

        self.models['bayesianridge'] = {
            'model': BayesianRidge(),
            'name': '贝叶斯岭回归'
        }

    def add_boosting_models(self):
        """添加Boosting模型"""
        self.models['lsboost'] = {
            'model': GradientBoostingRegressor(random_state=42),
            'name': 'LSBoost回归',
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'subsample': [0.8, 0.9, 1.0]
            }
        }

        self.models['xgboost'] = {
            'model': xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
            'name': 'XGBoost回归',
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'subsample': [0.8, 0.9, 1.0],
                'colsample_bytree': [0.8, 0.9, 1.0]
            }
        }

        self.models['lightgbm'] = {
            'model': lgb.LGBMRegressor(random_state=42),
            'name': 'LightGBM回归',
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'subsample': [0.8, 0.9, 1.0],
                'colsample_bytree': [0.8, 0.9, 1.0]
            }
        }

    def add_kernel_models(self):
        """添加核模型"""
        # 高斯过程回归
        kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))
        self.models['gaussian'] = {
            'model': GaussianProcessRegressor(kernel=kernel, random_state=42),
            'name': '高斯过程回归'
        }

        # 高斯核SVM
        self.models['gaussiansvm'] = {
            'model': SVR(kernel='rbf'),
            'name': '高斯核SVM回归',
            'params': {
                'C': [0.1, 1.0, 10.0, 100.0],
                'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1.0]
            }
        }

        # 标准SVM回归
        self.models['svm'] = {
            'model': SVR(),
            'name': 'SVM回归',
            'params': {
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [0.1, 1.0, 10.0, 100.0],
                'gamma': ['scale', 'auto']
            }
        }

    def add_neural_networks(self, training_config: Optional[TrainingConfig] = None):
        """添加神经网络模型"""
        # ELM回归 (真正的极限学习机实现)
        self.models['elm'] = {
            'model': ExtremeLearningMachine(random_state=42),
            'name': 'ELM回归',
            'params': {
                'n_hidden': [50, 100, 200, 500],
                'activation': ['sigmoid', 'tanh', 'relu']
            }
        }

        # BP/MLP回归
        self.models['mlp'] = {
            'model': MLPRegressor(random_state=42, max_iter=1000),
            'name': 'BP/MLP回归',
            'params': {
                'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)],
                'activation': ['relu', 'tanh'],
                'learning_rate_init': [0.001, 0.01, 0.1],
                'alpha': [0.0001, 0.001, 0.01]
            }
        }

        # LSTM回归 (将光谱数据视为序列，波段作为时间步)
        self.models['lstm'] = {
            'model': LSTMRegressor(random_state=42, config=training_config),
            'name': 'LSTM回归',
            'params': {
                'units': [32, 64, 128],
                'dropout': [0.1, 0.2, 0.3],
                'epochs': [50, 100, 200]
            }
        }

        # GRU回归 (将光谱数据视为序列，波段作为时间步)
        self.models['gru'] = {
            'model': GRURegressor(random_state=42, config=training_config),
            'name': 'GRU回归',
            'params': {
                'units': [32, 64, 128],
                'dropout': [0.1, 0.2, 0.3],
                'epochs': [50, 100, 200]
            }
        }

    def add_specialized_models(self):
        """添加专业模型"""
        # GAM回归 (真正的广义加性模型)
        self.models['gam'] = {
            'model': GeneralizedAdditiveModel(),
            'name': 'GAM回归',
            'params': {
                'n_splines': [5, 10, 15, 20],
                'degree': [3, 4],
                'lambda_': [0.001, 0.01, 0.1, 1.0]
            }
        }

        # 决策树回归
        self.models['decisiontree'] = {
            'model': DecisionTreeRegressor(random_state=42),
            'name': '决策树回归',
            'params': {
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt', 'log2']
            }
        }

        # 随机森林回归
        self.models['randomforest'] = {
            'model': RandomForestRegressor(random_state=42),
            'name': '随机森林回归',
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt', 'log2']
            }
        }

        # 极端随机树回归
        self.models['extratrees'] = {
            'model': ExtraTreesRegressor(random_state=42),
            'name': '极端随机树回归',
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt', 'log2']
            }
        }

        # AdaBoost回归
        self.models['adaboost'] = {
            'model': AdaBoostRegressor(random_state=42),
            'name': 'AdaBoost回归',
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 1.0],
                'loss': ['linear', 'square', 'exponential']
            }
        }

    def initialize_all_models(self, use_config: bool = True):
        """Initialize all models"""
        self.add_linear_models()
        self.add_boosting_models()
        self.add_kernel_models()

        # 神经网络模型可以选择是否使用配置
        if use_config and hasattr(self, 'config'):
            self.add_neural_networks(self.config.training)
        else:
            self.add_neural_networks()

        self.add_specialized_models()

        print(f"Initialized {len(self.models)} regression models")

    def get_available_models(self):
        """获取所有可用模型的名称和描述"""
        return {name: info['name'] for name, info in self.models.items()}

    def hyperparameter_tuning(self, model_name, method=None, cv=None, n_iter=None):
        """
        超参数调优

        Parameters:
        model_name (str): 模型名称
        method (str, optional): 调优方法，如果为None则使用配置中的值
        cv (int, optional): 交叉验证折数，如果为None则使用配置中的值
        n_iter (int, optional): 随机搜索的迭代次数，如果为None则使用配置中的值
        """
        # 使用配置中的默认值
        method = method if method is not None else self.config.models.tuning_method
        cv = cv if cv is not None else self.config.models.cv_folds
        n_iter = n_iter if n_iter is not None else self.config.models.random_search_iter
        if model_name not in self.models:
            print(f"Model '{model_name}' does not exist")
            return False

        model_info = self.models[model_name]
        if 'params' not in model_info:
            print(f"Model '{model_name}' has no tunable parameters")
            return False

        print(f"Starting hyperparameter tuning for model: {model_info['name']}")

        if method == 'grid':
            search = GridSearchCV(
                model_info['model'],
                model_info['params'],
                cv=cv,
                scoring='neg_mean_squared_error',
                n_jobs=-1,
                verbose=1
            )
        elif method == 'random':
            search = RandomizedSearchCV(
                model_info['model'],
                model_info['params'],
                n_iter=n_iter,
                cv=cv,
                scoring='neg_mean_squared_error',
                n_jobs=-1,
                random_state=42,
                verbose=1
            )
        else:
            print("Tuning method must be 'grid' or 'random'")
            return False

        try:
            search.fit(self.X_train_scaled, self.y_train)
            self.best_params[model_name] = search.best_params_
            self.models[model_name]['model'] = search.best_estimator_

            print(f"Best parameters: {search.best_params_}")
            print(f"Best score: {-search.best_score_:.4f}")

            return True

        except Exception as e:
            print(f"Tuning failed: {str(e)}")
            return False

    def train_model(self, model_name):
        """训练单个模型"""
        if model_name not in self.models:
            print(f"Model '{model_name}' does not exist")
            return False

        try:
            model_info = self.models[model_name]
            model = model_info['model']

            print(f"Training model: {model_info['name']}")

            # Train model
            model.fit(self.X_train_scaled, self.y_train)

            # Predict
            y_pred_train = model.predict(self.X_train_scaled)
            y_pred_test = model.predict(self.X_test_scaled)

            # Calculate evaluation metrics
            metrics = self.calculate_metrics(self.y_train, y_pred_train, self.y_test, y_pred_test)

            self.results[model_name] = {
                'model': model,
                'metrics': metrics,
                'y_pred_train': y_pred_train,
                'y_pred_test': y_pred_test
            }

            print(f"{model_info['name']} training completed")
            print(f"Training R²: {metrics['train_r2']:.4f}, Test R²: {metrics['test_r2']:.4f}")

            return True

        except Exception as e:
            print(f"Training failed: {str(e)}")
            return False

    def train_all_models(self, tune_hyperparams=False, tuning_method='grid'):
        """训练当前配置的所有模型"""
        models_to_train = list(self.models.keys())
        print(f"Training {len(models_to_train)} models: {', '.join(models_to_train)}")

        for model_name in models_to_train:
            if tune_hyperparams and 'params' in self.models[model_name]:
                self.hyperparameter_tuning(model_name, method=tuning_method)
            self.train_model(model_name)

    def calculate_metrics(self, y_train, y_pred_train, y_test, y_pred_test):
        """计算评价指标"""
        metrics = {}

        # 训练集指标
        metrics['train_mse'] = mean_squared_error(y_train, y_pred_train)
        metrics['train_rmse'] = np.sqrt(metrics['train_mse'])
        metrics['train_mae'] = mean_absolute_error(y_train, y_pred_train)
        metrics['train_r2'] = r2_score(y_train, y_pred_train)

        # 测试集指标
        metrics['test_mse'] = mean_squared_error(y_test, y_pred_test)
        metrics['test_rmse'] = np.sqrt(metrics['test_mse'])
        metrics['test_mae'] = mean_absolute_error(y_test, y_pred_test)
        metrics['test_r2'] = r2_score(y_test, y_pred_test)

        return metrics

    def plot_results(self, save_path=None, plot_type='comprehensive'):
        """
        绘制结果比较图

        Parameters:
        save_path (str, optional): 保存路径
        plot_type (str): 绘图类型 ('basic', 'comprehensive', 'prediction', 'residual',
                     'metrics', 'error_dist', 'ranking')
        """
        if not self.results:
            print("No training results to plot")
            return

        if plot_type == 'comprehensive':
            # 生成综合报告
            self.visualizer.generate_comprehensive_report(
                save_dir=self.config.output.plot_dir,
                prefix='regression_analysis'
            )
        elif plot_type == 'basic':
            # 基础图表（保持向后兼容）
            self._plot_basic_comparison(save_path)
        elif plot_type == 'prediction':
            # 预测值vs真实值散点图
            self.visualizer.plot_prediction_scatter(save_path=save_path)
        elif plot_type == 'residual':
            # 残差分析图
            self.visualizer.plot_residual_analysis(save_path=save_path)
        elif plot_type == 'metrics':
            # 性能指标对比图
            self.visualizer.plot_metrics_comparison(save_path=save_path)
        elif plot_type == 'error_dist':
            # 误差分布图
            self.visualizer.plot_error_distribution(save_path=save_path)
        elif plot_type == 'ranking':
            # 模型排名矩阵
            self.visualizer.plot_model_ranking_matrix(save_path=save_path)
        else:
            print(f"Unknown plot type: {plot_type}")
            return

    def _plot_basic_comparison(self, save_path=None):
        """绘制基础比较图（向后兼容）"""
        # 准备数据
        model_names = []
        train_r2 = []
        test_r2 = []
        train_rmse = []
        test_rmse = []

        for model_name, result in self.results.items():
            model_names.append(self.models[model_name]['name'])
            train_r2.append(result['metrics']['train_r2'])
            test_r2.append(result['metrics']['test_r2'])
            train_rmse.append(result['metrics']['train_rmse'])
            test_rmse.append(result['metrics']['test_rmse'])

        # 创建图表
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

        # R² Score Comparison
        x = np.arange(len(model_names))
        width = 0.35

        ax1.bar(x - width/2, train_r2, width, label='Training Set', alpha=0.8)
        ax1.bar(x + width/2, test_r2, width, label='Test Set', alpha=0.8)
        ax1.set_xlabel('Model')
        ax1.set_ylabel('R² Score')
        ax1.set_title('R² Score Comparison')
        ax1.set_xticks(x)
        ax1.set_xticklabels(model_names, rotation=45, ha='right')
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # RMSE Comparison
        ax2.bar(x - width/2, train_rmse, width, label='Training Set', alpha=0.8)
        ax2.bar(x + width/2, test_rmse, width, label='Test Set', alpha=0.8)
        ax2.set_xlabel('Model')
        ax2.set_ylabel('RMSE')
        ax2.set_title('RMSE Comparison')
        ax2.set_xticks(x)
        ax2.set_xticklabels(model_names, rotation=45, ha='right')
        ax2.legend()
        ax2.grid(True, alpha=0.3)

        # Predicted vs Actual Values Scatter Plot (Test Set)
        colors = plt.cm.tab10(np.linspace(0, 1, len(self.results)))
        for i, (model_name, result) in enumerate(self.results.items()):
            ax3.scatter(self.y_test, result['y_pred_test'], alpha=0.6, color=colors[i],
                       label=self.models[model_name]['name'], s=20)

        ax3.plot([self.y_test.min(), self.y_test.max()], [self.y_test.min(), self.y_test.max()],
                'k--', linewidth=2, label='Perfect Prediction')
        ax3.set_xlabel('Actual Values')
        ax3.set_ylabel('Predicted Values')
        ax3.set_title('Predicted vs Actual Values (Test Set)')
        ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax3.grid(True, alpha=0.3)

        # Residual Plot
        for i, (model_name, result) in enumerate(self.results.items()):
            residuals = self.y_test - result['y_pred_test']
            ax4.scatter(result['y_pred_test'], residuals, alpha=0.6, color=colors[i],
                       label=self.models[model_name]['name'], s=20)

        ax4.axhline(y=0, color='k', linestyle='--', linewidth=2)
        ax4.set_xlabel('Predicted Values')
        ax4.set_ylabel('Residuals')
        ax4.set_title('Residual Plot (Test Set)')
        ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax4.grid(True, alpha=0.3)

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Chart saved to: {save_path}")

        # plt.show()

    def plot_prediction_scatter(self, save_path=None, **kwargs):
        """绘制预测值vs真实值散点图"""
        self.visualizer.plot_prediction_scatter(save_path=save_path, **kwargs)

    def plot_residual_analysis(self, save_path=None, **kwargs):
        """绘制残差分析图"""
        self.visualizer.plot_residual_analysis(save_path=save_path, **kwargs)

    def plot_metrics_comparison(self, save_path=None, **kwargs):
        """绘制性能指标对比图"""
        self.visualizer.plot_metrics_comparison(save_path=save_path, **kwargs)

    def plot_error_distribution(self, save_path=None, **kwargs):
        """绘制误差分布图"""
        self.visualizer.plot_error_distribution(save_path=save_path, **kwargs)

    def plot_model_ranking(self, save_path=None, **kwargs):
        """绘制模型排名矩阵"""
        self.visualizer.plot_model_ranking_matrix(save_path=save_path, **kwargs)

    def generate_visualization_report(self, save_dir=None, prefix=None):
        """生成完整的可视化报告"""
        save_dir = save_dir or self.config.output.plot_dir
        prefix = prefix or 'regression_analysis'
        return self.visualizer.generate_comprehensive_report(save_dir=save_dir, prefix=prefix)

    def save_model(self, model_name, save_dir='models'):
        """Save model"""
        if model_name not in self.results:
            print(f"Model '{model_name}' has no training results")
            return False

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        model_path = os.path.join(save_dir, f'{model_name}_{timestamp}.pkl')
        scaler_path = os.path.join(save_dir, f'scaler_{timestamp}.pkl')
        info_path = os.path.join(save_dir, f'info_{model_name}_{timestamp}.json')

        try:
            # 保存模型
            joblib.dump(self.results[model_name]['model'], model_path)

            # 保存标准化器
            joblib.dump(self.scalers['X'], scaler_path)

            # 保存模型信息
            info = {
                'model_name': model_name,
                'full_name': self.models[model_name]['name'],
                'timestamp': timestamp,
                'metrics': self.results[model_name]['metrics'],
                'best_params': self.best_params.get(model_name, {}),
                'feature_names': self.feature_names
            }

            with open(info_path, 'w', encoding='utf-8') as f:
                json.dump(info, f, indent=4, ensure_ascii=False)

            print(f"Model saved:")
            print(f"  Model file: {model_path}")
            print(f"  Scaler: {scaler_path}")
            print(f"  Info file: {info_path}")

            return True

        except Exception as e:
            print(f"Save failed: {str(e)}")
            return False

    def save_all_models(self, save_dir='models'):
        """保存所有模型"""
        for model_name in self.results.keys():
            self.save_model(model_name, save_dir)

    def load_model(self, model_path, scaler_path=None):
        """加载模型"""
        try:
            model = joblib.load(model_path)
            if scaler_path:
                scaler = joblib.load(scaler_path)
            else:
                scaler = None

            return model, scaler

        except Exception as e:
            print(f"Load failed: {str(e)}")
            return None, None

    def print_summary(self):
        """Print results summary"""
        if not self.results:
            print("No training results")
            return

        print("\n" + "="*80)
        print("Regression Model Performance Summary")
        print("="*80)

        # Header
        header = "|30"
        print(header)

        # Result rows
        for model_name, result in sorted(self.results.items(),
                                        key=lambda x: x[1]['metrics']['test_r2'], reverse=True):
            metrics = result['metrics']
            model_full_name = self.models[model_name]['name']
            print("|30")

        print("-"*80)
        print("Note: R² closer to 1 is better, RMSE/MAE smaller is better")
        print("="*80)

    def run_analysis_from_config(self) -> bool:
        """
        基于配置对象运行完整分析流程 - 推荐用于GUI对接

        Returns:
        bool: 分析是否成功完成
        """
        print("Starting regression analysis from configuration...")

        # 1. 加载数据
        if not self.load_csv(self.config.data.csv_path, self.config.data.label_column,
                           self.config.data.spectrum_columns):
            return False

        # 2. 数据预处理
        if not self.preprocess_data():
            return False

        # 3. 初始化模型
        self.initialize_all_models(use_config=True)

        # 4. 过滤模型（如果指定了特定的模型）
        if self.config.models.model_names is not None:
            # 验证指定的模型名称
            invalid_models = [name for name in self.config.models.model_names if name not in self.models]
            if invalid_models:
                print(f"Warning: The following models do not exist: {invalid_models}")
                valid_model_names = [name for name in self.config.models.model_names if name in self.models]
            else:
                valid_model_names = self.config.models.model_names

            # 只保留指定的模型
            models_to_keep = {}
            for model_name in valid_model_names:
                if model_name in self.models:
                    models_to_keep[model_name] = self.models[model_name]
            self.models = models_to_keep
            print(f"Filtered to {len(self.models)} specified models")

        # 5. 训练模型
        self.train_all_models(tune_hyperparams=self.config.models.tune_hyperparams,
                            tuning_method=self.config.models.tuning_method)

        # 6. 打印汇总
        self.print_summary()

        # 7. 保存模型
        if self.config.output.save_models:
            self.save_all_models(save_dir=self.config.output.save_dir)

        # 8. 绘制结果
        if self.config.output.plot_results:
            os.makedirs(self.config.output.plot_dir, exist_ok=True)
            self.generate_visualization_report(
                save_dir=self.config.output.plot_dir,
                prefix='regression_analysis'
            )

        print("Analysis completed!")
        return True

    def run_complete_analysis(self, csv_path=None, label_column=None, spectrum_columns=None,
                            test_size=None, scale_method=None, tune_hyperparams=None,
                            tuning_method=None, save_models=None, plot_results=None,
                            model_names=None):
        """
        运行完整分析流程 - 保持向后兼容性

        Parameters:
        csv_path (str, optional): CSV文件路径，如果为None则使用配置中的值
        label_column (str or int, optional): 标签列，如果为None则使用配置中的值
        spectrum_columns (str or list or None, optional): 光谱列，如果为None则使用配置中的值
        test_size (float, optional): 测试集比例，如果为None则使用配置中的值
        scale_method (str, optional): 标准化方法，如果为None则使用配置中的值
        tune_hyperparams (bool, optional): 是否调优超参数，如果为None则使用配置中的值
        tuning_method (str, optional): 调优方法，如果为None则使用配置中的值
        save_models (bool, optional): 是否保存模型，如果为None则使用配置中的值
        plot_results (bool, optional): 是否绘制结果图，如果为None则使用配置中的值
        model_names (list or None, optional): 要训练的模型名称列表，如果为None则使用配置中的值
        """
        # 更新配置对象（向后兼容）
        if csv_path is not None:
            self.config.data.csv_path = csv_path
        if label_column is not None:
            self.config.data.label_column = label_column
        if spectrum_columns is not None:
            self.config.data.spectrum_columns = spectrum_columns
        if test_size is not None:
            self.config.data.test_size = test_size
        if scale_method is not None:
            self.config.data.scale_method = scale_method
        if tune_hyperparams is not None:
            self.config.models.tune_hyperparams = tune_hyperparams
        if tuning_method is not None:
            self.config.models.tuning_method = tuning_method
        if save_models is not None:
            self.config.output.save_models = save_models
        if plot_results is not None:
            self.config.output.plot_results = plot_results
        if model_names is not None:
            self.config.models.model_names = model_names

        # 使用配置驱动的方法
        return self.run_analysis_from_config()


class RegressionVisualizer:
    """
    回归分析可视化器 - 提供丰富的可视化功能
    支持预测值vs真实值散点图、残差图、性能指标对比等
    """

    def __init__(self, analyzer: Optional['RegressionAnalyzer'] = None):
        """
        初始化可视化器

        Parameters:
        analyzer (RegressionAnalyzer, optional): 回归分析器实例
        """
        self.analyzer = analyzer
        self.colorblind_friendly_palette = [
            '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
            '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
            '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5'
        ]
        plt.style.use('seaborn-v0_8')

        # 配置matplotlib支持中文显示
        self._configure_chinese_font()

    def _configure_chinese_font(self):
        """配置matplotlib以支持中文显示"""
        import matplotlib as mpl

        # 尝试设置中文字体
        chinese_fonts = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans', 'Arial Unicode MS',
                        'WenQuanYi Micro Hei', 'AR PL UMing CN', 'Liberation Serif']

        font_set = False
        for font in chinese_fonts:
            try:
                # 测试字体是否可用
                test_text = "测试中文"
                fig, ax = plt.subplots()
                ax.text(0.5, 0.5, test_text, fontname=font, fontsize=12)
                plt.close(fig)

                # 如果没有报错，设置字体
                mpl.rcParams['font.sans-serif'] = [font] + mpl.rcParams['font.sans-serif']
                mpl.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
                font_set = True
                print(f"Successfully set Chinese font to: {font}")
                break
            except:
                continue

        if not font_set:
            # 如果没有找到合适的中文字体，使用系统默认并启用unicode
            mpl.rcParams['font.sans-serif'] = ['DejaVu Sans', 'SimHei']
            mpl.rcParams['axes.unicode_minus'] = False
            print("Warning: Could not find suitable Chinese font. Using default fonts.")

    def _ensure_chinese_text(self, text):
        """确保文本正确显示中文"""
        if isinstance(text, str):
            try:
                # 尝试编码和解码以确保UTF-8格式
                return text.encode('utf-8').decode('utf-8')
            except:
                return text
        return text

    def set_colorblind_palette(self):
        """设置色盲友好配色方案"""
        import matplotlib as mpl
        mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=self.colorblind_friendly_palette)

    def plot_prediction_scatter(self, figsize=(16, 12), save_path=None, show_individual=True, show_overlay=True):
        """
        绘制预测值vs真实值散点图

        Parameters:
        figsize (tuple): 图形尺寸
        save_path (str, optional): 保存路径
        show_individual (bool): 是否显示多子图
        show_overlay (bool): 是否显示叠加图
        """
        if not self.analyzer or not self.analyzer.results:
            print("No analyzer results available for plotting")
            return

        self.set_colorblind_palette()
        n_models = len(self.analyzer.results)

        if show_individual and show_overlay:
            # 创建复合图：上方多子图，下方叠加图
            fig = plt.figure(figsize=figsize)

            # 上方：多子图（每个模型一个子图）
            n_cols = min(4, n_models)
            n_rows = (n_models + n_cols - 1) // n_cols

            gs = fig.add_gridspec(n_rows + 1, n_cols, hspace=0.3, wspace=0.3)
            axes_scatter = []
            for i in range(n_rows):
                for j in range(n_cols):
                    if i * n_cols + j < n_models:
                        axes_scatter.append(fig.add_subplot(gs[i, j]))

            # 下方：叠加图
            ax_overlay = fig.add_subplot(gs[n_rows, :])

        elif show_individual:
            # 只有多子图
            n_cols = min(4, n_models)
            n_rows = (n_models + n_cols - 1) // n_cols
            fig, axes_scatter = plt.subplots(n_rows, n_cols, figsize=figsize)
            if n_models == 1:
                axes_scatter = [axes_scatter]
            else:
                axes_scatter = axes_scatter.flatten()
            ax_overlay = None

        elif show_overlay:
            # 只有叠加图
            fig, ax_overlay = plt.subplots(1, 1, figsize=(10, 8))
            axes_scatter = []
        else:
            print("At least one of show_individual or show_overlay must be True")
            return

        # 绘制多子图
        if show_individual:
            for idx, (model_name, result) in enumerate(self.analyzer.results.items()):
                if idx < len(axes_scatter):
                    ax = axes_scatter[idx]
                    y_true = self.analyzer.y_test
                    y_pred = result['y_pred_test']

                    # 散点图
                    ax.scatter(y_true, y_pred, alpha=0.6, s=30, color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)])

                    # 45度参考线
                    min_val = min(y_true.min(), y_pred.min())
                    max_val = max(y_true.max(), y_pred.max())
                    ax.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, alpha=0.7)

                    # 添加R²和RMSE文本
                    r2 = result['metrics']['test_r2']
                    rmse = result['metrics']['test_rmse']
                    ax.text(0.05, 0.95, f'R² = {r2:.3f}\nRMSE = {rmse:.3f}',
                           transform=ax.transAxes, fontsize=10,
                           verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

                    ax.set_xlabel(self._ensure_chinese_text('True Values'))
                    ax.set_ylabel(self._ensure_chinese_text('Predicted Values'))
                    ax.set_title(self._ensure_chinese_text(f'{self.analyzer.models[model_name]["name"]}'))
                    ax.grid(True, alpha=0.3)
                    ax.axis('equal')

        # 绘制叠加图
        if show_overlay:
            for idx, (model_name, result) in enumerate(self.analyzer.results.items()):
                y_true = self.analyzer.y_test
                y_pred = result['y_pred_test']

                ax_overlay.scatter(y_true, y_pred, alpha=0.6, s=30,
                                 color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)],
                                 label=f'{self.analyzer.models[model_name]["name"]} (R²={result["metrics"]["test_r2"]:.3f})')

            # 45度参考线
            min_val = min(self.analyzer.y_test.min(), min([r['y_pred_test'].min() for r in self.analyzer.results.values()]))
            max_val = max(self.analyzer.y_test.max(), max([r['y_pred_test'].max() for r in self.analyzer.results.values()]))
            ax_overlay.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, alpha=0.7, label='Perfect Prediction')

            ax_overlay.set_xlabel(self._ensure_chinese_text('True Values'))
            ax_overlay.set_ylabel(self._ensure_chinese_text('Predicted Values'))
            ax_overlay.set_title(self._ensure_chinese_text('Predicted vs True Values - All Models Overlay'))
            ax_overlay.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            ax_overlay.grid(True, alpha=0.3)
            ax_overlay.axis('equal')

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Prediction scatter plot saved to: {save_path}")

        # plt.show()

    def plot_residual_analysis(self, figsize=(16, 8), save_path=None, n_feature_plots=3):
        """
        绘制残差分析图

        Parameters:
        figsize (tuple): 图形尺寸
        save_path (str, optional): 保存路径
        n_feature_plots (int): 显示多少个特征的残差vs特征图
        """
        if not self.analyzer or not self.analyzer.results:
            print("No analyzer results available for plotting")
            return

        self.set_colorblind_palette()

        # 选择表现最好的几个模型进行详细分析
        sorted_models = sorted(self.analyzer.results.items(),
                             key=lambda x: x[1]['metrics']['test_r2'], reverse=True)
        top_models = sorted_models[:min(3, len(sorted_models))]

        fig, axes = plt.subplots(2, 3, figsize=figsize)

        for idx, (model_name, result) in enumerate(top_models):
            y_true = self.analyzer.y_test
            y_pred = result['y_pred_test']
            residuals = y_true - y_pred

            # 残差vs预测值
            ax1 = axes[0, 0] if idx == 0 else axes[0, idx]
            ax1.scatter(y_pred, residuals, alpha=0.6, s=20,
                       color=self.colorblind_friendly_palette[idx % len(self.colorblind_friendly_palette)])
            ax1.axhline(y=0, color='k', linestyle='--', linewidth=2)
            ax1.set_xlabel('Predicted Values')
            ax1.set_ylabel('Residuals')
            ax1.set_title(f'Residuals vs Predicted\n{self.analyzer.models[model_name]["name"]}')
            ax1.grid(True, alpha=0.3)

            # Q-Q图
            ax2 = axes[1, 0] if idx == 0 else axes[1, idx]
            stats.probplot(residuals, dist="norm", plot=ax2)
            ax2.set_title(f'Normal Q-Q Plot\n{self.analyzer.models[model_name]["name"]}')

            # 残差vs重要特征（如果有特征重要性）
            if idx < n_feature_plots - 2 and hasattr(result['model'], 'feature_importances_'):
                ax3 = axes[idx // 3 + 1, idx % 3 + 1] if idx > 0 else axes[0, 2]
                if idx < 2:  # 只显示前两个模型的特征残差图
                    try:
                        importances = result['model'].feature_importances_
                        top_features_idx = np.argsort(importances)[-2:]  # 最重要的两个特征

                        for i, feat_idx in enumerate(top_features_idx):
                            feat_name = self.analyzer.feature_names[feat_idx] if hasattr(self.analyzer, 'feature_names') else f'Feature {feat_idx}'
                            ax3.scatter(self.analyzer.X_test[:, feat_idx], residuals,
                                      alpha=0.6, s=20, label=f'{feat_name}',
                                      color=self.colorblind_friendly_palette[(idx*2 + i) % len(self.colorblind_friendly_palette)])
                        ax3.axhline(y=0, color='k', linestyle='--', linewidth=2)
                        ax3.set_xlabel('Feature Values')
                        ax3.set_ylabel('Residuals')
                        ax3.set_title(f'Residuals vs Top Features\n{self.analyzer.models[model_name]["name"]}')
                        ax3.legend()
                        ax3.grid(True, alpha=0.3)
                    except:
                        ax3.text(0.5, 0.5, 'Feature importance\nnot available',
                                transform=ax3.transAxes, ha='center', va='center')
                        ax3.set_title(f'Feature Analysis\n{self.analyzer.models[model_name]["name"]}')

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Residual analysis plot saved to: {save_path}")

        # plt.show()

    def plot_metrics_comparison(self, figsize=(16, 10), save_path=None):
        """
        绘制性能指标对比图

        Parameters:
        figsize (tuple): 图形尺寸
        save_path (str, optional): 保存路径
        """
        if not self.analyzer or not self.analyzer.results:
            print("No analyzer results available for plotting")
            return

        self.set_colorblind_palette()

        # 准备数据
        model_names = []
        model_full_names = []
        r2_scores = []
        rmse_scores = []
        mae_scores = []
        training_times = []
        memory_usage = []

        # 模拟训练时间和内存使用（实际应用中需要测量）
        for model_name, result in self.analyzer.results.items():
            model_names.append(model_name)
            model_full_names.append(self.analyzer.models[model_name]['name'])
            r2_scores.append(result['metrics']['test_r2'])
            rmse_scores.append(result['metrics']['test_rmse'])
            mae_scores.append(result['metrics']['test_mae'])
            training_times.append(np.random.uniform(0.1, 5.0))  # 模拟时间
            memory_usage.append(np.random.uniform(50, 500))  # 模拟内存

        # 创建子图
        fig, axes = plt.subplots(2, 2, figsize=figsize)

        # 雷达图 - R², RMSE, MAE
        ax_radar = axes[0, 0]

        # 标准化指标到0-1范围
        r2_norm = (r2_scores - np.min(r2_scores)) / (np.max(r2_scores) - np.min(r2_scores))
        rmse_norm = 1 - (rmse_scores - np.min(rmse_scores)) / (np.max(rmse_scores) - np.min(rmse_scores))  # RMSE越小越好，反转
        mae_norm = 1 - (mae_scores - np.min(mae_scores)) / (np.max(mae_scores) - np.min(mae_scores))   # MAE越小越好，反转

        # 雷达图数据
        categories = ['R² Score', 'RMSE (inv)', 'MAE (inv)']
        n_models = len(model_names)

        # 计算角度
        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
        angles += angles[:1]  # 闭合图形

        for i in range(n_models):
            values = [r2_norm[i], rmse_norm[i], mae_norm[i]]
            values += values[:1]  # 闭合图形
            ax_radar.plot(angles, values, 'o-', linewidth=2,
                         color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
                         label=model_full_names[i])
            ax_radar.fill(angles, values, alpha=0.25,
                         color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)])

        ax_radar.set_xticks(angles[:-1])
        ax_radar.set_xticklabels(categories)
        ax_radar.set_title('Performance Metrics Radar Chart')
        ax_radar.legend(bbox_to_anchor=(1.1, 1), loc='upper left')
        ax_radar.grid(True, alpha=0.3)

        # 分组柱状图 - 不同指标的比较
        ax_bar = axes[0, 1]
        x = np.arange(len(model_names))
        width = 0.25

        bars1 = ax_bar.bar(x - width, r2_scores, width, label='R²', alpha=0.8,
                          color=self.colorblind_friendly_palette[0])
        bars2 = ax_bar.bar(x, [1/s for s in rmse_scores], width, label='1/RMSE', alpha=0.8,
                          color=self.colorblind_friendly_palette[1])
        bars3 = ax_bar.bar(x + width, [1/s for s in mae_scores], width, label='1/MAE', alpha=0.8,
                          color=self.colorblind_friendly_palette[2])

        ax_bar.set_xlabel('Models')
        ax_bar.set_ylabel('Normalized Scores')
        ax_bar.set_title('Normalized Performance Comparison')
        ax_bar.set_xticks(x)
        ax_bar.set_xticklabels(model_full_names, rotation=45, ha='right')
        ax_bar.legend()
        ax_bar.grid(True, alpha=0.3, axis='y')

        # 堆叠柱状图 - 误差分解（偏差vs方差）
        ax_stack = axes[1, 0]
        bias_errors = [abs(np.mean(residuals)) for residuals in
                      [self.analyzer.y_test - result['y_pred_test'] for result in self.analyzer.results.values()]]
        variance_errors = [np.var(residuals) for residuals in
                          [self.analyzer.y_test - result['y_pred_test'] for result in self.analyzer.results.values()]]

        bars_bias = ax_stack.bar(model_names, bias_errors, label='Bias (Mean Abs Error)', alpha=0.8,
                                color=self.colorblind_friendly_palette[0])
        bars_var = ax_stack.bar(model_names, variance_errors, bottom=bias_errors,
                               label='Variance (Residual Var)', alpha=0.8,
                               color=self.colorblind_friendly_palette[1])

        ax_stack.set_xlabel('Models')
        ax_stack.set_ylabel('Error Components')
        ax_stack.set_title('Bias-Variance Decomposition')
        ax_stack.set_xticklabels(model_full_names, rotation=45, ha='right')
        ax_stack.legend()
        ax_stack.grid(True, alpha=0.3, axis='y')

        # 气泡图 - 综合评估（R² vs 1/RMSE，气泡大小表示1/MAE）
        ax_bubble = axes[1, 1]
        bubble_sizes = [100 * (1/s) for s in mae_scores]  # MAE越大气泡越小

        scatter = ax_bubble.scatter(r2_scores, [1/s for s in rmse_scores], s=bubble_sizes,
                                  c=range(len(model_names)), cmap='viridis', alpha=0.6, edgecolors='black')

        # 添加模型名称标签
        for i, name in enumerate(model_full_names):
            ax_bubble.annotate(name, (r2_scores[i], 1/rmse_scores[i]),
                             xytext=(5, 5), textcoords='offset points', fontsize=8)

        ax_bubble.set_xlabel('R² Score')
        ax_bubble.set_ylabel('1/RMSE')
        ax_bubble.set_title('Comprehensive Performance Assessment\n(Bubble size ∝ 1/MAE)')
        ax_bubble.grid(True, alpha=0.3)

        # 添加颜色条
        cbar = plt.colorbar(scatter, ax=ax_bubble)
        cbar.set_label('Model Index')

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Metrics comparison plot saved to: {save_path}")

        # plt.show()

    def plot_error_distribution(self, figsize=(16, 8), save_path=None):
        """
        绘制误差分布图

        Parameters:
        figsize (tuple): 图形尺寸
        save_path (str, optional): 保存路径
        """
        if not self.analyzer or not self.analyzer.results:
            print("No analyzer results available for plotting")
            return

        self.set_colorblind_palette()

        # 计算所有模型的误差
        model_errors = {}
        for model_name, result in self.analyzer.results.items():
            errors = self.analyzer.y_test - result['y_pred_test']
            model_errors[model_name] = errors

        fig, axes = plt.subplots(2, 2, figsize=figsize)

        # 误差分布直方图 - 所有模型并排比较
        ax_hist = axes[0, 0]
        bins = np.linspace(min([min(errors) for errors in model_errors.values()]),
                          max([max(errors) for errors in model_errors.values()]), 30)

        for i, (model_name, errors) in enumerate(model_errors.items()):
            ax_hist.hist(errors, bins=bins, alpha=0.7, label=self.analyzer.models[model_name]['name'],
                        color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
                        density=True)

        ax_hist.set_xlabel('Prediction Error')
        ax_hist.set_ylabel('Density')
        ax_hist.set_title('Error Distribution Histogram')
        ax_hist.legend()
        ax_hist.grid(True, alpha=0.3)

        # 核密度估计曲线
        ax_kde = axes[0, 1]
        for i, (model_name, errors) in enumerate(model_errors.items()):
            try:
                sns.kdeplot(data=errors, ax=ax_kde, label=self.analyzer.models[model_name]['name'],
                           color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
                           fill=True, alpha=0.3)
            except:
                # 如果seaborn不可用，使用matplotlib
                ax_kde.hist(errors, bins=30, alpha=0.3, density=True,
                           label=self.analyzer.models[model_name]['name'],
                           color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)])

        ax_kde.set_xlabel('Prediction Error')
        ax_kde.set_ylabel('Density')
        ax_kde.set_title('Error Distribution KDE')
        ax_kde.legend()
        ax_kde.grid(True, alpha=0.3)

        # 累积分布函数
        ax_cdf = axes[1, 0]
        error_range = np.linspace(min([min(errors) for errors in model_errors.values()]),
                                 max([max(errors) for errors in model_errors.values()]), 100)

        for i, (model_name, errors) in enumerate(model_errors.items()):
            sorted_errors = np.sort(errors)
            y_vals = np.arange(len(sorted_errors)) / float(len(sorted_errors))
            ax_cdf.plot(sorted_errors, y_vals,
                       label=self.analyzer.models[model_name]['name'],
                       color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
                       linewidth=2)

        ax_cdf.set_xlabel('Prediction Error')
        ax_cdf.set_ylabel('Cumulative Probability')
        ax_cdf.set_title('Cumulative Distribution Function')
        ax_cdf.legend()
        ax_cdf.grid(True, alpha=0.3)

        # 箱线图
        ax_box = axes[1, 1]
        error_data = [errors for errors in model_errors.values()]
        model_labels = [self.analyzer.models[name]['name'] for name in model_errors.keys()]

        bp = ax_box.boxplot(error_data, labels=model_labels, patch_artist=True)
        for patch, color in zip(bp['boxes'], self.colorblind_friendly_palette):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)

        # 添加均值点
        for i, errors in enumerate(error_data):
            ax_box.plot(i+1, np.mean(errors), 'ro', markersize=8, label='Mean' if i == 0 else "")

        ax_box.set_xlabel('Models')
        ax_box.set_ylabel('Prediction Error')
        ax_box.set_title('Error Distribution Box Plot')
        ax_box.legend()
        ax_box.grid(True, alpha=0.3, axis='y')

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Error distribution plot saved to: {save_path}")

        # plt.show()

    def plot_model_ranking_matrix(self, figsize=(14, 10), save_path=None):
        """
        绘制模型排名矩阵

        Parameters:
        figsize (tuple): 图形尺寸
        save_path (str, optional): 保存路径
        """
        if not self.analyzer or not self.analyzer.results:
            print("No analyzer results available for plotting")
            return

        self.set_colorblind_palette()

        # 准备指标数据
        metrics_data = []
        model_names = []
        metric_names = ['R²', 'RMSE', 'MAE', 'Training_R²', 'Training_RMSE', 'Training_MAE']

        for model_name, result in self.analyzer.results.items():
            model_names.append(self.analyzer.models[model_name]['name'])
            metrics = result['metrics']
            metrics_data.append([
                metrics['test_r2'],
                metrics['test_rmse'],
                metrics['test_mae'],
                metrics['train_r2'],
                metrics['train_rmse'],
                metrics['train_mae']
            ])

        metrics_array = np.array(metrics_data)

        # 计算排名（对于R²，越高越好；对于RMSE/MAE，越低越好）
        rankings = np.zeros_like(metrics_array)
        rankings[:, 0] = len(model_names) - stats.rankdata(metrics_array[:, 0]) + 1  # R²排名（反转）
        rankings[:, 1] = stats.rankdata(metrics_array[:, 1])  # RMSE排名
        rankings[:, 2] = stats.rankdata(metrics_array[:, 2])  # MAE排名
        rankings[:, 3] = len(model_names) - stats.rankdata(metrics_array[:, 3]) + 1  # Training R²排名（反转）
        rankings[:, 4] = stats.rankdata(metrics_array[:, 4])  # Training RMSE排名
        rankings[:, 5] = stats.rankdata(metrics_array[:, 5])  # Training MAE排名

        fig, axes = plt.subplots(2, 2, figsize=figsize)

        # 热力图 - 模型vs指标的排名
        ax_heatmap = axes[0, 0]
        im = ax_heatmap.imshow(rankings, cmap='RdYlGn_r', aspect='auto', alpha=0.8)

        # 设置标签
        ax_heatmap.set_xticks(np.arange(len(metric_names)))
        ax_heatmap.set_yticks(np.arange(len(model_names)))
        ax_heatmap.set_xticklabels(metric_names, rotation=45, ha='right')
        ax_heatmap.set_yticklabels(model_names)

        # 添加数值标签
        for i in range(len(model_names)):
            for j in range(len(metric_names)):
                text = ax_heatmap.text(j, i, f'{rankings[i, j]:.0f}',
                                     ha="center", va="center", color="black", fontsize=10)

        ax_heatmap.set_title('Model Ranking Matrix\n(Lower rank = Better performance)')
        plt.colorbar(im, ax=ax_heatmap, label='Rank')

        # 平行坐标图
        ax_parallel = axes[0, 1]

        # 标准化数据到0-1范围
        normalized_data = np.zeros_like(metrics_array)
        for j in range(metrics_array.shape[1]):
            if j in [0, 3]:  # R²指标，越高越好
                normalized_data[:, j] = (metrics_array[:, j] - metrics_array[:, j].min()) / (metrics_array[:, j].max() - metrics_array[:, j].min())
            else:  # RMSE/MAE指标，越低越好，反转标准化
                normalized_data[:, j] = 1 - (metrics_array[:, j] - metrics_array[:, j].min()) / (metrics_array[:, j].max() - metrics_array[:, j].min())

        for i in range(len(model_names)):
            ax_parallel.plot(range(len(metric_names)), normalized_data[i],
                           marker='o', linewidth=2, markersize=6,
                           color=self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)],
                           label=model_names[i], alpha=0.8)

        ax_parallel.set_xticks(range(len(metric_names)))
        ax_parallel.set_xticklabels(metric_names, rotation=45, ha='right')
        ax_parallel.set_ylabel('Normalized Score (Higher = Better)')
        ax_parallel.set_title('Parallel Coordinates Plot')
        ax_parallel.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax_parallel.grid(True, alpha=0.3)

        # 气泡图 - R² vs RMSE，气泡大小表示MAE
        ax_bubble = axes[1, 0]
        r2_scores = metrics_array[:, 0]
        rmse_scores = metrics_array[:, 1]
        mae_scores = metrics_array[:, 2]

        # 气泡大小（MAE越小气泡越大）
        bubble_sizes = 1000 / (mae_scores + 0.01)  # 避免除零

        scatter = ax_bubble.scatter(r2_scores, rmse_scores, s=bubble_sizes,
                                  c=range(len(model_names)), cmap='viridis', alpha=0.6, edgecolors='black')

        # 添加模型名称标签
        for i, name in enumerate(model_names):
            ax_bubble.annotate(name, (r2_scores[i], rmse_scores[i]),
                             xytext=(5, 5), textcoords='offset points', fontsize=8)

        ax_bubble.set_xlabel('R² Score')
        ax_bubble.set_ylabel('RMSE')
        ax_bubble.set_title('Performance Bubble Chart\n(Bubble size ∝ 1/MAE)')
        ax_bubble.grid(True, alpha=0.3)

        # 添加颜色条
        cbar = plt.colorbar(scatter, ax=ax_bubble)
        cbar.set_label('Model Index')

        # 综合排名条形图
        ax_ranking = axes[1, 1]
        avg_rankings = np.mean(rankings, axis=1)
        sorted_indices = np.argsort(avg_rankings)

        bars = ax_ranking.bar(range(len(model_names)),
                             avg_rankings[sorted_indices],
                             color=[self.colorblind_friendly_palette[i % len(self.colorblind_friendly_palette)]
                                   for i in range(len(model_names))], alpha=0.7)

        ax_ranking.set_xlabel('Models (Sorted by Average Rank)')
        ax_ranking.set_ylabel('Average Rank')
        ax_ranking.set_title('Overall Model Ranking')
        ax_ranking.set_xticks(range(len(model_names)))
        ax_ranking.set_xticklabels([model_names[i] for i in sorted_indices], rotation=45, ha='right')
        ax_ranking.grid(True, alpha=0.3, axis='y')

        # 添加数值标签
        for i, bar in enumerate(bars):
            height = bar.get_height()
            ax_ranking.text(bar.get_x() + bar.get_width()/2., height,
                           '.2f', ha='center', va='bottom')

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Model ranking matrix plot saved to: {save_path}")

        # plt.show()

    def generate_comprehensive_report(self, save_dir='plots', prefix='regression_analysis'):
        """
        生成综合可视化报告

        Parameters:
        save_dir (str): 保存目录
        prefix (str): 文件名前缀
        """
        if not self.analyzer:
            print("No analyzer available for report generation")
            return

        os.makedirs(save_dir, exist_ok=True)
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # 生成各种图表
        plot_configs = [
            ('prediction_scatter', self.plot_prediction_scatter),
            ('residual_analysis', self.plot_residual_analysis),
            ('metrics_comparison', self.plot_metrics_comparison),
            ('error_distribution', self.plot_error_distribution),
            ('model_ranking', self.plot_model_ranking_matrix)
        ]

        saved_files = []
        for plot_name, plot_func in plot_configs:
            try:
                save_path = f'{save_dir}/{prefix}_{plot_name}_{timestamp}.png'
                plot_func(save_path=save_path)
                saved_files.append(save_path)
                plt.close('all')  # 关闭所有图形以释放内存
            except Exception as e:
                print(f"Failed to generate {plot_name} plot: {str(e)}")

        if saved_files:
            print("Comprehensive visualization report generated:")
            for file in saved_files:
                print(f"  - {file}")
        else:
            print("No plots were successfully generated")

        return saved_files


def main():
    """主函数 - 展示配置驱动和向后兼容两种使用方式"""

    print("="*60)
    print("Regression Analysis Tool - Configuration-Driven Interface")
    print("="*60)

    # 方法1：配置驱动方式（推荐用于GUI对接）
    print("\n--- Method 1: Configuration-Driven (Recommended for GUI) ---")

    # 创建配置对象
    csv_file_path = r"E:\code\content\change\6.csv"


    config = RegressionConfig.create_default(
        csv_path=csv_file_path,
        label_column="0"
    )
    # 可选：自定义配置
    config.data.spectrum_columns = "8:"  # 光谱列范围
    config.models.model_names ='all'# 选择部分模型进行演示
    config.models.tune_hyperparams = False  # 快速分析，不进行超参数调优
    config.output.save_models = True  # 不保存模型文件
    config.output.plot_results = True  # 启用可视化
    config.output.plot_dir = 'E:\code\content\change\plot\yellow'  # 可视化输出目录

    # 创建分析器并传入配置
    analyzer = RegressionAnalyzer(config)

    # 查看可用模型
    analyzer.initialize_all_models()
    print("Available models:")
    for model_key, model_name in analyzer.get_available_models().items():
        print(f"  {model_key}: {model_name}")

    # 运行配置驱动的分析
    success = analyzer.run_analysis_from_config()
    if success:
        print("Configuration-driven analysis completed successfully!")

        # 演示各种可视化功能
        print("\n--- Visualization Demo ---")

        # 创建可视化目录
        viz_dir = 'visualization_demo'
        os.makedirs(viz_dir, exist_ok=True)

        print("Generating various visualization plots...")

        # 1. 预测值vs真实值散点图
        print("1. Prediction vs True Values Scatter Plot...")
        analyzer.plot_prediction_scatter(
            save_path=f'{viz_dir}/prediction_scatter.png',
            show_individual=True,
            show_overlay=True
        )

        # 2. 残差分析图
        print("2. Residual Analysis Plot...")
        analyzer.plot_residual_analysis(
            save_path=f'{viz_dir}/residual_analysis.png'
        )

        # 3. 性能指标对比图
        print("3. Performance Metrics Comparison...")
        analyzer.plot_metrics_comparison(
            save_path=f'{viz_dir}/metrics_comparison.png'
        )

        # 4. 误差分布图
        print("4. Error Distribution Analysis...")
        analyzer.plot_error_distribution(
            save_path=f'{viz_dir}/error_distribution.png'
        )

        # 5. 模型排名矩阵
        print("5. Model Ranking Matrix...")
        analyzer.plot_model_ranking(
            save_path=f'{viz_dir}/model_ranking.png'
        )

        # 6. 生成完整可视化报告
        print("6. Generating Comprehensive Visualization Report...")
        saved_plots = analyzer.generate_visualization_report(
            save_dir=viz_dir,
            prefix='demo_report'
        )

        print(f"\nVisualization completed! Generated {len(saved_plots)} plot files in '{viz_dir}' directory:")
        for plot_file in saved_plots:
            print(f"  - {plot_file}")

        print("\nAvailable visualization methods:")
        print("  - analyzer.plot_prediction_scatter()      # 预测值vs真实值散点图")
        print("  - analyzer.plot_residual_analysis()      # 残差分析图")
        print("  - analyzer.plot_metrics_comparison()     # 性能指标对比图")
        print("  - analyzer.plot_error_distribution()     # 误差分布图")
        print("  - analyzer.plot_model_ranking()          # 模型排名矩阵")
        print("  - analyzer.generate_visualization_report() # 生成完整报告")

    else:
        print("Configuration-driven analysis failed!")

    # # 方法2：向后兼容方式（传统参数传递）
    # print("\n--- Method 2: Backward Compatible (Legacy Parameter Passing) ---")
    #
    # analyzer2 = RegressionAnalyzer()  # 使用默认配置
    #
    # # 使用传统的参数传递方式
    # success2 = analyzer2.run_complete_analysis(
    #     csv_path=r"E:\code\WQ\pipeline_result\work_dir\5_training_spectra\training_spectra.csv",
    #     label_column="0",
    #     spectrum_columns="13:",
    #     test_size=0.2,
    #     scale_method='standard',
    #     tune_hyperparams=False,
    #     save_models=False,
    #     plot_results=True,
    #     model_names=['xgboost', 'lightgbm']  # 只训练这两个模型
    # )
    #
    # if success2:
    #     print("Backward-compatible analysis completed successfully!")
    # else:
    #     print("Backward-compatible analysis failed!")
    #
    # print("\n" + "="*60)
    # print("Both methods are supported. Configuration-driven is recommended for GUI integration.")
    # print("="*60)


if __name__ == "__main__":
    main()