Initial commit of WQ_GUI

2026-04-08 15:25:08 +08:00
commit 91e36407ae
302 changed files with 40872 additions and 0 deletions
--- a/src/core/prediction/init.py
+++ b/src/core/prediction/init.py
@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
--- a/src/core/prediction/inference_batch.py
+++ b/src/core/prediction/inference_batch.py
--- a/src/core/prediction/sctter_batch.py
+++ b/src/core/prediction/sctter_batch.py
@ -0,0 +1,894 @@
+import numpy as np
+import pandas as pd
+import joblib
+import os
+from pathlib import Path
+from typing import List, Dict, Union, Tuple, Optional
+import warnings
+import matplotlib.pyplot as plt
+import matplotlib.font_manager as fm
+import scipy.stats as stats
+
+warnings.filterwarnings('ignore')
+
+# 设置中文字体
+plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans', 'Arial Unicode MS']
+plt.rcParams['axes.unicode_minus'] = False
+plt.rcParams['font.size'] = 12
+
+# 机器学习模型导入 - 改为回归模型
+from sklearn.svm import SVR
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
+from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from sklearn.cross_decomposition import PLSRegression
+
+# 第三方模型导入
+# try:
+#     import lightgbm as lgb
+#     LGB_AVAILABLE = True
+# except ImportError:
+#     LGB_AVAILABLE = False
+LGB_AVAILABLE = False  # 注释掉lightgbm
+
+# try:
+#     import catboost as cb
+#     CB_AVAILABLE = True
+# except ImportError:
+#     CB_AVAILABLE = False
+CB_AVAILABLE = False  # 注释掉catboost
+
+# 导入预处理模块
+# 动态导入预处理模块
+import sys
+import os
+
+from src.preprocessing.spectral_Preprocessing import Preprocessing
+
+
+class WaterQualityScatterBatch:
+    """水质参数反演批量散点图绘制类"""
+
+    def __init__(self):
+        """初始化批量散点图绘制类"""
+        # 定义支持的回归模型及其参数网格
+        self.model_configs = {
+            'SVR': {
+                'model': SVR,
+                'params': {
+                    'C': [0.1, 1, 10, 100],
+                    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
+                    'kernel': ['rbf', 'poly', 'sigmoid'],
+                    'epsilon': [0.01, 0.1, 0.2]
+                },
+                'available': True
+            },
+            'RF': {
+                'model': RandomForestRegressor,
+                'params': {
+                    'n_estimators': [50, 100, 200],
+                    'max_depth': [None, 10, 20, 30],
+                    'min_samples_split': [2, 5, 10],
+                    'min_samples_leaf': [1, 2, 4]
+                },
+                'available': True
+            },
+            'KNN': {
+                'model': KNeighborsRegressor,
+                'params': {
+                    'n_neighbors': [3, 5, 7, 9, 11],
+                    'weights': ['uniform', 'distance'],
+                    'metric': ['euclidean', 'manhattan', 'minkowski']
+                },
+                'available': True
+            },
+            'LinearRegression': {
+                'model': LinearRegression,
+                'params': {
+                    'fit_intercept': [True, False]
+                },
+                'available': True
+            },
+            'Ridge': {
+                'model': Ridge,
+                'params': {
+                    'alpha': [0.01, 0.1, 1, 10, 100],
+                    'fit_intercept': [True, False]
+                },
+                'available': True
+            },
+            'Lasso': {
+                'model': Lasso,
+                'params': {
+                    'alpha': [0.01, 0.1, 1, 10, 100],
+                    'fit_intercept': [True, False],
+                    'max_iter': [1000, 2000]
+                },
+                'available': True
+            },
+            'ElasticNet': {
+                'model': ElasticNet,
+                'params': {
+                    'alpha': [0.01, 0.1, 1, 10],
+                    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
+                    'fit_intercept': [True, False],
+                    'max_iter': [1000, 2000]
+                },
+                'available': True
+            },
+            'XGBoost': {
+                'model': None, # xgboost is removed, so set to None
+                'params': {
+                    'n_estimators': [50, 100, 200],
+                    'max_depth': [3, 6, 9],
+                    'learning_rate': [0.01, 0.1, 0.2],
+                    'subsample': [0.8, 0.9, 1.0]
+                },
+                'available': False
+            },
+            'LightGBM': {
+                'model': lgb.LGBMRegressor if LGB_AVAILABLE else None,
+                'params': {
+                    'n_estimators': [50, 100, 200],
+                    'max_depth': [3, 6, 9],
+                    'learning_rate': [0.01, 0.1, 0.2],
+                    'num_leaves': [31, 50, 100]
+                },
+                'available': LGB_AVAILABLE
+            },
+            'CatBoost': {
+                'model': cb.CatBoostRegressor if CB_AVAILABLE else None,
+                'params': {
+                    'iterations': [50, 100, 200],
+                    'depth': [3, 6, 9],
+                    'learning_rate': [0.01, 0.1, 0.2],
+                    'l2_leaf_reg': [1, 3, 5]
+                },
+                'available': CB_AVAILABLE
+            },
+            'PLS': {
+                'model': PLSRegression,
+                'params': {
+                    'n_components': [2, 3, 5, 7, 10]
+                },
+                'available': True
+            }
+        }
+
+        # 预处理方法列表
+        self.preprocessing_methods = [
+            "None", "MMS", "SS", "CT", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "WVAE"
+        ]
+
+        # 样本划分方法列表
+        self.split_methods = ["random", "spxy", "ks"]
+
+    def load_data(self, csv_path: str, target_column_name: str = None, target_column: int = None, feature_start_column: int = 13) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        加载CSV数据
+
+        Args:
+            csv_path: CSV文件路径
+            target_column_name: 目标值列名（优先使用）
+            target_column: 目标值列索引（当列名不存在时使用）
+            feature_start_column: 特征开始列索引
+
+        Returns:
+            X: 特征数据
+            y: 目标值数据
+        """
+        data = pd.read_csv(csv_path)
+
+        # 根据列名或列索引提取目标值
+        if target_column_name and target_column_name in data.columns:
+            print(f"使用列名 '{target_column_name}' 作为目标值")
+            y = data[target_column_name]
+            target_col_index = data.columns.get_loc(target_column_name)
+        elif target_column is not None:
+            print(f"使用列索引 {target_column} 作为目标值")
+            y = data.iloc[:, target_column]
+            target_col_index = target_column
+        else:
+            raise ValueError("必须指定 target_column_name 或 target_column")
+        
+        # 提取特征数据
+        X = data.iloc[:, feature_start_column:]
+        
+        # 去除y值为空的行
+        mask = ~y.isna()
+        data_cleaned = data[mask]
+        
+        if target_column_name and target_column_name in data.columns:
+            y = data_cleaned[target_column_name]
+        else:
+            y = data_cleaned.iloc[:, target_col_index]
+        X = data_cleaned.iloc[:, feature_start_column:]
+
+        print(f"数据加载完成:")
+        print(f"  目标列: {target_column_name if target_column_name else f'索引{target_col_index}'}")
+        print(f"  样本数量: {X.shape[0]}")
+        print(f"  特征数量: {X.shape[1]}")
+        print(f"  目标值范围: {y.min():.4f} ~ {y.max():.4f}")
+        print(f"  目标值均值: {y.mean():.4f}")
+
+        return X, y
+
+    def preprocess_data(self, X: pd.DataFrame, method: str) -> np.ndarray:
+        """
+        数据预处理
+
+        Args:
+            X: 原始特征数据
+            method: 预处理方法
+
+        Returns:
+            预处理后的数据
+        """
+        print(f"应用预处理方法: {method}")
+
+        # 如果方法为None，直接返回原始数据
+        if method == "None" or method is None:
+            print("跳过预处理，使用原始数据")
+            return X.values
+
+        try:
+            X_processed = Preprocessing(method, X)
+
+            # 确保返回的是numpy数组
+            if isinstance(X_processed, pd.DataFrame):
+                X_processed = X_processed.values
+
+            print(f"预处理完成，数据形状: {X_processed.shape}")
+            return X_processed
+
+        except Exception as e:
+            print(f"预处理失败: {e}")
+            print("使用原始数据")
+            return X.values
+
+    def random(self, data, label, test_ratio=0.2, random_state=123):
+        """随机划分数据集"""
+        X_train, X_test, y_train, y_test = train_test_split(
+            data, label, test_size=test_ratio, random_state=random_state
+        )
+        return X_train, X_test, y_train, y_test
+
+    def spxy(self, data, label, test_size=0.2):
+        """SPXY算法划分数据集"""
+        # 确保 data 和 label 是 NumPy 数组
+        data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
+        label = label.to_numpy() if isinstance(label, pd.Series) else label
+
+        # 备份原始数据和标签
+        x_backup = data
+        y_backup = label
+
+        M = data.shape[0]
+        N = round((1 - test_size) * M)
+        samples = np.arange(M)
+
+        # 归一化标签数据
+        label = (label - np.mean(label)) / np.std(label)
+        D = np.zeros((M, M))
+        Dy = np.zeros((M, M))
+
+        # 计算样本之间的距离
+        for i in range(M - 1):
+            xa = data[i, :]
+            ya = label[i]
+            for j in range((i + 1), M):
+                xb = data[j, :]
+                yb = label[j]
+                D[i, j] = np.linalg.norm(xa - xb)
+                Dy[i, j] = np.linalg.norm(ya - yb)
+
+        # 距离归一化
+        Dmax = np.max(D)
+        Dymax = np.max(Dy)
+        D = D / Dmax + Dy / Dymax
+
+        # 找到最远的两个点
+        maxD = D.max(axis=0)
+        index_row = D.argmax(axis=0)
+        index_column = maxD.argmax()
+
+        m = np.zeros(N, dtype=int)
+        m[0] = index_row[index_column]
+        m[1] = index_column
+
+        dminmax = np.zeros(N)
+        dminmax[1] = D[m[0], m[1]]
+
+        # 根据距离选择训练集
+        for i in range(2, N):
+            pool = np.delete(samples, m[:i])
+            dmin = np.zeros(M - i)
+            for j in range(M - i):
+                indexa = pool[j]
+                d = np.zeros(i)
+                for k in range(i):
+                    indexb = m[k]
+                    if indexa < indexb:
+                        d[k] = D[indexa, indexb]
+                    else:
+                        d[k] = D[indexb, indexa]
+                dmin[j] = np.min(d)
+            dminmax[i] = np.max(dmin)
+            index = np.argmax(dmin)
+            m[i] = pool[index]
+
+        m_complement = np.delete(samples, m)
+
+        # 划分训练集和测试集
+        X_train = data[m, :]
+        y_train = y_backup[m]
+        X_test = data[m_complement, :]
+        y_test = y_backup[m_complement]
+
+        return X_train, X_test, y_train, y_test
+
+    def ks(self, data, label, test_size=0.2):
+        """Kennard-Stone算法划分数据集"""
+        # 确保 data 和 label 是 NumPy 数组
+        data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
+        label = label.to_numpy() if isinstance(label, pd.Series) else label
+
+        M = data.shape[0]
+        N = round((1 - test_size) * M)
+        samples = np.arange(M)
+
+        D = np.zeros((M, M))
+
+        for i in range((M - 1)):
+            xa = data[i, :]
+            for j in range((i + 1), M):
+                xb = data[j, :]
+                D[i, j] = np.linalg.norm(xa - xb)
+
+        maxD = np.max(D, axis=0)
+        index_row = np.argmax(D, axis=0)
+        index_column = np.argmax(maxD)
+
+        m = np.zeros(N)
+        m[0] = np.array(index_row[index_column])
+        m[1] = np.array(index_column)
+        m = m.astype(int)
+        dminmax = np.zeros(N)
+        dminmax[1] = D[m[0], m[1]]
+
+        for i in range(2, N):
+            pool = np.delete(samples, m[:i])
+            dmin = np.zeros((M - i))
+            for j in range((M - i)):
+                indexa = pool[j]
+                d = np.zeros(i)
+                for k in range(i):
+                    indexb = m[k]
+                    if indexa < indexb:
+                        d[k] = D[indexa, indexb]
+                    else:
+                        d[k] = D[indexb, indexa]
+                dmin[j] = np.min(d)
+            dminmax[i] = np.max(dmin)
+            index = np.argmax(dmin)
+            m[i] = pool[index]
+
+        m_complement = np.delete(np.arange(data.shape[0]), m)
+
+        X_train = data[m, :]
+        y_train = label[m]
+        X_test = data[m_complement, :]
+        y_test = label[m_complement]
+
+        return X_train, X_test, y_train, y_test
+
+    def split_data(self, X: np.ndarray, y: pd.Series, method: str = "random", 
+                   test_size: float = 0.2, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        根据指定方法划分数据集
+        """
+        print(f"使用 {method} 方法划分数据集")
+        
+        if method == "random":
+            return self.random(X, y, test_ratio=test_size, random_state=random_state)
+        elif method == "spxy":
+            return self.spxy(X, y, test_size=test_size)
+        elif method == "ks":
+            return self.ks(X, y, test_size=test_size)
+        else:
+            raise ValueError(f"不支持的划分方法: {method}. 支持的方法: {self.split_methods}")
+
+    def plot_scatter_with_confidence(self, y_train, y_pred_train, y_test, y_pred_test, 
+                                    r2_train, mae_train, r2_test, mae_test,
+                                    folder_name, split_method, preprocess_method, model_name,
+                                    save_path):
+        """
+        绘制带置信区间的散点图，模仿提供的代码样式
+        
+        参数:
+        - y_train, y_pred_train: 训练集的真实值和预测值
+        - y_test, y_pred_test: 测试集的真实值和预测值
+        - r2_train, mae_train: 训练集的R²和MAE指标
+        - r2_test, mae_test: 测试集的R²和MAE指标
+        - folder_name: 文件夹名称
+        - split_method: 数据划分方法
+        - preprocess_method: 预处理方法
+        - model_name: 模型名称
+        - save_path: 保存路径
+        """
+        
+        # scale_factor 用于放大置信区间
+        scale_factor = 1.5  # 调整这个值，越大置信区间越宽 scale_factor = 1 是理论上的标准置信区间宽度
+        confidence = 0.95  # 95% 的置信水平
+        
+        # 拟合训练集线
+        z_train = np.polyfit(y_train, y_pred_train, 1)
+        p_train = np.poly1d(z_train)
+        predicted_values_train = p_train(y_train)
+        residuals_train = y_pred_train - predicted_values_train
+        mean_error_train = np.mean(residuals_train**2)
+        t_value_train = stats.t.ppf((1 + confidence) / 2., len(y_train) - 1)
+        ci_train = t_value_train * scale_factor * np.sqrt(mean_error_train) * np.sqrt(1 / len(y_train) + (y_train - np.mean(y_train))**2 / np.sum((y_train - np.mean(y_train))**2))
+        x_extended_train = np.linspace(min(y_train), max(y_train), 100)
+        predicted_extended_train = p_train(x_extended_train)
+        ci_extended_train = t_value_train * scale_factor * np.sqrt(mean_error_train) * np.sqrt(1 / len(y_train) + (x_extended_train - np.mean(y_train))**2 / np.sum((y_train - np.mean(y_train))**2))
+
+        # 拟合测试集线
+        z_test = np.polyfit(y_test, y_pred_test, 1)
+        p_test = np.poly1d(z_test)
+        predicted_values_test = p_test(y_test)
+        residuals_test = y_pred_test - predicted_values_test
+        mean_error_test = np.mean(residuals_test**2)
+        t_value_test = stats.t.ppf((1 + confidence) / 2., len(y_test) - 1)
+        ci_test = t_value_test * scale_factor * np.sqrt(mean_error_test) * np.sqrt(1 / len(y_test) + (y_test - np.mean(y_test))**2 / np.sum((y_test - np.mean(y_test))**2))
+        x_extended_test = np.linspace(min(y_test), max(y_test), 100)
+        predicted_extended_test = p_test(x_extended_test)
+        ci_extended_test = t_value_test * scale_factor * np.sqrt(mean_error_test) * np.sqrt(1 / len(y_test) + (x_extended_test - np.mean(y_test))**2 / np.sum((y_test - np.mean(y_test))**2))
+
+        # 设置新的配色方案
+        train_color = '#1f77b4'  # 训练集主色：蓝色系
+        test_color = '#ff7f0e'   # 测试集主色：橙色系
+        confidence_train_color = '#aec7e8'  # 训练集置信区间浅蓝色
+        confidence_test_color = '#ffbb78'   # 测试集置信区间浅橙色
+
+        # 设置图形大小和分布
+        fig = plt.figure(figsize=(10, 8), dpi=300)  # 降低dpi以提高兼容性
+        gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
+        ax_main = fig.add_subplot(gs[1:, :-1])  # 主图
+        ax_hist_x = fig.add_subplot(gs[0, :-1], sharex=ax_main)  # 上方的直方图
+        ax_hist_y = fig.add_subplot(gs[1:, -1], sharey=ax_main)  # 右侧的直方图
+
+        # 绘制训练集
+        ax_main.scatter(y_train, y_pred_train, color=train_color, label="训练集预测值", alpha=0.6)
+        ax_main.plot(y_train, p_train(y_train), color=train_color, alpha=0.9, 
+                    label=f"训练集拟合线\n$R^2$ = {r2_train:.2f}, MAE = {mae_train:.2f}")
+        ax_main.fill_between(x_extended_train, predicted_extended_train - ci_extended_train, 
+                            predicted_extended_train + ci_extended_train, 
+                            color=confidence_train_color, alpha=0.5, label="训练集95%置信区间")
+
+        # 绘制测试集
+        ax_main.scatter(y_test, y_pred_test, color=test_color, label="测试集预测值", alpha=0.6)
+        ax_main.plot(y_test, p_test(y_test), color=test_color, alpha=0.9, 
+                    label=f"测试集拟合线\n$R^2$ = {r2_test:.2f}, MAE = {mae_test:.2f}")
+        ax_main.fill_between(x_extended_test, predicted_extended_test - ci_extended_test, 
+                            predicted_extended_test + ci_extended_test, 
+                            color=confidence_test_color, alpha=0.5, label="测试集95%置信区间")
+
+        # 添加参考线
+        ax_main.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
+                    [min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
+                    color='grey', linestyle='--', alpha=0.6, label="1:1 参考线")
+
+        # 设置主图
+        ax_main.set_xlabel("观测值", fontsize=12)
+        ax_main.set_ylabel("预测值", fontsize=12)
+        ax_main.legend(loc="upper left", fontsize=10)
+        ax_main.grid(True, alpha=0.3)
+
+        # 绘制上方的直方图 (真实值的分布)
+        ax_hist_x.hist(y_train, bins=20, color=train_color, alpha=0.7, edgecolor='black', label="训练集观测值分布")
+        ax_hist_x.hist(y_test, bins=20, color=test_color, alpha=0.7, edgecolor='black', label="测试集观测值分布")
+        ax_hist_x.tick_params(labelbottom=False)  # 隐藏 x 轴的标签
+        ax_hist_x.set_ylabel("频次", fontsize=10)
+        ax_hist_x.legend(fontsize=8)
+
+        # 绘制右侧的直方图 (预测值的分布)
+        ax_hist_y.hist(y_pred_train, bins=20, orientation='horizontal', color=train_color, alpha=0.7, edgecolor='black')
+        ax_hist_y.hist(y_pred_test, bins=20, orientation='horizontal', color=test_color, alpha=0.7, edgecolor='black')
+        ax_hist_y.set_xlabel("频次", fontsize=10)
+        ax_hist_y.tick_params(labelleft=False)  # 隐藏 y 轴的标签
+
+        # 添加标题
+        title = f'{folder_name} - 最佳模型预测效果对比图\n'
+        title += f'{split_method}_{preprocess_method}_{model_name}'
+        fig.suptitle(title, fontsize=14, fontweight='bold')
+
+        # 保存和展示图像
+        plt.tight_layout()
+        plt.savefig(save_path, format='png', bbox_inches='tight', dpi=300)
+        print(f"散点图已保存至: {save_path}")
+
+    def get_best_model_from_summary(self, artifacts_dir: Path, metric: str = 'test_r2', target_column_name: str = None) -> Tuple[str, str, Dict]:
+        """
+        从训练摘要中获取最佳模型信息
+
+        Args:
+            artifacts_dir: 模型目录
+            metric: 评估指标
+            target_column_name: 目标列名（用于构建文件路径）
+
+        Returns:
+            preprocess_method: 预处理方法
+            model_name: 模型名称
+            best_result: 最佳模型结果信息
+        """
+        # 清理目标列名，移除可能的特殊字符
+        if target_column_name:
+            safe_target_name = "".join(c for c in target_column_name if c.isalnum() or c in ('-', '_')).rstrip()
+            # 尝试加载以目标列名为前缀的详细结果文件
+            detailed_path = artifacts_dir / f"{safe_target_name}_detailed_results.csv"
+            summary_path = artifacts_dir / f"{safe_target_name}_training_summary.csv"
+        else:
+            # 兼容旧版本，使用固定文件名
+            detailed_path = artifacts_dir / "detailed_results.csv"
+            summary_path = artifacts_dir / "training_summary.csv"
+        
+        summary_df = None
+        
+        # 优先使用详细结果文件
+        if detailed_path.exists():
+            print(f"使用详细结果文件: {detailed_path}")
+            summary_df = pd.read_csv(detailed_path)
+            # 将中文列名映射到英文
+            metric_mapping = {
+                'test_r2': '测试集R²',
+                'train_r2': '训练集R²',
+                'test_rmse': '测试集RMSE',
+                'train_rmse': '训练集RMSE',
+                'cv_mean': 'CV均值'
+            }
+            if metric in metric_mapping and metric_mapping[metric] in summary_df.columns:
+                metric_col = metric_mapping[metric]
+            else:
+                metric_col = metric
+        elif summary_path.exists():
+            print(f"使用训练摘要文件: {summary_path}")
+            summary_df = pd.read_csv(summary_path)
+            metric_col = metric
+        else:
+            # 如果使用了目标列名前缀的文件不存在，尝试查找旧版本的文件
+            if target_column_name:
+                old_detailed_path = artifacts_dir / "detailed_results.csv"
+                old_summary_path = artifacts_dir / "training_summary.csv"
+                
+                if old_detailed_path.exists():
+                    print(f"使用旧版本详细结果文件: {old_detailed_path}")
+                    summary_df = pd.read_csv(old_detailed_path)
+                    # 将中文列名映射到英文
+                    metric_mapping = {
+                        'test_r2': '测试集R²',
+                        'train_r2': '训练集R²',
+                        'test_rmse': '测试集RMSE',
+                        'train_rmse': '训练集RMSE',
+                        'cv_mean': 'CV均值'
+                    }
+                    if metric in metric_mapping and metric_mapping[metric] in summary_df.columns:
+                        metric_col = metric_mapping[metric]
+                    else:
+                        metric_col = metric
+                elif old_summary_path.exists():
+                    print(f"使用旧版本训练摘要文件: {old_summary_path}")
+                    summary_df = pd.read_csv(old_summary_path)
+                    metric_col = metric
+                else:
+                    raise FileNotFoundError(f"训练摘要文件不存在: {summary_path} 或 {detailed_path} 或 {old_summary_path} 或 {old_detailed_path}")
+            else:
+                raise FileNotFoundError(f"训练摘要文件不存在: {summary_path} 或 {detailed_path}")
+
+        if summary_df.empty:
+            raise ValueError("训练摘要为空")
+
+        # 检查指标列是否存在
+        if metric_col not in summary_df.columns:
+            available_cols = list(summary_df.columns)
+            raise ValueError(f"指标 '{metric_col}' 不存在。可用列: {available_cols}")
+
+        # 获取最佳模型（对于R²等指标，值越大越好）
+        if 'r2' in metric.lower() or 'score' in metric.lower():
+            best_idx = summary_df[metric_col].idxmax()
+        else:  # 对于RMSE、MAE等，值越小越好
+            best_idx = summary_df[metric_col].idxmin()
+            
+        best_row = summary_df.loc[best_idx]
+
+        # 根据文件类型解析模型信息
+        if '划分方法' in summary_df.columns:
+            # 详细结果文件格式（中文列名）
+            split_method = best_row['划分方法']
+            preprocess_method = best_row['预处理方法']
+            model_name = best_row['建模方法']
+            best_combination = f"{split_method}_{preprocess_method}_{model_name}"
+        else:
+            # 简化结果文件格式（英文列名）
+            best_combination = best_row['combination']
+            # 解析组合名称（格式: split_method_preprocess_method_model_name）
+            parts = best_combination.split('_')
+            if len(parts) < 3:
+                raise ValueError(f"无效的模型组合名称格式: {best_combination}")
+            
+            split_method = parts[0]
+            preprocess_method = parts[1]
+            model_name = '_'.join(parts[2:])
+
+        print(f"最佳模型组合: {best_combination}")
+        print(f"  划分方法: {split_method}")
+        print(f"  预处理方法: {preprocess_method}")
+        print(f"  模型名称: {model_name}")
+        print(f"  {metric_col}: {best_row[metric_col]:.4f}")
+
+        # 构建模型文件前缀
+        model_file_prefix = f"{split_method}_{preprocess_method}"
+        
+        # 构建结果信息
+        best_result = {
+            'combination': best_combination,
+            'split_method': split_method,
+            'preprocess_method': preprocess_method,
+            'model_name': model_name,
+            'metric_value': best_row[metric_col],
+            'model_file_prefix': model_file_prefix
+        }
+        
+        # 尝试获取更多指标信息
+        for col in summary_df.columns:
+            if col not in ['combination', '划分方法', '预处理方法', '建模方法', '最佳参数']:
+                try:
+                    best_result[col] = best_row[col]
+                except:
+                    pass
+
+        return model_file_prefix, model_name, best_result
+
+    def load_model(self, artifacts_dir: Path, preprocess_method: str, model_name: str, target_column_name: str = None):
+        """
+        加载保存的模型
+
+        Args:
+            artifacts_dir: 模型目录
+            preprocess_method: 预处理方法名称
+            model_name: 模型名称
+            target_column_name: 目标列名（用于构建文件路径）
+
+        Returns:
+            加载的模型数据
+        """
+        if target_column_name:
+            # 清理目标列名，移除可能的特殊字符
+            safe_target_name = "".join(c for c in target_column_name if c.isalnum() or c in ('-', '_')).rstrip()
+            # 尝试加载以目标列名为前缀的模型文件
+            filename = f"{safe_target_name}_{preprocess_method}_{model_name}.joblib"
+            filepath = artifacts_dir / filename
+            
+            if filepath.exists():
+                print(f"加载模型文件: {filepath}")
+                return joblib.load(filepath)
+            
+            # 如果带前缀的文件不存在，尝试加载旧版本的文件
+            old_filename = f"{preprocess_method}_{model_name}.joblib"
+            old_filepath = artifacts_dir / old_filename
+            
+            if old_filepath.exists():
+                print(f"加载旧版本模型文件: {old_filepath}")
+                return joblib.load(old_filepath)
+            
+            raise FileNotFoundError(f"模型文件不存在: {filepath} 或 {old_filepath}")
+        else:
+            # 兼容旧版本，使用固定文件名
+            filename = f"{preprocess_method}_{model_name}.joblib"
+            filepath = artifacts_dir / filename
+
+            if not filepath.exists():
+                raise FileNotFoundError(f"模型文件不存在: {filepath}")
+
+            return joblib.load(filepath)
+
+    def plot_best_model_scatter(self, artifacts_dir: str, csv_path: str, output_dir: str,
+                               folder_name: str, metric: str = 'test_r2',
+                               target_column: int = None, feature_start_column: int = 13,
+                               test_size: float = 0.2, random_state: int = 42):
+        """
+        绘制最佳模型的散点图
+
+        Args:
+            artifacts_dir: 模型目录
+            csv_path: 原始CSV数据文件路径
+            output_dir: 输出目录
+            folder_name: 文件夹名称（用作图片名称和目标列名）
+            metric: 评估指标
+            target_column: 目标值列索引（如果为None，则使用folder_name作为列名）
+            feature_start_column: 特征开始列索引
+            test_size: 测试集比例
+            random_state: 随机种子
+        """
+        artifacts_path = Path(artifacts_dir)
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        
+        try:
+            print(f"\n{'='*60}")
+            print(f"处理文件夹: {folder_name}")
+            print(f"{'='*60}")
+            
+            # 获取最佳模型信息
+            model_file_prefix, model_name, best_result = self.get_best_model_from_summary(
+                artifacts_path, metric, folder_name
+            )
+
+            # 加载数据 - 优先使用文件夹名称作为目标列名
+            X_raw, y_true = self.load_data(csv_path, target_column_name=folder_name, target_column=target_column, feature_start_column=feature_start_column)
+
+            # 获取最佳模型的预处理方法
+            actual_preprocess_method = best_result['preprocess_method']
+            split_method = best_result['split_method']
+
+            # 加载最佳模型
+            best_model_data = self.load_model(artifacts_path, model_file_prefix, model_name, folder_name)
+            best_model = best_model_data['model']
+
+            # 应用相同的数据预处理
+            X_processed = self.preprocess_data(X_raw, actual_preprocess_method)
+
+            # 使用相同的数据分割方法
+            X_train, X_test, y_train, y_test = self.split_data(
+                X_processed, y_true, method=split_method, 
+                test_size=test_size, random_state=random_state
+            )
+
+            # 预测训练集和测试集
+            y_pred_train = best_model.predict(X_train)
+            y_pred_test = best_model.predict(X_test)
+
+            # 计算评估指标
+            train_r2 = r2_score(y_train, y_pred_train)
+            test_r2 = r2_score(y_test, y_pred_test)
+            train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+            test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+            train_mae = mean_absolute_error(y_train, y_pred_train)
+            test_mae = mean_absolute_error(y_test, y_pred_test)
+
+            # 绘制带置信区间的散点图（模仿提供的代码样式）
+            self.plot_scatter_with_confidence(
+                y_train, y_pred_train, y_test, y_pred_test,
+                train_r2, train_mae, test_r2, test_mae,
+                folder_name, split_method, actual_preprocess_method, model_name,
+                output_path / f"{folder_name}_scatter_with_confidence.png"
+            )
+
+            plt.close()  # 关闭图形以释放内存
+
+            return {
+                'status': 'success',
+                'save_path': str(output_path / f"{folder_name}_scatter_with_confidence.png"),
+                'best_result': best_result,
+                'metrics': {
+                    'train_r2': train_r2,
+                    'test_r2': test_r2,
+                    'train_rmse': train_rmse,
+                    'test_rmse': test_rmse,
+                    'train_mae': train_mae,
+                    'test_mae': test_mae
+                }
+            }
+            
+        except Exception as e:
+            print(f"处理文件夹 {folder_name} 失败: {e}")
+            return {
+                'status': 'error',
+                'error': str(e)
+            }
+
+    def batch_plot_scatter(self, models_root_dir: str, csv_path: str, output_dir: str,
+                          metric: str = 'test_r2', target_column: int = None, 
+                          feature_start_column: int = 13, test_size: float = 0.2, 
+                                random_state: int = 42):
+        """
+        批量处理多个子文件夹中的模型并绘制散点图
+
+        Args:
+            models_root_dir: 包含多个子文件夹的根目录
+            csv_path: 原始CSV数据文件路径
+            output_dir: 输出目录
+            metric: 评估指标
+            target_column: 目标值列索引（如果为None，则使用文件夹名称作为列名）
+            feature_start_column: 特征开始列索引
+            test_size: 测试集比例
+            random_state: 随机种子
+        """
+        models_root = Path(models_root_dir)
+        
+        # 查找所有子文件夹
+        subdirs = [d for d in models_root.iterdir() if d.is_dir()]
+        
+        if not subdirs:
+            print(f"在目录 {models_root_dir} 中未找到子文件夹")
+            return {}
+            
+        print("=" * 80)
+        print("批量散点图绘制任务")
+        print("=" * 80)
+        print(f"模型根目录: {models_root_dir}")
+        print(f"数据文件: {csv_path}")
+        print(f"输出目录: {output_dir}")
+        print(f"评估指标: {metric}")
+        print(f"找到 {len(subdirs)} 个模型子文件夹")
+        print("=" * 80)
+        
+        all_results = {}
+        
+        for subdir in subdirs:
+            folder_name = subdir.name
+            result = self.plot_best_model_scatter(
+                artifacts_dir=str(subdir),
+                csv_path=csv_path,
+                output_dir=output_dir,
+                folder_name=folder_name,
+                metric=metric,
+                target_column=target_column,
+                feature_start_column=feature_start_column,
+                test_size=test_size,
+                random_state=random_state
+            )
+            
+            all_results[folder_name] = result
+        
+        print(f"\n{'='*80}")
+        print(f"批量散点图绘制完成，共处理 {len(subdirs)} 个模型文件夹")
+        print(f"{'='*80}")
+        
+        # 打印汇总信息
+        print("\n汇总结果:")
+        success_count = 0
+        for folder_name, result in all_results.items():
+            if result['status'] == 'success':
+                metrics = result['metrics']
+                print(f"  ✓ {folder_name}: 测试集R²={metrics['test_r2']:.4f}, "
+                      f"RMSE={metrics['test_rmse']:.4f}")
+                success_count += 1
+            else:
+                print(f"  ✗ {folder_name}: 失败 - {result['error']}")
+        
+        print(f"\n成功处理: {success_count}/{len(subdirs)} 个文件夹")
+        print(f"输出目录: {output_dir}")
+        
+        return all_results
+
+
+def main():
+    """主函数示例"""
+    # 创建批量散点图绘制实例
+    scatter_batch = WaterQualityScatterBatch()
+
+    # 配置路径
+    models_root_dir = r"E:\code\WQ\yaobao925\qvchuyaoban"  # 包含多个子文件夹的根目录
+    csv_path = r"E:\code\WQ\yaobao925\data\qvyaoban\data.csv"  # 原始数据文件
+    output_dir = r"E:\code\WQ\yaobao925\plot\qvyaoban_sctter"          # 散点图输出目录
+    
+    # 批量绘制散点图
+    results = scatter_batch.batch_plot_scatter(
+        models_root_dir=models_root_dir,
+        csv_path=csv_path,
+        output_dir=output_dir,
+        metric='test_r2',           # 评估指标
+        target_column=None,         # 使用文件夹名称作为目标列名
+        feature_start_column=13,    # 特征开始列索引
+        test_size=0.2,              # 测试集比例
+        random_state=42             # 随机种子
+    )
+    
+    print("\n任务完成！")
+
+
+if __name__ == "__main__":
+    main()