# -*- coding: utf-8 -*- """ 建模步骤 包含 step6_train_models, step6_5_non_empirical_modeling, step6_75_custom_regression """ import time import json from pathlib import Path from typing import Optional, List, Union, Callable, Dict import pandas as pd import numpy as np # ============================================================ # 汉化 -> 英文 反向映射字典(UI 复选框显示文本 -> 底层算法键名) # ============================================================ # 模型名称:中文 (缩写) -> 英文键名 MODEL_NAME_MAP = { "多元线性回归 (MLR)": "LinearRegression", "岭回归 (Ridge)": "Ridge", "套索回归 (Lasso)": "Lasso", "弹性网络 (ElasticNet)": "ElasticNet", "偏最小二乘 (PLSR)": "PLS", "决策树 (CART)": "DecisionTree", "随机森林 (RF)": "RF", "极端随机树 (ET)": "ExtraTrees", "极值梯度提升 (XGBoost)": "XGBoost", "轻量梯度提升 (LightGBM)": "LightGBM", "类别梯度提升 (CatBoost)": "CatBoost", "梯度提升树 (GBDT)": "GradientBoosting", "自适应提升 (AdaBoost)": "AdaBoost", "支持向量回归 (SVR)": "SVR", "K近邻回归 (KNN)": "KNN", "多层感知机 (BP神经网络)": "MLP", } # 预处理方法:各种可能的中文变体 -> 标准键名 PREPROC_NAME_MAP = { # 无处理 "无 (None)": "None", "None": "None", # MMS "最小-最大归一化 (MMS)": "MMS", "MMS": "MMS", # SS "标度化 (SS)": "SS", "SS": "SS", # SNV "标准正态变换 (SNV)": "SNV", "SNV": "SNV", # MA "移动平均 (MA)": "MA", "MA": "MA", # SG "Savitzky-Golay (SG)": "SG", "SG": "SG", # MSC "多元散射校正 (MSC)": "MSC", "MSC": "MSC", # D1 "一阶导数 (D1)": "D1", "D1": "D1", # D2 "二阶导数 (D2)": "D2", "D2": "D2", # DT "去趋势 (DT)": "DT", "DT": "DT", # CT "中心化 (CT)": "CT", "CT": "CT", } # 数据划分方法:各种可能的中文变体 -> 标准键名 SPLIT_NAME_MAP = { "SPXY 算法 (考量X-Y空间)": "spxy", "spxy": "spxy", "KS 算法 (考量X空间)": "ks", "ks": "ks", "随机划分 (Random)": "random", "random": "random", } def _normalize_model_names(model_names: List[str]) -> List[str]: """清洗模型名称列表:将汉化显示文本还原为英文键名""" result = [] for name in model_names: if name in MODEL_NAME_MAP: result.append(MODEL_NAME_MAP[name]) else: # 已经是英文键名,直接保留 result.append(name) return result def _normalize_preprocessing_methods(methods: List[str]) -> List[str]: """清洗预处理方法列表:将汉化显示文本还原为标准键名""" result = [] for method in methods: if method in PREPROC_NAME_MAP: result.append(PREPROC_NAME_MAP[method]) else: # 已经是标准键名,直接保留 result.append(method) return result def _normalize_split_methods(methods: List[str]) -> List[str]: """清洗数据划分方法列表:将汉化显示文本还原为标准键名""" result = [] for method in methods: if method in SPLIT_NAME_MAP: result.append(SPLIT_NAME_MAP[method]) else: # 已经是标准键名,直接保留 result.append(method) return result class ModelingStep: """建模步骤""" # ---- Step 6: 训练机器学习模型 ---- @staticmethod def train_models( feature_start_column: str = "374.285004", preprocessing_methods: Optional[List[str]] = None, model_names: Optional[List[str]] = None, split_methods: Optional[List[str]] = None, cv_folds: int = 5, training_csv_path: Optional[str] = None, output_dir: Union[str, Path] = "./7_Supervised_Model_Training", callback: Optional[Callable] = None, _report_generator=None, ) -> str: """使用采样点光谱和实测值建立机器学习模型""" from src.core.modeling.modeling_batch import WaterQualityModelingBatch output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) def notify(status, msg=""): if callback: callback("步骤6", status, msg) print("\n" + "=" * 80) print("步骤6: 训练机器学习模型") print("=" * 80) step_start_time = time.time() if training_csv_path is None: raise ValueError("必须提供 training_csv_path 参数") # 检查模型目录是否已有模型 if output_dir.exists() and any(output_dir.iterdir()): has_models = False for item in output_dir.iterdir(): if item.is_dir(): model_files = ( list(item.glob("*.pkl")) + list(item.glob("*.joblib")) + list(item.glob("*.h5")) ) if model_files: has_models = True break if has_models: print(f"检测到已存在的模型文件,直接使用: {output_dir}") notify("skipped", f"模型目录已设置: {output_dir}") return str(output_dir) if preprocessing_methods is None: preprocessing_methods = ["None", "MMS", "SS", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "CT"] if model_names is None: model_names = ["SVR", "RF", "Ridge", "Lasso"] if split_methods is None: split_methods = ["spxy", "ks", "random"] # ---- 汉化清洗:将 UI 传来的中文/混合名称转换为底层英文键名 ---- preprocessing_methods = _normalize_preprocessing_methods(preprocessing_methods) model_names = _normalize_model_names(model_names) split_methods = _normalize_split_methods(split_methods) print(f"[参数清洗] 预处理方法: {preprocessing_methods}") print(f"[参数清洗] 模型名称: {model_names}") print(f"[参数清洗] 划分方法: {split_methods}") modeler = WaterQualityModelingBatch(str(output_dir)) modeler.train_models_batch( csv_path=training_csv_path, feature_start_column=feature_start_column, preprocessing_methods=preprocessing_methods, model_names=model_names, split_methods=split_methods, cv_folds=cv_folds, ) print(f"模型训练完成,结果保存在: {output_dir}") if _report_generator is not None: try: summary_path = _report_generator.generate_training_summary(str(output_dir)) print(f"训练摘要报告已生成: {summary_path}") except Exception as e: print(f"生成训练摘要报告时出错: {e}") notify("completed", f"模型训练完成: {output_dir}") return str(output_dir) # ---- Step 6.5: 非经验统计回归模型训练 ---- @staticmethod def train_non_empirical_models( csv_path: Optional[str] = None, preprocessing_methods: Optional[List[str]] = None, algorithms: Optional[List[str]] = None, value_cols: Union[int, Dict[str, int]] = 0, spectral_start_col: int = 1, spectral_end_col: Optional[int] = None, window: int = 5, output_dir: Optional[str] = None, enabled: bool = True, callback: Optional[Callable] = None, ) -> Dict[str, str]: """非经验统计回归模型训练""" def notify(status, msg=""): if callback: callback("步骤6.5", status, msg) print("\n" + "=" * 80) print("步骤6.5: 非经验统计回归模型训练") print("=" * 80) step_start_time = time.time() if not enabled: print("已设置跳过非经验模型训练(enabled=False)。") notify("skipped", "跳过的经验模型训练") return {} if csv_path is None: raise ValueError("必须提供 csv_path 参数") if output_dir is not None: non_empirical_dir = Path(output_dir) else: non_empirical_dir = Path.cwd() / "8_Regression_Modeling" non_empirical_dir.mkdir(parents=True, exist_ok=True) if preprocessing_methods is None: preprocessing_methods = ["None"] if algorithms is None: algorithms = ["chl_a", "nh3", "mno4", "tn", "tp", "tss"] if isinstance(value_cols, int): value_cols_dict = {algorithm: value_cols for algorithm in algorithms} elif isinstance(value_cols, dict): value_cols_dict = value_cols else: raise ValueError("value_cols 参数必须是整数或字典") if spectral_end_col is None: df = pd.read_csv(csv_path) spectral_end_col = len(df.columns) - 1 all_model_results = {} for preprocess in preprocessing_methods: preprocess_dir = non_empirical_dir / preprocess preprocess_dir.mkdir(parents=True, exist_ok=True) processed_csv_path = _apply_preprocessing_internal( csv_path, preprocess, preprocess_dir, spectral_start_col ) for algorithm in algorithms: algorithm_value_col = value_cols_dict[algorithm] print(f"\n训练 {preprocess} + {algorithm} 模型 (实测值列: {algorithm_value_col})...") model_outpath = str(preprocess_dir / f"{preprocess}_{algorithm}.json") if Path(model_outpath).exists(): print(f"检测到已存在的模型文件,直接使用: {model_outpath}") all_model_results[f"{preprocess}_{algorithm}"] = model_outpath continue try: from src.core.non_empirical_model_correction import run_model_correction run_model_correction( algorithm=algorithm, csv_file=processed_csv_path if Path(processed_csv_path).exists() else csv_path, value_col=algorithm_value_col, spectral_start=spectral_start_col, spectral_end=spectral_end_col, model_info_outpath=model_outpath, window=window, ) all_model_results[f"{preprocess}_{algorithm}"] = model_outpath print(f"模型训练完成: {model_outpath}") except Exception as e: print(f"训练 {preprocess}_{algorithm} 模型时出错: {e}") continue summary_path = _generate_non_empirical_summary(all_model_results, non_empirical_dir) notify("completed", f"非经验模型训练完成: {non_empirical_dir}") return all_model_results # ---- Step 6.75: 自定义回归分析 ---- @staticmethod def custom_regression( csv_path: Optional[str] = None, x_columns: Optional[Union[str, List[str]]] = None, y_columns: Optional[Union[str, List[str]]] = None, methods: Union[str, List[str]] = "all", output_dir: Optional[str] = None, enabled: bool = True, callback: Optional[Callable] = None, work_dir: Union[str, Path] = "./work_dir", ) -> Optional[str]: """使用自定义回归方法分析指标与目标参数之间的关系""" def notify(status, msg=""): if callback: callback("步骤6.75", status, msg) print("\n" + "=" * 80) print("步骤6.75: 自定义回归分析") print("=" * 80) step_start_time = time.time() if not enabled: print("已设置跳过自定义回归分析(enabled=False)。") notify("skipped", "跳过自定义回归分析") return None if csv_path is None: raise ValueError("必须提供 csv_path 参数") if y_columns is None: raise ValueError("必须指定 y_columns") if x_columns is None: raise ValueError("必须指定 x_columns") if isinstance(x_columns, str): x_columns = [x_columns] if isinstance(y_columns, str): y_columns = [y_columns] df = pd.read_csv(csv_path) missing_x = [col for col in x_columns if col not in df.columns] missing_y = [col for col in y_columns if col not in df.columns] if missing_x: raise ValueError(f"自变量列不存在: {missing_x}") if missing_y: raise ValueError(f"因变量列不存在: {missing_y}") if output_dir is None: custom_regression_dir = Path(work_dir) / "9_Custom_Regression_Modeling" else: custom_regression_dir = Path(work_dir) / output_dir custom_regression_dir.mkdir(parents=True, exist_ok=True) from src.core.modeling.regression import SingleVariableRegressionAnalysis analyzer = SingleVariableRegressionAnalysis() analyzer.batch_single_variable_regression( data=df, x_columns=x_columns, y_columns=y_columns, methods=methods, output_dir=str(custom_regression_dir), ) notify("completed", f"自定义回归结果已保存到目录: {custom_regression_dir}") return str(custom_regression_dir) # ============================================================ # 内部辅助函数(供 ModelingStep 内部使用) # ============================================================ def _apply_preprocessing_internal( csv_path: str, preprocess_method: str, output_dir: Path, spectral_start_col: int = 4, ) -> str: """应用预处理到CSV数据(内部函数)""" raw_p = str(preprocess_method).lower() if raw_p == "none" or "无" in raw_p or "跳过" in raw_p: preprocess_method = "None" elif raw_p == "mms" or "minmax" in raw_p or "最大最小" in raw_p: preprocess_method = "MMS" elif raw_p == "ss" or "标准" in raw_p or "标准化" in raw_p: preprocess_method = "SS" elif raw_p == "snv" or "标准正态" in raw_p: preprocess_method = "SNV" elif raw_p == "ma" or "移动" in raw_p: preprocess_method = "MA" elif raw_p == "sg" or "savitzky" in raw_p or "平滑" in raw_p: preprocess_method = "SG" elif raw_p == "msc" or "多元散射" in raw_p: preprocess_method = "MSC" elif raw_p in ("d1", "d2", "dt"): preprocess_method = {"d1": "D1", "d2": "D2", "dt": "DT"}.get(raw_p, raw_p.upper()) elif raw_p == "ct" or "去趋势" in raw_p: preprocess_method = "CT" if preprocess_method == "None": return csv_path output_filename = f"preprocessed_{preprocess_method}.csv" output_path = str(output_dir / output_filename) if Path(output_path).exists(): print(f"检测到已存在的预处理文件,直接使用: {output_path}") return output_path df = pd.read_csv(csv_path) non_spectral_cols = df.iloc[:, :spectral_start_col] spectral_data = df.iloc[:, spectral_start_col:] from src.preprocessing.spectral_Preprocessing import Preprocessing save_path = None if preprocess_method == "SS": models_dir = output_dir.parent.parent / "7_Supervised_Model_Training" models_dir.mkdir(parents=True, exist_ok=True) save_path = str(models_dir / "scaler_params.pkl") print(f"SS预处理: scaler模型将保存到 {save_path}") processed_spectral = Preprocessing(preprocess_method, spectral_data, save_path=save_path) if isinstance(processed_spectral, pd.DataFrame): processed_df = pd.concat([non_spectral_cols, processed_spectral], axis=1) else: processed_spectral_df = pd.DataFrame( processed_spectral, columns=spectral_data.columns, index=spectral_data.index ) processed_df = pd.concat([non_spectral_cols, processed_spectral_df], axis=1) processed_df.to_csv(output_path, index=False) print(f"预处理完成: {output_path}") return output_path def _generate_non_empirical_summary(model_results: Dict[str, str], output_dir: Path) -> str: """生成非经验模型训练结果汇总CSV""" summary_path = str(output_dir / "non_empirical_models_summary.csv") summary_data = [] for model_key, model_path in model_results.items(): try: parts = model_key.split("_") preprocess_method = parts[0] algorithm_name = "_".join(parts[1:]) if len(parts) > 2 else parts[1] with open(model_path, "r", encoding="utf-8") as f: model_info = json.load(f) accuracy_list = model_info.get("accuracy", []) summary_row = { "Preprocessing Method": preprocess_method, "Algorithm Name": algorithm_name, "Model Type": model_info.get("model_type", ""), "Coefficient Count": len(model_info.get("model_info", [])), "Average Accuracy(%)": np.mean(accuracy_list) if accuracy_list else 0, "Min Accuracy(%)": np.min(accuracy_list) if accuracy_list else 0, "Max Accuracy(%)": np.max(accuracy_list) if accuracy_list else 0, "Sample Count": len(model_info.get("long", [])), "Model File": model_path, } coefficients = model_info.get("model_info", []) for i, coeff in enumerate(coefficients[:5]): summary_row[f"系数_{i+1}"] = coeff summary_data.append(summary_row) except Exception as e: print(f"读取模型文件 {model_path} 时出错: {e}") continue if summary_data: df_summary = pd.DataFrame(summary_data) df_summary.to_csv(summary_path, index=False, encoding="utf-8-sig") print(f"汇总文件已生成: {summary_path}") else: print("警告: 没有有效的模型数据可汇总") summary_path = "" return summary_path