WQ_GUI/src/core/steps/modeling_step.py

# -*- coding: utf-8 -*-
"""
建模步骤

包含 step6_train_models, step6_5_non_empirical_modeling, step6_75_custom_regression
"""

import time
import json
from pathlib import Path
from typing import Optional, List, Union, Callable, Dict

import pandas as pd
import numpy as np


# ============================================================
# 汉化 -> 英文 反向映射字典（UI 复选框显示文本 -> 底层算法键名）
# ============================================================

# 模型名称：中文 (缩写) -> 英文键名
MODEL_NAME_MAP = {
    "多元线性回归 (MLR)": "LinearRegression",
    "岭回归 (Ridge)": "Ridge",
    "套索回归 (Lasso)": "Lasso",
    "弹性网络 (ElasticNet)": "ElasticNet",
    "偏最小二乘 (PLSR)": "PLS",
    "决策树 (CART)": "DecisionTree",
    "随机森林 (RF)": "RF",
    "极端随机树 (ET)": "ExtraTrees",
    "极值梯度提升 (XGBoost)": "XGBoost",
    "轻量梯度提升 (LightGBM)": "LightGBM",
    "类别梯度提升 (CatBoost)": "CatBoost",
    "梯度提升树 (GBDT)": "GradientBoosting",
    "自适应提升 (AdaBoost)": "AdaBoost",
    "支持向量回归 (SVR)": "SVR",
    "K近邻回归 (KNN)": "KNN",
    "多层感知机 (BP神经网络)": "MLP",
}

# 预处理方法：各种可能的中文变体 -> 标准键名
PREPROC_NAME_MAP = {
    # 无处理
    "无 (None)": "None",
    "None": "None",
    # MMS
    "最小-最大归一化 (MMS)": "MMS",
    "MMS": "MMS",
    # SS
    "标度化 (SS)": "SS",
    "SS": "SS",
    # SNV
    "标准正态变换 (SNV)": "SNV",
    "SNV": "SNV",
    # MA
    "移动平均 (MA)": "MA",
    "MA": "MA",
    # SG
    "Savitzky-Golay (SG)": "SG",
    "SG": "SG",
    # MSC
    "多元散射校正 (MSC)": "MSC",
    "MSC": "MSC",
    # D1
    "一阶导数 (D1)": "D1",
    "D1": "D1",
    # D2
    "二阶导数 (D2)": "D2",
    "D2": "D2",
    # DT
    "去趋势 (DT)": "DT",
    "DT": "DT",
    # CT
    "中心化 (CT)": "CT",
    "CT": "CT",
}

# 数据划分方法：各种可能的中文变体 -> 标准键名
SPLIT_NAME_MAP = {
    "SPXY 算法 (考量X-Y空间)": "spxy",
    "spxy": "spxy",
    "KS 算法 (考量X空间)": "ks",
    "ks": "ks",
    "随机划分 (Random)": "random",
    "random": "random",
}


def _normalize_model_names(model_names: List[str]) -> List[str]:
    """清洗模型名称列表：将汉化显示文本还原为英文键名"""
    result = []
    for name in model_names:
        if name in MODEL_NAME_MAP:
            result.append(MODEL_NAME_MAP[name])
        else:
            # 已经是英文键名，直接保留
            result.append(name)
    return result


def _normalize_preprocessing_methods(methods: List[str]) -> List[str]:
    """清洗预处理方法列表：将汉化显示文本还原为标准键名"""
    result = []
    for method in methods:
        if method in PREPROC_NAME_MAP:
            result.append(PREPROC_NAME_MAP[method])
        else:
            # 已经是标准键名，直接保留
            result.append(method)
    return result


def _normalize_split_methods(methods: List[str]) -> List[str]:
    """清洗数据划分方法列表：将汉化显示文本还原为标准键名"""
    result = []
    for method in methods:
        if method in SPLIT_NAME_MAP:
            result.append(SPLIT_NAME_MAP[method])
        else:
            # 已经是标准键名，直接保留
            result.append(method)
    return result


class ModelingStep:
    """建模步骤"""

    # ---- Step 6: 训练机器学习模型 ----

    @staticmethod
    def train_models(
        feature_start_column: str = "374.285004",
        preprocessing_methods: Optional[List[str]] = None,
        model_names: Optional[List[str]] = None,
        split_methods: Optional[List[str]] = None,
        cv_folds: int = 5,
        training_csv_path: Optional[str] = None,
        output_dir: Union[str, Path] = "./7_Supervised_Model_Training",
        callback: Optional[Callable] = None,
        _report_generator=None,
    ) -> str:
        """使用采样点光谱和实测值建立机器学习模型"""
        from src.core.modeling.modeling_batch import WaterQualityModelingBatch

        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        def notify(status, msg=""):
            if callback:
                callback("步骤6", status, msg)

        print("\n" + "=" * 80)
        print("步骤6: 训练机器学习模型")
        print("=" * 80)

        step_start_time = time.time()

        if training_csv_path is None:
            raise ValueError("必须提供 training_csv_path 参数")

        # 检查模型目录是否已有模型
        if output_dir.exists() and any(output_dir.iterdir()):
            has_models = False
            for item in output_dir.iterdir():
                if item.is_dir():
                    model_files = (
                        list(item.glob("*.pkl"))
                        + list(item.glob("*.joblib"))
                        + list(item.glob("*.h5"))
                    )
                    if model_files:
                        has_models = True
                        break
            if has_models:
                print(f"检测到已存在的模型文件，直接使用: {output_dir}")
                notify("skipped", f"模型目录已设置: {output_dir}")
                return str(output_dir)

        if preprocessing_methods is None:
            preprocessing_methods = ["None", "MMS", "SS", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "CT"]
        if model_names is None:
            model_names = ["SVR", "RF", "Ridge", "Lasso"]
        if split_methods is None:
            split_methods = ["spxy", "ks", "random"]

        # ---- 汉化清洗：将 UI 传来的中文/混合名称转换为底层英文键名 ----
        preprocessing_methods = _normalize_preprocessing_methods(preprocessing_methods)
        model_names = _normalize_model_names(model_names)
        split_methods = _normalize_split_methods(split_methods)

        print(f"[参数清洗] 预处理方法: {preprocessing_methods}")
        print(f"[参数清洗] 模型名称: {model_names}")
        print(f"[参数清洗] 划分方法: {split_methods}")

        modeler = WaterQualityModelingBatch(str(output_dir))
        modeler.train_models_batch(
            csv_path=training_csv_path,
            feature_start_column=feature_start_column,
            preprocessing_methods=preprocessing_methods,
            model_names=model_names,
            split_methods=split_methods,
            cv_folds=cv_folds,
        )

        print(f"模型训练完成，结果保存在: {output_dir}")

        if _report_generator is not None:
            try:
                summary_path = _report_generator.generate_training_summary(str(output_dir))
                print(f"训练摘要报告已生成: {summary_path}")
            except Exception as e:
                print(f"生成训练摘要报告时出错: {e}")

        notify("completed", f"模型训练完成: {output_dir}")
        return str(output_dir)

    # ---- Step 6.5: 非经验统计回归模型训练 ----

    @staticmethod
    def train_non_empirical_models(
        csv_path: Optional[str] = None,
        preprocessing_methods: Optional[List[str]] = None,
        algorithms: Optional[List[str]] = None,
        value_cols: Union[int, Dict[str, int]] = 0,
        spectral_start_col: int = 1,
        spectral_end_col: Optional[int] = None,
        window: int = 5,
        output_dir: Optional[str] = None,
        enabled: bool = True,
        callback: Optional[Callable] = None,
    ) -> Dict[str, str]:
        """非经验统计回归模型训练"""
        def notify(status, msg=""):
            if callback:
                callback("步骤6.5", status, msg)

        print("\n" + "=" * 80)
        print("步骤6.5: 非经验统计回归模型训练")
        print("=" * 80)

        step_start_time = time.time()

        if not enabled:
            print("已设置跳过非经验模型训练（enabled=False）。")
            notify("skipped", "跳过的经验模型训练")
            return {}

        if csv_path is None:
            raise ValueError("必须提供 csv_path 参数")

        if output_dir is not None:
            non_empirical_dir = Path(output_dir)
        else:
            non_empirical_dir = Path.cwd() / "8_Regression_Modeling"
        non_empirical_dir.mkdir(parents=True, exist_ok=True)

        if preprocessing_methods is None:
            preprocessing_methods = ["None"]
        if algorithms is None:
            algorithms = ["chl_a", "nh3", "mno4", "tn", "tp", "tss"]

        if isinstance(value_cols, int):
            value_cols_dict = {algorithm: value_cols for algorithm in algorithms}
        elif isinstance(value_cols, dict):
            value_cols_dict = value_cols
        else:
            raise ValueError("value_cols 参数必须是整数或字典")

        if spectral_end_col is None:
            df = pd.read_csv(csv_path)
            spectral_end_col = len(df.columns) - 1

        all_model_results = {}

        for preprocess in preprocessing_methods:
            preprocess_dir = non_empirical_dir / preprocess
            preprocess_dir.mkdir(parents=True, exist_ok=True)

            processed_csv_path = _apply_preprocessing_internal(
                csv_path, preprocess, preprocess_dir, spectral_start_col
            )

            for algorithm in algorithms:
                algorithm_value_col = value_cols_dict[algorithm]
                print(f"\n训练 {preprocess} + {algorithm} 模型 (实测值列: {algorithm_value_col})...")

                model_outpath = str(preprocess_dir / f"{preprocess}_{algorithm}.json")

                if Path(model_outpath).exists():
                    print(f"检测到已存在的模型文件，直接使用: {model_outpath}")
                    all_model_results[f"{preprocess}_{algorithm}"] = model_outpath
                    continue

                try:
                    from src.core.non_empirical_model_correction import run_model_correction
                    run_model_correction(
                        algorithm=algorithm,
                        csv_file=processed_csv_path if Path(processed_csv_path).exists() else csv_path,
                        value_col=algorithm_value_col,
                        spectral_start=spectral_start_col,
                        spectral_end=spectral_end_col,
                        model_info_outpath=model_outpath,
                        window=window,
                    )
                    all_model_results[f"{preprocess}_{algorithm}"] = model_outpath
                    print(f"模型训练完成: {model_outpath}")
                except Exception as e:
                    print(f"训练 {preprocess}_{algorithm} 模型时出错: {e}")
                    continue

        summary_path = _generate_non_empirical_summary(all_model_results, non_empirical_dir)
        notify("completed", f"非经验模型训练完成: {non_empirical_dir}")
        return all_model_results

    # ---- Step 6.75: 自定义回归分析 ----

    @staticmethod
    def custom_regression(
        csv_path: Optional[str] = None,
        x_columns: Optional[Union[str, List[str]]] = None,
        y_columns: Optional[Union[str, List[str]]] = None,
        methods: Union[str, List[str]] = "all",
        output_dir: Optional[str] = None,
        enabled: bool = True,
        callback: Optional[Callable] = None,
        work_dir: Union[str, Path] = "./work_dir",
    ) -> Optional[str]:
        """使用自定义回归方法分析指标与目标参数之间的关系"""
        def notify(status, msg=""):
            if callback:
                callback("步骤6.75", status, msg)

        print("\n" + "=" * 80)
        print("步骤6.75: 自定义回归分析")
        print("=" * 80)

        step_start_time = time.time()

        if not enabled:
            print("已设置跳过自定义回归分析（enabled=False）。")
            notify("skipped", "跳过自定义回归分析")
            return None

        if csv_path is None:
            raise ValueError("必须提供 csv_path 参数")
        if y_columns is None:
            raise ValueError("必须指定 y_columns")
        if x_columns is None:
            raise ValueError("必须指定 x_columns")

        if isinstance(x_columns, str):
            x_columns = [x_columns]
        if isinstance(y_columns, str):
            y_columns = [y_columns]

        df = pd.read_csv(csv_path)
        missing_x = [col for col in x_columns if col not in df.columns]
        missing_y = [col for col in y_columns if col not in df.columns]
        if missing_x:
            raise ValueError(f"自变量列不存在: {missing_x}")
        if missing_y:
            raise ValueError(f"因变量列不存在: {missing_y}")

        if output_dir is None:
            custom_regression_dir = Path(work_dir) / "9_Custom_Regression_Modeling"
        else:
            custom_regression_dir = Path(work_dir) / output_dir
        custom_regression_dir.mkdir(parents=True, exist_ok=True)

        from src.core.modeling.regression import SingleVariableRegressionAnalysis
        analyzer = SingleVariableRegressionAnalysis()
        analyzer.batch_single_variable_regression(
            data=df,
            x_columns=x_columns,
            y_columns=y_columns,
            methods=methods,
            output_dir=str(custom_regression_dir),
        )

        notify("completed", f"自定义回归结果已保存到目录: {custom_regression_dir}")
        return str(custom_regression_dir)


# ============================================================
# 内部辅助函数（供 ModelingStep 内部使用）
# ============================================================

def _apply_preprocessing_internal(
    csv_path: str,
    preprocess_method: str,
    output_dir: Path,
    spectral_start_col: int = 4,
) -> str:
    """应用预处理到CSV数据（内部函数）"""
    raw_p = str(preprocess_method).lower()
    if raw_p == "none" or "无" in raw_p or "跳过" in raw_p:
        preprocess_method = "None"
    elif raw_p == "mms" or "minmax" in raw_p or "最大最小" in raw_p:
        preprocess_method = "MMS"
    elif raw_p == "ss" or "标准" in raw_p or "标准化" in raw_p:
        preprocess_method = "SS"
    elif raw_p == "snv" or "标准正态" in raw_p:
        preprocess_method = "SNV"
    elif raw_p == "ma" or "移动" in raw_p:
        preprocess_method = "MA"
    elif raw_p == "sg" or "savitzky" in raw_p or "平滑" in raw_p:
        preprocess_method = "SG"
    elif raw_p == "msc" or "多元散射" in raw_p:
        preprocess_method = "MSC"
    elif raw_p in ("d1", "d2", "dt"):
        preprocess_method = {"d1": "D1", "d2": "D2", "dt": "DT"}.get(raw_p, raw_p.upper())
    elif raw_p == "ct" or "去趋势" in raw_p:
        preprocess_method = "CT"

    if preprocess_method == "None":
        return csv_path

    output_filename = f"preprocessed_{preprocess_method}.csv"
    output_path = str(output_dir / output_filename)

    if Path(output_path).exists():
        print(f"检测到已存在的预处理文件，直接使用: {output_path}")
        return output_path

    df = pd.read_csv(csv_path)
    non_spectral_cols = df.iloc[:, :spectral_start_col]
    spectral_data = df.iloc[:, spectral_start_col:]

    from src.preprocessing.spectral_Preprocessing import Preprocessing

    save_path = None
    if preprocess_method == "SS":
        models_dir = output_dir.parent.parent / "7_Supervised_Model_Training"
        models_dir.mkdir(parents=True, exist_ok=True)
        save_path = str(models_dir / "scaler_params.pkl")
        print(f"SS预处理: scaler模型将保存到 {save_path}")

    processed_spectral = Preprocessing(preprocess_method, spectral_data, save_path=save_path)

    if isinstance(processed_spectral, pd.DataFrame):
        processed_df = pd.concat([non_spectral_cols, processed_spectral], axis=1)
    else:
        processed_spectral_df = pd.DataFrame(
            processed_spectral, columns=spectral_data.columns, index=spectral_data.index
        )
        processed_df = pd.concat([non_spectral_cols, processed_spectral_df], axis=1)

    processed_df.to_csv(output_path, index=False)
    print(f"预处理完成: {output_path}")
    return output_path


def _generate_non_empirical_summary(model_results: Dict[str, str], output_dir: Path) -> str:
    """生成非经验模型训练结果汇总CSV"""
    summary_path = str(output_dir / "non_empirical_models_summary.csv")
    summary_data = []

    for model_key, model_path in model_results.items():
        try:
            parts = model_key.split("_")
            preprocess_method = parts[0]
            algorithm_name = "_".join(parts[1:]) if len(parts) > 2 else parts[1]

            with open(model_path, "r", encoding="utf-8") as f:
                model_info = json.load(f)

            accuracy_list = model_info.get("accuracy", [])
            summary_row = {
                "Preprocessing Method": preprocess_method,
                "Algorithm Name": algorithm_name,
                "Model Type": model_info.get("model_type", ""),
                "Coefficient Count": len(model_info.get("model_info", [])),
                "Average Accuracy(%)": np.mean(accuracy_list) if accuracy_list else 0,
                "Min Accuracy(%)": np.min(accuracy_list) if accuracy_list else 0,
                "Max Accuracy(%)": np.max(accuracy_list) if accuracy_list else 0,
                "Sample Count": len(model_info.get("long", [])),
                "Model File": model_path,
            }

            coefficients = model_info.get("model_info", [])
            for i, coeff in enumerate(coefficients[:5]):
                summary_row[f"系数_{i+1}"] = coeff

            summary_data.append(summary_row)
        except Exception as e:
            print(f"读取模型文件 {model_path} 时出错: {e}")
            continue

    if summary_data:
        df_summary = pd.DataFrame(summary_data)
        df_summary.to_csv(summary_path, index=False, encoding="utf-8-sig")
        print(f"汇总文件已生成: {summary_path}")
    else:
        print("警告: 没有有效的模型数据可汇总")
        summary_path = ""

    return summary_path