refactor(step4): 剥离 Steps 层 - step4~step9 业务逻辑下沉到独立模块

2026-05-09 17:55:58 +08:00
parent d0eb458392
commit 14278739bf
6 changed files with 1202 additions and 829 deletions
--- a/src/core/steps/modeling_step.py
+++ b/src/core/steps/modeling_step.py
@ -0,0 +1,380 @@
+# -*- coding: utf-8 -*-
+"""
+建模步骤
+
+包含 step6_train_models, step6_5_non_empirical_modeling, step6_75_custom_regression
+"""
+
+import time
+import json
+from pathlib import Path
+from typing import Optional, List, Union, Callable, Dict
+
+import pandas as pd
+import numpy as np
+
+
+class ModelingStep:
+    """建模步骤"""
+
+    # ---- Step 6: 训练机器学习模型 ----
+
+    @staticmethod
+    def train_models(
+        feature_start_column: str = "374.285004",
+        preprocessing_methods: Optional[List[str]] = None,
+        model_names: Optional[List[str]] = None,
+        split_methods: Optional[List[str]] = None,
+        cv_folds: int = 5,
+        training_csv_path: Optional[str] = None,
+        output_dir: Union[str, Path] = "./7_Supervised_Model_Training",
+        callback: Optional[Callable] = None,
+        _report_generator=None,
+    ) -> str:
+        """使用采样点光谱和实测值建立机器学习模型"""
+        from src.core.modeling.modeling_batch import WaterQualityModelingBatch
+
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        def notify(status, msg=""):
+            if callback:
+                callback("步骤6", status, msg)
+
+        print("\n" + "=" * 80)
+        print("步骤6: 训练机器学习模型")
+        print("=" * 80)
+
+        step_start_time = time.time()
+
+        if training_csv_path is None:
+            raise ValueError("必须提供 training_csv_path 参数")
+
+        # 检查模型目录是否已有模型
+        if output_dir.exists() and any(output_dir.iterdir()):
+            has_models = False
+            for item in output_dir.iterdir():
+                if item.is_dir():
+                    model_files = (
+                        list(item.glob("*.pkl"))
+                        + list(item.glob("*.joblib"))
+                        + list(item.glob("*.h5"))
+                    )
+                    if model_files:
+                        has_models = True
+                        break
+            if has_models:
+                print(f"检测到已存在的模型文件，直接使用: {output_dir}")
+                notify("skipped", f"模型目录已设置: {output_dir}")
+                return str(output_dir)
+
+        if preprocessing_methods is None:
+            preprocessing_methods = ["None", "MMS", "SS", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "CT"]
+        if model_names is None:
+            model_names = ["SVR", "RF", "Ridge", "Lasso"]
+        if split_methods is None:
+            split_methods = ["spxy", "ks", "random"]
+
+        modeler = WaterQualityModelingBatch(str(output_dir))
+        modeler.train_models_batch(
+            csv_path=training_csv_path,
+            feature_start_column=feature_start_column,
+            preprocessing_methods=preprocessing_methods,
+            model_names=model_names,
+            split_methods=split_methods,
+            cv_folds=cv_folds,
+        )
+
+        print(f"模型训练完成，结果保存在: {output_dir}")
+
+        if _report_generator is not None:
+            try:
+                summary_path = _report_generator.generate_training_summary(str(output_dir))
+                print(f"训练摘要报告已生成: {summary_path}")
+            except Exception as e:
+                print(f"生成训练摘要报告时出错: {e}")
+
+        notify("completed", f"模型训练完成: {output_dir}")
+        return str(output_dir)
+
+    # ---- Step 6.5: 非经验统计回归模型训练 ----
+
+    @staticmethod
+    def train_non_empirical_models(
+        csv_path: Optional[str] = None,
+        preprocessing_methods: Optional[List[str]] = None,
+        algorithms: Optional[List[str]] = None,
+        value_cols: Union[int, Dict[str, int]] = 0,
+        spectral_start_col: int = 1,
+        spectral_end_col: Optional[int] = None,
+        window: int = 5,
+        output_dir: Optional[str] = None,
+        enabled: bool = True,
+        callback: Optional[Callable] = None,
+    ) -> Dict[str, str]:
+        """非经验统计回归模型训练"""
+        def notify(status, msg=""):
+            if callback:
+                callback("步骤6.5", status, msg)
+
+        print("\n" + "=" * 80)
+        print("步骤6.5: 非经验统计回归模型训练")
+        print("=" * 80)
+
+        step_start_time = time.time()
+
+        if not enabled:
+            print("已设置跳过非经验模型训练（enabled=False）。")
+            notify("skipped", "跳过的经验模型训练")
+            return {}
+
+        if csv_path is None:
+            raise ValueError("必须提供 csv_path 参数")
+
+        if output_dir is not None:
+            non_empirical_dir = Path(output_dir)
+        else:
+            non_empirical_dir = Path.cwd() / "8_Regression_Modeling"
+        non_empirical_dir.mkdir(parents=True, exist_ok=True)
+
+        if preprocessing_methods is None:
+            preprocessing_methods = ["None"]
+        if algorithms is None:
+            algorithms = ["chl_a", "nh3", "mno4", "tn", "tp", "tss"]
+
+        if isinstance(value_cols, int):
+            value_cols_dict = {algorithm: value_cols for algorithm in algorithms}
+        elif isinstance(value_cols, dict):
+            value_cols_dict = value_cols
+        else:
+            raise ValueError("value_cols 参数必须是整数或字典")
+
+        if spectral_end_col is None:
+            df = pd.read_csv(csv_path)
+            spectral_end_col = len(df.columns) - 1
+
+        all_model_results = {}
+
+        for preprocess in preprocessing_methods:
+            preprocess_dir = non_empirical_dir / preprocess
+            preprocess_dir.mkdir(parents=True, exist_ok=True)
+
+            processed_csv_path = _apply_preprocessing_internal(
+                csv_path, preprocess, preprocess_dir, spectral_start_col
+            )
+
+            for algorithm in algorithms:
+                algorithm_value_col = value_cols_dict[algorithm]
+                print(f"\n训练 {preprocess} + {algorithm} 模型 (实测值列: {algorithm_value_col})...")
+
+                model_outpath = str(preprocess_dir / f"{preprocess}_{algorithm}.json")
+
+                if Path(model_outpath).exists():
+                    print(f"检测到已存在的模型文件，直接使用: {model_outpath}")
+                    all_model_results[f"{preprocess}_{algorithm}"] = model_outpath
+                    continue
+
+                try:
+                    from src.core.non_empirical_model_correction import run_model_correction
+                    run_model_correction(
+                        algorithm=algorithm,
+                        csv_file=processed_csv_path if Path(processed_csv_path).exists() else csv_path,
+                        value_col=algorithm_value_col,
+                        spectral_start=spectral_start_col,
+                        spectral_end=spectral_end_col,
+                        model_info_outpath=model_outpath,
+                        window=window,
+                    )
+                    all_model_results[f"{preprocess}_{algorithm}"] = model_outpath
+                    print(f"模型训练完成: {model_outpath}")
+                except Exception as e:
+                    print(f"训练 {preprocess}_{algorithm} 模型时出错: {e}")
+                    continue
+
+        summary_path = _generate_non_empirical_summary(all_model_results, non_empirical_dir)
+        notify("completed", f"非经验模型训练完成: {non_empirical_dir}")
+        return all_model_results
+
+    # ---- Step 6.75: 自定义回归分析 ----
+
+    @staticmethod
+    def custom_regression(
+        csv_path: Optional[str] = None,
+        x_columns: Optional[Union[str, List[str]]] = None,
+        y_columns: Optional[Union[str, List[str]]] = None,
+        methods: Union[str, List[str]] = "all",
+        output_dir: Optional[str] = None,
+        enabled: bool = True,
+        callback: Optional[Callable] = None,
+        work_dir: Union[str, Path] = "./work_dir",
+    ) -> Optional[str]:
+        """使用自定义回归方法分析指标与目标参数之间的关系"""
+        def notify(status, msg=""):
+            if callback:
+                callback("步骤6.75", status, msg)
+
+        print("\n" + "=" * 80)
+        print("步骤6.75: 自定义回归分析")
+        print("=" * 80)
+
+        step_start_time = time.time()
+
+        if not enabled:
+            print("已设置跳过自定义回归分析（enabled=False）。")
+            notify("skipped", "跳过自定义回归分析")
+            return None
+
+        if csv_path is None:
+            raise ValueError("必须提供 csv_path 参数")
+        if y_columns is None:
+            raise ValueError("必须指定 y_columns")
+        if x_columns is None:
+            raise ValueError("必须指定 x_columns")
+
+        if isinstance(x_columns, str):
+            x_columns = [x_columns]
+        if isinstance(y_columns, str):
+            y_columns = [y_columns]
+
+        df = pd.read_csv(csv_path)
+        missing_x = [col for col in x_columns if col not in df.columns]
+        missing_y = [col for col in y_columns if col not in df.columns]
+        if missing_x:
+            raise ValueError(f"自变量列不存在: {missing_x}")
+        if missing_y:
+            raise ValueError(f"因变量列不存在: {missing_y}")
+
+        if output_dir is None:
+            custom_regression_dir = Path(work_dir) / "9_Custom_Regression_Modeling"
+        else:
+            custom_regression_dir = Path(work_dir) / output_dir
+        custom_regression_dir.mkdir(parents=True, exist_ok=True)
+
+        from src.core.modeling.regression import SingleVariableRegressionAnalysis
+        analyzer = SingleVariableRegressionAnalysis()
+        analyzer.batch_single_variable_regression(
+            data=df,
+            x_columns=x_columns,
+            y_columns=y_columns,
+            methods=methods,
+            output_dir=str(custom_regression_dir),
+        )
+
+        notify("completed", f"自定义回归结果已保存到目录: {custom_regression_dir}")
+        return str(custom_regression_dir)
+
+
+# ============================================================
+# 内部辅助函数（供 ModelingStep 内部使用）
+# ============================================================
+
+def _apply_preprocessing_internal(
+    csv_path: str,
+    preprocess_method: str,
+    output_dir: Path,
+    spectral_start_col: int = 4,
+) -> str:
+    """应用预处理到CSV数据（内部函数）"""
+    raw_p = str(preprocess_method).lower()
+    if raw_p == "none" or "无" in raw_p or "跳过" in raw_p:
+        preprocess_method = "None"
+    elif raw_p == "mms" or "minmax" in raw_p or "最大最小" in raw_p:
+        preprocess_method = "MMS"
+    elif raw_p == "ss" or "标准" in raw_p or "标准化" in raw_p:
+        preprocess_method = "SS"
+    elif raw_p == "snv" or "标准正态" in raw_p:
+        preprocess_method = "SNV"
+    elif raw_p == "ma" or "移动" in raw_p:
+        preprocess_method = "MA"
+    elif raw_p == "sg" or "savitzky" in raw_p or "平滑" in raw_p:
+        preprocess_method = "SG"
+    elif raw_p == "msc" or "多元散射" in raw_p:
+        preprocess_method = "MSC"
+    elif raw_p in ("d1", "d2", "dt"):
+        preprocess_method = {"d1": "D1", "d2": "D2", "dt": "DT"}.get(raw_p, raw_p.upper())
+    elif raw_p == "ct" or "去趋势" in raw_p:
+        preprocess_method = "CT"
+
+    if preprocess_method == "None":
+        return csv_path
+
+    output_filename = f"preprocessed_{preprocess_method}.csv"
+    output_path = str(output_dir / output_filename)
+
+    if Path(output_path).exists():
+        print(f"检测到已存在的预处理文件，直接使用: {output_path}")
+        return output_path
+
+    df = pd.read_csv(csv_path)
+    non_spectral_cols = df.iloc[:, :spectral_start_col]
+    spectral_data = df.iloc[:, spectral_start_col:]
+
+    from src.preprocessing.spectral_Preprocessing import Preprocessing
+
+    save_path = None
+    if preprocess_method == "SS":
+        models_dir = output_dir.parent.parent / "7_Supervised_Model_Training"
+        models_dir.mkdir(parents=True, exist_ok=True)
+        save_path = str(models_dir / "scaler_params.pkl")
+        print(f"SS预处理: scaler模型将保存到 {save_path}")
+
+    processed_spectral = Preprocessing(preprocess_method, spectral_data, save_path=save_path)
+
+    if isinstance(processed_spectral, pd.DataFrame):
+        processed_df = pd.concat([non_spectral_cols, processed_spectral], axis=1)
+    else:
+        processed_spectral_df = pd.DataFrame(
+            processed_spectral, columns=spectral_data.columns, index=spectral_data.index
+        )
+        processed_df = pd.concat([non_spectral_cols, processed_spectral_df], axis=1)
+
+    processed_df.to_csv(output_path, index=False)
+    print(f"预处理完成: {output_path}")
+    return output_path
+
+
+def _generate_non_empirical_summary(model_results: Dict[str, str], output_dir: Path) -> str:
+    """生成非经验模型训练结果汇总CSV"""
+    summary_path = str(output_dir / "non_empirical_models_summary.csv")
+    summary_data = []
+
+    for model_key, model_path in model_results.items():
+        try:
+            parts = model_key.split("_")
+            preprocess_method = parts[0]
+            algorithm_name = "_".join(parts[1:]) if len(parts) > 2 else parts[1]
+
+            with open(model_path, "r", encoding="utf-8") as f:
+                model_info = json.load(f)
+
+            accuracy_list = model_info.get("accuracy", [])
+            summary_row = {
+                "Preprocessing Method": preprocess_method,
+                "Algorithm Name": algorithm_name,
+                "Model Type": model_info.get("model_type", ""),
+                "Coefficient Count": len(model_info.get("model_info", [])),
+                "Average Accuracy(%)": np.mean(accuracy_list) if accuracy_list else 0,
+                "Min Accuracy(%)": np.min(accuracy_list) if accuracy_list else 0,
+                "Max Accuracy(%)": np.max(accuracy_list) if accuracy_list else 0,
+                "Sample Count": len(model_info.get("long", [])),
+                "Model File": model_path,
+            }
+
+            coefficients = model_info.get("model_info", [])
+            for i, coeff in enumerate(coefficients[:5]):
+                summary_row[f"系数_{i+1}"] = coeff
+
+            summary_data.append(summary_row)
+        except Exception as e:
+            print(f"读取模型文件 {model_path} 时出错: {e}")
+            continue
+
+    if summary_data:
+        df_summary = pd.DataFrame(summary_data)
+        df_summary.to_csv(summary_path, index=False, encoding="utf-8-sig")
+        print(f"汇总文件已生成: {summary_path}")
+    else:
+        print("警告: 没有有效的模型数据可汇总")
+        summary_path = ""
+
+    return summary_path