Files
WQ_GUI/src/core/steps/modeling_step.py

498 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
建模步骤
包含 step6_train_models, step6_5_non_empirical_modeling, step6_75_custom_regression
"""
import time
import json
from pathlib import Path
from typing import Optional, List, Union, Callable, Dict
import pandas as pd
import numpy as np
# ============================================================
# 汉化 -> 英文 反向映射字典UI 复选框显示文本 -> 底层算法键名)
# ============================================================
# 模型名称:中文 (缩写) -> 英文键名
MODEL_NAME_MAP = {
"多元线性回归 (MLR)": "LinearRegression",
"岭回归 (Ridge)": "Ridge",
"套索回归 (Lasso)": "Lasso",
"弹性网络 (ElasticNet)": "ElasticNet",
"偏最小二乘 (PLSR)": "PLS",
"决策树 (CART)": "DecisionTree",
"随机森林 (RF)": "RF",
"极端随机树 (ET)": "ExtraTrees",
"极值梯度提升 (XGBoost)": "XGBoost",
"轻量梯度提升 (LightGBM)": "LightGBM",
"类别梯度提升 (CatBoost)": "CatBoost",
"梯度提升树 (GBDT)": "GradientBoosting",
"自适应提升 (AdaBoost)": "AdaBoost",
"支持向量回归 (SVR)": "SVR",
"K近邻回归 (KNN)": "KNN",
"多层感知机 (BP神经网络)": "MLP",
}
# 预处理方法:各种可能的中文变体 -> 标准键名
PREPROC_NAME_MAP = {
# 无处理
"无 (None)": "None",
"None": "None",
# MMS
"最小-最大归一化 (MMS)": "MMS",
"MMS": "MMS",
# SS
"标度化 (SS)": "SS",
"SS": "SS",
# SNV
"标准正态变换 (SNV)": "SNV",
"SNV": "SNV",
# MA
"移动平均 (MA)": "MA",
"MA": "MA",
# SG
"Savitzky-Golay (SG)": "SG",
"SG": "SG",
# MSC
"多元散射校正 (MSC)": "MSC",
"MSC": "MSC",
# D1
"一阶导数 (D1)": "D1",
"D1": "D1",
# D2
"二阶导数 (D2)": "D2",
"D2": "D2",
# DT
"去趋势 (DT)": "DT",
"DT": "DT",
# CT
"中心化 (CT)": "CT",
"CT": "CT",
}
# 数据划分方法:各种可能的中文变体 -> 标准键名
SPLIT_NAME_MAP = {
"SPXY 算法 (考量X-Y空间)": "spxy",
"spxy": "spxy",
"KS 算法 (考量X空间)": "ks",
"ks": "ks",
"随机划分 (Random)": "random",
"random": "random",
}
def _normalize_model_names(model_names: List[str]) -> List[str]:
"""清洗模型名称列表:将汉化显示文本还原为英文键名"""
result = []
for name in model_names:
if name in MODEL_NAME_MAP:
result.append(MODEL_NAME_MAP[name])
else:
# 已经是英文键名,直接保留
result.append(name)
return result
def _normalize_preprocessing_methods(methods: List[str]) -> List[str]:
"""清洗预处理方法列表:将汉化显示文本还原为标准键名"""
result = []
for method in methods:
if method in PREPROC_NAME_MAP:
result.append(PREPROC_NAME_MAP[method])
else:
# 已经是标准键名,直接保留
result.append(method)
return result
def _normalize_split_methods(methods: List[str]) -> List[str]:
"""清洗数据划分方法列表:将汉化显示文本还原为标准键名"""
result = []
for method in methods:
if method in SPLIT_NAME_MAP:
result.append(SPLIT_NAME_MAP[method])
else:
# 已经是标准键名,直接保留
result.append(method)
return result
class ModelingStep:
"""建模步骤"""
# ---- Step 6: 训练机器学习模型 ----
@staticmethod
def train_models(
feature_start_column: str = "374.285004",
preprocessing_methods: Optional[List[str]] = None,
model_names: Optional[List[str]] = None,
split_methods: Optional[List[str]] = None,
cv_folds: int = 5,
training_csv_path: Optional[str] = None,
output_dir: Union[str, Path] = "./7_Supervised_Model_Training",
callback: Optional[Callable] = None,
_report_generator=None,
) -> str:
"""使用采样点光谱和实测值建立机器学习模型"""
from src.core.modeling.modeling_batch import WaterQualityModelingBatch
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
def notify(status, msg=""):
if callback:
callback("步骤6", status, msg)
print("\n" + "=" * 80)
print("步骤6: 训练机器学习模型")
print("=" * 80)
step_start_time = time.time()
if training_csv_path is None:
raise ValueError("必须提供 training_csv_path 参数")
# 检查模型目录是否已有模型
if output_dir.exists() and any(output_dir.iterdir()):
has_models = False
for item in output_dir.iterdir():
if item.is_dir():
model_files = (
list(item.glob("*.pkl"))
+ list(item.glob("*.joblib"))
+ list(item.glob("*.h5"))
)
if model_files:
has_models = True
break
if has_models:
print(f"检测到已存在的模型文件,直接使用: {output_dir}")
notify("skipped", f"模型目录已设置: {output_dir}")
return str(output_dir)
if preprocessing_methods is None:
preprocessing_methods = ["None", "MMS", "SS", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "CT"]
if model_names is None:
model_names = ["SVR", "RF", "Ridge", "Lasso"]
if split_methods is None:
split_methods = ["spxy", "ks", "random"]
# ---- 汉化清洗:将 UI 传来的中文/混合名称转换为底层英文键名 ----
preprocessing_methods = _normalize_preprocessing_methods(preprocessing_methods)
model_names = _normalize_model_names(model_names)
split_methods = _normalize_split_methods(split_methods)
print(f"[参数清洗] 预处理方法: {preprocessing_methods}")
print(f"[参数清洗] 模型名称: {model_names}")
print(f"[参数清洗] 划分方法: {split_methods}")
modeler = WaterQualityModelingBatch(str(output_dir))
modeler.train_models_batch(
csv_path=training_csv_path,
feature_start_column=feature_start_column,
preprocessing_methods=preprocessing_methods,
model_names=model_names,
split_methods=split_methods,
cv_folds=cv_folds,
)
print(f"模型训练完成,结果保存在: {output_dir}")
if _report_generator is not None:
try:
summary_path = _report_generator.generate_training_summary(str(output_dir))
print(f"训练摘要报告已生成: {summary_path}")
except Exception as e:
print(f"生成训练摘要报告时出错: {e}")
notify("completed", f"模型训练完成: {output_dir}")
return str(output_dir)
# ---- Step 6.5: 非经验统计回归模型训练 ----
@staticmethod
def train_non_empirical_models(
csv_path: Optional[str] = None,
preprocessing_methods: Optional[List[str]] = None,
algorithms: Optional[List[str]] = None,
value_cols: Union[int, Dict[str, int]] = 0,
spectral_start_col: int = 1,
spectral_end_col: Optional[int] = None,
window: int = 5,
output_dir: Optional[str] = None,
enabled: bool = True,
callback: Optional[Callable] = None,
) -> Dict[str, str]:
"""非经验统计回归模型训练"""
def notify(status, msg=""):
if callback:
callback("步骤6.5", status, msg)
print("\n" + "=" * 80)
print("步骤6.5: 非经验统计回归模型训练")
print("=" * 80)
step_start_time = time.time()
if not enabled:
print("已设置跳过非经验模型训练enabled=False")
notify("skipped", "跳过的经验模型训练")
return {}
if csv_path is None:
raise ValueError("必须提供 csv_path 参数")
if output_dir is not None:
non_empirical_dir = Path(output_dir)
else:
non_empirical_dir = Path.cwd() / "8_Regression_Modeling"
non_empirical_dir.mkdir(parents=True, exist_ok=True)
if preprocessing_methods is None:
preprocessing_methods = ["None"]
if algorithms is None:
algorithms = ["chl_a", "nh3", "mno4", "tn", "tp", "tss"]
if isinstance(value_cols, int):
value_cols_dict = {algorithm: value_cols for algorithm in algorithms}
elif isinstance(value_cols, dict):
value_cols_dict = value_cols
else:
raise ValueError("value_cols 参数必须是整数或字典")
if spectral_end_col is None:
df = pd.read_csv(csv_path)
spectral_end_col = len(df.columns) - 1
all_model_results = {}
for preprocess in preprocessing_methods:
preprocess_dir = non_empirical_dir / preprocess
preprocess_dir.mkdir(parents=True, exist_ok=True)
processed_csv_path = _apply_preprocessing_internal(
csv_path, preprocess, preprocess_dir, spectral_start_col
)
for algorithm in algorithms:
algorithm_value_col = value_cols_dict[algorithm]
print(f"\n训练 {preprocess} + {algorithm} 模型 (实测值列: {algorithm_value_col})...")
model_outpath = str(preprocess_dir / f"{preprocess}_{algorithm}.json")
if Path(model_outpath).exists():
print(f"检测到已存在的模型文件,直接使用: {model_outpath}")
all_model_results[f"{preprocess}_{algorithm}"] = model_outpath
continue
try:
from src.core.non_empirical_model_correction import run_model_correction
run_model_correction(
algorithm=algorithm,
csv_file=processed_csv_path if Path(processed_csv_path).exists() else csv_path,
value_col=algorithm_value_col,
spectral_start=spectral_start_col,
spectral_end=spectral_end_col,
model_info_outpath=model_outpath,
window=window,
)
all_model_results[f"{preprocess}_{algorithm}"] = model_outpath
print(f"模型训练完成: {model_outpath}")
except Exception as e:
print(f"训练 {preprocess}_{algorithm} 模型时出错: {e}")
continue
summary_path = _generate_non_empirical_summary(all_model_results, non_empirical_dir)
notify("completed", f"非经验模型训练完成: {non_empirical_dir}")
return all_model_results
# ---- Step 6.75: 自定义回归分析 ----
@staticmethod
def custom_regression(
csv_path: Optional[str] = None,
x_columns: Optional[Union[str, List[str]]] = None,
y_columns: Optional[Union[str, List[str]]] = None,
methods: Union[str, List[str]] = "all",
output_dir: Optional[str] = None,
enabled: bool = True,
callback: Optional[Callable] = None,
work_dir: Union[str, Path] = "./work_dir",
) -> Optional[str]:
"""使用自定义回归方法分析指标与目标参数之间的关系"""
def notify(status, msg=""):
if callback:
callback("步骤6.75", status, msg)
print("\n" + "=" * 80)
print("步骤6.75: 自定义回归分析")
print("=" * 80)
step_start_time = time.time()
if not enabled:
print("已设置跳过自定义回归分析enabled=False")
notify("skipped", "跳过自定义回归分析")
return None
if csv_path is None:
raise ValueError("必须提供 csv_path 参数")
if y_columns is None:
raise ValueError("必须指定 y_columns")
if x_columns is None:
raise ValueError("必须指定 x_columns")
if isinstance(x_columns, str):
x_columns = [x_columns]
if isinstance(y_columns, str):
y_columns = [y_columns]
df = pd.read_csv(csv_path)
missing_x = [col for col in x_columns if col not in df.columns]
missing_y = [col for col in y_columns if col not in df.columns]
if missing_x:
raise ValueError(f"自变量列不存在: {missing_x}")
if missing_y:
raise ValueError(f"因变量列不存在: {missing_y}")
if output_dir is None:
custom_regression_dir = Path(work_dir) / "9_Custom_Regression_Modeling"
else:
custom_regression_dir = Path(work_dir) / output_dir
custom_regression_dir.mkdir(parents=True, exist_ok=True)
from src.core.modeling.regression import SingleVariableRegressionAnalysis
analyzer = SingleVariableRegressionAnalysis()
analyzer.batch_single_variable_regression(
data=df,
x_columns=x_columns,
y_columns=y_columns,
methods=methods,
output_dir=str(custom_regression_dir),
)
notify("completed", f"自定义回归结果已保存到目录: {custom_regression_dir}")
return str(custom_regression_dir)
# ============================================================
# 内部辅助函数(供 ModelingStep 内部使用)
# ============================================================
def _apply_preprocessing_internal(
csv_path: str,
preprocess_method: str,
output_dir: Path,
spectral_start_col: int = 4,
) -> str:
"""应用预处理到CSV数据内部函数"""
raw_p = str(preprocess_method).lower()
if raw_p == "none" or "" in raw_p or "跳过" in raw_p:
preprocess_method = "None"
elif raw_p == "mms" or "minmax" in raw_p or "最大最小" in raw_p:
preprocess_method = "MMS"
elif raw_p == "ss" or "标准" in raw_p or "标准化" in raw_p:
preprocess_method = "SS"
elif raw_p == "snv" or "标准正态" in raw_p:
preprocess_method = "SNV"
elif raw_p == "ma" or "移动" in raw_p:
preprocess_method = "MA"
elif raw_p == "sg" or "savitzky" in raw_p or "平滑" in raw_p:
preprocess_method = "SG"
elif raw_p == "msc" or "多元散射" in raw_p:
preprocess_method = "MSC"
elif raw_p in ("d1", "d2", "dt"):
preprocess_method = {"d1": "D1", "d2": "D2", "dt": "DT"}.get(raw_p, raw_p.upper())
elif raw_p == "ct" or "去趋势" in raw_p:
preprocess_method = "CT"
if preprocess_method == "None":
return csv_path
output_filename = f"preprocessed_{preprocess_method}.csv"
output_path = str(output_dir / output_filename)
if Path(output_path).exists():
print(f"检测到已存在的预处理文件,直接使用: {output_path}")
return output_path
df = pd.read_csv(csv_path)
non_spectral_cols = df.iloc[:, :spectral_start_col]
spectral_data = df.iloc[:, spectral_start_col:]
from src.preprocessing.spectral_Preprocessing import Preprocessing
save_path = None
if preprocess_method == "SS":
models_dir = output_dir.parent.parent / "7_Supervised_Model_Training"
models_dir.mkdir(parents=True, exist_ok=True)
save_path = str(models_dir / "scaler_params.pkl")
print(f"SS预处理: scaler模型将保存到 {save_path}")
processed_spectral = Preprocessing(preprocess_method, spectral_data, save_path=save_path)
if isinstance(processed_spectral, pd.DataFrame):
processed_df = pd.concat([non_spectral_cols, processed_spectral], axis=1)
else:
processed_spectral_df = pd.DataFrame(
processed_spectral, columns=spectral_data.columns, index=spectral_data.index
)
processed_df = pd.concat([non_spectral_cols, processed_spectral_df], axis=1)
processed_df.to_csv(output_path, index=False)
print(f"预处理完成: {output_path}")
return output_path
def _generate_non_empirical_summary(model_results: Dict[str, str], output_dir: Path) -> str:
"""生成非经验模型训练结果汇总CSV"""
summary_path = str(output_dir / "non_empirical_models_summary.csv")
summary_data = []
for model_key, model_path in model_results.items():
try:
parts = model_key.split("_")
preprocess_method = parts[0]
algorithm_name = "_".join(parts[1:]) if len(parts) > 2 else parts[1]
with open(model_path, "r", encoding="utf-8") as f:
model_info = json.load(f)
accuracy_list = model_info.get("accuracy", [])
summary_row = {
"Preprocessing Method": preprocess_method,
"Algorithm Name": algorithm_name,
"Model Type": model_info.get("model_type", ""),
"Coefficient Count": len(model_info.get("model_info", [])),
"Average Accuracy(%)": np.mean(accuracy_list) if accuracy_list else 0,
"Min Accuracy(%)": np.min(accuracy_list) if accuracy_list else 0,
"Max Accuracy(%)": np.max(accuracy_list) if accuracy_list else 0,
"Sample Count": len(model_info.get("long", [])),
"Model File": model_path,
}
coefficients = model_info.get("model_info", [])
for i, coeff in enumerate(coefficients[:5]):
summary_row[f"系数_{i+1}"] = coeff
summary_data.append(summary_row)
except Exception as e:
print(f"读取模型文件 {model_path} 时出错: {e}")
continue
if summary_data:
df_summary = pd.DataFrame(summary_data)
df_summary.to_csv(summary_path, index=False, encoding="utf-8-sig")
print(f"汇总文件已生成: {summary_path}")
else:
print("警告: 没有有效的模型数据可汇总")
summary_path = ""
return summary_path