498 lines
18 KiB
Python
498 lines
18 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
建模步骤
|
||
|
||
包含 step6_train_models, step6_5_non_empirical_modeling, step6_75_custom_regression
|
||
"""
|
||
|
||
import time
|
||
import json
|
||
from pathlib import Path
|
||
from typing import Optional, List, Union, Callable, Dict
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
|
||
|
||
# ============================================================
|
||
# 汉化 -> 英文 反向映射字典(UI 复选框显示文本 -> 底层算法键名)
|
||
# ============================================================
|
||
|
||
# 模型名称:中文 (缩写) -> 英文键名
|
||
MODEL_NAME_MAP = {
|
||
"多元线性回归 (MLR)": "LinearRegression",
|
||
"岭回归 (Ridge)": "Ridge",
|
||
"套索回归 (Lasso)": "Lasso",
|
||
"弹性网络 (ElasticNet)": "ElasticNet",
|
||
"偏最小二乘 (PLSR)": "PLS",
|
||
"决策树 (CART)": "DecisionTree",
|
||
"随机森林 (RF)": "RF",
|
||
"极端随机树 (ET)": "ExtraTrees",
|
||
"极值梯度提升 (XGBoost)": "XGBoost",
|
||
"轻量梯度提升 (LightGBM)": "LightGBM",
|
||
"类别梯度提升 (CatBoost)": "CatBoost",
|
||
"梯度提升树 (GBDT)": "GradientBoosting",
|
||
"自适应提升 (AdaBoost)": "AdaBoost",
|
||
"支持向量回归 (SVR)": "SVR",
|
||
"K近邻回归 (KNN)": "KNN",
|
||
"多层感知机 (BP神经网络)": "MLP",
|
||
}
|
||
|
||
# 预处理方法:各种可能的中文变体 -> 标准键名
|
||
PREPROC_NAME_MAP = {
|
||
# 无处理
|
||
"无 (None)": "None",
|
||
"None": "None",
|
||
# MMS
|
||
"最小-最大归一化 (MMS)": "MMS",
|
||
"MMS": "MMS",
|
||
# SS
|
||
"标度化 (SS)": "SS",
|
||
"SS": "SS",
|
||
# SNV
|
||
"标准正态变换 (SNV)": "SNV",
|
||
"SNV": "SNV",
|
||
# MA
|
||
"移动平均 (MA)": "MA",
|
||
"MA": "MA",
|
||
# SG
|
||
"Savitzky-Golay (SG)": "SG",
|
||
"SG": "SG",
|
||
# MSC
|
||
"多元散射校正 (MSC)": "MSC",
|
||
"MSC": "MSC",
|
||
# D1
|
||
"一阶导数 (D1)": "D1",
|
||
"D1": "D1",
|
||
# D2
|
||
"二阶导数 (D2)": "D2",
|
||
"D2": "D2",
|
||
# DT
|
||
"去趋势 (DT)": "DT",
|
||
"DT": "DT",
|
||
# CT
|
||
"中心化 (CT)": "CT",
|
||
"CT": "CT",
|
||
}
|
||
|
||
# 数据划分方法:各种可能的中文变体 -> 标准键名
|
||
SPLIT_NAME_MAP = {
|
||
"SPXY 算法 (考量X-Y空间)": "spxy",
|
||
"spxy": "spxy",
|
||
"KS 算法 (考量X空间)": "ks",
|
||
"ks": "ks",
|
||
"随机划分 (Random)": "random",
|
||
"random": "random",
|
||
}
|
||
|
||
|
||
def _normalize_model_names(model_names: List[str]) -> List[str]:
|
||
"""清洗模型名称列表:将汉化显示文本还原为英文键名"""
|
||
result = []
|
||
for name in model_names:
|
||
if name in MODEL_NAME_MAP:
|
||
result.append(MODEL_NAME_MAP[name])
|
||
else:
|
||
# 已经是英文键名,直接保留
|
||
result.append(name)
|
||
return result
|
||
|
||
|
||
def _normalize_preprocessing_methods(methods: List[str]) -> List[str]:
|
||
"""清洗预处理方法列表:将汉化显示文本还原为标准键名"""
|
||
result = []
|
||
for method in methods:
|
||
if method in PREPROC_NAME_MAP:
|
||
result.append(PREPROC_NAME_MAP[method])
|
||
else:
|
||
# 已经是标准键名,直接保留
|
||
result.append(method)
|
||
return result
|
||
|
||
|
||
def _normalize_split_methods(methods: List[str]) -> List[str]:
|
||
"""清洗数据划分方法列表:将汉化显示文本还原为标准键名"""
|
||
result = []
|
||
for method in methods:
|
||
if method in SPLIT_NAME_MAP:
|
||
result.append(SPLIT_NAME_MAP[method])
|
||
else:
|
||
# 已经是标准键名,直接保留
|
||
result.append(method)
|
||
return result
|
||
|
||
|
||
class ModelingStep:
|
||
"""建模步骤"""
|
||
|
||
# ---- Step 6: 训练机器学习模型 ----
|
||
|
||
@staticmethod
|
||
def train_models(
|
||
feature_start_column: str = "374.285004",
|
||
preprocessing_methods: Optional[List[str]] = None,
|
||
model_names: Optional[List[str]] = None,
|
||
split_methods: Optional[List[str]] = None,
|
||
cv_folds: int = 5,
|
||
training_csv_path: Optional[str] = None,
|
||
output_dir: Union[str, Path] = "./7_Supervised_Model_Training",
|
||
callback: Optional[Callable] = None,
|
||
_report_generator=None,
|
||
) -> str:
|
||
"""使用采样点光谱和实测值建立机器学习模型"""
|
||
from src.core.modeling.modeling_batch import WaterQualityModelingBatch
|
||
|
||
output_dir = Path(output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
def notify(status, msg=""):
|
||
if callback:
|
||
callback("步骤6", status, msg)
|
||
|
||
print("\n" + "=" * 80)
|
||
print("步骤6: 训练机器学习模型")
|
||
print("=" * 80)
|
||
|
||
step_start_time = time.time()
|
||
|
||
if training_csv_path is None:
|
||
raise ValueError("必须提供 training_csv_path 参数")
|
||
|
||
# 检查模型目录是否已有模型
|
||
if output_dir.exists() and any(output_dir.iterdir()):
|
||
has_models = False
|
||
for item in output_dir.iterdir():
|
||
if item.is_dir():
|
||
model_files = (
|
||
list(item.glob("*.pkl"))
|
||
+ list(item.glob("*.joblib"))
|
||
+ list(item.glob("*.h5"))
|
||
)
|
||
if model_files:
|
||
has_models = True
|
||
break
|
||
if has_models:
|
||
print(f"检测到已存在的模型文件,直接使用: {output_dir}")
|
||
notify("skipped", f"模型目录已设置: {output_dir}")
|
||
return str(output_dir)
|
||
|
||
if preprocessing_methods is None:
|
||
preprocessing_methods = ["None", "MMS", "SS", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "CT"]
|
||
if model_names is None:
|
||
model_names = ["SVR", "RF", "Ridge", "Lasso"]
|
||
if split_methods is None:
|
||
split_methods = ["spxy", "ks", "random"]
|
||
|
||
# ---- 汉化清洗:将 UI 传来的中文/混合名称转换为底层英文键名 ----
|
||
preprocessing_methods = _normalize_preprocessing_methods(preprocessing_methods)
|
||
model_names = _normalize_model_names(model_names)
|
||
split_methods = _normalize_split_methods(split_methods)
|
||
|
||
print(f"[参数清洗] 预处理方法: {preprocessing_methods}")
|
||
print(f"[参数清洗] 模型名称: {model_names}")
|
||
print(f"[参数清洗] 划分方法: {split_methods}")
|
||
|
||
modeler = WaterQualityModelingBatch(str(output_dir))
|
||
modeler.train_models_batch(
|
||
csv_path=training_csv_path,
|
||
feature_start_column=feature_start_column,
|
||
preprocessing_methods=preprocessing_methods,
|
||
model_names=model_names,
|
||
split_methods=split_methods,
|
||
cv_folds=cv_folds,
|
||
)
|
||
|
||
print(f"模型训练完成,结果保存在: {output_dir}")
|
||
|
||
if _report_generator is not None:
|
||
try:
|
||
summary_path = _report_generator.generate_training_summary(str(output_dir))
|
||
print(f"训练摘要报告已生成: {summary_path}")
|
||
except Exception as e:
|
||
print(f"生成训练摘要报告时出错: {e}")
|
||
|
||
notify("completed", f"模型训练完成: {output_dir}")
|
||
return str(output_dir)
|
||
|
||
# ---- Step 6.5: 非经验统计回归模型训练 ----
|
||
|
||
@staticmethod
|
||
def train_non_empirical_models(
|
||
csv_path: Optional[str] = None,
|
||
preprocessing_methods: Optional[List[str]] = None,
|
||
algorithms: Optional[List[str]] = None,
|
||
value_cols: Union[int, Dict[str, int]] = 0,
|
||
spectral_start_col: int = 1,
|
||
spectral_end_col: Optional[int] = None,
|
||
window: int = 5,
|
||
output_dir: Optional[str] = None,
|
||
enabled: bool = True,
|
||
callback: Optional[Callable] = None,
|
||
) -> Dict[str, str]:
|
||
"""非经验统计回归模型训练"""
|
||
def notify(status, msg=""):
|
||
if callback:
|
||
callback("步骤6.5", status, msg)
|
||
|
||
print("\n" + "=" * 80)
|
||
print("步骤6.5: 非经验统计回归模型训练")
|
||
print("=" * 80)
|
||
|
||
step_start_time = time.time()
|
||
|
||
if not enabled:
|
||
print("已设置跳过非经验模型训练(enabled=False)。")
|
||
notify("skipped", "跳过的经验模型训练")
|
||
return {}
|
||
|
||
if csv_path is None:
|
||
raise ValueError("必须提供 csv_path 参数")
|
||
|
||
if output_dir is not None:
|
||
non_empirical_dir = Path(output_dir)
|
||
else:
|
||
non_empirical_dir = Path.cwd() / "8_Regression_Modeling"
|
||
non_empirical_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
if preprocessing_methods is None:
|
||
preprocessing_methods = ["None"]
|
||
if algorithms is None:
|
||
algorithms = ["chl_a", "nh3", "mno4", "tn", "tp", "tss"]
|
||
|
||
if isinstance(value_cols, int):
|
||
value_cols_dict = {algorithm: value_cols for algorithm in algorithms}
|
||
elif isinstance(value_cols, dict):
|
||
value_cols_dict = value_cols
|
||
else:
|
||
raise ValueError("value_cols 参数必须是整数或字典")
|
||
|
||
if spectral_end_col is None:
|
||
df = pd.read_csv(csv_path)
|
||
spectral_end_col = len(df.columns) - 1
|
||
|
||
all_model_results = {}
|
||
|
||
for preprocess in preprocessing_methods:
|
||
preprocess_dir = non_empirical_dir / preprocess
|
||
preprocess_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
processed_csv_path = _apply_preprocessing_internal(
|
||
csv_path, preprocess, preprocess_dir, spectral_start_col
|
||
)
|
||
|
||
for algorithm in algorithms:
|
||
algorithm_value_col = value_cols_dict[algorithm]
|
||
print(f"\n训练 {preprocess} + {algorithm} 模型 (实测值列: {algorithm_value_col})...")
|
||
|
||
model_outpath = str(preprocess_dir / f"{preprocess}_{algorithm}.json")
|
||
|
||
if Path(model_outpath).exists():
|
||
print(f"检测到已存在的模型文件,直接使用: {model_outpath}")
|
||
all_model_results[f"{preprocess}_{algorithm}"] = model_outpath
|
||
continue
|
||
|
||
try:
|
||
from src.core.non_empirical_model_correction import run_model_correction
|
||
run_model_correction(
|
||
algorithm=algorithm,
|
||
csv_file=processed_csv_path if Path(processed_csv_path).exists() else csv_path,
|
||
value_col=algorithm_value_col,
|
||
spectral_start=spectral_start_col,
|
||
spectral_end=spectral_end_col,
|
||
model_info_outpath=model_outpath,
|
||
window=window,
|
||
)
|
||
all_model_results[f"{preprocess}_{algorithm}"] = model_outpath
|
||
print(f"模型训练完成: {model_outpath}")
|
||
except Exception as e:
|
||
print(f"训练 {preprocess}_{algorithm} 模型时出错: {e}")
|
||
continue
|
||
|
||
summary_path = _generate_non_empirical_summary(all_model_results, non_empirical_dir)
|
||
notify("completed", f"非经验模型训练完成: {non_empirical_dir}")
|
||
return all_model_results
|
||
|
||
# ---- Step 6.75: 自定义回归分析 ----
|
||
|
||
@staticmethod
|
||
def custom_regression(
|
||
csv_path: Optional[str] = None,
|
||
x_columns: Optional[Union[str, List[str]]] = None,
|
||
y_columns: Optional[Union[str, List[str]]] = None,
|
||
methods: Union[str, List[str]] = "all",
|
||
output_dir: Optional[str] = None,
|
||
enabled: bool = True,
|
||
callback: Optional[Callable] = None,
|
||
work_dir: Union[str, Path] = "./work_dir",
|
||
) -> Optional[str]:
|
||
"""使用自定义回归方法分析指标与目标参数之间的关系"""
|
||
def notify(status, msg=""):
|
||
if callback:
|
||
callback("步骤6.75", status, msg)
|
||
|
||
print("\n" + "=" * 80)
|
||
print("步骤6.75: 自定义回归分析")
|
||
print("=" * 80)
|
||
|
||
step_start_time = time.time()
|
||
|
||
if not enabled:
|
||
print("已设置跳过自定义回归分析(enabled=False)。")
|
||
notify("skipped", "跳过自定义回归分析")
|
||
return None
|
||
|
||
if csv_path is None:
|
||
raise ValueError("必须提供 csv_path 参数")
|
||
if y_columns is None:
|
||
raise ValueError("必须指定 y_columns")
|
||
if x_columns is None:
|
||
raise ValueError("必须指定 x_columns")
|
||
|
||
if isinstance(x_columns, str):
|
||
x_columns = [x_columns]
|
||
if isinstance(y_columns, str):
|
||
y_columns = [y_columns]
|
||
|
||
df = pd.read_csv(csv_path)
|
||
missing_x = [col for col in x_columns if col not in df.columns]
|
||
missing_y = [col for col in y_columns if col not in df.columns]
|
||
if missing_x:
|
||
raise ValueError(f"自变量列不存在: {missing_x}")
|
||
if missing_y:
|
||
raise ValueError(f"因变量列不存在: {missing_y}")
|
||
|
||
if output_dir is None:
|
||
custom_regression_dir = Path(work_dir) / "9_Custom_Regression_Modeling"
|
||
else:
|
||
custom_regression_dir = Path(work_dir) / output_dir
|
||
custom_regression_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
from src.core.modeling.regression import SingleVariableRegressionAnalysis
|
||
analyzer = SingleVariableRegressionAnalysis()
|
||
analyzer.batch_single_variable_regression(
|
||
data=df,
|
||
x_columns=x_columns,
|
||
y_columns=y_columns,
|
||
methods=methods,
|
||
output_dir=str(custom_regression_dir),
|
||
)
|
||
|
||
notify("completed", f"自定义回归结果已保存到目录: {custom_regression_dir}")
|
||
return str(custom_regression_dir)
|
||
|
||
|
||
# ============================================================
|
||
# 内部辅助函数(供 ModelingStep 内部使用)
|
||
# ============================================================
|
||
|
||
def _apply_preprocessing_internal(
|
||
csv_path: str,
|
||
preprocess_method: str,
|
||
output_dir: Path,
|
||
spectral_start_col: int = 4,
|
||
) -> str:
|
||
"""应用预处理到CSV数据(内部函数)"""
|
||
raw_p = str(preprocess_method).lower()
|
||
if raw_p == "none" or "无" in raw_p or "跳过" in raw_p:
|
||
preprocess_method = "None"
|
||
elif raw_p == "mms" or "minmax" in raw_p or "最大最小" in raw_p:
|
||
preprocess_method = "MMS"
|
||
elif raw_p == "ss" or "标准" in raw_p or "标准化" in raw_p:
|
||
preprocess_method = "SS"
|
||
elif raw_p == "snv" or "标准正态" in raw_p:
|
||
preprocess_method = "SNV"
|
||
elif raw_p == "ma" or "移动" in raw_p:
|
||
preprocess_method = "MA"
|
||
elif raw_p == "sg" or "savitzky" in raw_p or "平滑" in raw_p:
|
||
preprocess_method = "SG"
|
||
elif raw_p == "msc" or "多元散射" in raw_p:
|
||
preprocess_method = "MSC"
|
||
elif raw_p in ("d1", "d2", "dt"):
|
||
preprocess_method = {"d1": "D1", "d2": "D2", "dt": "DT"}.get(raw_p, raw_p.upper())
|
||
elif raw_p == "ct" or "去趋势" in raw_p:
|
||
preprocess_method = "CT"
|
||
|
||
if preprocess_method == "None":
|
||
return csv_path
|
||
|
||
output_filename = f"preprocessed_{preprocess_method}.csv"
|
||
output_path = str(output_dir / output_filename)
|
||
|
||
if Path(output_path).exists():
|
||
print(f"检测到已存在的预处理文件,直接使用: {output_path}")
|
||
return output_path
|
||
|
||
df = pd.read_csv(csv_path)
|
||
non_spectral_cols = df.iloc[:, :spectral_start_col]
|
||
spectral_data = df.iloc[:, spectral_start_col:]
|
||
|
||
from src.preprocessing.spectral_Preprocessing import Preprocessing
|
||
|
||
save_path = None
|
||
if preprocess_method == "SS":
|
||
models_dir = output_dir.parent.parent / "7_Supervised_Model_Training"
|
||
models_dir.mkdir(parents=True, exist_ok=True)
|
||
save_path = str(models_dir / "scaler_params.pkl")
|
||
print(f"SS预处理: scaler模型将保存到 {save_path}")
|
||
|
||
processed_spectral = Preprocessing(preprocess_method, spectral_data, save_path=save_path)
|
||
|
||
if isinstance(processed_spectral, pd.DataFrame):
|
||
processed_df = pd.concat([non_spectral_cols, processed_spectral], axis=1)
|
||
else:
|
||
processed_spectral_df = pd.DataFrame(
|
||
processed_spectral, columns=spectral_data.columns, index=spectral_data.index
|
||
)
|
||
processed_df = pd.concat([non_spectral_cols, processed_spectral_df], axis=1)
|
||
|
||
processed_df.to_csv(output_path, index=False)
|
||
print(f"预处理完成: {output_path}")
|
||
return output_path
|
||
|
||
|
||
def _generate_non_empirical_summary(model_results: Dict[str, str], output_dir: Path) -> str:
|
||
"""生成非经验模型训练结果汇总CSV"""
|
||
summary_path = str(output_dir / "non_empirical_models_summary.csv")
|
||
summary_data = []
|
||
|
||
for model_key, model_path in model_results.items():
|
||
try:
|
||
parts = model_key.split("_")
|
||
preprocess_method = parts[0]
|
||
algorithm_name = "_".join(parts[1:]) if len(parts) > 2 else parts[1]
|
||
|
||
with open(model_path, "r", encoding="utf-8") as f:
|
||
model_info = json.load(f)
|
||
|
||
accuracy_list = model_info.get("accuracy", [])
|
||
summary_row = {
|
||
"Preprocessing Method": preprocess_method,
|
||
"Algorithm Name": algorithm_name,
|
||
"Model Type": model_info.get("model_type", ""),
|
||
"Coefficient Count": len(model_info.get("model_info", [])),
|
||
"Average Accuracy(%)": np.mean(accuracy_list) if accuracy_list else 0,
|
||
"Min Accuracy(%)": np.min(accuracy_list) if accuracy_list else 0,
|
||
"Max Accuracy(%)": np.max(accuracy_list) if accuracy_list else 0,
|
||
"Sample Count": len(model_info.get("long", [])),
|
||
"Model File": model_path,
|
||
}
|
||
|
||
coefficients = model_info.get("model_info", [])
|
||
for i, coeff in enumerate(coefficients[:5]):
|
||
summary_row[f"系数_{i+1}"] = coeff
|
||
|
||
summary_data.append(summary_row)
|
||
except Exception as e:
|
||
print(f"读取模型文件 {model_path} 时出错: {e}")
|
||
continue
|
||
|
||
if summary_data:
|
||
df_summary = pd.DataFrame(summary_data)
|
||
df_summary.to_csv(summary_path, index=False, encoding="utf-8-sig")
|
||
print(f"汇总文件已生成: {summary_path}")
|
||
else:
|
||
print("警告: 没有有效的模型数据可汇总")
|
||
summary_path = ""
|
||
|
||
return summary_path
|