refactor(pipeline): 路径直接传输 — 统一 ctx 字段名/panel key/step 形参名

This commit is contained in:
DXC
2026-06-03 17:29:41 +08:00
parent 517bb28611
commit 343e316799
99 changed files with 9127 additions and 91 deletions

View File

@ -20,23 +20,28 @@ class PipelineContext:
"""流水线运行上下文(在 14 个 step 之间传递的内存字典)
字段命名约定:
- 路径字段统一 `_path` 后缀(如 water_mask_path
- 目录类字段无 `_path` 后缀(如 models_dir
- 路径字段名 = panel key 名 = step 形参名(全链路无翻译
- 训练/产物 CSV 用 `_path` 后缀(如 training_csv_path / water_mask_path
- 入参影像/CSV 沿用 panel 原名img_path / csv_path无 `_path` 后缀
- 目录类字段无 `_path` 后缀(如 models_dir / prediction_dir
- 元信息字段无后缀(如 user_config / status / log
"""
# ── 9 步主路径(按 step 输出顺序排列) ──
raw_img_path: Optional[str] = None # Step 1 入参:原始影像
# ── 11 个 step 的入参/产物(按 step 顺序排列;字段名 = panel key = step 形参 ──
img_path: Optional[str] = None # Step 1/2/3 入参:原始影像
water_mask_path: Optional[str] = None # Step 1 出 → Step 2/3/7 入
glint_mask_path: Optional[str] = None # Step 2 出 → Step 3/7 入
deglint_img_path: Optional[str] = None # Step 3 出 → Step 5/7 入
raw_csv_path: Optional[str] = None # Step 4 入:原始 CSV
csv_path: Optional[str] = None # Step 4/5/6_5/6_75:原始/训练 CSV
processed_csv_path: Optional[str] = None # Step 4 出 → Step 5 入
training_spectra_path: Optional[str] = None # Step 5 出 → Step 6
training_csv_path: Optional[str] = None # Step 5 出 → Step 5_5/6/6_5/6_75
boundary_path: Optional[str] = None # Step 5 入参:边界 SHPpanel step5 名)
indices_path: Optional[str] = None # Step 5.5 出
sampling_csv_path: Optional[str] = None # Step 7 出 → Step 8/9 入
prediction_csv_path: Optional[str] = None # Step 8 出
sampling_csv_path: Optional[str] = None # Step 7 出 → Step 8/8_5/8_75/9 入
prediction_csv_path: Optional[str] = None # Step 8 出 → Step 9 入
distribution_map_path: Optional[str] = None # Step 9 出
boundary_shp_path: Optional[str] = None # Step 9 入参:边界 SHPpanel step9 名)
formula_csv_path: Optional[str] = None # Step 8_75 入参:公式 CSV
# ── 目录类(命名不带 _path 以示区别) ──
models_dir: Optional[str] = None

View File

@ -4,10 +4,8 @@ PipelineRunner基于 StepSpec 声明式调度 14 个 step。
设计要点:
- StepSpec 声明 requiresctx 字段名列表)+ producesctx 字段名列表)
- 默认约定ctx 字段名去掉 `_path` 后缀 = step 方法形参名
ctx.water_mask_path → 形参 water_mask
ctx.raw_img_path → 形参 raw_img
- 可被 spec.parameter_map 覆盖
- 命名约定ctx 字段名 == panel key 名 == step 形参名(全链路无翻译)
- 保留 spec.parameter_map 字段骨架供极少数特例覆盖(默认空 dict
- 调度顺序:按 PIPELINE_STEPS 列表顺序requires 缺则 skip
- 软取消:在每个 step 前检查 ctx.is_cancelled()
- duck-typed pipelinerunner 只调 getattr(pipeline, method_name),不强依赖类层级
@ -48,101 +46,76 @@ class StepSpec:
PIPELINE_STEPS: List[StepSpec] = [
StepSpec(
step_id="step1", method_name="step1_generate_water_mask",
requires=["raw_img_path"], produces=["water_mask_path"],
# ctx.raw_img_path → 形参 img_path老 step1 形参名是 img_path不是 raw_img
parameter_map={"raw_img_path": "img_path"},
requires=["img_path"], produces=["water_mask_path"],
description="水域掩膜生成NDWI 或 SHP",
),
StepSpec(
step_id="step2", method_name="step2_find_glint_area",
requires=["raw_img_path", "water_mask_path"], produces=["glint_mask_path"],
# raw_img_path→img_pathwater_mask_path 不变
parameter_map={"raw_img_path": "img_path"},
requires=["img_path", "water_mask_path"], produces=["glint_mask_path"],
description="耀斑区域检测",
),
StepSpec(
step_id="step3", method_name="step3_remove_glint",
requires=["deglint_img_path"], produces=["deglint_img_path"],
# deglint_img_path→img_path老 step3 形参名是 img_path
# 注意glint_mask_path 不在 requires 中——step3 形参表无该参数,内部走 self.glint_mask_path 回退
parameter_map={"deglint_img_path": "img_path"},
requires=["img_path", "water_mask_path", "glint_mask_path"],
produces=["deglint_img_path"],
description="耀斑去除",
),
StepSpec(
step_id="step4", method_name="step4_process_csv",
requires=["raw_csv_path"], produces=["processed_csv_path"],
# raw_csv_path→csv_path老 step4 形参名是 csv_path
parameter_map={"raw_csv_path": "csv_path"},
requires=["csv_path"], produces=["processed_csv_path"],
description="CSV 异常值清洗",
),
StepSpec(
step_id="step5", method_name="step5_extract_training_spectra",
requires=["deglint_img_path", "processed_csv_path"], produces=["training_spectra_path"],
# processed_csv_path→csv_path老 step5 形参名是 csv_pathdeglint_img_path 不变
parameter_map={"processed_csv_path": "csv_path"},
requires=["deglint_img_path", "csv_path", "boundary_path", "glint_mask_path"],
produces=["training_csv_path"],
description="实测样本点光谱提取",
),
StepSpec(
step_id="step5_5", method_name="step5_5_calculate_water_quality_indices",
requires=["training_spectra_path"], produces=["indices_path"],
# 老 step5.5 形参是 training_spectra_pathctx 字段同名,无需映射
parameter_map={},
requires=["training_csv_path"], produces=["indices_path"],
description="水质光谱指数计算optional",
),
StepSpec(
step_id="step6", method_name="step6_train_models",
requires=["training_spectra_path"], produces=["models_dir"],
# training_spectra_path→training_csv_path老 step6 形参名是 training_csv_path
parameter_map={"training_spectra_path": "training_csv_path"},
requires=["training_csv_path"], produces=["models_dir"],
description="ML 建模GridSearchCV / AutoML",
),
StepSpec(
step_id="step6_5", method_name="step6_5_non_empirical_modeling",
requires=["training_spectra_path"], produces=["models_dir"],
# training_spectra_path→csv_path老 step6.5 形参名是 csv_path
parameter_map={"training_spectra_path": "csv_path"},
requires=["training_csv_path"], produces=["models_dir"],
description="非经验统计回归",
),
StepSpec(
step_id="step6_75", method_name="step6_75_custom_regression",
requires=["training_spectra_path"], produces=["models_dir"],
# training_spectra_path→csv_path老 step6.75 形参名是 csv_path
parameter_map={"training_spectra_path": "csv_path"},
requires=["training_csv_path"], produces=["models_dir"],
description="自定义回归分析",
),
StepSpec(
step_id="step7", method_name="step7_generate_sampling_points",
requires=["deglint_img_path", "water_mask_path"], produces=["sampling_csv_path"],
# 老 step7 形参是 deglint_img_path / water_mask_pathctx 字段同名
parameter_map={},
description="整景密集采样点生成 + 光谱提取",
),
StepSpec(
step_id="step8", method_name="step8_predict_water_quality",
requires=["sampling_csv_path", "models_dir"], produces=["prediction_csv_path"],
parameter_map={},
description="ML 模型预测(采样点)",
),
StepSpec(
step_id="step8_5", method_name="step8_5_predict_with_non_empirical_models",
requires=["sampling_csv_path"], produces=["prediction_dir"],
parameter_map={},
requires=["sampling_csv_path", "models_dir"], produces=["prediction_dir"],
description="非经验模型预测",
),
StepSpec(
step_id="step8_75", method_name="step8_75_predict_with_custom_regression",
requires=["sampling_csv_path"], produces=["prediction_dir"],
parameter_map={},
requires=["sampling_csv_path", "models_dir", "formula_csv_path"],
produces=["prediction_dir"],
description="自定义回归预测",
),
StepSpec(
step_id="step9", method_name="step9_generate_distribution_map",
requires=["prediction_csv_path"],
requires=["prediction_csv_path", "boundary_shp_path"],
produces=["distribution_map_path"],
# 老 step9 形参是 prediction_csv_path / boundary_shp_pathctx 字段同名
# 注意sampling_csv_path / water_mask_path 不在 requires 中——step9 形参表无该参数,
# 内部走 self.sampling_csv_path / self.water_mask_path 回退
parameter_map={},
description="克里金插值成图",
),
]
@ -157,7 +130,7 @@ class PipelineRunner:
用法:
runner = PipelineRunner(pipeline_instance)
ctx = PipelineContext(raw_img_path=..., ...)
ctx = PipelineContext(img_path=..., ...)
result_ctx = runner.run(ctx)
"""

View File

@ -0,0 +1,544 @@
# -*- coding: utf-8 -*-
"""
Optuna + 智能子采样 AutoML 训练器(路线 B 防爆引擎)。
为什么需要这个:
- 老路径11 预处理 × 4 模型 × 3 划分 = 132 组 GridSearchCV
对中小数据集 10 分钟+,对大数据集 5w+ 行 直接 OOM
- AutoML 路径1 预处理 × N 模型Optuna 调超参),用智能子采样避开 OOM
再用最优超参在**全量数据**上 refit最终保存单一模型
设计要点:
- 入口 train_with_automl(csv, feature_start_column, model_names, ...)
- AutoMLResult dataclass 返回(每个目标列一份)
- smart_subsampleN > max_samples 时随机下采样
- 失败兜底optuna 未装 / 全 trial 失败 → fallback 到 WaterQualityModelingBatch
- 文件命名规范:{target}_{preprocess}_{model}_AUTOML.joblib
- save_data["metadata"]["automl"] = True 标记
调用:
from src.core.prediction.automl_trainer import train_with_automl
results = train_with_automl(
training_csv_path=".../training_spectra.csv",
feature_start_column="374.285004",
model_names=["RF", "SVR", "Ridge"],
n_trials=20,
timeout_sec=300,
)
"""
from __future__ import annotations
import json
import time
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
# ============================================================
# 常量
# ============================================================
# AutoML 寻优阶段允许的最大样本数(避免 OOM
# 5000 样本对 RF/SVR/Ridge 的 Optuna 寻优足够给出稳定 CV
DEFAULT_MAX_SAMPLES = 5000
# 单次 Optuna trial 的默认超时(秒)
DEFAULT_TIMEOUT = 300.0
# 默认 trial 数
DEFAULT_N_TRIALS = 20
# AutoML 输出目录名后缀
AUTOML_DIR_SUFFIX = "_AutoML"
# ============================================================
# 数据类
# ============================================================
@dataclass
class AutoMLResult:
"""单个目标列的 AutoML 训练结果"""
success: bool = False
model_path: Optional[str] = None
cv_score: float = -float("inf")
best_params: Optional[Dict[str, Any]] = None
target_column: str = ""
preprocessing: str = ""
model_name: str = ""
n_trials_done: int = 0
n_samples_used: int = 0
fallback_used: bool = False
elapsed_sec: float = 0.0
error: Optional[str] = None
metadata: Dict[str, Any] = field(default_factory=dict)
# ============================================================
# 智能子采样
# ============================================================
def smart_subsample(
X: np.ndarray,
y: np.ndarray,
max_samples: int = DEFAULT_MAX_SAMPLES,
random_state: int = 42,
) -> Tuple[np.ndarray, np.ndarray, bool]:
"""当 N > max_samples 时随机下采样;否则原样返回。
Returns:
(X_sub, y_sub, was_subsampled)
"""
n = X.shape[0]
if n <= max_samples:
return X, y, False
rng = np.random.default_rng(random_state)
idx = rng.choice(n, size=max_samples, replace=False)
return X[idx], y[idx], True
# ============================================================
# 模型工厂
# ============================================================
def _build_model(model_name: str, random_state: int = 42):
"""根据英文模型键名构造 sklearn-compatible 模型实例factory"""
from sklearn.ensemble import (
AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor,
RandomForestRegressor,
)
from sklearn.linear_model import (
ElasticNet, Lasso, LinearRegression, Ridge,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
factory = {
"RF": lambda **kw: RandomForestRegressor(random_state=random_state, n_jobs=1, **kw),
"ET": lambda **kw: ExtraTreesRegressor(random_state=random_state, n_jobs=1, **kw),
"GradientBoosting": lambda **kw: GradientBoostingRegressor(random_state=random_state, **kw),
"AdaBoost": lambda **kw: AdaBoostRegressor(random_state=random_state, **kw),
"Ridge": lambda **kw: Ridge(**kw),
"Lasso": lambda **kw: Lasso(max_iter=5000, **kw),
"ElasticNet": lambda **kw: ElasticNet(max_iter=5000, **kw),
"LinearRegression": lambda **kw: LinearRegression(**kw),
"SVR": lambda **kw: SVR(**kw),
"KNN": lambda **kw: KNeighborsRegressor(n_jobs=1, **kw),
"MLP": lambda **kw: MLPRegressor(max_iter=500, random_state=random_state, **kw),
"DecisionTree": lambda **kw: DecisionTreeRegressor(random_state=random_state, **kw),
"PLS": None, # sklearn.cross_decomposition.PLSRegression 暂未集成
}
builder = factory.get(model_name)
if builder is None:
return None
return builder
# ============================================================
# Optuna 超参 search space
# ============================================================
def _get_search_space(model_name: str, trial) -> Dict[str, Any]:
"""按模型名返回 Optuna 超参 search space。"""
sp: Dict[str, Any] = {}
if model_name == "RF":
sp["n_estimators"] = trial.suggest_int("n_estimators", 50, 300, step=50)
sp["max_depth"] = trial.suggest_int("max_depth", 3, 20)
sp["min_samples_split"] = trial.suggest_int("min_samples_split", 2, 10)
sp["min_samples_leaf"] = trial.suggest_int("min_samples_leaf", 1, 5)
elif model_name == "ET":
sp["n_estimators"] = trial.suggest_int("n_estimators", 50, 300, step=50)
sp["max_depth"] = trial.suggest_int("max_depth", 3, 20)
elif model_name == "GradientBoosting":
sp["n_estimators"] = trial.suggest_int("n_estimators", 50, 300, step=50)
sp["max_depth"] = trial.suggest_int("max_depth", 3, 8)
sp["learning_rate"] = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
elif model_name == "SVR":
sp["C"] = trial.suggest_float("C", 0.1, 100.0, log=True)
sp["epsilon"] = trial.suggest_float("epsilon", 0.001, 1.0, log=True)
sp["kernel"] = trial.suggest_categorical("kernel", ["rbf", "linear"])
elif model_name == "KNN":
sp["n_neighbors"] = trial.suggest_int("n_neighbors", 3, 20)
sp["weights"] = trial.suggest_categorical("weights", ["uniform", "distance"])
elif model_name in ("Ridge", "Lasso", "ElasticNet"):
sp["alpha"] = trial.suggest_float("alpha", 0.01, 100.0, log=True)
if model_name == "ElasticNet":
sp["l1_ratio"] = trial.suggest_float("l1_ratio", 0.0, 1.0)
elif model_name == "MLP":
sp["hidden_layer_sizes"] = trial.suggest_categorical(
"hidden_layer_sizes", [(50,), (100,), (50, 50), (100, 50)]
)
sp["alpha"] = trial.suggest_float("alpha", 1e-5, 1e-1, log=True)
sp["learning_rate_init"] = trial.suggest_float("learning_rate_init", 1e-4, 1e-2, log=True)
elif model_name == "DecisionTree":
sp["max_depth"] = trial.suggest_int("max_depth", 3, 20)
sp["min_samples_split"] = trial.suggest_int("min_samples_split", 2, 10)
elif model_name == "AdaBoost":
sp["n_estimators"] = trial.suggest_int("n_estimators", 30, 200, step=30)
sp["learning_rate"] = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
else:
sp["n_estimators"] = trial.suggest_int("n_estimators", 50, 200, step=50)
return sp
def _make_objective(model_name: str, X: np.ndarray, y: np.ndarray,
cv_folds: int, random_state: int):
"""构造 Optuna objective5 折 CV R²"""
from sklearn.model_selection import KFold, cross_val_score
def objective(trial):
params = _get_search_space(model_name, trial)
try:
builder = _build_model(model_name, random_state=random_state)
if builder is None:
return -1.0
model = builder(**params)
kf = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring="r2", n_jobs=1)
return float(np.mean(scores))
except Exception:
return -1.0
return objective
def _refit_full(model_name: str, best_params: Dict[str, Any],
X: np.ndarray, y: np.ndarray, random_state: int):
"""用 best params 在**全量数据**上 refit。"""
builder = _build_model(model_name, random_state=random_state)
if builder is None:
return None
model = builder(**best_params)
model.fit(X, y)
return model
# ============================================================
# 失败兜底(回退到老 GridSearchCV 路径)
# ============================================================
def _fallback_train(
training_csv_path: str,
feature_start_column,
preprocessing: str,
model_name: str,
split_method: str,
cv_folds: int,
output_dir: Path,
target_column: str,
) -> AutoMLResult:
"""AutoML 失败时调老 WaterQualityModelingBatch。
返回的 AutoMLResult.fallback_used=True。
"""
try:
from src.core.modeling.modeling_batch import WaterQualityModelingBatch
except ImportError as e:
return AutoMLResult(
success=False, error=f"fallback 导入失败: {e!r}", fallback_used=True,
target_column=target_column, preprocessing=preprocessing, model_name=model_name,
)
try:
out_dir = output_dir / preprocessing
out_dir.mkdir(parents=True, exist_ok=True)
modeler = WaterQualityModelingBatch(str(out_dir))
modeler.train_models_batch(
csv_path=training_csv_path,
feature_start_column=feature_start_column,
preprocessing_methods=[preprocessing],
model_names=[model_name],
split_methods=[split_method],
cv_folds=cv_folds,
)
# 找产出
candidates = list(out_dir.rglob(f"{target_column}_{preprocessing}_{model_name}.joblib"))
model_path = str(candidates[0]) if candidates else None
return AutoMLResult(
success=model_path is not None,
model_path=model_path,
target_column=target_column, preprocessing=preprocessing, model_name=model_name,
fallback_used=True,
metadata={"source": "WaterQualityModelingBatch"},
)
except Exception as e:
return AutoMLResult(
success=False, error=f"fallback 失败: {e!r}", fallback_used=True,
target_column=target_column, preprocessing=preprocessing, model_name=model_name,
)
# ============================================================
# 主入口
# ============================================================
def train_with_automl(
training_csv_path: str,
feature_start_column,
preprocessing_methods: Optional[List[str]] = None,
model_names: Optional[List[str]] = None,
split_methods: Optional[List[str]] = None,
cv_folds: int = 5,
output_dir: Optional[str] = None,
n_trials: int = DEFAULT_N_TRIALS,
timeout_sec: float = DEFAULT_TIMEOUT,
max_samples: int = DEFAULT_MAX_SAMPLES,
random_state: int = 42,
callback: Optional[Callable[[str, str, str], None]] = None,
) -> List[AutoMLResult]:
"""用 Optuna + 子采样跑 AutoML。失败时自动回退到 GridSearchCV。
Args:
training_csv_path: 训练用 CSVStep 5 产物 training_spectra.csv
feature_start_column: 特征起始列名或索引(之前所有列视为目标 y
preprocessing_methods: 候选预处理列表(**仅用第 1 个**,避免笛卡尔爆炸)
model_names: 候选模型列表(每个都会跑一遍 Optuna
split_methods: 候选数据划分列表AutoML 仅用第 1 个)
cv_folds: 交叉验证折数
output_dir: 输出目录(默认 <models_dir>_AutoML
n_trials: 单模型 Optuna trial 数
timeout_sec: 单模型超时(秒),到时强制停止
max_samples: 寻优阶段允许的最大样本数
callback: 状态回调 callback(step_name, status, message)
Returns:
List[AutoMLResult],每个目标列一份结果
"""
def notify(status: str, msg: str = "") -> None:
if callback:
callback("步骤6_AutoML", status, msg)
# ---- 1) 参数默认值 ----
if preprocessing_methods is None:
preprocessing_methods = ["MMS"]
if model_names is None:
model_names = ["RF", "SVR", "Ridge"]
if split_methods is None:
split_methods = ["spxy"]
# 决策:仅用第一个预处理 + 第一个划分,避免笛卡尔爆炸
preproc = preprocessing_methods[0]
split_method = split_methods[0]
if output_dir is None:
output_dir = "./7_Supervised_Model_Training_AutoML"
out_dir = Path(output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
preproc_dir = out_dir / preproc
preproc_dir.mkdir(parents=True, exist_ok=True)
# ---- 2) 加载数据 ----
notify("start", f"AutoML 训练开始 (n_trials={n_trials}, timeout={timeout_sec}s, max_samples={max_samples})")
if not Path(training_csv_path).exists():
return [AutoMLResult(success=False, error=f"训练 CSV 不存在: {training_csv_path}")]
df = pd.read_csv(training_csv_path)
# 提取目标列feature_start_column 之前所有数值列)
if isinstance(feature_start_column, int):
y_cols = [c for c in df.columns[:feature_start_column]
if pd.api.types.is_numeric_dtype(df[c])]
else:
try:
idx = list(df.columns).index(feature_start_column)
y_cols = [c for c in df.columns[:idx]
if pd.api.types.is_numeric_dtype(df[c])]
except ValueError:
y_cols = []
if not y_cols:
notify("error", "AutoML: 未识别出目标列feature_start_column 之前的所有数值列)")
return [AutoMLResult(success=False, error="未识别出目标列")]
feat_cols = [c for c in df.columns if c not in y_cols]
X_all = df[feat_cols].values.astype(np.float64)
# ---- 3) 预处理(仅第一项) ----
if preproc != "None":
try:
from src.preprocessing.spectral_Preprocessing import Preprocessing
processed = Preprocessing(preproc, df[feat_cols])
if isinstance(processed, pd.DataFrame):
X_all = processed.values.astype(np.float64)
else:
X_all = np.asarray(processed, dtype=np.float64)
except Exception as e:
notify("warning", f"预处理 {preproc} 失败: {e!r},改用 None")
preproc = "None"
# ---- 4) 检查 Optuna 是否可用 ----
try:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
optuna_available = True
except ImportError:
optuna_available = False
notify("warning", "optuna 未安装,全目标列回退到 GridSearchCVpip install \"optuna>=3.6\"")
# ---- 5) 逐 target 跑 ----
results: List[AutoMLResult] = []
total = len(y_cols)
per_model_timeout = max(10.0, timeout_sec / max(1, len(model_names)))
for ti, tgt in enumerate(y_cols, 1):
t0 = time.time()
yv = df[tgt].values.astype(np.float64)
mask = ~np.isnan(yv)
X_t = X_all[mask]
y_t = yv[mask]
if X_t.shape[0] < cv_folds * 2:
notify("warning", f"目标 {tgt}: 有效样本 {X_t.shape[0]} 不足,跳过")
results.append(AutoMLResult(
success=False, target_column=tgt, error=f"样本不足({X_t.shape[0]})",
preprocessing=preproc,
))
continue
X_sub, y_sub, was_sub = smart_subsample(X_t, y_t, max_samples=max_samples, random_state=random_state)
if was_sub:
notify("info", f"目标 {tgt}: {X_t.shape[0]} 样本 → 子采样 {X_sub.shape[0]}(寻优用)")
best_overall = AutoMLResult(success=False, target_column=tgt, preprocessing=preproc)
if not optuna_available:
# 全目标列一次性 fallback
best_overall = _fallback_train(
training_csv_path, feature_start_column, preproc, model_names[0], split_method,
cv_folds, out_dir, tgt,
)
else:
for model_name in model_names:
try:
builder = _build_model(model_name, random_state=random_state)
if builder is None:
notify("warning", f"模型 {model_name} 暂不支持 AutoML 寻优")
continue
study = optuna.create_study(
direction="maximize",
sampler=optuna.samplers.TPESampler(seed=random_state),
)
study.optimize(
_make_objective(model_name, X_sub, y_sub, cv_folds, random_state),
n_trials=n_trials,
timeout=per_model_timeout,
show_progress_bar=False,
)
if study.best_value is None or study.best_value <= -1.0:
notify("warning", f"{tgt}/{model_name}: 全部 trial 失败CV 全部 <= -1")
continue
# refit on FULL
final_model = _refit_full(model_name, study.best_params, X_t, y_t, random_state)
if final_model is None:
continue
# 保存
import joblib
fname = f"{tgt}_{preproc}_{model_name}_AUTOML.joblib"
fpath = preproc_dir / fname
joblib.dump({
"model": final_model,
"target_column_name": tgt,
"preprocess_method": preproc,
"model_name": model_name,
"metadata": {
"automl": True,
"best_params": study.best_params,
"cv_score": float(study.best_value),
"n_trials_done": len(study.trials),
"n_samples_used_full": int(X_t.shape[0]),
"n_samples_used_for_search": int(X_sub.shape[0]),
"was_subsampled": was_sub,
"split_method": split_method,
},
}, fpath)
cand = AutoMLResult(
success=True,
model_path=str(fpath),
cv_score=float(study.best_value),
best_params=study.best_params,
target_column=tgt,
preprocessing=preproc,
model_name=model_name,
n_trials_done=len(study.trials),
n_samples_used=int(X_sub.shape[0]),
metadata={"refit_on_full": True, "n_samples_full": int(X_t.shape[0])},
)
if cand.cv_score > best_overall.cv_score:
best_overall = cand
except Exception as e:
notify("warning", f"目标 {tgt} / 模型 {model_name} 失败: {e!r}")
continue
if not best_overall.success:
notify("warning", f"目标 {tgt} 全部 Optuna trial 失败,回退 GridSearchCV")
best_overall = _fallback_train(
training_csv_path, feature_start_column, preproc, model_names[0], split_method,
cv_folds, out_dir, tgt,
)
best_overall.elapsed_sec = time.time() - t0
results.append(best_overall)
notify("info", f"AutoML 目标 {tgt} 完成 ({ti}/{total}) cv={best_overall.cv_score:.4f}")
# ---- 6) 汇总 json ----
summary_path = out_dir / "automl_summary.json"
try:
with open(summary_path, "w", encoding="utf-8") as f:
json.dump([asdict(r) for r in results], f, ensure_ascii=False, indent=2, default=str)
except Exception as e:
notify("warning", f"写 automl_summary.json 失败: {e!r}")
success_n = sum(1 for r in results if r.success)
fallback_n = sum(1 for r in results if r.fallback_used)
notify("completed", f"AutoML 训练完成 {success_n}/{len(results)} 成功({fallback_n} 走 fallback汇总 {summary_path}")
return results
# ============================================================
# CLI 自测
# ============================================================
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser(description="AutoML 训练器 CLI 自测")
p.add_argument("--csv", required=True, help="训练用 CSVfeature_start_column 之前的列为目标 y")
p.add_argument("--feature-start", default="0", help="特征起始列名或索引(默认 0")
p.add_argument("--n-trials", type=int, default=DEFAULT_N_TRIALS)
p.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT)
p.add_argument("--max-samples", type=int, default=DEFAULT_MAX_SAMPLES)
p.add_argument("--out", default="./7_Supervised_Model_Training_AutoML")
args = p.parse_args()
# 智能推断 feature_start_column 类型
fsc: Any = args.feature_start
try:
fsc = int(fsc)
except ValueError:
pass
res = train_with_automl(
training_csv_path=args.csv,
feature_start_column=fsc,
n_trials=args.n_trials,
timeout_sec=args.timeout,
max_samples=args.max_samples,
output_dir=args.out,
)
print(f"\n训练完成 {len(res)} 个目标")
for r in res:
marker = "" if r.success else ""
fb = " [fallback]" if r.fallback_used else ""
print(f" {marker} {r.target_column}: cv={r.cv_score:.4f} path={r.model_path}{fb}")

View File

@ -126,7 +126,7 @@ class DataPreparationStep:
@staticmethod
def calculate_water_quality_indices(
training_spectra_path: Optional[str] = None,
training_csv_path: Optional[str] = None,
formula_csv_file: Optional[str] = None,
formula_names: Optional[List[str]] = None,
output_file: Optional[str] = None,
@ -153,8 +153,8 @@ class DataPreparationStep:
notify("skipped", "跳过水质指数计算")
return None
if training_spectra_path is None:
raise ValueError("必须提供 training_spectra_path 参数")
if training_csv_path is None:
raise ValueError("必须提供 training_csv_path 参数")
if formula_csv_file is None:
raise ValueError("必须提供 formula_csv_file 参数")
@ -170,7 +170,7 @@ class DataPreparationStep:
from src.utils.band_math import BandMathCalculator
calculator = BandMathCalculator(training_spectra_path)
calculator = BandMathCalculator(training_csv_path)
result_df = calculator.process_formulas_from_csv(
formula_csv_file=formula_csv_file,
formula_names=formula_names,

View File

@ -173,7 +173,7 @@ class WaterQualityInversionPipeline:
self.interpolated_img_path = None # 存储插值后的影像路径
self.deglint_img_path = None
self.processed_csv_path = None
self.training_spectra_path = None
self.training_csv_path = None
self.indices_path = None
self.custom_regression_path = None
@ -511,7 +511,7 @@ class WaterQualityInversionPipeline:
left_shoulder_wave: Optional[float] = None,
valley_wave: Optional[float] = None,
right_shoulder_wave: Optional[float] = None,
water_mask: Optional[Union[str, np.ndarray]] = None,
water_mask_path: Optional[Union[str, np.ndarray]] = None,
interpolate_zeros: bool = False,
interpolation_method: str = 'nearest',
enabled: bool = True,
@ -546,7 +546,7 @@ class WaterQualityInversionPipeline:
left_shoulder_wave=left_shoulder_wave,
valley_wave=valley_wave,
right_shoulder_wave=right_shoulder_wave,
water_mask=water_mask,
water_mask=water_mask_path,
interpolate_zeros=interpolate_zeros,
interpolation_method=interpolation_method,
enabled=enabled,
@ -655,13 +655,13 @@ class WaterQualityInversionPipeline:
water_mask_path=self.water_mask_path,
output_dir=str(self.training_spectra_dir),
)
self.training_spectra_path = result
self.training_csv_path = result
self._record_step_time("步骤5: 提取训练样本点光谱", 0, 0)
self._notify("completed", f"训练光谱数据已保存: {result}")
return result
def step5_5_calculate_water_quality_indices(self,
training_spectra_path: Optional[str] = None,
training_csv_path: Optional[str] = None,
formula_csv_file: Optional[str] = None,
formula_names: Optional[List[str]] = None,
output_file: Optional[str] = None,
@ -669,29 +669,29 @@ class WaterQualityInversionPipeline:
skip_dependency_check: bool = False) -> str:
"""
步骤5.5: 根据训练光谱计算水质光谱指数
使用band_math.py中的方法实现支持从公式CSV文件中批量计算指定公式
Args:
training_spectra_path: 训练光谱数据CSV路径如果为None使用步骤5的结果
training_csv_path: 训练光谱数据CSV路径如果为None使用步骤5的结果
formula_csv_file: 公式CSV文件路径包含公式名称和具体公式
formula_names: 要计算的公式名称列表如果为None则计算所有公式
output_file: 输出文件完整路径支持绝对路径如果为None则使用默认路径
Returns:
包含计算结果的新CSV文件路径
"""
# 参数解析(保留原逻辑)
if training_spectra_path is not None:
csv_path = training_spectra_path
elif self.training_spectra_path is not None:
csv_path = self.training_spectra_path
if training_csv_path is not None:
csv_path = training_csv_path
elif self.training_csv_path is not None:
csv_path = self.training_csv_path
else:
csv_path = None
self._notify("started", "步骤5.5: 计算水质光谱指数")
result = DataPreparationStep.calculate_water_quality_indices(
training_spectra_path=csv_path,
training_csv_path=csv_path,
formula_csv_file=formula_csv_file,
formula_names=formula_names,
output_file=output_file,
@ -727,8 +727,8 @@ class WaterQualityInversionPipeline:
# 参数解析(保留原逻辑)
if training_csv_path is not None:
final_csv_path = training_csv_path
elif self.training_spectra_path is not None:
final_csv_path = self.training_spectra_path
elif self.training_csv_path is not None:
final_csv_path = self.training_csv_path
else:
final_csv_path = None
@ -911,7 +911,7 @@ class WaterQualityInversionPipeline:
print("="*80)
if training_csv_path is None:
training_csv_path = self.training_spectra_path
training_csv_path = self.training_csv_path
if training_csv_path is None:
raise ValueError("请提供训练数据CSV路径或先执行步骤5")
@ -1033,7 +1033,7 @@ class WaterQualityInversionPipeline:
print("="*80)
if csv_path is None:
csv_path = self.training_spectra_path
csv_path = self.training_csv_path
if csv_path is None:
raise ValueError("请提供CSV文件路径或先执行步骤5")
@ -1506,7 +1506,7 @@ class WaterQualityInversionPipeline:
if 'step5' in config:
self._notify("步骤5: 光谱提取", "start")
self.step5_extract_training_spectra(**config['step5'])
self._notify("步骤5: 光谱提取", "completed", f"(输出: {self.training_spectra_path})")
self._notify("步骤5: 光谱提取", "completed", f"(输出: {self.training_csv_path})")
else:
self._notify("步骤5: 光谱提取", "skipped", "未配置")
@ -1615,7 +1615,7 @@ class WaterQualityInversionPipeline:
# 生成散点图
if 'visualization' in config and config['visualization'].get('generate_scatter', True):
if self.training_spectra_path and self.models_dir.exists():
if self.training_csv_path and self.models_dir.exists():
try:
self._notify("可视化", "info", "生成模型评估散点图...")
scatter_config = config['visualization'].get('scatter_config', {})
@ -1653,7 +1653,7 @@ class WaterQualityInversionPipeline:
# 生成光谱曲线图
if 'visualization' in config and config['visualization'].get('generate_spectrum', True):
if self.training_spectra_path:
if self.training_csv_path:
try:
self._notify("可视化", "info", "生成光谱曲线对比图...")
spectrum_paths = self.generate_spectrum_comparison_plots(
@ -1701,7 +1701,7 @@ class WaterQualityInversionPipeline:
pipeline_info['step2'] = {'status': 'completed', 'output_file': str(self.glint_mask_path) if self.glint_mask_path else 'N/A'}
pipeline_info['step3'] = {'status': 'completed', 'output_file': str(self.deglint_img_path) if self.deglint_img_path else 'N/A'}
pipeline_info['step4'] = {'status': 'completed', 'output_file': str(self.processed_csv_path) if self.processed_csv_path else 'N/A'}
pipeline_info['step5'] = {'status': 'completed', 'output_file': str(self.training_spectra_path) if self.training_spectra_path else 'N/A'}
pipeline_info['step5'] = {'status': 'completed', 'output_file': str(self.training_csv_path) if self.training_csv_path else 'N/A'}
pipeline_info['step5_5'] = {'status': 'completed', 'output_file': str(self.indices_path) if self.indices_path else 'N/A'}
pipeline_info['step6'] = {'status': 'completed', 'output_file': str(self.models_dir)}
pipeline_info['step6_75'] = {'status': 'completed', 'output_file': str(self.custom_regression_path) if self.custom_regression_path else 'N/A'}
@ -1784,8 +1784,8 @@ class WaterQualityInversionPipeline:
# 参数解析(保留原逻辑)
if csv_path is not None:
final_csv_path = csv_path
elif self.training_spectra_path is not None:
final_csv_path = self.training_spectra_path
elif self.training_csv_path is not None:
final_csv_path = self.training_csv_path
else:
final_csv_path = None
@ -2109,7 +2109,7 @@ def main():
'interpolation_method': 'bilinear', # 插值方法: 'nearest'(邻近), 'bilinear'(双线性),
# 'spline'(样条), 'kriging'(克里金)
# 水域掩膜参数(可选):
'water_mask':r"D:\BaiduNetdiskDownload\yaobao\roi\roi.shp", # None表示自动使用步骤1生成的掩膜也可以提供
'water_mask_path':r"D:\BaiduNetdiskDownload\yaobao\roi\roi.shp", # None表示自动使用步骤1生成的掩膜也可以提供
# # - numpy数组
# # - 栅格文件路径(.dat/.tif)
# # - shapefile路径(.shp)