1137 lines
44 KiB
Python
1137 lines
44 KiB
Python
import numpy as np
|
||
import pandas as pd
|
||
import joblib
|
||
import os
|
||
from pathlib import Path
|
||
from typing import List, Dict, Union, Tuple, Optional
|
||
import warnings
|
||
|
||
warnings.filterwarnings('ignore')
|
||
|
||
# 机器学习模型导入 - 改为回归模型
|
||
from sklearn.svm import SVR
|
||
from sklearn.ensemble import RandomForestRegressor
|
||
from sklearn.neighbors import KNeighborsRegressor
|
||
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
|
||
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
|
||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||
from sklearn.cross_decomposition import PLSRegression
|
||
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
|
||
from sklearn.tree import DecisionTreeRegressor
|
||
from sklearn.neural_network import MLPRegressor
|
||
from joblib import parallel_backend
|
||
# 第三方模型导入
|
||
# try:
|
||
# import lightgbm as lgb
|
||
# LGB_AVAILABLE = True
|
||
# except ImportError:
|
||
# LGB_AVAILABLE = False
|
||
LGB_AVAILABLE = False # 注释掉lightgbm
|
||
|
||
# try:
|
||
# import catboost as cb
|
||
# CB_AVAILABLE = True
|
||
# except ImportError:
|
||
# CB_AVAILABLE = False
|
||
CB_AVAILABLE = False # 注释掉catboost
|
||
|
||
# 导入预处理模块
|
||
# 动态导入预处理模块
|
||
import sys
|
||
import os
|
||
|
||
from src.preprocessing.spectral_Preprocessing import Preprocessing
|
||
|
||
|
||
class WaterQualityModelingBatch:
|
||
"""水质参数反演批量建模类"""
|
||
|
||
def __init__(self, artifacts_dir: str = "models/artifacts"):
|
||
"""
|
||
初始化批量建模类
|
||
|
||
Args:
|
||
artifacts_dir: 模型保存目录
|
||
"""
|
||
self.artifacts_dir = Path(artifacts_dir)
|
||
self.artifacts_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 定义支持的回归模型及其参数网格
|
||
self.model_configs = {
|
||
'SVR': {
|
||
'model': SVR,
|
||
'params': {
|
||
'C': [0.1, 1, 10, 100],
|
||
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
|
||
'kernel': ['rbf', 'poly', 'sigmoid'],
|
||
'epsilon': [0.01, 0.1, 0.2]
|
||
},
|
||
'available': True
|
||
},
|
||
'RF': {
|
||
'model': RandomForestRegressor,
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'max_depth': [None, 10, 20, 30],
|
||
'min_samples_split': [2, 5, 10],
|
||
'min_samples_leaf': [1, 2, 4]
|
||
},
|
||
'available': True
|
||
},
|
||
'KNN': {
|
||
'model': KNeighborsRegressor,
|
||
'params': {
|
||
'n_neighbors': [3, 5, 7, 9, 11],
|
||
'weights': ['uniform', 'distance'],
|
||
'metric': ['euclidean', 'manhattan', 'minkowski']
|
||
},
|
||
'available': True
|
||
},
|
||
'LinearRegression': {
|
||
'model': LinearRegression,
|
||
'params': {
|
||
'fit_intercept': [True, False]
|
||
},
|
||
'available': True
|
||
},
|
||
'Ridge': {
|
||
'model': Ridge,
|
||
'params': {
|
||
'alpha': [0.01, 0.1, 1, 10, 100],
|
||
'fit_intercept': [True, False]
|
||
},
|
||
'available': True
|
||
},
|
||
'Lasso': {
|
||
'model': Lasso,
|
||
'params': {
|
||
'alpha': [0.01, 0.1, 1, 10, 100],
|
||
'fit_intercept': [True, False],
|
||
'max_iter': [1000, 2000]
|
||
},
|
||
'available': True
|
||
},
|
||
'ElasticNet': {
|
||
'model': ElasticNet,
|
||
'params': {
|
||
'alpha': [0.01, 0.1, 1, 10],
|
||
'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
|
||
'fit_intercept': [True, False],
|
||
'max_iter': [1000, 2000]
|
||
},
|
||
'available': True
|
||
},
|
||
'XGBoost': {
|
||
'model': None, # xgboost is removed, so set to None
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'max_depth': [3, 6, 9],
|
||
'learning_rate': [0.01, 0.1, 0.2],
|
||
'subsample': [0.8, 0.9, 1.0]
|
||
},
|
||
'available': False
|
||
},
|
||
'LightGBM': {
|
||
'model': lgb.LGBMRegressor if LGB_AVAILABLE else None,
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'max_depth': [3, 6, 9],
|
||
'learning_rate': [0.01, 0.1, 0.2],
|
||
'num_leaves': [31, 50, 100]
|
||
},
|
||
'available': LGB_AVAILABLE
|
||
},
|
||
'CatBoost': {
|
||
'model': cb.CatBoostRegressor if CB_AVAILABLE else None,
|
||
'params': {
|
||
'iterations': [50, 100, 200],
|
||
'depth': [3, 6, 9],
|
||
'learning_rate': [0.01, 0.1, 0.2],
|
||
'l2_leaf_reg': [1, 3, 5]
|
||
},
|
||
'available': CB_AVAILABLE
|
||
},
|
||
'PLS': {
|
||
'model': PLSRegression,
|
||
'params': {
|
||
'n_components': [2, 3, 5, 7, 10]
|
||
},
|
||
'available': True
|
||
},
|
||
'GradientBoosting': {
|
||
'model': GradientBoostingRegressor,
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'learning_rate': [0.01, 0.1, 0.2],
|
||
'max_depth': [3, 5, 7],
|
||
'subsample': [0.8, 0.9, 1.0],
|
||
'min_samples_split': [2, 5, 10],
|
||
'min_samples_leaf': [1, 2, 4]
|
||
},
|
||
'available': True
|
||
},
|
||
'AdaBoost': {
|
||
'model': AdaBoostRegressor,
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'learning_rate': [0.01, 0.1, 0.2],
|
||
'loss': ['linear', 'square', 'exponential']
|
||
},
|
||
'available': True
|
||
},
|
||
'DecisionTree': {
|
||
'model': DecisionTreeRegressor,
|
||
'params': {
|
||
'max_depth': [None, 5, 10, 20, 30],
|
||
'min_samples_split': [2, 5, 10],
|
||
'min_samples_leaf': [1, 2, 4],
|
||
'max_features': ['auto', 'sqrt', 'log2']
|
||
},
|
||
'available': True
|
||
},
|
||
'MLP': {
|
||
'model': MLPRegressor,
|
||
'params': {
|
||
'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
|
||
'activation': ['relu', 'tanh', 'logistic'],
|
||
'solver': ['adam', 'sgd'],
|
||
'alpha': [0.0001, 0.001, 0.01],
|
||
'learning_rate': ['constant', 'invscaling', 'adaptive'],
|
||
'max_iter': [1000, 2000]
|
||
},
|
||
'available': True
|
||
},
|
||
'ExtraTrees': {
|
||
'model': ExtraTreesRegressor,
|
||
'params': {
|
||
'n_estimators': [50, 100, 200],
|
||
'max_depth': [None, 10, 20, 30],
|
||
'min_samples_split': [2, 5, 10],
|
||
'min_samples_leaf': [1, 2, 4],
|
||
'max_features': ['auto', 'sqrt', 'log2']
|
||
},
|
||
'available': True
|
||
}
|
||
}
|
||
|
||
# 预处理方法列表
|
||
self.preprocessing_methods = [
|
||
"None", "MMS", "SS", "CT", "SNV", "MA", "SG", "MSC", "D1", "D2", "DT", "WVAE"
|
||
]
|
||
|
||
# 样本划分方法列表
|
||
self.split_methods = ["random", "spxy", "ks"]
|
||
|
||
self.results = {}
|
||
self.best_models = {}
|
||
|
||
def load_data_batch(self, csv_path: str, feature_start_column: Union[int, str]) -> Tuple[pd.DataFrame, Dict[str, pd.Series]]:
|
||
"""
|
||
批量加载CSV数据,将指定列之前的列作为目标值
|
||
|
||
Args:
|
||
csv_path: CSV文件路径
|
||
feature_start_column: 特征开始列索引(int)或列名(str)
|
||
|
||
Returns:
|
||
X: 特征数据
|
||
y_dict: 目标值数据字典,键为列名
|
||
"""
|
||
# 读取CSV数据,处理空字符串和缺失值
|
||
try:
|
||
data = pd.read_csv(csv_path, na_values=['', ' ', 'NaN', 'nan', 'NULL', 'null'])
|
||
except pd.errors.EmptyDataError:
|
||
raise ValueError(f"CSV文件 '{csv_path}' 为空或不存在")
|
||
except Exception as e:
|
||
raise ValueError(f"读取CSV文件 '{csv_path}' 时出错: {e}")
|
||
|
||
# 检查并清理数据中的空字符串和其他无效值
|
||
print("数据清理...")
|
||
original_shape = data.shape
|
||
|
||
# 将空字符串替换为NaN
|
||
data = data.replace(r'^\s*$', np.nan, regex=True)
|
||
|
||
# 对于数值列,将无法转换为数字的字符串替换为NaN
|
||
for col in data.columns:
|
||
try:
|
||
# 尝试将列转换为数值类型
|
||
data[col] = pd.to_numeric(data[col], errors='coerce')
|
||
except Exception:
|
||
# 如果转换失败,保持原样(可能是字符串列)
|
||
pass
|
||
|
||
cleaned_shape = data.shape
|
||
if cleaned_shape != original_shape:
|
||
print(f"数据清理完成: {original_shape[0]}行{original_shape[1]}列 -> {cleaned_shape[0]}行{cleaned_shape[1]}列")
|
||
|
||
print(f"数据加载完成,总列数: {data.shape[1]}")
|
||
print(f"所有列名: {list(data.columns)}")
|
||
|
||
# 如果feature_start_column是列名,转换为索引
|
||
if isinstance(feature_start_column, str):
|
||
if feature_start_column not in data.columns:
|
||
raise ValueError(f"指定的特征开始列 '{feature_start_column}' 不存在于数据中")
|
||
feature_start_index = data.columns.get_loc(feature_start_column)
|
||
print(f"特征开始列 '{feature_start_column}' 对应索引: {feature_start_index}")
|
||
else:
|
||
feature_start_index = feature_start_column
|
||
print(f"特征开始列索引: {feature_start_index}")
|
||
|
||
# 提取特征数据(从feature_start_index开始)
|
||
X = data.iloc[:, feature_start_index:]
|
||
|
||
# 提取所有目标列(从0列到feature_start_index-1列)
|
||
y_dict = {}
|
||
target_columns = data.columns[:feature_start_index]
|
||
|
||
print(f"检测到的目标列: {list(target_columns)}")
|
||
|
||
for col_name in target_columns:
|
||
y_series = data[col_name]
|
||
# 检查是否有非空值
|
||
if not y_series.isna().all():
|
||
y_dict[col_name] = y_series
|
||
print(f" 目标列 '{col_name}': {y_series.count()} 个非空值, 范围: {y_series.min():.4f} ~ {y_series.max():.4f}")
|
||
else:
|
||
print(f" 跳过目标列 '{col_name}': 所有值为空")
|
||
|
||
print(f"特征数据形状: {X.shape}")
|
||
print(f"有效目标列数量: {len(y_dict)}")
|
||
|
||
return X, y_dict
|
||
|
||
def load_data_single(self, csv_path: str, target_column_name: str, feature_start_column: Union[int, str]) -> Tuple[pd.DataFrame, pd.Series]:
|
||
"""
|
||
加载单个目标列的CSV数据
|
||
|
||
Args:
|
||
csv_path: CSV文件路径
|
||
target_column_name: 目标列名
|
||
feature_start_column: 特征开始列索引(int)或列名(str)
|
||
|
||
Returns:
|
||
X: 特征数据
|
||
y: 目标值数据
|
||
"""
|
||
data = pd.read_csv(csv_path)
|
||
|
||
# 检查目标列是否存在
|
||
if target_column_name not in data.columns:
|
||
raise ValueError(f"目标列 '{target_column_name}' 不存在于数据中")
|
||
|
||
# 如果feature_start_column是列名,转换为索引
|
||
if isinstance(feature_start_column, str):
|
||
if feature_start_column not in data.columns:
|
||
raise ValueError(f"指定的特征开始列 '{feature_start_column}' 不存在于数据中")
|
||
feature_start_index = data.columns.get_loc(feature_start_column)
|
||
else:
|
||
feature_start_index = feature_start_column
|
||
|
||
# 提取目标值和特征
|
||
y = data[target_column_name]
|
||
X = data.iloc[:, feature_start_index:]
|
||
|
||
# 去除y值为空的行
|
||
mask = ~y.isna()
|
||
data_cleaned = data[mask]
|
||
|
||
# 重新定义y和X,去除对应的空值行
|
||
y = data_cleaned[target_column_name]
|
||
X = data_cleaned.iloc[:, feature_start_column:]
|
||
|
||
print(f"目标列 '{target_column_name}' 数据加载完成:")
|
||
print(f" 样本数量: {X.shape[0]}")
|
||
print(f" 特征数量: {X.shape[1]}")
|
||
print(f" 目标值范围: {y.min():.4f} ~ {y.max():.4f}")
|
||
print(f" 目标值均值: {y.mean():.4f}")
|
||
|
||
return X, y
|
||
|
||
def preprocess_data(self, X: pd.DataFrame, method: str) -> np.ndarray:
|
||
"""
|
||
数据预处理
|
||
|
||
Args:
|
||
X: 原始特征数据
|
||
method: 预处理方法
|
||
|
||
Returns:
|
||
预处理后的数据
|
||
"""
|
||
print(f"应用预处理方法: {method}")
|
||
|
||
# 如果方法为None,直接返回原始数据
|
||
if method == "None" or method is None:
|
||
print("跳过预处理,使用原始数据")
|
||
return X.values
|
||
|
||
try:
|
||
X_processed = Preprocessing(method, X)
|
||
|
||
# 确保返回的是numpy数组
|
||
if isinstance(X_processed, pd.DataFrame):
|
||
X_processed = X_processed.values
|
||
|
||
print(f"预处理完成,数据形状: {X_processed.shape}")
|
||
return X_processed
|
||
|
||
except Exception as e:
|
||
print(f"预处理失败: {e}")
|
||
print("使用原始数据")
|
||
return X.values
|
||
|
||
def random(self, data, label, test_ratio=0.2, random_state=123):
|
||
"""
|
||
随机划分数据集
|
||
|
||
Args:
|
||
data: shape (n_samples, n_features)
|
||
label: shape (n_sample, )
|
||
test_ratio: 测试集比例,默认: 0.2
|
||
random_state: 随机种子,默认: 123
|
||
|
||
Returns:
|
||
X_train: (n_samples, n_features)
|
||
X_test: (n_samples, n_features)
|
||
y_train: (n_sample, )
|
||
y_test: (n_sample, )
|
||
"""
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
data, label, test_size=test_ratio, random_state=random_state
|
||
)
|
||
return X_train, X_test, y_train, y_test
|
||
|
||
def spxy(self, data, label, test_size=0.2):
|
||
"""
|
||
SPXY算法划分数据集(考虑X和Y空间的距离)
|
||
|
||
Args:
|
||
data: shape (n_samples, n_features)
|
||
label: shape (n_samples, )
|
||
test_size: 测试集比例,默认: 0.2
|
||
|
||
Returns:
|
||
X_train: (n_samples, n_features)
|
||
X_test: (n_samples, n_features)
|
||
y_train: (n_samples, )
|
||
y_test: (n_samples, )
|
||
"""
|
||
# 确保 data 和 label 是 NumPy 数组
|
||
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
|
||
label = label.to_numpy() if isinstance(label, pd.Series) else label
|
||
|
||
# 备份原始数据和标签
|
||
x_backup = data
|
||
y_backup = label
|
||
|
||
M = data.shape[0]
|
||
N = round((1 - test_size) * M)
|
||
samples = np.arange(M)
|
||
|
||
# 归一化标签数据
|
||
label = (label - np.mean(label)) / np.std(label)
|
||
D = np.zeros((M, M))
|
||
Dy = np.zeros((M, M))
|
||
|
||
# 计算样本之间的距离
|
||
for i in range(M - 1):
|
||
xa = data[i, :]
|
||
ya = label[i]
|
||
for j in range((i + 1), M):
|
||
xb = data[j, :]
|
||
yb = label[j]
|
||
D[i, j] = np.linalg.norm(xa - xb)
|
||
Dy[i, j] = np.linalg.norm(ya - yb)
|
||
|
||
# 距离归一化
|
||
Dmax = np.max(D)
|
||
Dymax = np.max(Dy)
|
||
D = D / Dmax + Dy / Dymax
|
||
|
||
# 找到最远的两个点
|
||
maxD = D.max(axis=0)
|
||
index_row = D.argmax(axis=0)
|
||
index_column = maxD.argmax()
|
||
|
||
m = np.zeros(N, dtype=int)
|
||
m[0] = index_row[index_column]
|
||
m[1] = index_column
|
||
|
||
dminmax = np.zeros(N)
|
||
dminmax[1] = D[m[0], m[1]]
|
||
|
||
# 根据距离选择训练集
|
||
for i in range(2, N):
|
||
pool = np.delete(samples, m[:i])
|
||
dmin = np.zeros(M - i)
|
||
for j in range(M - i):
|
||
indexa = pool[j]
|
||
d = np.zeros(i)
|
||
for k in range(i):
|
||
indexb = m[k]
|
||
if indexa < indexb:
|
||
d[k] = D[indexa, indexb]
|
||
else:
|
||
d[k] = D[indexb, indexa]
|
||
dmin[j] = np.min(d)
|
||
dminmax[i] = np.max(dmin)
|
||
index = np.argmax(dmin)
|
||
m[i] = pool[index]
|
||
|
||
m_complement = np.delete(samples, m)
|
||
|
||
# 划分训练集和测试集
|
||
X_train = data[m, :]
|
||
y_train = y_backup[m]
|
||
X_test = data[m_complement, :]
|
||
y_test = y_backup[m_complement]
|
||
|
||
return X_train, X_test, y_train, y_test
|
||
|
||
def ks(self, data, label, test_size=0.2):
|
||
"""
|
||
Kennard-Stone算法划分数据集
|
||
|
||
Args:
|
||
data: shape (n_samples, n_features)
|
||
label: shape (n_sample, )
|
||
test_size: 测试集比例,默认: 0.2
|
||
|
||
Returns:
|
||
X_train: (n_samples, n_features)
|
||
X_test: (n_samples, n_features)
|
||
y_train: (n_samples, )
|
||
y_test: (n_samples, )
|
||
"""
|
||
# 确保 data 和 label 是 NumPy 数组
|
||
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
|
||
label = label.to_numpy() if isinstance(label, pd.Series) else label
|
||
|
||
M = data.shape[0]
|
||
N = round((1 - test_size) * M)
|
||
samples = np.arange(M)
|
||
|
||
D = np.zeros((M, M))
|
||
|
||
for i in range((M - 1)):
|
||
xa = data[i, :]
|
||
for j in range((i + 1), M):
|
||
xb = data[j, :]
|
||
D[i, j] = np.linalg.norm(xa - xb)
|
||
|
||
maxD = np.max(D, axis=0)
|
||
index_row = np.argmax(D, axis=0)
|
||
index_column = np.argmax(maxD)
|
||
|
||
m = np.zeros(N)
|
||
m[0] = np.array(index_row[index_column])
|
||
m[1] = np.array(index_column)
|
||
m = m.astype(int)
|
||
dminmax = np.zeros(N)
|
||
dminmax[1] = D[m[0], m[1]]
|
||
|
||
for i in range(2, N):
|
||
pool = np.delete(samples, m[:i])
|
||
dmin = np.zeros((M - i))
|
||
for j in range((M - i)):
|
||
indexa = pool[j]
|
||
d = np.zeros(i)
|
||
for k in range(i):
|
||
indexb = m[k]
|
||
if indexa < indexb:
|
||
d[k] = D[indexa, indexb]
|
||
else:
|
||
d[k] = D[indexb, indexa]
|
||
dmin[j] = np.min(d)
|
||
dminmax[i] = np.max(dmin)
|
||
index = np.argmax(dmin)
|
||
m[i] = pool[index]
|
||
|
||
m_complement = np.delete(np.arange(data.shape[0]), m)
|
||
|
||
X_train = data[m, :]
|
||
y_train = label[m]
|
||
X_test = data[m_complement, :]
|
||
y_test = label[m_complement]
|
||
|
||
return X_train, X_test, y_train, y_test
|
||
|
||
def split_data(self, X: np.ndarray, y: pd.Series, method: str = "random",
|
||
test_size: float = 0.2, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||
"""
|
||
根据指定方法划分数据集
|
||
|
||
Args:
|
||
X: 特征数据
|
||
y: 目标值数据
|
||
method: 划分方法 ("random", "spxy", "ks")
|
||
test_size: 测试集比例
|
||
random_state: 随机种子(仅对random方法有效)
|
||
|
||
Returns:
|
||
X_train, X_test, y_train, y_test
|
||
"""
|
||
print(f"使用 {method} 方法划分数据集")
|
||
|
||
if method == "random":
|
||
return self.random(X, y, test_ratio=test_size, random_state=random_state)
|
||
elif method == "spxy":
|
||
return self.spxy(X, y, test_size=test_size)
|
||
elif method == "ks":
|
||
return self.ks(X, y, test_size=test_size)
|
||
else:
|
||
raise ValueError(f"不支持的划分方法: {method}. 支持的方法: {self.split_methods}")
|
||
|
||
def train_single_model(self, X: np.ndarray, y: pd.Series, model_name: str,
|
||
cv_folds: int = 5, scoring: str = 'neg_mean_squared_error',
|
||
test_size: float = 0.2, random_state: int = 42,
|
||
split_method: str = "random") -> Dict:
|
||
"""
|
||
训练单个回归模型
|
||
|
||
Args:
|
||
X: 特征数据
|
||
y: 目标值数据
|
||
model_name: 模型名称
|
||
cv_folds: 交叉验证折数
|
||
scoring: 评分指标
|
||
test_size: 测试集比例
|
||
random_state: 随机种子
|
||
split_method: 数据划分方法
|
||
|
||
Returns:
|
||
训练结果字典
|
||
"""
|
||
if model_name not in self.model_configs:
|
||
raise ValueError(f"不支持的模型: {model_name}")
|
||
|
||
config = self.model_configs[model_name]
|
||
|
||
if not config['available']:
|
||
print(f"模型 {model_name} 不可用,请安装相应的库")
|
||
return None
|
||
|
||
print(f"开始训练模型: {model_name}")
|
||
|
||
# 使用指定方法分割训练集和测试集
|
||
X_train, X_test, y_train, y_test = self.split_data(
|
||
X, y, method=split_method, test_size=test_size, random_state=random_state
|
||
)
|
||
|
||
print(f"数据分割完成:")
|
||
print(f" 训练集样本数: {X_train.shape[0]}")
|
||
print(f" 测试集样本数: {X_test.shape[0]}")
|
||
|
||
# 创建模型实例
|
||
if callable(config['model']):
|
||
base_model = config['model']()
|
||
else:
|
||
base_model = config['model']
|
||
|
||
# 特殊处理某些模型
|
||
if model_name == 'CatBoost':
|
||
base_model.set_params(verbose=False)
|
||
elif model_name == 'LightGBM':
|
||
base_model.set_params(verbose=-1)
|
||
|
||
# 网格搜索 - 使用KFold代替StratifiedKFold
|
||
cv_strategy = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
||
|
||
grid_search = GridSearchCV(
|
||
base_model,
|
||
config['params'],
|
||
cv=cv_strategy,
|
||
scoring=scoring,
|
||
n_jobs=-1,
|
||
verbose=1
|
||
)
|
||
|
||
# 在训练集上训练模型
|
||
# with parallel_backend("threading", n_jobs=-1):
|
||
# grid_search.fit(X_train, y_train)
|
||
grid_search.fit(X_train, y_train)
|
||
|
||
# 获取最佳模型
|
||
best_model = grid_search.best_estimator_
|
||
|
||
# 交叉验证评估(在训练集上)
|
||
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv_strategy, scoring=scoring)
|
||
|
||
# 计算训练集上的回归指标
|
||
y_train_pred = best_model.predict(X_train)
|
||
train_mse = mean_squared_error(y_train, y_train_pred)
|
||
train_mae = mean_absolute_error(y_train, y_train_pred)
|
||
train_r2 = r2_score(y_train, y_train_pred)
|
||
train_rmse = np.sqrt(train_mse)
|
||
|
||
# 计算测试集上的回归指标
|
||
y_test_pred = best_model.predict(X_test)
|
||
test_mse = mean_squared_error(y_test, y_test_pred)
|
||
test_mae = mean_absolute_error(y_test, y_test_pred)
|
||
test_r2 = r2_score(y_test, y_test_pred)
|
||
test_rmse = np.sqrt(test_mse)
|
||
|
||
result = {
|
||
'model': best_model,
|
||
'best_params': grid_search.best_params_,
|
||
'best_score': grid_search.best_score_,
|
||
'cv_mean': cv_scores.mean(),
|
||
'cv_std': cv_scores.std(),
|
||
'cv_scores': cv_scores,
|
||
# 训练集指标
|
||
'train_mse': train_mse,
|
||
'train_mae': train_mae,
|
||
'train_rmse': train_rmse,
|
||
'train_r2': train_r2,
|
||
# 测试集指标
|
||
'test_mse': test_mse,
|
||
'test_mae': test_mae,
|
||
'test_rmse': test_rmse,
|
||
'test_r2': test_r2,
|
||
# 数据分割信息
|
||
'train_size': X_train.shape[0],
|
||
'test_size': X_test.shape[0],
|
||
'split_method': split_method
|
||
}
|
||
|
||
print(f"模型 {model_name} 训练完成:")
|
||
print(f" 最佳参数: {result['best_params']}")
|
||
print(f" 最佳得分: {result['best_score']:.4f}")
|
||
print(f" CV均值: {result['cv_mean']:.4f} ± {result['cv_std']:.4f}")
|
||
print(f" 训练集指标:")
|
||
print(f" R²: {result['train_r2']:.4f}")
|
||
print(f" RMSE: {result['train_rmse']:.4f}")
|
||
print(f" MAE: {result['train_mae']:.4f}")
|
||
print(f" 测试集指标:")
|
||
print(f" R²: {result['test_r2']:.4f}")
|
||
print(f" RMSE: {result['test_rmse']:.4f}")
|
||
print(f" MAE: {result['test_mae']:.4f}")
|
||
|
||
return result
|
||
|
||
def save_model(self, model, target_column_name: str, preprocess_method: str, model_name: str,
|
||
metadata: Dict = None):
|
||
"""
|
||
保存模型,使用目标列名作为文件名的一部分
|
||
|
||
Args:
|
||
model: 训练好的模型
|
||
target_column_name: 目标列名
|
||
preprocess_method: 预处理方法名称
|
||
model_name: 模型名称
|
||
metadata: 模型元数据
|
||
"""
|
||
# 清理目标列名,移除可能的特殊字符
|
||
safe_target_name = "".join(c for c in target_column_name if c.isalnum() or c in ('-', '_')).rstrip()
|
||
|
||
filename = f"{safe_target_name}_{preprocess_method}_{model_name}.joblib"
|
||
filepath = self.artifacts_dir / filename
|
||
|
||
# 保存模型和元数据
|
||
save_data = {
|
||
'model': model,
|
||
'target_column_name': target_column_name,
|
||
'preprocess_method': preprocess_method,
|
||
'model_name': model_name,
|
||
'metadata': metadata or {}
|
||
}
|
||
|
||
joblib.dump(save_data, filepath)
|
||
print(f"模型已保存: {filepath}")
|
||
|
||
def train_models_batch(self, csv_path: str, feature_start_column: Union[int, str],
|
||
preprocessing_methods: Union[str, List[str]] = "None",
|
||
model_names: Union[str, List[str]] = "RF",
|
||
split_methods: Union[str, List[str]] = "random",
|
||
cv_folds: int = 5,
|
||
scoring: str = 'neg_mean_squared_error',
|
||
test_size: float = 0.2,
|
||
random_state: int = 42) -> Dict:
|
||
"""
|
||
批量训练多个目标列的模型
|
||
|
||
Args:
|
||
csv_path: 数据文件路径
|
||
feature_start_column: 特征开始列索引(int)或列名(str)
|
||
preprocessing_methods: 预处理方法列表
|
||
model_names: 模型名称列表
|
||
split_methods: 数据划分方法列表
|
||
cv_folds: 交叉验证折数
|
||
scoring: 评分指标(回归指标)
|
||
test_size: 测试集比例
|
||
random_state: 随机种子
|
||
|
||
Returns:
|
||
所有模型的训练结果
|
||
"""
|
||
# 转换为列表
|
||
if isinstance(preprocessing_methods, str):
|
||
preprocessing_methods = [preprocessing_methods]
|
||
if isinstance(model_names, str):
|
||
model_names = [model_names]
|
||
if isinstance(split_methods, str):
|
||
split_methods = [split_methods]
|
||
|
||
# 加载数据
|
||
X_raw, y_dict = self.load_data_batch(csv_path, feature_start_column)
|
||
|
||
all_results = {}
|
||
|
||
# 对每个目标列进行训练
|
||
for target_column_name, y in y_dict.items():
|
||
print(f"\n{'='*80}")
|
||
print(f"开始训练目标列: {target_column_name}")
|
||
print(f"{'='*80}")
|
||
|
||
# 创建该目标列的子目录
|
||
target_artifacts_dir = self.artifacts_dir / target_column_name
|
||
target_artifacts_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 临时更改artifacts_dir
|
||
original_artifacts_dir = self.artifacts_dir
|
||
self.artifacts_dir = target_artifacts_dir
|
||
|
||
try:
|
||
# 去除该目标列的空值
|
||
mask = ~y.isna()
|
||
if mask.sum() == 0:
|
||
print(f"目标列 '{target_column_name}' 无有效数据,跳过")
|
||
continue
|
||
|
||
X_clean = X_raw[mask]
|
||
y_clean = y[mask]
|
||
|
||
print(f"有效样本数: {len(y_clean)}")
|
||
|
||
# 训练该目标列的所有模型组合
|
||
target_results = self.train_models_single_target(
|
||
X_clean, y_clean, target_column_name,
|
||
preprocessing_methods, model_names, split_methods,
|
||
cv_folds, scoring, test_size, random_state
|
||
)
|
||
|
||
all_results[target_column_name] = target_results
|
||
|
||
except Exception as e:
|
||
print(f"训练目标列 '{target_column_name}' 时出错: {e}")
|
||
continue
|
||
finally:
|
||
# 恢复原始artifacts_dir
|
||
self.artifacts_dir = original_artifacts_dir
|
||
|
||
# 保存所有结果的汇总
|
||
self._save_batch_results_summary(all_results)
|
||
|
||
return all_results
|
||
|
||
def train_models_single_target(self, X_raw: pd.DataFrame, y: pd.Series, target_column_name: str,
|
||
preprocessing_methods: List[str], model_names: List[str],
|
||
split_methods: List[str], cv_folds: int, scoring: str,
|
||
test_size: float, random_state: int) -> Dict:
|
||
"""
|
||
训练单个目标列的所有模型组合
|
||
"""
|
||
results = {}
|
||
|
||
# 遍历所有组合
|
||
for split_method in split_methods:
|
||
for preprocess_method in preprocessing_methods:
|
||
for model_name in model_names:
|
||
combo_key = f"{split_method}_{preprocess_method}_{model_name}"
|
||
print(f"\n{'-' * 60}")
|
||
print(f"训练组合: {combo_key}")
|
||
print(f"{'-' * 60}")
|
||
|
||
try:
|
||
# 数据预处理
|
||
X_processed = self.preprocess_data(X_raw, preprocess_method)
|
||
|
||
# 训练模型
|
||
result = self.train_single_model(X_processed, y, model_name,
|
||
cv_folds, scoring, test_size, random_state, split_method)
|
||
|
||
if result is not None:
|
||
# 保存模型
|
||
metadata = {
|
||
'target_column_name': target_column_name,
|
||
'cv_mean': result['cv_mean'],
|
||
'cv_std': result['cv_std'],
|
||
'best_params': result['best_params'],
|
||
'data_shape': X_processed.shape,
|
||
'target_range': [float(y.min()), float(y.max())],
|
||
'train_r2': result['train_r2'],
|
||
'train_rmse': result['train_rmse'],
|
||
'train_mae': result['train_mae'],
|
||
'test_r2': result['test_r2'],
|
||
'test_rmse': result['test_rmse'],
|
||
'test_mae': result['test_mae'],
|
||
'train_size': result['train_size'],
|
||
'test_size': result['test_size'],
|
||
'split_method': result['split_method']
|
||
}
|
||
|
||
self.save_model(result['model'], target_column_name,
|
||
f"{split_method}_{preprocess_method}",
|
||
model_name, metadata)
|
||
|
||
results[combo_key] = result
|
||
|
||
except Exception as e:
|
||
print(f"训练组合 {combo_key} 失败: {e}")
|
||
continue
|
||
|
||
# 保存该目标列的结果摘要
|
||
self._save_single_target_results_summary(target_column_name, results)
|
||
|
||
return results
|
||
|
||
def _save_single_target_results_summary(self, target_column_name: str, results: Dict):
|
||
"""保存单个目标列的结果摘要"""
|
||
if not results:
|
||
print(f"目标列 '{target_column_name}' 没有训练结果")
|
||
return
|
||
|
||
summary_data = []
|
||
|
||
for combo_key, result in results.items():
|
||
# 分离划分方法、预处理方法和建模方法
|
||
parts = combo_key.split('_', 2)
|
||
split_method = parts[0] if len(parts) > 0 else ''
|
||
preprocess_method = parts[1] if len(parts) > 1 else ''
|
||
model_method = parts[2] if len(parts) > 2 else ''
|
||
|
||
summary_data.append({
|
||
'划分方法': split_method,
|
||
'预处理方法': preprocess_method,
|
||
'建模方法': model_method,
|
||
'CV均值': result['cv_mean'],
|
||
'CV标准差': result['cv_std'],
|
||
'最佳得分': result['best_score'],
|
||
'训练集R²': result['train_r2'],
|
||
'训练集RMSE': result['train_rmse'],
|
||
'训练集MAE': result['train_mae'],
|
||
'训练集MSE': result['train_mse'],
|
||
'测试集R²': result['test_r2'],
|
||
'测试集RMSE': result['test_rmse'],
|
||
'测试集MAE': result['test_mae'],
|
||
'测试集MSE': result['test_mse'],
|
||
'训练样本数': result['train_size'],
|
||
'测试样本数': result['test_size'],
|
||
'最佳参数': str(result['best_params'])
|
||
})
|
||
|
||
summary_df = pd.DataFrame(summary_data)
|
||
# 按测试集R²降序排列(R²越大越好)
|
||
summary_df = summary_df.sort_values('测试集R²', ascending=False)
|
||
|
||
# 清理目标列名,移除可能的特殊字符
|
||
safe_target_name = "".join(c for c in target_column_name if c.isalnum() or c in ('-', '_')).rstrip()
|
||
|
||
# 保存详细结果CSV(中文版)
|
||
detailed_path = self.artifacts_dir / f"{safe_target_name}_detailed_results.csv"
|
||
summary_df.to_csv(detailed_path, index=False, encoding='utf-8-sig')
|
||
|
||
# 保存简化版本用于兼容性(英文版)
|
||
summary_data_simple = []
|
||
for combo_key, result in results.items():
|
||
summary_data_simple.append({
|
||
'combination': combo_key,
|
||
'cv_mean': result['cv_mean'],
|
||
'cv_std': result['cv_std'],
|
||
'best_score': result['best_score'],
|
||
'train_r2': result['train_r2'],
|
||
'train_rmse': result['train_rmse'],
|
||
'train_mae': result['train_mae'],
|
||
'test_r2': result['test_r2'],
|
||
'test_rmse': result['test_rmse'],
|
||
'test_mae': result['test_mae'],
|
||
'train_size': result['train_size'],
|
||
'test_size': result['test_size'],
|
||
'split_method': result.get('split_method', 'unknown'),
|
||
'best_params': str(result['best_params'])
|
||
})
|
||
|
||
summary_df_simple = pd.DataFrame(summary_data_simple)
|
||
summary_df_simple = summary_df_simple.sort_values('test_r2', ascending=False)
|
||
simple_summary_path = self.artifacts_dir / f"{safe_target_name}_training_summary.csv"
|
||
summary_df_simple.to_csv(simple_summary_path, index=False)
|
||
|
||
print(f"\n{'-' * 60}")
|
||
print(f"目标列 '{target_column_name}' 训练结果摘要:")
|
||
print(f"{'-' * 60}")
|
||
print(summary_df[
|
||
['划分方法', '预处理方法', '建模方法', '训练集R²', '测试集R²', '训练集RMSE', '测试集RMSE', 'CV均值']].to_string(
|
||
index=False))
|
||
print(f"\n详细结果已保存: {detailed_path}")
|
||
print(f"简化结果已保存: {simple_summary_path}")
|
||
|
||
def _save_batch_results_summary(self, all_results: Dict):
|
||
"""保存批量训练结果汇总"""
|
||
all_summary_data = []
|
||
|
||
for target_column_name, target_results in all_results.items():
|
||
for combo_key, result in target_results.items():
|
||
# 分离划分方法、预处理方法和建模方法
|
||
parts = combo_key.split('_', 2)
|
||
split_method = parts[0] if len(parts) > 0 else ''
|
||
preprocess_method = parts[1] if len(parts) > 1 else ''
|
||
model_method = parts[2] if len(parts) > 2 else ''
|
||
|
||
all_summary_data.append({
|
||
'目标列': target_column_name,
|
||
'划分方法': split_method,
|
||
'预处理方法': preprocess_method,
|
||
'建模方法': model_method,
|
||
'CV均值': result['cv_mean'],
|
||
'CV标准差': result['cv_std'],
|
||
'最佳得分': result['best_score'],
|
||
'训练集R²': result['train_r2'],
|
||
'训练集RMSE': result['train_rmse'],
|
||
'训练集MAE': result['train_mae'],
|
||
'训练集MSE': result['train_mse'],
|
||
'测试集R²': result['test_r2'],
|
||
'测试集RMSE': result['test_rmse'],
|
||
'测试集MAE': result['test_mae'],
|
||
'测试集MSE': result['test_mse'],
|
||
'训练样本数': result['train_size'],
|
||
'测试样本数': result['test_size'],
|
||
'最佳参数': str(result['best_params'])
|
||
})
|
||
|
||
if all_summary_data:
|
||
summary_df = pd.DataFrame(all_summary_data)
|
||
# 按目标列和测试集R²排序
|
||
summary_df = summary_df.sort_values(['目标列', '测试集R²'], ascending=[True, False])
|
||
|
||
# 保存详细结果CSV(中文版)
|
||
detailed_path = self.artifacts_dir / "batch_detailed_results.csv"
|
||
summary_df.to_csv(detailed_path, index=False, encoding='utf-8-sig')
|
||
|
||
# 保持原有的批量训练汇总结果(中文版)
|
||
batch_summary_path = self.artifacts_dir / "batch_training_summary.csv"
|
||
summary_df.to_csv(batch_summary_path, index=False, encoding='utf-8-sig')
|
||
|
||
# 创建简化版本用于兼容性(英文版)
|
||
all_summary_data_simple = []
|
||
for target_column_name, target_results in all_results.items():
|
||
for combo_key, result in target_results.items():
|
||
all_summary_data_simple.append({
|
||
'target_column': target_column_name,
|
||
'combination': combo_key,
|
||
'cv_mean': result['cv_mean'],
|
||
'cv_std': result['cv_std'],
|
||
'best_score': result['best_score'],
|
||
'train_r2': result['train_r2'],
|
||
'train_rmse': result['train_rmse'],
|
||
'train_mae': result['train_mae'],
|
||
'test_r2': result['test_r2'],
|
||
'test_rmse': result['test_rmse'],
|
||
'test_mae': result['test_mae'],
|
||
'train_size': result['train_size'],
|
||
'test_size': result['test_size'],
|
||
'split_method': result.get('split_method', 'unknown'),
|
||
'best_params': str(result['best_params'])
|
||
})
|
||
|
||
summary_df_simple = pd.DataFrame(all_summary_data_simple)
|
||
summary_df_simple = summary_df_simple.sort_values(['target_column', 'test_r2'], ascending=[True, False])
|
||
simple_summary_path = self.artifacts_dir / "batch_training_summary_simple.csv"
|
||
summary_df_simple.to_csv(simple_summary_path, index=False)
|
||
|
||
print(f"\n{'='*80}")
|
||
print("批量训练结果汇总:")
|
||
print(f"{'='*80}")
|
||
|
||
# 显示每个目标列的最佳模型
|
||
for target_col in summary_df['目标列'].unique():
|
||
target_data = summary_df[summary_df['目标列'] == target_col]
|
||
best_row = target_data.iloc[0] # 已经按R²降序排列
|
||
print(f"\n目标列 '{target_col}' 最佳模型:")
|
||
print(f" 组合: {best_row['划分方法']}_{best_row['预处理方法']}_{best_row['建模方法']}")
|
||
print(f" 测试集R²: {best_row['测试集R²']:.4f}")
|
||
print(f" 测试集RMSE: {best_row['测试集RMSE']:.4f}")
|
||
print(f" 最佳参数: {best_row['最佳参数']}")
|
||
|
||
print(f"\n详细结果已保存: {detailed_path}")
|
||
print(f"批量训练汇总结果已保存: {batch_summary_path}")
|
||
print(f"简化结果已保存: {simple_summary_path}")
|
||
|
||
def load_model(self, preprocess_method: str, model_name: str):
|
||
"""
|
||
加载保存的模型
|
||
|
||
Args:
|
||
preprocess_method: 预处理方法名称
|
||
model_name: 模型名称
|
||
|
||
Returns:
|
||
加载的模型数据
|
||
"""
|
||
filename = f"{preprocess_method}_{model_name}.joblib"
|
||
filepath = self.artifacts_dir / filename
|
||
|
||
if not filepath.exists():
|
||
raise FileNotFoundError(f"模型文件不存在: {filepath}")
|
||
|
||
return joblib.load(filepath)
|
||
|
||
def get_best_model(self, metric: str = 'test_r2') -> Tuple[str, Dict]:
|
||
"""
|
||
获取最佳模型
|
||
|
||
Args:
|
||
metric: 评估指标(默认使用测试集R²)
|
||
可选:'test_r2', 'train_r2', 'test_rmse', 'test_mae',
|
||
'train_rmse', 'train_mae', 'cv_mean', 'best_score'
|
||
|
||
Returns:
|
||
最佳模型的组合名称和结果
|
||
"""
|
||
if not self.results:
|
||
raise ValueError("没有训练结果,请先训练模型")
|
||
|
||
# 对于回归指标,R²和负MSE需要取最大值,RMSE和MAE需要取最小值
|
||
if metric in ['test_r2', 'train_r2', 'cv_mean', 'best_score']:
|
||
best_combo = max(self.results.keys(),
|
||
key=lambda k: self.results[k][metric])
|
||
else: # rmse, mae等,越小越好
|
||
best_combo = min(self.results.keys(),
|
||
key=lambda k: self.results[k][metric])
|
||
|
||
return best_combo, self.results[best_combo]
|
||
|
||
|
||
def main():
|
||
"""主函数示例 - 批量训练"""
|
||
# 创建批量建模实例
|
||
modeler = WaterQualityModelingBatch(r"D:\BaiduNetdiskDownload\yaobao\model")
|
||
|
||
# 批量训练多个目标列的模型
|
||
all_results = modeler.train_models_batch(
|
||
csv_path=r"D:\BaiduNetdiskDownload\yaobao\csv\yangdian_output.csv",
|
||
feature_start_column="374.285004", # 使用列名指定特征开始位置
|
||
preprocessing_methods=['None', 'MMS', 'SS', 'SNV', 'MA', 'SG', 'MSC', 'D1', 'D2', 'DT', 'CT'],#
|
||
model_names=['SVR', 'RF', 'Ridge', 'Lasso'],#, 'ElasticNet', 'XGBoost', 'LightGBM', 'CatBoost'
|
||
split_methods=['spxy', 'ks','random' ], #
|
||
cv_folds=5
|
||
)
|
||
|
||
print(f"\n批量训练完成,共训练了 {len(all_results)} 个目标列的模型")
|
||
|
||
# 显示每个目标列的最佳模型
|
||
for target_column_name, target_results in all_results.items():
|
||
if target_results:
|
||
best_combo = max(target_results.keys(),
|
||
key=lambda k: target_results[k]['test_r2'])
|
||
best_result = target_results[best_combo]
|
||
|
||
print(f"\n目标列 '{target_column_name}' 最佳模型:")
|
||
print(f" 组合: {best_combo}")
|
||
print(f" 测试集R²: {best_result['test_r2']:.4f}")
|
||
print(f" 测试集RMSE: {best_result['test_rmse']:.4f}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|