Files
micro_plastic/classification_model/Parallel/test.py
2026-02-25 09:42:51 +08:00

832 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from imblearn.over_sampling import SMOTE
import pandas as pd
from classification_model.DataLoad.DataLoad import SetSplit, LoadNirtest
from classification_model.Preprocessing.Preprocessing import Preprocessing
from classification_model.WaveSelect.WaveSelcet import SpctrumFeatureSelcet
from classification_model.Classification.ClassicCls import (
LogisticRegressionModel, SVM as SVM_Classic, PLS_DA, RF,
XGBoost, LightGBM, CatBoost, AdaBoost, KNN
)
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import sklearn.svm as svm
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
import numpy as np
import joblib
import os
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
def cross_validate_model(model, X, y, cv=5):
"""
:param model: 模型
:param X:
:param y:
:param cv: 折数
:return:
"""
scores = cross_val_score(model, X, y, cv=cv)
print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
return scores
# ==================== 光谱数据增强模块 ====================
def augment_spectrum(spectrum, noise_level=0.01, offset_range=0.02, multiplier_range=(0.95, 1.05), slope_range=(-0.001, 0.001), random_state=None):
"""
对单个光谱进行数据增强,包括添加随机噪声、偏移量、乘法和斜率的随机变化
:param spectrum: 单个光谱数据1D数组
:param noise_level: 噪声水平相对于光谱值的标准差比例默认0.011%
:param offset_range: 偏移量范围绝对值默认0.02
:param multiplier_range: 乘法因子范围(最小值,最大值),默认(0.95, 1.05)
:param slope_range: 斜率变化范围(最小值,最大值),默认(-0.001, 0.001)
:param random_state: 随机种子,用于可重复性
:return: 增强后的光谱数据
"""
if random_state is not None:
np.random.seed(random_state)
spectrum = np.array(spectrum).flatten()
n_features = len(spectrum)
# 1. 添加随机噪声(高斯噪声)
noise = np.random.normal(0, noise_level * np.std(spectrum), n_features)
augmented = spectrum + noise
# 2. 添加偏移量(基线偏移)
offset = np.random.uniform(-offset_range, offset_range)
augmented = augmented + offset
# 3. 乘法变化(乘性散射校正的变体)
multiplier = np.random.uniform(multiplier_range[0], multiplier_range[1])
augmented = augmented * multiplier
# 4. 斜率变化(线性基线漂移)
slope = np.random.uniform(slope_range[0], slope_range[1])
x_indices = np.arange(n_features)
augmented = augmented + slope * x_indices
return augmented
def augment_dataset(X, y, augmentation_factor=1, noise_level=0.01, offset_range=0.02,
multiplier_range=(0.95, 1.05), slope_range=(-0.001, 0.001),
random_state=None, preserve_original=True):
"""
对整个数据集进行光谱数据增强
:param X: 特征数据n_samples, n_features
:param y: 标签数据n_samples,
:param augmentation_factor: 增强倍数每个样本生成augmentation_factor个增强样本默认1
:param noise_level: 噪声水平默认0.01
:param offset_range: 偏移量范围默认0.02
:param multiplier_range: 乘法因子范围,默认(0.95, 1.05)
:param slope_range: 斜率变化范围,默认(-0.001, 0.001)
:param random_state: 随机种子默认None
:param preserve_original: 是否保留原始数据默认True
:return: 增强后的特征数据和标签数据 (X_augmented, y_augmented)
"""
# 确保 X 是密集的 numpy 数组
if hasattr(X, 'toarray'): # 处理稀疏矩阵
X = X.toarray()
else:
X = np.array(X)
# 确保 X 是2D数组
if X.ndim == 1:
X = X.reshape(1, -1)
y = np.array(y).flatten()
if random_state is not None:
np.random.seed(random_state)
augmented_X_list = []
augmented_y_list = []
n_samples = X.shape[0]
n_features = X.shape[1]
# 对每个样本进行处理
for i in range(n_samples):
# 获取当前样本确保是1D数组
current_sample = np.array(X[i]).flatten()
# 如果保留原始数据,先添加原始样本
if preserve_original:
# 确保原始样本是2D数组 (1, n_features)
original_sample = current_sample.reshape(1, -1)
augmented_X_list.append(original_sample)
augmented_y_list.append(y[i])
# 生成增强样本
for j in range(augmentation_factor):
# 为每个增强样本生成不同的随机种子
if random_state is not None:
seed = random_state + i * augmentation_factor + j
else:
seed = None
augmented_spectrum = augment_spectrum(
current_sample,
noise_level=noise_level,
offset_range=offset_range,
multiplier_range=multiplier_range,
slope_range=slope_range,
random_state=seed
)
# 确保是2D数组 (1, n_features)
augmented_X_list.append(augmented_spectrum.reshape(1, -1))
augmented_y_list.append(y[i])
# 合并所有数据
if len(augmented_X_list) > 0:
X_augmented = np.vstack(augmented_X_list)
y_augmented = np.array(augmented_y_list)
else:
X_augmented = X
y_augmented = y
return X_augmented, y_augmented
def augment_dataset_with_params(X, y, augmentation_params=None, random_state=None, preserve_original=True):
"""
使用参数字典对整个数据集进行光谱数据增强(更灵活的接口)
:param X: 特征数据n_samples, n_features
:param y: 标签数据n_samples,
:param augmentation_params: 增强参数字典,包含:
- 'augmentation_factor': 增强倍数默认1
- 'noise_level': 噪声水平默认0.01
- 'offset_range': 偏移量范围默认0.02
- 'multiplier_range': 乘法因子范围,默认(0.95, 1.05)
- 'slope_range': 斜率变化范围,默认(-0.001, 0.001)
:param random_state: 随机种子默认None
:param preserve_original: 是否保留原始数据默认True
:return: 增强后的特征数据和标签数据 (X_augmented, y_augmented)
"""
if augmentation_params is None:
augmentation_params = {}
return augment_dataset(
X, y,
augmentation_factor=augmentation_params.get('augmentation_factor', 1),
noise_level=augmentation_params.get('noise_level', 0.01),
offset_range=augmentation_params.get('offset_range', 0.02),
multiplier_range=augmentation_params.get('multiplier_range', (0.95, 1.05)),
slope_range=augmentation_params.get('slope_range', (-0.001, 0.001)),
random_state=random_state,
preserve_original=preserve_original
)
# 混淆矩阵与分类报告
def evaluate_model(y_true, y_pred, dataset_name="Test", title="Confusion Matrix", cmap='Blues'):
"""
性能评估,包含分类报告和混淆矩阵,且标签从 1 开始。
参数:
y_true -- 真实标签
y_pred -- 预测标签
dataset_name -- 数据集名称(如 "Train""Test"
title -- 图表标题
cmap -- 热力图颜色
"""
print(f"{dataset_name} Classification Report:")
print(classification_report(y_true, y_pred))
# 计算混淆矩阵
cm = confusion_matrix(y_true, y_pred)
# 绘制热力图(此部分可选择性取消注释以显示图形)
# plt.figure(figsize=(8, 6))
# ax = sns.heatmap(cm, annot=True, fmt='g', cmap=cmap, cbar=True,
# linewidths=0.5, linecolor='black', square=True,
# annot_kws={"size": 12})
# ax.set_title(f"{dataset_name} {title}", fontsize=16)
# ax.set_xlabel('Predicted Label', fontsize=14)
# ax.set_ylabel('True Label', fontsize=14)
# plt.tight_layout()
# plt.show()
# 返回多个性能指标的字典,包括混淆矩阵
return {
"accuracy": accuracy_score(y_true, y_pred),
"precision": precision_score(y_true, y_pred, average='weighted'),
"recall": recall_score(y_true, y_pred, average='weighted'),
"f1_score": f1_score(y_true, y_pred, average='weighted'),
"confusion_matrix": cm
}
# 光谱定性分析
def SpectralQualitativeAnalysis(data, label, ProcessMethods, ProcessMethods2, FslecetedMethods, SetSplitMethods,
use_smote=False, use_augmentation=False, augmentation_params=None, random_state=42):
"""
光谱定性分析,支持数据增强
:param data: 输入数据
:param label: 标签数据
:param ProcessMethods: 第一种预处理方法
:param ProcessMethods2: 第二种预处理方法
:param FslecetedMethods: 特征选择方法
:param SetSplitMethods: 数据划分方法
:param use_smote: 是否使用SMOTE默认False
:param use_augmentation: 是否使用光谱数据增强默认False
:param augmentation_params: 数据增强参数字典默认None使用默认参数
:param random_state: 随机种子默认42
:return: X_train, X_test, y_train, y_test
"""
# 预处理
ProcesedData = Preprocessing(ProcessMethods, data)
ProcesedData2 = Preprocessing(ProcessMethods2, ProcesedData)
# 特征选择
FeatrueData, labels, selected_columns = SpctrumFeatureSelcet(FslecetedMethods, ProcesedData2, label)
# 数据划分
X_train, X_test, y_train, y_test = SetSplit(SetSplitMethods, FeatrueData, labels, test_size=0.3, randomseed=random_state)
# 使用光谱数据增强在SMOTE之前应用
if use_augmentation:
print(f"Original training set size: {len(y_train)}")
X_train, y_train = augment_dataset_with_params(
X_train, y_train,
augmentation_params=augmentation_params,
random_state=random_state,
preserve_original=True
)
print(f"Training set size after augmentation: {len(y_train)}")
# 使用 SMOTE 增加少数类别样本
if use_smote:
smote = SMOTE(random_state=random_state)
X_train, y_train = smote.fit_resample(X_train, y_train)
print("SMOTE applied: Training set size after resampling:", len(y_train))
# 模型训练和评估
return X_train, X_test, y_train, y_test
def Procesed(data, ProcessMethods1, ProcessMethods2, model_path):
"""
对数据进行预处理,支持两种预处理方法
:param data: 输入数据
:param ProcessMethods1: 第一种预处理方法(如 'SS', 'MMS', 'None'等)
:param ProcessMethods2: 第二种预处理方法(如 'SG', 'D1', 'None'等)
:param model_path: 模型路径用于定位scaler_params.pkl
:return: 预处理后的数据
"""
import os
from classification_model.Preprocessing import Preprocessing
# 第一步预处理
if ProcessMethods1 == 'SS':
# 当第一种预处理方法为SS时需要加载保存的scaler
model_dir = os.path.dirname(model_path)
scaler_path = os.path.join(model_dir, 'scaler_params.pkl')
if not os.path.exists(scaler_path):
raise FileNotFoundError(f"Scaler file not found at {scaler_path}. Please ensure the model was trained with SS preprocessing.")
loaded_scaler = joblib.load(scaler_path)
transformed_data = loaded_scaler.transform(data)
# 转换为DataFrame格式以便后续处理
transformed_data_layout = pd.DataFrame(transformed_data)
elif ProcessMethods1 == 'None' or ProcessMethods1 is None:
# 如果第一种预处理方法为None直接使用原始数据
transformed_data_layout = pd.DataFrame(data) if not isinstance(data, pd.DataFrame) else data
else:
# 其他预处理方法直接调用Preprocessing函数
transformed_data_layout = Preprocessing(ProcessMethods1, data)
if isinstance(transformed_data_layout, np.ndarray):
transformed_data_layout = pd.DataFrame(transformed_data_layout)
# 第二步预处理
if ProcessMethods2 == 'None' or ProcessMethods2 is None:
ProcesedData2 = transformed_data_layout
else:
ProcesedData2 = Preprocessing(ProcessMethods2, transformed_data_layout)
if isinstance(ProcesedData2, np.ndarray):
ProcesedData2 = pd.DataFrame(ProcesedData2)
return ProcesedData2
def SVM_with_kernels_visualization(X_train, X_test, y_train, y_test, param_grid=None):
"""
针对不同核函数的 SVM 模型进行超参数调优,使用测试集验证最佳模型,并分别绘制三维网络图。
:param X_train: 训练集特征
:param X_test: 测试集特征
:param y_train: 训练集标签
:param y_test: 测试集标签
:param param_grid: 超参数网格,默认为 None。如果 None使用默认参数范围。
"""
if param_grid is None:
# 默认参数网格
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': [1, 0.1, 0.01, 0.001],
'kernel': ['linear', 'rbf', 'poly']
}
# 初始化 SVM 模型
svc = SVC()
# 网格搜索
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
# 输出最优参数和对应的分数
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)
# 测试集验证最佳模型
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)
print("\nTest Set Evaluation:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
# 获取搜索结果
results = grid_search.cv_results_
# 按核函数类型分别绘制三维网格图
kernels = np.unique(param_grid['kernel'])
for kernel in kernels:
kernel_indices = [i for i, params in enumerate(results['params']) if params['kernel'] == kernel]
C_values = [results['params'][i]['C'] for i in kernel_indices]
gamma_values = [results['params'][i]['gamma'] for i in kernel_indices if 'gamma' in results['params'][i]]
scores = results['mean_test_score'][kernel_indices]
# 如果是线性核,不需要绘制 gamma 参数
if kernel == 'linear':
plot_linear_kernel(C_values, scores, kernel)
else:
plot_3D_grid(C_values, gamma_values, scores, kernel)
return best_model
def plot_3D_grid(C_values, gamma_values, scores, kernel):
"""
绘制三维超参数网络图(针对 RBF 和多项式核),添加颜色梯度。
:param C_values: C 参数的列表
:param gamma_values: gamma 参数的列表
:param scores: 对应的交叉验证分数
:param kernel: 核函数名称
"""
# 将数据转化为网格形式
C_unique = np.unique(C_values)
gamma_unique = np.unique(gamma_values)
C_grid, gamma_grid = np.meshgrid(C_unique, gamma_unique)
# 构建 Z 轴(对应交叉验证得分)
Z = np.zeros_like(C_grid)
for i, c in enumerate(C_unique):
for j, gamma in enumerate(gamma_unique):
indices = [k for k, val in enumerate(C_values) if val == c and gamma_values[k] == gamma]
if indices:
Z[j, i] = scores[indices[0]]
# 转换 C 和 gamma 为对数尺度
log_C_grid = np.log10(C_grid)
log_gamma_grid = np.log10(gamma_grid)
# 绘制三维表面图并添加颜色梯度
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
surface = ax.plot_surface(
log_C_grid, log_gamma_grid, Z, cmap='viridis', edgecolor='k', alpha=0.8
)
# 添加颜色条
cbar = fig.colorbar(surface, pad=0.1, shrink=0.5, aspect=10)
cbar.set_label('Mean Accuracy', fontsize=12)
# 设置坐标轴和标题
ax.set_title(f'3D Hyperparameter Grid ({kernel} kernel)', fontsize=16)
ax.set_xlabel('Log10(C)', fontsize=12)
ax.set_ylabel('Log10(Gamma)', fontsize=12)
ax.set_zlabel('Mean Accuracy', fontsize=12)
# 显示图形
plt.show()
def plot_linear_kernel(C_values, scores, kernel):
"""
绘制线性核的超参数网络图(仅针对 C 参数)。
:param C_values: C 参数的列表
:param scores: 对应的交叉验证分数
:param kernel: 核函数名称
"""
# 将 C 转换为对数尺度
C_values = np.log10(C_values)
# 创建二维折线图
plt.figure(figsize=(8, 6))
plt.plot(C_values, scores, marker='o', label='Mean Accuracy')
plt.xlabel('Log10(C)', fontsize=12)
plt.ylabel('Mean Accuracy', fontsize=12)
plt.title(f'Hyperparameter Tuning ({kernel} kernel)', fontsize=16)
plt.grid(True)
plt.legend()
plt.show()
# 分类并填充结果到标签数组
def classify_and_fill(segments, superpixel_features, model, label_array):
"""
:param segments:
:param superpixel_features:
:param model: 模型
:param label_array:
:return: 类别列
"""
for segment, feature in superpixel_features.items():
# 将高光谱平均特征输入模型,预测类别
label = model.predict([feature])[0]
# 填充到标签数组的对应位置
label_array[segments == segment] = label
return label_array
def save_model(model, model_path, model_type='SVM'):
"""
保存模型到指定路径
:param model: 训练好的模型对象
:param model_path: 模型保存路径
:param model_type: 模型类型(用于文件命名)
:return: 保存的完整路径
"""
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(model, model_path)
print(f"{model_type} model saved to: {model_path}")
return model_path
def load_model(model_path):
"""
加载模型(支持所有模型类型)
:param model_path: 模型路径
:return: 加载的模型
"""
return joblib.load(model_path)
def predict_and_save(df, model_path, model_type='SVM', ProcessMethods1='SS', ProcessMethods2='SG'):
"""
预测模型(支持所有模型类型)
:param df: 包含反射率和形状特征的dataframe
:param model_path: 模型的路径
:param model_type: 模型类型(可选,用于特殊处理)
:param ProcessMethods1: 第一次预处理方法,默认为'SS'(当为'SS'时会加载scaler_params.pkl
:param ProcessMethods2: 第二次预处理方法,默认为'SG'
:return: 包含预测类别列的dataframe
"""
model = load_model(model_path)
# 找到轮廓列的索引
contour_col_idx = None
if 'contour' in df.columns:
contour_col_idx = df.columns.get_loc('contour')
# 选择所有数值列(排除轮廓列)
numeric_cols = []
for i in range(1, df.shape[1]): # 跳过第一列可能是类别或ID
if i != contour_col_idx:
col_name = df.columns[i]
# 只选择数值类型的列
if df[col_name].dtype in ['int64', 'float64']:
numeric_cols.append(col_name)
# 加载数据
x = df[numeric_cols]
# 进行预处理(支持两种预处理方法)
Procesed_features = Procesed(x, ProcessMethods1, ProcessMethods2, model_path)
# 确保Procesed_features是numpy数组格式供模型预测
if isinstance(Procesed_features, pd.DataFrame):
Procesed_features = Procesed_features.values
# 进行预测
predictions = model.predict(Procesed_features)
df['Predictions'] = predictions
return df
def SVM(X_train, X_test, y_train, y_test, kernel='linear', C=1, gamma=1e-3):
clf = svm.SVC(C=C, kernel=kernel, gamma=gamma)
# 交叉验证
cross_validate_model(clf, X_train, y_train)
# 模型拟合
clf.fit(X_train, y_train.ravel())
# 训练集评估
y_train_pred = clf.predict(X_train)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
# 测试集评估
y_test_pred = clf.predict(X_test)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return clf
# ==================== 所有模型的训练函数(返回模型对象)====================
def train_LogisticRegression(X_train, X_test, y_train, y_test, penalty='l2', C=1.0, solver='lbfgs', max_iter=200):
"""训练逻辑回归模型并返回模型对象"""
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=max_iter, multi_class='multinomial', random_state=1)
cross_validate_model(model, X_train, y_train)
model.fit(X_train, y_train.ravel())
y_train_pred = model.predict(X_train)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
y_test_pred = model.predict(X_test)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return model
def train_PLS_DA(X_train, X_test, y_train, y_test, n_components=40):
"""训练PLS-DA模型并返回模型对象"""
from sklearn.cross_decomposition import PLSRegression
y_train_encoded = pd.get_dummies(y_train)
model = PLSRegression(n_components=n_components)
model.fit(X_train, y_train_encoded)
y_train_pred = model.predict(X_train)
y_train_pred = np.argmax(y_train_pred, axis=1)
train_metrics = evaluate_model(np.argmax(y_train_encoded.values, axis=1), y_train_pred, dataset_name="Train")
y_test_pred = model.predict(X_test)
y_test_pred = np.argmax(y_test_pred, axis=1)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return model
def train_RF(X_train, X_test, y_train, y_test, n_estimators=200, max_depth=15, n_jobs=-1):
"""训练随机森林模型并返回模型对象"""
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=n_jobs)
cross_validate_model(model, X_train, y_train, n_jobs=n_jobs)
model.fit(X_train, y_train.ravel())
y_train_pred = model.predict(X_train)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
y_test_pred = model.predict(X_test)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return model
def train_XGBoost(X_train, X_test, y_train, y_test, n_estimators=100, learning_rate=0.1, max_depth=3):
"""训练XGBoost模型并返回模型对象"""
import xgboost as xgb
model = xgb.XGBClassifier(
n_estimators=n_estimators,
learning_rate=learning_rate,
max_depth=max_depth,
random_state=1,
gpu_id=0
)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
y_test_pred = model.predict(X_test)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return model
def train_LightGBM(X_train, X_test, y_train, y_test, n_estimators=100, learning_rate=0.1, max_depth=-1, num_leaves=31):
"""训练LightGBM模型并返回模型对象"""
import lightgbm as lgb
model = lgb.LGBMClassifier(
n_estimators=n_estimators,
learning_rate=learning_rate,
max_depth=max_depth,
num_leaves=num_leaves,
random_state=1
)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
y_test_pred = model.predict(X_test)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return model
def train_CatBoost(X_train, X_test, y_train, y_test, iterations=500, learning_rate=0.1, depth=6):
"""训练CatBoost模型并返回模型对象"""
import catboost as cb
model = cb.CatBoostClassifier(
iterations=iterations,
learning_rate=learning_rate,
depth=depth,
random_seed=1,
verbose=0
)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
y_test_pred = model.predict(X_test)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return model
def train_AdaBoost(X_train, X_test, y_train, y_test, n_estimators=50, learning_rate=1.0):
"""训练AdaBoost模型并返回模型对象"""
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
base_estimator = DecisionTreeClassifier(max_depth=1)
model = AdaBoostClassifier(
base_estimator=base_estimator,
n_estimators=n_estimators,
learning_rate=learning_rate,
random_state=1
)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
y_test_pred = model.predict(X_test)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return model
def train_KNN(X_train, X_test, y_train, y_test, n_neighbors=5, weights='uniform', algorithm='auto'):
"""训练KNN模型并返回模型对象"""
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
cross_validate_model(model, X_train, y_train)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
y_test_pred = model.predict(X_test)
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return model
# ==================== 统一的模型训练和保存函数 ====================
def train_and_save_model(model_name, X_train, X_test, y_train, y_test, model_save_dir, **kwargs):
"""
训练指定模型并保存
:param model_name: 模型名称 ('SVM', 'LogisticRegression', 'PLS_DA', 'RF', 'XGBoost', 'LightGBM', 'CatBoost', 'AdaBoost', 'KNN')
:param X_train: 训练特征
:param X_test: 测试特征
:param y_train: 训练标签
:param y_test: 测试标签
:param model_save_dir: 模型保存目录
:param kwargs: 模型特定的超参数
:return: 训练好的模型和保存路径
"""
model_trainers = {
'SVM': SVM,
'LogisticRegression': train_LogisticRegression,
'PLS_DA': train_PLS_DA,
'RF': train_RF,
'XGBoost': train_XGBoost,
'LightGBM': train_LightGBM,
'CatBoost': train_CatBoost,
'AdaBoost': train_AdaBoost,
'KNN': train_KNN
}
if model_name not in model_trainers:
raise ValueError(f"Unsupported model: {model_name}. Supported models: {list(model_trainers.keys())}")
print(f"\n{'='*60}")
print(f"Training {model_name} model...")
print(f"{'='*60}")
# 训练模型
trainer = model_trainers[model_name]
model = trainer(X_train, X_test, y_train, y_test, **kwargs)
# 保存模型
os.makedirs(model_save_dir, exist_ok=True)
model_path = os.path.join(model_save_dir, f"{model_name.lower()}.m")
save_model(model, model_path, model_type=model_name)
return model, model_path
# ==================== 针对不同模型的预测函数 ====================
def predict_with_model(df, model_path, model_type='SVM', ProcessMethods1='SS', ProcessMethods2='SG'):
"""
使用指定模型进行预测
:param df: 包含反射率和形状特征的dataframe
:param model_path: 模型路径
:param model_type: 模型类型
:param ProcessMethods1: 第一次预处理方法,默认为'SS'(当为'SS'时会加载scaler_params.pkl
:param ProcessMethods2: 第二次预处理方法,默认为'SG'
:return: 包含预测结果的dataframe
"""
return predict_and_save(df, model_path, model_type=model_type, ProcessMethods1=ProcessMethods1, ProcessMethods2=ProcessMethods2)
# 主函数,用于训练
if __name__ == "__main__":
# 加载 SVM 模型
data = pd.read_csv(r"E:\plastic\plastic\output\20251113\数据增强\all.csv")
df = pd.DataFrame(data)
# x = df.iloc[:, 1:]
# x = df.iloc[:, np.r_[1:94, 119:]].values
cols_to_remove = df.columns[87:110]
# 删除这些列
df_filtered = df.drop(columns=cols_to_remove)
# 提取数据(保持列名)
x = df_filtered.iloc[:, 1:].values
y = df.iloc[:, 0]
# 示例:不使用数据增强(原始方式)
# X_train, X_test, y_train, y_test = SpectralQualitativeAnalysis(x, y, 'None', 'None', 'None', 'random', use_smote=True)
# 示例:使用光谱数据增强(推荐)
# 定义数据增强参数
augmentation_params = {
'augmentation_factor': 2, # 每个样本生成2个增强样本
'noise_level': 0.01, # 噪声水平1%
'offset_range': 0.02, # 偏移量范围±0.02
'multiplier_range': (0.9, 1.1), # 乘法因子范围0.95-1.05
'slope_range': (0, 0.1) # 斜率变化范围±0.001
}
X_train, X_test, y_train, y_test = SpectralQualitativeAnalysis(
x, y, 'D1', 'None', 'None', 'random',
use_smote=True,
use_augmentation=False, # 启用数据增强
augmentation_params=augmentation_params,
random_state=42
)
# # # 网格搜索 SVM 模型并对不同核函数进行三维可视化
# param_grid = {
# 'C': np.logspace(-3, 3, 13), # 在 10^(-3) 到 10^3 范围内生成 13 个值
# 'gamma': np.logspace(-4, 1, 13), # 在 10^(-4) 到 10^1 范围内生成 13 个值
# 'kernel': ['rbf'] # 针对 RBF 核
# }
# clf = SVM_with_kernels_visualization(X_train, X_test, y_train, y_test, param_grid)
# joblib.dump(clf, "./classification_model/model_save/pre_salinas_MODEL.m")
# clf1 = joblib.load("./classification_model/model_save/pre_salinas_MODEL.m")
# 示例1: 训练并保存SVM模型旧方法仍然支持
# clf = SVM(X_train, X_test, y_train, y_test)
# save_model(clf, r"D:\WQ\plastic\classification_model\modelsave\svm.m", model_type='SVM')
# 示例2: 使用统一的训练和保存函数(推荐)
save_dir = r"E:\plastic\plastic\output\20251113\一阶导数"
# 训练并保存多个模型
models_to_train = ['CatBoost'] # 'SVM', 'RF', 'XGBoost', 'LogisticRegression'
for model_name in models_to_train:
model, model_path = train_and_save_model(
model_name=model_name,
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
model_save_dir=save_dir
)
print(f"{model_name} model saved at: {model_path}")
# 示例3: 加载模型并进行预测
# model_path = r"D:\WQ\plastic\classification_model\modelsave\svm.m"
# loaded_model = load_model(model_path)
# # 预测时使用与训练时相同的预处理方法
# # ProcessMethods1='SS' 时会自动加载scaler_params.pkl
# # ProcessMethods2='SG' 应用Savitzky-Golay滤波
# predictions_df = predict_with_model(df, model_path, model_type='SVM', ProcessMethods1='SS', ProcessMethods2='SG')
# print(f"Predictions completed. Results shape: {predictions_df.shape}")