from imblearn.over_sampling import SMOTE import pandas as pd from classification_model.DataLoad.DataLoad import SetSplit, LoadNirtest from classification_model.Preprocessing.Preprocessing import Preprocessing from classification_model.WaveSelect.WaveSelcet import SpctrumFeatureSelcet from classification_model.Classification.ClassicCls import ( LogisticRegressionModel, SVM as SVM_Classic, PLS_DA, RF, XGBoost, LightGBM, CatBoost, AdaBoost, KNN ) import warnings warnings.filterwarnings("ignore", category=FutureWarning) import sklearn.svm as svm from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score import numpy as np import joblib import os from sklearn.svm import SVC from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report, confusion_matrix import matplotlib.pyplot as plt def cross_validate_model(model, X, y, cv=5): """ :param model: 模型 :param X: :param y: :param cv: 折数 :return: """ scores = cross_val_score(model, X, y, cv=cv) print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}") return scores # ==================== 光谱数据增强模块 ==================== def augment_spectrum(spectrum, noise_level=0.01, offset_range=0.02, multiplier_range=(0.95, 1.05), slope_range=(-0.001, 0.001), random_state=None): """ 对单个光谱进行数据增强,包括添加随机噪声、偏移量、乘法和斜率的随机变化 :param spectrum: 单个光谱数据(1D数组) :param noise_level: 噪声水平(相对于光谱值的标准差比例),默认0.01(1%) :param offset_range: 偏移量范围(绝对值),默认0.02 :param multiplier_range: 乘法因子范围(最小值,最大值),默认(0.95, 1.05) :param slope_range: 斜率变化范围(最小值,最大值),默认(-0.001, 0.001) :param random_state: 随机种子,用于可重复性 :return: 增强后的光谱数据 """ if random_state is not None: np.random.seed(random_state) spectrum = np.array(spectrum).flatten() n_features = len(spectrum) # 1. 添加随机噪声(高斯噪声) noise = np.random.normal(0, noise_level * np.std(spectrum), n_features) augmented = spectrum + noise # 2. 添加偏移量(基线偏移) offset = np.random.uniform(-offset_range, offset_range) augmented = augmented + offset # 3. 乘法变化(乘性散射校正的变体) multiplier = np.random.uniform(multiplier_range[0], multiplier_range[1]) augmented = augmented * multiplier # 4. 斜率变化(线性基线漂移) slope = np.random.uniform(slope_range[0], slope_range[1]) x_indices = np.arange(n_features) augmented = augmented + slope * x_indices return augmented def augment_dataset(X, y, augmentation_factor=1, noise_level=0.01, offset_range=0.02, multiplier_range=(0.95, 1.05), slope_range=(-0.001, 0.001), random_state=None, preserve_original=True): """ 对整个数据集进行光谱数据增强 :param X: 特征数据(n_samples, n_features) :param y: 标签数据(n_samples,) :param augmentation_factor: 增强倍数,每个样本生成augmentation_factor个增强样本,默认1 :param noise_level: 噪声水平,默认0.01 :param offset_range: 偏移量范围,默认0.02 :param multiplier_range: 乘法因子范围,默认(0.95, 1.05) :param slope_range: 斜率变化范围,默认(-0.001, 0.001) :param random_state: 随机种子,默认None :param preserve_original: 是否保留原始数据,默认True :return: 增强后的特征数据和标签数据 (X_augmented, y_augmented) """ # 确保 X 是密集的 numpy 数组 if hasattr(X, 'toarray'): # 处理稀疏矩阵 X = X.toarray() else: X = np.array(X) # 确保 X 是2D数组 if X.ndim == 1: X = X.reshape(1, -1) y = np.array(y).flatten() if random_state is not None: np.random.seed(random_state) augmented_X_list = [] augmented_y_list = [] n_samples = X.shape[0] n_features = X.shape[1] # 对每个样本进行处理 for i in range(n_samples): # 获取当前样本(确保是1D数组) current_sample = np.array(X[i]).flatten() # 如果保留原始数据,先添加原始样本 if preserve_original: # 确保原始样本是2D数组 (1, n_features) original_sample = current_sample.reshape(1, -1) augmented_X_list.append(original_sample) augmented_y_list.append(y[i]) # 生成增强样本 for j in range(augmentation_factor): # 为每个增强样本生成不同的随机种子 if random_state is not None: seed = random_state + i * augmentation_factor + j else: seed = None augmented_spectrum = augment_spectrum( current_sample, noise_level=noise_level, offset_range=offset_range, multiplier_range=multiplier_range, slope_range=slope_range, random_state=seed ) # 确保是2D数组 (1, n_features) augmented_X_list.append(augmented_spectrum.reshape(1, -1)) augmented_y_list.append(y[i]) # 合并所有数据 if len(augmented_X_list) > 0: X_augmented = np.vstack(augmented_X_list) y_augmented = np.array(augmented_y_list) else: X_augmented = X y_augmented = y return X_augmented, y_augmented def augment_dataset_with_params(X, y, augmentation_params=None, random_state=None, preserve_original=True): """ 使用参数字典对整个数据集进行光谱数据增强(更灵活的接口) :param X: 特征数据(n_samples, n_features) :param y: 标签数据(n_samples,) :param augmentation_params: 增强参数字典,包含: - 'augmentation_factor': 增强倍数,默认1 - 'noise_level': 噪声水平,默认0.01 - 'offset_range': 偏移量范围,默认0.02 - 'multiplier_range': 乘法因子范围,默认(0.95, 1.05) - 'slope_range': 斜率变化范围,默认(-0.001, 0.001) :param random_state: 随机种子,默认None :param preserve_original: 是否保留原始数据,默认True :return: 增强后的特征数据和标签数据 (X_augmented, y_augmented) """ if augmentation_params is None: augmentation_params = {} return augment_dataset( X, y, augmentation_factor=augmentation_params.get('augmentation_factor', 1), noise_level=augmentation_params.get('noise_level', 0.01), offset_range=augmentation_params.get('offset_range', 0.02), multiplier_range=augmentation_params.get('multiplier_range', (0.95, 1.05)), slope_range=augmentation_params.get('slope_range', (-0.001, 0.001)), random_state=random_state, preserve_original=preserve_original ) # 混淆矩阵与分类报告 def evaluate_model(y_true, y_pred, dataset_name="Test", title="Confusion Matrix", cmap='Blues'): """ 性能评估,包含分类报告和混淆矩阵,且标签从 1 开始。 参数: y_true -- 真实标签 y_pred -- 预测标签 dataset_name -- 数据集名称(如 "Train" 或 "Test") title -- 图表标题 cmap -- 热力图颜色 """ print(f"{dataset_name} Classification Report:") print(classification_report(y_true, y_pred)) # 计算混淆矩阵 cm = confusion_matrix(y_true, y_pred) # 绘制热力图(此部分可选择性取消注释以显示图形) # plt.figure(figsize=(8, 6)) # ax = sns.heatmap(cm, annot=True, fmt='g', cmap=cmap, cbar=True, # linewidths=0.5, linecolor='black', square=True, # annot_kws={"size": 12}) # ax.set_title(f"{dataset_name} {title}", fontsize=16) # ax.set_xlabel('Predicted Label', fontsize=14) # ax.set_ylabel('True Label', fontsize=14) # plt.tight_layout() # plt.show() # 返回多个性能指标的字典,包括混淆矩阵 return { "accuracy": accuracy_score(y_true, y_pred), "precision": precision_score(y_true, y_pred, average='weighted'), "recall": recall_score(y_true, y_pred, average='weighted'), "f1_score": f1_score(y_true, y_pred, average='weighted'), "confusion_matrix": cm } # 光谱定性分析 def SpectralQualitativeAnalysis(data, label, ProcessMethods, ProcessMethods2, FslecetedMethods, SetSplitMethods, use_smote=False, use_augmentation=False, augmentation_params=None, random_state=42): """ 光谱定性分析,支持数据增强 :param data: 输入数据 :param label: 标签数据 :param ProcessMethods: 第一种预处理方法 :param ProcessMethods2: 第二种预处理方法 :param FslecetedMethods: 特征选择方法 :param SetSplitMethods: 数据划分方法 :param use_smote: 是否使用SMOTE,默认False :param use_augmentation: 是否使用光谱数据增强,默认False :param augmentation_params: 数据增强参数字典,默认None(使用默认参数) :param random_state: 随机种子,默认42 :return: X_train, X_test, y_train, y_test """ # 预处理 ProcesedData = Preprocessing(ProcessMethods, data) ProcesedData2 = Preprocessing(ProcessMethods2, ProcesedData) # 特征选择 FeatrueData, labels, selected_columns = SpctrumFeatureSelcet(FslecetedMethods, ProcesedData2, label) # 数据划分 X_train, X_test, y_train, y_test = SetSplit(SetSplitMethods, FeatrueData, labels, test_size=0.3, randomseed=random_state) # 使用光谱数据增强(在SMOTE之前应用) if use_augmentation: print(f"Original training set size: {len(y_train)}") X_train, y_train = augment_dataset_with_params( X_train, y_train, augmentation_params=augmentation_params, random_state=random_state, preserve_original=True ) print(f"Training set size after augmentation: {len(y_train)}") # 使用 SMOTE 增加少数类别样本 if use_smote: smote = SMOTE(random_state=random_state) X_train, y_train = smote.fit_resample(X_train, y_train) print("SMOTE applied: Training set size after resampling:", len(y_train)) # 模型训练和评估 return X_train, X_test, y_train, y_test def Procesed(data, ProcessMethods1, ProcessMethods2, model_path): """ 对数据进行预处理,支持两种预处理方法 :param data: 输入数据 :param ProcessMethods1: 第一种预处理方法(如 'SS', 'MMS', 'None'等) :param ProcessMethods2: 第二种预处理方法(如 'SG', 'D1', 'None'等) :param model_path: 模型路径(用于定位scaler_params.pkl) :return: 预处理后的数据 """ import os from classification_model.Preprocessing import Preprocessing # 第一步预处理 if ProcessMethods1 == 'SS': # 当第一种预处理方法为SS时,需要加载保存的scaler model_dir = os.path.dirname(model_path) scaler_path = os.path.join(model_dir, 'scaler_params.pkl') if not os.path.exists(scaler_path): raise FileNotFoundError(f"Scaler file not found at {scaler_path}. Please ensure the model was trained with SS preprocessing.") loaded_scaler = joblib.load(scaler_path) transformed_data = loaded_scaler.transform(data) # 转换为DataFrame格式以便后续处理 transformed_data_layout = pd.DataFrame(transformed_data) elif ProcessMethods1 == 'None' or ProcessMethods1 is None: # 如果第一种预处理方法为None,直接使用原始数据 transformed_data_layout = pd.DataFrame(data) if not isinstance(data, pd.DataFrame) else data else: # 其他预处理方法直接调用Preprocessing函数 transformed_data_layout = Preprocessing(ProcessMethods1, data) if isinstance(transformed_data_layout, np.ndarray): transformed_data_layout = pd.DataFrame(transformed_data_layout) # 第二步预处理 if ProcessMethods2 == 'None' or ProcessMethods2 is None: ProcesedData2 = transformed_data_layout else: ProcesedData2 = Preprocessing(ProcessMethods2, transformed_data_layout) if isinstance(ProcesedData2, np.ndarray): ProcesedData2 = pd.DataFrame(ProcesedData2) return ProcesedData2 def SVM_with_kernels_visualization(X_train, X_test, y_train, y_test, param_grid=None): """ 针对不同核函数的 SVM 模型进行超参数调优,使用测试集验证最佳模型,并分别绘制三维网络图。 :param X_train: 训练集特征 :param X_test: 测试集特征 :param y_train: 训练集标签 :param y_test: 测试集标签 :param param_grid: 超参数网格,默认为 None。如果 None,使用默认参数范围。 """ if param_grid is None: # 默认参数网格 param_grid = { 'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf', 'poly'] } # 初始化 SVM 模型 svc = SVC() # 网格搜索 grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1) grid_search.fit(X_train, y_train) # 输出最优参数和对应的分数 print("Best Parameters:", grid_search.best_params_) print("Best Cross-Validation Score:", grid_search.best_score_) # 测试集验证最佳模型 best_model = grid_search.best_estimator_ y_test_pred = best_model.predict(X_test) print("\nTest Set Evaluation:") print(classification_report(y_test, y_test_pred)) print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred)) print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}") # 获取搜索结果 results = grid_search.cv_results_ # 按核函数类型分别绘制三维网格图 kernels = np.unique(param_grid['kernel']) for kernel in kernels: kernel_indices = [i for i, params in enumerate(results['params']) if params['kernel'] == kernel] C_values = [results['params'][i]['C'] for i in kernel_indices] gamma_values = [results['params'][i]['gamma'] for i in kernel_indices if 'gamma' in results['params'][i]] scores = results['mean_test_score'][kernel_indices] # 如果是线性核,不需要绘制 gamma 参数 if kernel == 'linear': plot_linear_kernel(C_values, scores, kernel) else: plot_3D_grid(C_values, gamma_values, scores, kernel) return best_model def plot_3D_grid(C_values, gamma_values, scores, kernel): """ 绘制三维超参数网络图(针对 RBF 和多项式核),添加颜色梯度。 :param C_values: C 参数的列表 :param gamma_values: gamma 参数的列表 :param scores: 对应的交叉验证分数 :param kernel: 核函数名称 """ # 将数据转化为网格形式 C_unique = np.unique(C_values) gamma_unique = np.unique(gamma_values) C_grid, gamma_grid = np.meshgrid(C_unique, gamma_unique) # 构建 Z 轴(对应交叉验证得分) Z = np.zeros_like(C_grid) for i, c in enumerate(C_unique): for j, gamma in enumerate(gamma_unique): indices = [k for k, val in enumerate(C_values) if val == c and gamma_values[k] == gamma] if indices: Z[j, i] = scores[indices[0]] # 转换 C 和 gamma 为对数尺度 log_C_grid = np.log10(C_grid) log_gamma_grid = np.log10(gamma_grid) # 绘制三维表面图并添加颜色梯度 fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot(111, projection='3d') surface = ax.plot_surface( log_C_grid, log_gamma_grid, Z, cmap='viridis', edgecolor='k', alpha=0.8 ) # 添加颜色条 cbar = fig.colorbar(surface, pad=0.1, shrink=0.5, aspect=10) cbar.set_label('Mean Accuracy', fontsize=12) # 设置坐标轴和标题 ax.set_title(f'3D Hyperparameter Grid ({kernel} kernel)', fontsize=16) ax.set_xlabel('Log10(C)', fontsize=12) ax.set_ylabel('Log10(Gamma)', fontsize=12) ax.set_zlabel('Mean Accuracy', fontsize=12) # 显示图形 plt.show() def plot_linear_kernel(C_values, scores, kernel): """ 绘制线性核的超参数网络图(仅针对 C 参数)。 :param C_values: C 参数的列表 :param scores: 对应的交叉验证分数 :param kernel: 核函数名称 """ # 将 C 转换为对数尺度 C_values = np.log10(C_values) # 创建二维折线图 plt.figure(figsize=(8, 6)) plt.plot(C_values, scores, marker='o', label='Mean Accuracy') plt.xlabel('Log10(C)', fontsize=12) plt.ylabel('Mean Accuracy', fontsize=12) plt.title(f'Hyperparameter Tuning ({kernel} kernel)', fontsize=16) plt.grid(True) plt.legend() plt.show() # 分类并填充结果到标签数组 def classify_and_fill(segments, superpixel_features, model, label_array): """ :param segments: :param superpixel_features: :param model: 模型 :param label_array: :return: 类别列 """ for segment, feature in superpixel_features.items(): # 将高光谱平均特征输入模型,预测类别 label = model.predict([feature])[0] # 填充到标签数组的对应位置 label_array[segments == segment] = label return label_array def save_model(model, model_path, model_type='SVM'): """ 保存模型到指定路径 :param model: 训练好的模型对象 :param model_path: 模型保存路径 :param model_type: 模型类型(用于文件命名) :return: 保存的完整路径 """ os.makedirs(os.path.dirname(model_path), exist_ok=True) joblib.dump(model, model_path) print(f"{model_type} model saved to: {model_path}") return model_path def load_model(model_path): """ 加载模型(支持所有模型类型) :param model_path: 模型路径 :return: 加载的模型 """ return joblib.load(model_path) def predict_and_save(df, model_path, model_type='SVM', ProcessMethods1='SS', ProcessMethods2='SG'): """ 预测模型(支持所有模型类型) :param df: 包含反射率和形状特征的dataframe :param model_path: 模型的路径 :param model_type: 模型类型(可选,用于特殊处理) :param ProcessMethods1: 第一次预处理方法,默认为'SS'(当为'SS'时会加载scaler_params.pkl) :param ProcessMethods2: 第二次预处理方法,默认为'SG' :return: 包含预测类别列的dataframe """ model = load_model(model_path) # 找到轮廓列的索引 contour_col_idx = None if 'contour' in df.columns: contour_col_idx = df.columns.get_loc('contour') # 选择所有数值列(排除轮廓列) numeric_cols = [] for i in range(1, df.shape[1]): # 跳过第一列(可能是类别或ID) if i != contour_col_idx: col_name = df.columns[i] # 只选择数值类型的列 if df[col_name].dtype in ['int64', 'float64']: numeric_cols.append(col_name) # 加载数据 x = df[numeric_cols] # 进行预处理(支持两种预处理方法) Procesed_features = Procesed(x, ProcessMethods1, ProcessMethods2, model_path) # 确保Procesed_features是numpy数组格式供模型预测 if isinstance(Procesed_features, pd.DataFrame): Procesed_features = Procesed_features.values # 进行预测 predictions = model.predict(Procesed_features) df['Predictions'] = predictions return df def SVM(X_train, X_test, y_train, y_test, kernel='linear', C=1, gamma=1e-3): clf = svm.SVC(C=C, kernel=kernel, gamma=gamma) # 交叉验证 cross_validate_model(clf, X_train, y_train) # 模型拟合 clf.fit(X_train, y_train.ravel()) # 训练集评估 y_train_pred = clf.predict(X_train) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") # 测试集评估 y_test_pred = clf.predict(X_test) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return clf # ==================== 所有模型的训练函数(返回模型对象)==================== def train_LogisticRegression(X_train, X_test, y_train, y_test, penalty='l2', C=1.0, solver='lbfgs', max_iter=200): """训练逻辑回归模型并返回模型对象""" from sklearn.linear_model import LogisticRegression model = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=max_iter, multi_class='multinomial', random_state=1) cross_validate_model(model, X_train, y_train) model.fit(X_train, y_train.ravel()) y_train_pred = model.predict(X_train) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") y_test_pred = model.predict(X_test) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return model def train_PLS_DA(X_train, X_test, y_train, y_test, n_components=40): """训练PLS-DA模型并返回模型对象""" from sklearn.cross_decomposition import PLSRegression y_train_encoded = pd.get_dummies(y_train) model = PLSRegression(n_components=n_components) model.fit(X_train, y_train_encoded) y_train_pred = model.predict(X_train) y_train_pred = np.argmax(y_train_pred, axis=1) train_metrics = evaluate_model(np.argmax(y_train_encoded.values, axis=1), y_train_pred, dataset_name="Train") y_test_pred = model.predict(X_test) y_test_pred = np.argmax(y_test_pred, axis=1) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return model def train_RF(X_train, X_test, y_train, y_test, n_estimators=200, max_depth=15, n_jobs=-1): """训练随机森林模型并返回模型对象""" from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=n_jobs) cross_validate_model(model, X_train, y_train, n_jobs=n_jobs) model.fit(X_train, y_train.ravel()) y_train_pred = model.predict(X_train) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") y_test_pred = model.predict(X_test) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return model def train_XGBoost(X_train, X_test, y_train, y_test, n_estimators=100, learning_rate=0.1, max_depth=3): """训练XGBoost模型并返回模型对象""" import xgboost as xgb model = xgb.XGBClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=1, gpu_id=0 ) model.fit(X_train, y_train) y_train_pred = model.predict(X_train) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") y_test_pred = model.predict(X_test) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return model def train_LightGBM(X_train, X_test, y_train, y_test, n_estimators=100, learning_rate=0.1, max_depth=-1, num_leaves=31): """训练LightGBM模型并返回模型对象""" import lightgbm as lgb model = lgb.LGBMClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, num_leaves=num_leaves, random_state=1 ) model.fit(X_train, y_train) y_train_pred = model.predict(X_train) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") y_test_pred = model.predict(X_test) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return model def train_CatBoost(X_train, X_test, y_train, y_test, iterations=500, learning_rate=0.1, depth=6): """训练CatBoost模型并返回模型对象""" import catboost as cb model = cb.CatBoostClassifier( iterations=iterations, learning_rate=learning_rate, depth=depth, random_seed=1, verbose=0 ) model.fit(X_train, y_train) y_train_pred = model.predict(X_train) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") y_test_pred = model.predict(X_test) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return model def train_AdaBoost(X_train, X_test, y_train, y_test, n_estimators=50, learning_rate=1.0): """训练AdaBoost模型并返回模型对象""" from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier base_estimator = DecisionTreeClassifier(max_depth=1) model = AdaBoostClassifier( base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=1 ) model.fit(X_train, y_train) y_train_pred = model.predict(X_train) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") y_test_pred = model.predict(X_test) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return model def train_KNN(X_train, X_test, y_train, y_test, n_neighbors=5, weights='uniform', algorithm='auto'): """训练KNN模型并返回模型对象""" from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm) cross_validate_model(model, X_train, y_train) model.fit(X_train, y_train) y_train_pred = model.predict(X_train) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") y_test_pred = model.predict(X_test) test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return model # ==================== 统一的模型训练和保存函数 ==================== def train_and_save_model(model_name, X_train, X_test, y_train, y_test, model_save_dir, **kwargs): """ 训练指定模型并保存 :param model_name: 模型名称 ('SVM', 'LogisticRegression', 'PLS_DA', 'RF', 'XGBoost', 'LightGBM', 'CatBoost', 'AdaBoost', 'KNN') :param X_train: 训练特征 :param X_test: 测试特征 :param y_train: 训练标签 :param y_test: 测试标签 :param model_save_dir: 模型保存目录 :param kwargs: 模型特定的超参数 :return: 训练好的模型和保存路径 """ model_trainers = { 'SVM': SVM, 'LogisticRegression': train_LogisticRegression, 'PLS_DA': train_PLS_DA, 'RF': train_RF, 'XGBoost': train_XGBoost, 'LightGBM': train_LightGBM, 'CatBoost': train_CatBoost, 'AdaBoost': train_AdaBoost, 'KNN': train_KNN } if model_name not in model_trainers: raise ValueError(f"Unsupported model: {model_name}. Supported models: {list(model_trainers.keys())}") print(f"\n{'='*60}") print(f"Training {model_name} model...") print(f"{'='*60}") # 训练模型 trainer = model_trainers[model_name] model = trainer(X_train, X_test, y_train, y_test, **kwargs) # 保存模型 os.makedirs(model_save_dir, exist_ok=True) model_path = os.path.join(model_save_dir, f"{model_name.lower()}.m") save_model(model, model_path, model_type=model_name) return model, model_path # ==================== 针对不同模型的预测函数 ==================== def predict_with_model(df, model_path, model_type='SVM', ProcessMethods1='SS', ProcessMethods2='SG'): """ 使用指定模型进行预测 :param df: 包含反射率和形状特征的dataframe :param model_path: 模型路径 :param model_type: 模型类型 :param ProcessMethods1: 第一次预处理方法,默认为'SS'(当为'SS'时会加载scaler_params.pkl) :param ProcessMethods2: 第二次预处理方法,默认为'SG' :return: 包含预测结果的dataframe """ return predict_and_save(df, model_path, model_type=model_type, ProcessMethods1=ProcessMethods1, ProcessMethods2=ProcessMethods2) # 主函数,用于训练 if __name__ == "__main__": # 加载 SVM 模型 data = pd.read_csv(r"E:\plastic\plastic\output\20251113\数据增强\all.csv") df = pd.DataFrame(data) # x = df.iloc[:, 1:] # x = df.iloc[:, np.r_[1:94, 119:]].values cols_to_remove = df.columns[87:110] # 删除这些列 df_filtered = df.drop(columns=cols_to_remove) # 提取数据(保持列名) x = df_filtered.iloc[:, 1:].values y = df.iloc[:, 0] # 示例:不使用数据增强(原始方式) # X_train, X_test, y_train, y_test = SpectralQualitativeAnalysis(x, y, 'None', 'None', 'None', 'random', use_smote=True) # 示例:使用光谱数据增强(推荐) # 定义数据增强参数 augmentation_params = { 'augmentation_factor': 2, # 每个样本生成2个增强样本 'noise_level': 0.01, # 噪声水平:1% 'offset_range': 0.02, # 偏移量范围:±0.02 'multiplier_range': (0.9, 1.1), # 乘法因子范围:0.95-1.05 'slope_range': (0, 0.1) # 斜率变化范围:±0.001 } X_train, X_test, y_train, y_test = SpectralQualitativeAnalysis( x, y, 'D1', 'None', 'None', 'random', use_smote=True, use_augmentation=False, # 启用数据增强 augmentation_params=augmentation_params, random_state=42 ) # # # 网格搜索 SVM 模型并对不同核函数进行三维可视化 # param_grid = { # 'C': np.logspace(-3, 3, 13), # 在 10^(-3) 到 10^3 范围内生成 13 个值 # 'gamma': np.logspace(-4, 1, 13), # 在 10^(-4) 到 10^1 范围内生成 13 个值 # 'kernel': ['rbf'] # 针对 RBF 核 # } # clf = SVM_with_kernels_visualization(X_train, X_test, y_train, y_test, param_grid) # joblib.dump(clf, "./classification_model/model_save/pre_salinas_MODEL.m") # clf1 = joblib.load("./classification_model/model_save/pre_salinas_MODEL.m") # 示例1: 训练并保存SVM模型(旧方法,仍然支持) # clf = SVM(X_train, X_test, y_train, y_test) # save_model(clf, r"D:\WQ\plastic\classification_model\modelsave\svm.m", model_type='SVM') # 示例2: 使用统一的训练和保存函数(推荐) save_dir = r"E:\plastic\plastic\output\20251113\一阶导数" # 训练并保存多个模型 models_to_train = ['CatBoost'] # 'SVM', 'RF', 'XGBoost', 'LogisticRegression' for model_name in models_to_train: model, model_path = train_and_save_model( model_name=model_name, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, model_save_dir=save_dir ) print(f"{model_name} model saved at: {model_path}") # 示例3: 加载模型并进行预测 # model_path = r"D:\WQ\plastic\classification_model\modelsave\svm.m" # loaded_model = load_model(model_path) # # 预测时使用与训练时相同的预处理方法 # # ProcessMethods1='SS' 时会自动加载scaler_params.pkl # # ProcessMethods2='SG' 应用Savitzky-Golay滤波 # predictions_df = predict_with_model(df, model_path, model_type='SVM', ProcessMethods1='SS', ProcessMethods2='SG') # print(f"Predictions completed. Results shape: {predictions_df.shape}")