import numpy as np import pandas as pd from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score import sklearn.svm as svm from sklearn.cross_decomposition import PLSRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split from skopt import BayesSearchCV from skopt.space import Real, Integer from xgboost import XGBClassifier import lightgbm as lgb import catboost as cb from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold from sklearn.neural_network import MLPClassifier # 固定随机种子 def set_random_seed(seed=42): np.random.seed(seed) set_random_seed() # 交叉验证(多核心支持) def cross_validate_model(model, X, y, cv=5, n_jobs=-1): """ 多核心交叉验证 """ scores = cross_val_score(model, X, y, cv=cv, n_jobs=n_jobs) print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}") return scores # 混淆矩阵与分类报告 def evaluate_model(y_true, y_pred, dataset_name="Test"): """ 性能评估,包含分类报告和混淆矩阵。 """ print(f"{dataset_name} Classification Report:") print(classification_report(y_true, y_pred)) # 计算混淆矩阵 cm = confusion_matrix(y_true, y_pred) # 返回多个性能指标的字典,包括F1分数 return { "accuracy": accuracy_score(y_true, y_pred), "precision": precision_score(y_true, y_pred, average='weighted'), "recall": recall_score(y_true, y_pred, average='weighted'), "f1_score": f1_score(y_true, y_pred, average='weighted'), "confusion_matrix": cm } # 逻辑回归模型 (Logistic Regression) from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier import xgboost as xgb import lightgbm as lgb import catboost as cb from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split # 1. SVM 贝叶斯优化 def optimize_SVM(X_train, y_train, X_test, y_test): param_space = { 'C': (0.01, 10.0, 'uniform'), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': (1e-4, 1e-1, 'log-uniform') } model = SVC() optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准 optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ # 使用最优超参数训练并评估模型 best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return best_params, train_metrics, test_metrics # 2. KNN 贝叶斯优化 def optimize_KNN(X_train, y_train, X_test, y_test): param_space = { 'n_neighbors': (1, 20), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } model = KNeighborsClassifier() optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准 optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ # 使用最优超参数训练并评估模型 best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return best_params, train_metrics, test_metrics # 3. XGBoost 贝叶斯优化 def optimize_XGBoost(X_train, y_train, X_test, y_test): param_space = { 'n_estimators': Integer(50, 500), 'max_depth': Integer(3, 10), 'learning_rate': Real(1e-4, 1.0, prior='log-uniform'), 'subsample': Real(0.1, 1.0), 'colsample_bytree': Real(0.1, 1.0) } model = XGBClassifier(tree_method='gpu_hist', gpu_id=0) optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准 optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return best_params, train_metrics, test_metrics # 4. Random Forest 贝叶斯优化 def optimize_RF(X_train, y_train, X_test, y_test): param_space = { 'n_estimators': (50, 500), 'max_depth': (3, 15), 'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'max_features': ['auto', 'sqrt', 'log2'] } model = RandomForestClassifier(random_state=42) optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准 optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ # 使用最优超参数训练并评估模型 best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return best_params, train_metrics, test_metrics # 5. CatBoost 贝叶斯优化 def optimize_CatBoost(X_train, y_train, X_test, y_test): param_space = { 'iterations': (50, 500), 'learning_rate': (0.01, 0.3, 'uniform'), 'depth': (3, 10), 'l2_leaf_reg': (1, 10, 'uniform'), 'bagging_temperature': (0, 1, 'uniform') } model = cb.CatBoostClassifier(task_type='GPU', random_seed=42, verbose=0) optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准 optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ # 使用最优超参数训练并评估模型 best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return best_params, train_metrics, test_metrics # 6. Logistic Regression 贝叶斯优化 def optimize_LogisticRegression(X_train, y_train, X_test, y_test): param_space = { 'C': (1e-5, 1e5, 'log-uniform'), 'penalty': ['l1', 'l2'], 'solver': ['lbfgs', 'liblinear', 'saga'] } model = LogisticRegression(multi_class='multinomial', random_state=42) optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准 optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ # 使用最优超参数训练并评估模型 best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return best_params, train_metrics, test_metrics # 7. Neural Network (ANN) 贝叶斯优化 def optimize_ANN(X_train, y_train, X_test, y_test): param_space = { 'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50)], 'activation': ['relu', 'tanh', 'logistic'], 'solver': ['adam', 'sgd'], 'alpha': (1e-5, 1e-1, 'log-uniform'), 'learning_rate': ['constant', 'invscaling', 'adaptive'] } model = MLPClassifier(max_iter=500, random_state=42) optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准 optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ # 使用最优超参数训练并评估模型 best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train") test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test") return best_params, train_metrics, test_metrics