import numpy as np from sklearn.metrics import f1_score, classification_report from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold from scipy.stats import loguniform, randint from xgboost import XGBClassifier import lightgbm as lgb import catboost as cb from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix import gc import os # 固定随机种子 def set_random_seed(seed=42): np.random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) set_random_seed() # 性能评估函数 def evaluate_model(y_true, y_pred, dataset_name="Test"): print(f"\n{dataset_name} Classification Report:") print(classification_report(y_true, y_pred)) cm = confusion_matrix(y_true, y_pred) return { "accuracy": accuracy_score(y_true, y_pred), "precision": precision_score(y_true, y_pred, average='weighted'), "recall": recall_score(y_true, y_pred, average='weighted'), "f1_score": f1_score(y_true, y_pred, average='weighted'), "confusion_matrix": cm } # 优化XGBoost def optimize_XGBoost(X_train, y_train, X_test, y_test): param_dist = { 'max_depth': randint(3, 10), # 控制树的最大深度 'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献 'subsample': [0.6, 0.8, 1.0], # 在每次迭代中随机选择部分样本进行训练 'colsample_bytree': [0.6, 0.8, 1.0], # 在每棵树的构建过程中,随机选择部分特征 'n_estimators': randint(100, 300), # 树的数量 'min_child_weight': randint(1, 10), # 子叶节点的最小样本权重和 'gamma': [0, 0.1, 0.2] # 控制模型切分节点的最小损失函数值 } model = XGBClassifier( tree_method='gpu_hist', gpu_id=0, use_label_encoder=False, eval_metric='mlogloss', objective='multi:softmax', num_class=len(np.unique(y_train)) ) optimizer = RandomizedSearchCV( model, param_distributions=param_dist, n_iter=30, cv=StratifiedKFold(n_splits=3), scoring='f1_weighted', n_jobs=-1, verbose=1 ) optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ print(f"Best XGBoost Hyperparameters: {best_params}") best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, "Train") test_metrics = evaluate_model(y_test, y_test_pred, "Test") gc.collect() return best_params, train_metrics, test_metrics # 优化LightGBM def optimize_LightGBM(X_train, y_train, X_test, y_test): param_dist = { 'num_leaves': randint(20, 50), # 控制树的复杂度 'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献 'subsample': [0.6, 0.8, 1.0], # 在每次迭代中随机选择部分样本进行训练 'colsample_bytree': [0.6, 0.8, 1.0], # 在每棵树的构建过程中随机选择部分特征 'n_estimators': randint(100, 300), # 树的数量 'min_child_samples': randint(10, 100), # 叶子节点的最小样本数 'max_depth': [None, 3, 5, 7] # 树的最大深度 } model = lgb.LGBMClassifier( device_type='gpu', objective='multiclass', num_class=len(np.unique(y_train)) ) optimizer = RandomizedSearchCV( model, param_distributions=param_dist, n_iter=30, cv=StratifiedKFold(n_splits=3), scoring='f1_weighted', n_jobs=-1, verbose=1 ) optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ print(f"Best LightGBM Hyperparameters: {best_params}") best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, "Train") test_metrics = evaluate_model(y_test, y_test_pred, "Test") gc.collect() return best_params, train_metrics, test_metrics # 优化CatBoost def optimize_CatBoost(X_train, y_train, X_test, y_test): param_dist = { 'depth': randint(4, 8), # 控制树的深度 'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献 'l2_leaf_reg': randint(1, 10), # L2正则化系数 'iterations': randint(100, 300), # 树的数量 'border_count': [32, 64, 128] # 分割点的数量 } model = cb.CatBoostClassifier( task_type='GPU', verbose=0, loss_function='MultiClass' ) optimizer = RandomizedSearchCV( model, param_distributions=param_dist, n_iter=30, cv=StratifiedKFold(n_splits=3), scoring='f1_weighted', n_jobs=-1, verbose=1 ) optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ print(f"Best CatBoost Hyperparameters: {best_params}") best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, "Train") test_metrics = evaluate_model(y_test, y_test_pred, "Test") gc.collect() return best_params, train_metrics, test_metrics # 优化SVM def optimize_SVM(X_train, y_train, X_test, y_test): param_dist = { 'C': loguniform(1e-2, 10), # 惩罚参数 'kernel': ['linear', 'rbf'], # 核函数 'gamma': loguniform(1e-4, 1e-1) # 核函数的系数 } model = SVC(probability=True) optimizer = RandomizedSearchCV( model, param_distributions=param_dist, n_iter=30, cv=StratifiedKFold(n_splits=3), scoring='f1_weighted', n_jobs=-1, verbose=1 ) optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ print(f"Best SVM Hyperparameters: {best_params}") best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, "Train") test_metrics = evaluate_model(y_test, y_test_pred, "Test") return best_params, train_metrics, test_metrics # 优化KNN def optimize_KNN(X_train, y_train, X_test, y_test): param_grid = { 'n_neighbors': list(range(3, 20, 2)), # 邻居的数量 'weights': ['uniform', 'distance'], # 权重函数 'p': [1, 2] # 距离度量 } model = KNeighborsClassifier(algorithm='brute') optimizer = GridSearchCV( model, param_grid=param_grid, cv=StratifiedKFold(n_splits=3), scoring='f1_weighted', n_jobs=-1, verbose=1 ) optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ print(f"Best KNN Hyperparameters: {best_params}") best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, "Train") test_metrics = evaluate_model(y_test, y_test_pred, "Test") return best_params, train_metrics, test_metrics # 优化LogisticRegression def optimize_LogisticRegression(X_train, y_train, X_test, y_test): param_grid = { 'C': loguniform(1e-4, 1e2), # 正则化强度 'penalty': ['l2', None], # 正则化类型 'solver': ['lbfgs', 'sag', 'saga'] # 优化算法 } model = LogisticRegression(max_iter=1000, random_state=42) optimizer = RandomizedSearchCV( model, param_distributions=param_grid, n_iter=30, cv=StratifiedKFold(n_splits=3), scoring='f1_weighted', n_jobs=-1, verbose=1 ) optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ print(f"Best Logistic Regression Hyperparameters: {best_params}") best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, "Train") test_metrics = evaluate_model(y_test, y_test_pred, "Test") return best_params, train_metrics, test_metrics # 优化RandomForest def optimize_RF(X_train, y_train, X_test, y_test): param_dist = { 'n_estimators': randint(100, 300), # 树的数量 'max_depth': [None, 3, 5, 7], # 树的最大深度 'min_samples_split': randint(2, 10), # 分裂内部节点所需的最小样本数 'min_samples_leaf': randint(1, 10), # 叶子节点的最小样本数 'bootstrap': [True, False], # 是否使用自助法采样 'criterion': ['gini', 'entropy'] # 划分的标准 } model = RandomForestClassifier(random_state=42) optimizer = RandomizedSearchCV( model, param_distributions=param_dist, n_iter=30, cv=StratifiedKFold(n_splits=3), scoring='f1_weighted', n_jobs=-1, verbose=1 ) optimizer.fit(X_train, y_train) best_params = optimizer.best_params_ print(f"Best Random Forest Hyperparameters: {best_params}") best_model = optimizer.best_estimator_ y_train_pred = best_model.predict(X_train) y_test_pred = best_model.predict(X_test) train_metrics = evaluate_model(y_train, y_train_pred, "Train") test_metrics = evaluate_model(y_test, y_test_pred, "Test") return best_params, train_metrics, test_metrics