236 lines
8.7 KiB
Python
236 lines
8.7 KiB
Python
import numpy as np
|
||
import pandas as pd
|
||
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
|
||
import sklearn.svm as svm
|
||
from sklearn.cross_decomposition import PLSRegression
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
|
||
from skopt import BayesSearchCV
|
||
from skopt.space import Real, Integer
|
||
from xgboost import XGBClassifier
|
||
import lightgbm as lgb
|
||
import catboost as cb
|
||
from sklearn.linear_model import LogisticRegression
|
||
from sklearn.model_selection import StratifiedKFold
|
||
from sklearn.neural_network import MLPClassifier
|
||
# 固定随机种子
|
||
def set_random_seed(seed=42):
|
||
np.random.seed(seed)
|
||
|
||
set_random_seed()
|
||
|
||
# 交叉验证(多核心支持)
|
||
def cross_validate_model(model, X, y, cv=5, n_jobs=-1):
|
||
"""
|
||
多核心交叉验证
|
||
"""
|
||
scores = cross_val_score(model, X, y, cv=cv, n_jobs=n_jobs)
|
||
print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
|
||
return scores
|
||
|
||
# 混淆矩阵与分类报告
|
||
def evaluate_model(y_true, y_pred, dataset_name="Test"):
|
||
"""
|
||
性能评估,包含分类报告和混淆矩阵。
|
||
"""
|
||
print(f"{dataset_name} Classification Report:")
|
||
print(classification_report(y_true, y_pred))
|
||
|
||
# 计算混淆矩阵
|
||
cm = confusion_matrix(y_true, y_pred)
|
||
|
||
# 返回多个性能指标的字典,包括F1分数
|
||
return {
|
||
"accuracy": accuracy_score(y_true, y_pred),
|
||
"precision": precision_score(y_true, y_pred, average='weighted'),
|
||
"recall": recall_score(y_true, y_pred, average='weighted'),
|
||
"f1_score": f1_score(y_true, y_pred, average='weighted'),
|
||
"confusion_matrix": cm
|
||
}
|
||
|
||
# 逻辑回归模型 (Logistic Regression)
|
||
from sklearn.svm import SVC
|
||
from sklearn.neighbors import KNeighborsClassifier
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
import xgboost as xgb
|
||
import lightgbm as lgb
|
||
import catboost as cb
|
||
from sklearn.linear_model import LogisticRegression
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
# 1. SVM 贝叶斯优化
|
||
def optimize_SVM(X_train, y_train, X_test, y_test):
|
||
param_space = {
|
||
'C': (0.01, 10.0, 'uniform'),
|
||
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
|
||
'gamma': (1e-4, 1e-1, 'log-uniform')
|
||
}
|
||
|
||
model = SVC()
|
||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||
optimizer.fit(X_train, y_train)
|
||
|
||
best_params = optimizer.best_params_
|
||
|
||
# 使用最优超参数训练并评估模型
|
||
best_model = optimizer.best_estimator_
|
||
y_train_pred = best_model.predict(X_train)
|
||
y_test_pred = best_model.predict(X_test)
|
||
|
||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||
|
||
return best_params, train_metrics, test_metrics
|
||
|
||
# 2. KNN 贝叶斯优化
|
||
def optimize_KNN(X_train, y_train, X_test, y_test):
|
||
param_space = {
|
||
'n_neighbors': (1, 20),
|
||
'weights': ['uniform', 'distance'],
|
||
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
|
||
}
|
||
|
||
model = KNeighborsClassifier()
|
||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||
optimizer.fit(X_train, y_train)
|
||
|
||
best_params = optimizer.best_params_
|
||
|
||
# 使用最优超参数训练并评估模型
|
||
best_model = optimizer.best_estimator_
|
||
y_train_pred = best_model.predict(X_train)
|
||
y_test_pred = best_model.predict(X_test)
|
||
|
||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||
|
||
return best_params, train_metrics, test_metrics
|
||
|
||
# 3. XGBoost 贝叶斯优化
|
||
def optimize_XGBoost(X_train, y_train, X_test, y_test):
|
||
param_space = {
|
||
'n_estimators': Integer(50, 500),
|
||
'max_depth': Integer(3, 10),
|
||
'learning_rate': Real(1e-4, 1.0, prior='log-uniform'),
|
||
'subsample': Real(0.1, 1.0),
|
||
'colsample_bytree': Real(0.1, 1.0)
|
||
}
|
||
|
||
model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
|
||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||
|
||
optimizer.fit(X_train, y_train)
|
||
|
||
best_params = optimizer.best_params_
|
||
best_model = optimizer.best_estimator_
|
||
|
||
y_train_pred = best_model.predict(X_train)
|
||
y_test_pred = best_model.predict(X_test)
|
||
|
||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||
|
||
return best_params, train_metrics, test_metrics
|
||
|
||
# 4. Random Forest 贝叶斯优化
|
||
def optimize_RF(X_train, y_train, X_test, y_test):
|
||
param_space = {
|
||
'n_estimators': (50, 500),
|
||
'max_depth': (3, 15),
|
||
'min_samples_split': (2, 20),
|
||
'min_samples_leaf': (1, 20),
|
||
'max_features': ['auto', 'sqrt', 'log2']
|
||
}
|
||
|
||
model = RandomForestClassifier(random_state=42)
|
||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||
optimizer.fit(X_train, y_train)
|
||
|
||
best_params = optimizer.best_params_
|
||
|
||
# 使用最优超参数训练并评估模型
|
||
best_model = optimizer.best_estimator_
|
||
y_train_pred = best_model.predict(X_train)
|
||
y_test_pred = best_model.predict(X_test)
|
||
|
||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||
|
||
return best_params, train_metrics, test_metrics
|
||
|
||
# 5. CatBoost 贝叶斯优化
|
||
def optimize_CatBoost(X_train, y_train, X_test, y_test):
|
||
param_space = {
|
||
'iterations': (50, 500),
|
||
'learning_rate': (0.01, 0.3, 'uniform'),
|
||
'depth': (3, 10),
|
||
'l2_leaf_reg': (1, 10, 'uniform'),
|
||
'bagging_temperature': (0, 1, 'uniform')
|
||
}
|
||
|
||
model = cb.CatBoostClassifier(task_type='GPU', random_seed=42, verbose=0)
|
||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||
optimizer.fit(X_train, y_train)
|
||
|
||
best_params = optimizer.best_params_
|
||
|
||
# 使用最优超参数训练并评估模型
|
||
best_model = optimizer.best_estimator_
|
||
y_train_pred = best_model.predict(X_train)
|
||
y_test_pred = best_model.predict(X_test)
|
||
|
||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||
|
||
return best_params, train_metrics, test_metrics
|
||
|
||
# 6. Logistic Regression 贝叶斯优化
|
||
def optimize_LogisticRegression(X_train, y_train, X_test, y_test):
|
||
param_space = {
|
||
'C': (1e-5, 1e5, 'log-uniform'),
|
||
'penalty': ['l1', 'l2'],
|
||
'solver': ['lbfgs', 'liblinear', 'saga']
|
||
}
|
||
|
||
model = LogisticRegression(multi_class='multinomial', random_state=42)
|
||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||
optimizer.fit(X_train, y_train)
|
||
|
||
best_params = optimizer.best_params_
|
||
|
||
# 使用最优超参数训练并评估模型
|
||
best_model = optimizer.best_estimator_
|
||
y_train_pred = best_model.predict(X_train)
|
||
y_test_pred = best_model.predict(X_test)
|
||
|
||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||
|
||
return best_params, train_metrics, test_metrics
|
||
|
||
# 7. Neural Network (ANN) 贝叶斯优化
|
||
def optimize_ANN(X_train, y_train, X_test, y_test):
|
||
param_space = {
|
||
'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50)],
|
||
'activation': ['relu', 'tanh', 'logistic'],
|
||
'solver': ['adam', 'sgd'],
|
||
'alpha': (1e-5, 1e-1, 'log-uniform'),
|
||
'learning_rate': ['constant', 'invscaling', 'adaptive']
|
||
}
|
||
|
||
model = MLPClassifier(max_iter=500, random_state=42)
|
||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||
optimizer.fit(X_train, y_train)
|
||
|
||
best_params = optimizer.best_params_
|
||
|
||
# 使用最优超参数训练并评估模型
|
||
best_model = optimizer.best_estimator_
|
||
y_train_pred = best_model.predict(X_train)
|
||
y_test_pred = best_model.predict(X_test)
|
||
|
||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||
|
||
return best_params, train_metrics, test_metrics
|