Files
2026-02-25 09:42:51 +08:00

236 lines
8.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
import sklearn.svm as svm
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from xgboost import XGBClassifier
import lightgbm as lgb
import catboost as cb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
# 固定随机种子
def set_random_seed(seed=42):
np.random.seed(seed)
set_random_seed()
# 交叉验证(多核心支持)
def cross_validate_model(model, X, y, cv=5, n_jobs=-1):
"""
多核心交叉验证
"""
scores = cross_val_score(model, X, y, cv=cv, n_jobs=n_jobs)
print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
return scores
# 混淆矩阵与分类报告
def evaluate_model(y_true, y_pred, dataset_name="Test"):
"""
性能评估,包含分类报告和混淆矩阵。
"""
print(f"{dataset_name} Classification Report:")
print(classification_report(y_true, y_pred))
# 计算混淆矩阵
cm = confusion_matrix(y_true, y_pred)
# 返回多个性能指标的字典包括F1分数
return {
"accuracy": accuracy_score(y_true, y_pred),
"precision": precision_score(y_true, y_pred, average='weighted'),
"recall": recall_score(y_true, y_pred, average='weighted'),
"f1_score": f1_score(y_true, y_pred, average='weighted'),
"confusion_matrix": cm
}
# 逻辑回归模型 (Logistic Regression)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# 1. SVM 贝叶斯优化
def optimize_SVM(X_train, y_train, X_test, y_test):
param_space = {
'C': (0.01, 10.0, 'uniform'),
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma': (1e-4, 1e-1, 'log-uniform')
}
model = SVC()
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 2. KNN 贝叶斯优化
def optimize_KNN(X_train, y_train, X_test, y_test):
param_space = {
'n_neighbors': (1, 20),
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
model = KNeighborsClassifier()
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 3. XGBoost 贝叶斯优化
def optimize_XGBoost(X_train, y_train, X_test, y_test):
param_space = {
'n_estimators': Integer(50, 500),
'max_depth': Integer(3, 10),
'learning_rate': Real(1e-4, 1.0, prior='log-uniform'),
'subsample': Real(0.1, 1.0),
'colsample_bytree': Real(0.1, 1.0)
}
model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 4. Random Forest 贝叶斯优化
def optimize_RF(X_train, y_train, X_test, y_test):
param_space = {
'n_estimators': (50, 500),
'max_depth': (3, 15),
'min_samples_split': (2, 20),
'min_samples_leaf': (1, 20),
'max_features': ['auto', 'sqrt', 'log2']
}
model = RandomForestClassifier(random_state=42)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 5. CatBoost 贝叶斯优化
def optimize_CatBoost(X_train, y_train, X_test, y_test):
param_space = {
'iterations': (50, 500),
'learning_rate': (0.01, 0.3, 'uniform'),
'depth': (3, 10),
'l2_leaf_reg': (1, 10, 'uniform'),
'bagging_temperature': (0, 1, 'uniform')
}
model = cb.CatBoostClassifier(task_type='GPU', random_seed=42, verbose=0)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 6. Logistic Regression 贝叶斯优化
def optimize_LogisticRegression(X_train, y_train, X_test, y_test):
param_space = {
'C': (1e-5, 1e5, 'log-uniform'),
'penalty': ['l1', 'l2'],
'solver': ['lbfgs', 'liblinear', 'saga']
}
model = LogisticRegression(multi_class='multinomial', random_state=42)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 7. Neural Network (ANN) 贝叶斯优化
def optimize_ANN(X_train, y_train, X_test, y_test):
param_space = {
'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50)],
'activation': ['relu', 'tanh', 'logistic'],
'solver': ['adam', 'sgd'],
'alpha': (1e-5, 1e-1, 'log-uniform'),
'learning_rate': ['constant', 'invscaling', 'adaptive']
}
model = MLPClassifier(max_iter=500, random_state=42)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics