初始提交
This commit is contained in:
235
classification_model/Classification/ClassicClsHY.py
Normal file
235
classification_model/Classification/ClassicClsHY.py
Normal file
@ -0,0 +1,235 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
|
||||
import sklearn.svm as svm
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
|
||||
from skopt import BayesSearchCV
|
||||
from skopt.space import Real, Integer
|
||||
from xgboost import XGBClassifier
|
||||
import lightgbm as lgb
|
||||
import catboost as cb
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
# 固定随机种子
|
||||
def set_random_seed(seed=42):
|
||||
np.random.seed(seed)
|
||||
|
||||
set_random_seed()
|
||||
|
||||
# 交叉验证(多核心支持)
|
||||
def cross_validate_model(model, X, y, cv=5, n_jobs=-1):
|
||||
"""
|
||||
多核心交叉验证
|
||||
"""
|
||||
scores = cross_val_score(model, X, y, cv=cv, n_jobs=n_jobs)
|
||||
print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
|
||||
return scores
|
||||
|
||||
# 混淆矩阵与分类报告
|
||||
def evaluate_model(y_true, y_pred, dataset_name="Test"):
|
||||
"""
|
||||
性能评估,包含分类报告和混淆矩阵。
|
||||
"""
|
||||
print(f"{dataset_name} Classification Report:")
|
||||
print(classification_report(y_true, y_pred))
|
||||
|
||||
# 计算混淆矩阵
|
||||
cm = confusion_matrix(y_true, y_pred)
|
||||
|
||||
# 返回多个性能指标的字典,包括F1分数
|
||||
return {
|
||||
"accuracy": accuracy_score(y_true, y_pred),
|
||||
"precision": precision_score(y_true, y_pred, average='weighted'),
|
||||
"recall": recall_score(y_true, y_pred, average='weighted'),
|
||||
"f1_score": f1_score(y_true, y_pred, average='weighted'),
|
||||
"confusion_matrix": cm
|
||||
}
|
||||
|
||||
# 逻辑回归模型 (Logistic Regression)
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
import xgboost as xgb
|
||||
import lightgbm as lgb
|
||||
import catboost as cb
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
# 1. SVM 贝叶斯优化
|
||||
def optimize_SVM(X_train, y_train, X_test, y_test):
|
||||
param_space = {
|
||||
'C': (0.01, 10.0, 'uniform'),
|
||||
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
|
||||
'gamma': (1e-4, 1e-1, 'log-uniform')
|
||||
}
|
||||
|
||||
model = SVC()
|
||||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
|
||||
# 使用最优超参数训练并评估模型
|
||||
best_model = optimizer.best_estimator_
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
# 2. KNN 贝叶斯优化
|
||||
def optimize_KNN(X_train, y_train, X_test, y_test):
|
||||
param_space = {
|
||||
'n_neighbors': (1, 20),
|
||||
'weights': ['uniform', 'distance'],
|
||||
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
|
||||
}
|
||||
|
||||
model = KNeighborsClassifier()
|
||||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
|
||||
# 使用最优超参数训练并评估模型
|
||||
best_model = optimizer.best_estimator_
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
# 3. XGBoost 贝叶斯优化
|
||||
def optimize_XGBoost(X_train, y_train, X_test, y_test):
|
||||
param_space = {
|
||||
'n_estimators': Integer(50, 500),
|
||||
'max_depth': Integer(3, 10),
|
||||
'learning_rate': Real(1e-4, 1.0, prior='log-uniform'),
|
||||
'subsample': Real(0.1, 1.0),
|
||||
'colsample_bytree': Real(0.1, 1.0)
|
||||
}
|
||||
|
||||
model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
|
||||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||||
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
best_model = optimizer.best_estimator_
|
||||
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
# 4. Random Forest 贝叶斯优化
|
||||
def optimize_RF(X_train, y_train, X_test, y_test):
|
||||
param_space = {
|
||||
'n_estimators': (50, 500),
|
||||
'max_depth': (3, 15),
|
||||
'min_samples_split': (2, 20),
|
||||
'min_samples_leaf': (1, 20),
|
||||
'max_features': ['auto', 'sqrt', 'log2']
|
||||
}
|
||||
|
||||
model = RandomForestClassifier(random_state=42)
|
||||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
|
||||
# 使用最优超参数训练并评估模型
|
||||
best_model = optimizer.best_estimator_
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
# 5. CatBoost 贝叶斯优化
|
||||
def optimize_CatBoost(X_train, y_train, X_test, y_test):
|
||||
param_space = {
|
||||
'iterations': (50, 500),
|
||||
'learning_rate': (0.01, 0.3, 'uniform'),
|
||||
'depth': (3, 10),
|
||||
'l2_leaf_reg': (1, 10, 'uniform'),
|
||||
'bagging_temperature': (0, 1, 'uniform')
|
||||
}
|
||||
|
||||
model = cb.CatBoostClassifier(task_type='GPU', random_seed=42, verbose=0)
|
||||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
|
||||
# 使用最优超参数训练并评估模型
|
||||
best_model = optimizer.best_estimator_
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
# 6. Logistic Regression 贝叶斯优化
|
||||
def optimize_LogisticRegression(X_train, y_train, X_test, y_test):
|
||||
param_space = {
|
||||
'C': (1e-5, 1e5, 'log-uniform'),
|
||||
'penalty': ['l1', 'l2'],
|
||||
'solver': ['lbfgs', 'liblinear', 'saga']
|
||||
}
|
||||
|
||||
model = LogisticRegression(multi_class='multinomial', random_state=42)
|
||||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
|
||||
# 使用最优超参数训练并评估模型
|
||||
best_model = optimizer.best_estimator_
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
# 7. Neural Network (ANN) 贝叶斯优化
|
||||
def optimize_ANN(X_train, y_train, X_test, y_test):
|
||||
param_space = {
|
||||
'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50)],
|
||||
'activation': ['relu', 'tanh', 'logistic'],
|
||||
'solver': ['adam', 'sgd'],
|
||||
'alpha': (1e-5, 1e-1, 'log-uniform'),
|
||||
'learning_rate': ['constant', 'invscaling', 'adaptive']
|
||||
}
|
||||
|
||||
model = MLPClassifier(max_iter=500, random_state=42)
|
||||
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
|
||||
# 使用最优超参数训练并评估模型
|
||||
best_model = optimizer.best_estimator_
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
Reference in New Issue
Block a user