307 lines
9.6 KiB
Python
307 lines
9.6 KiB
Python
import numpy as np
|
|
from sklearn.metrics import f1_score, classification_report
|
|
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold
|
|
from scipy.stats import loguniform, randint
|
|
from xgboost import XGBClassifier
|
|
import lightgbm as lgb
|
|
import catboost as cb
|
|
from sklearn.svm import SVC
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
|
|
import gc
|
|
import os
|
|
|
|
|
|
# 固定随机种子
|
|
def set_random_seed(seed=42):
|
|
np.random.seed(seed)
|
|
os.environ['PYTHONHASHSEED'] = str(seed)
|
|
|
|
|
|
set_random_seed()
|
|
|
|
|
|
# 性能评估函数
|
|
def evaluate_model(y_true, y_pred, dataset_name="Test"):
|
|
print(f"\n{dataset_name} Classification Report:")
|
|
print(classification_report(y_true, y_pred))
|
|
|
|
cm = confusion_matrix(y_true, y_pred)
|
|
return {
|
|
"accuracy": accuracy_score(y_true, y_pred),
|
|
"precision": precision_score(y_true, y_pred, average='weighted'),
|
|
"recall": recall_score(y_true, y_pred, average='weighted'),
|
|
"f1_score": f1_score(y_true, y_pred, average='weighted'),
|
|
"confusion_matrix": cm
|
|
}
|
|
|
|
|
|
# 优化XGBoost
|
|
def optimize_XGBoost(X_train, y_train, X_test, y_test):
|
|
param_dist = {
|
|
'max_depth': randint(3, 10), # 控制树的最大深度
|
|
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
|
|
'subsample': [0.6, 0.8, 1.0], # 在每次迭代中随机选择部分样本进行训练
|
|
'colsample_bytree': [0.6, 0.8, 1.0], # 在每棵树的构建过程中,随机选择部分特征
|
|
'n_estimators': randint(100, 300), # 树的数量
|
|
'min_child_weight': randint(1, 10), # 子叶节点的最小样本权重和
|
|
'gamma': [0, 0.1, 0.2] # 控制模型切分节点的最小损失函数值
|
|
}
|
|
|
|
model = XGBClassifier(
|
|
tree_method='gpu_hist',
|
|
gpu_id=0,
|
|
use_label_encoder=False,
|
|
eval_metric='mlogloss',
|
|
objective='multi:softmax',
|
|
num_class=len(np.unique(y_train))
|
|
)
|
|
|
|
optimizer = RandomizedSearchCV(
|
|
model,
|
|
param_distributions=param_dist,
|
|
n_iter=30,
|
|
cv=StratifiedKFold(n_splits=3),
|
|
scoring='f1_weighted',
|
|
n_jobs=-1,
|
|
verbose=1
|
|
)
|
|
optimizer.fit(X_train, y_train)
|
|
|
|
best_params = optimizer.best_params_
|
|
print(f"Best XGBoost Hyperparameters: {best_params}")
|
|
best_model = optimizer.best_estimator_
|
|
|
|
y_train_pred = best_model.predict(X_train)
|
|
y_test_pred = best_model.predict(X_test)
|
|
|
|
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
|
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
|
|
|
gc.collect()
|
|
return best_params, train_metrics, test_metrics
|
|
|
|
|
|
# 优化LightGBM
|
|
def optimize_LightGBM(X_train, y_train, X_test, y_test):
|
|
param_dist = {
|
|
'num_leaves': randint(20, 50), # 控制树的复杂度
|
|
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
|
|
'subsample': [0.6, 0.8, 1.0], # 在每次迭代中随机选择部分样本进行训练
|
|
'colsample_bytree': [0.6, 0.8, 1.0], # 在每棵树的构建过程中随机选择部分特征
|
|
'n_estimators': randint(100, 300), # 树的数量
|
|
'min_child_samples': randint(10, 100), # 叶子节点的最小样本数
|
|
'max_depth': [None, 3, 5, 7] # 树的最大深度
|
|
}
|
|
|
|
model = lgb.LGBMClassifier(
|
|
device_type='gpu',
|
|
objective='multiclass',
|
|
num_class=len(np.unique(y_train))
|
|
)
|
|
|
|
optimizer = RandomizedSearchCV(
|
|
model,
|
|
param_distributions=param_dist,
|
|
n_iter=30,
|
|
cv=StratifiedKFold(n_splits=3),
|
|
scoring='f1_weighted',
|
|
n_jobs=-1,
|
|
verbose=1
|
|
)
|
|
optimizer.fit(X_train, y_train)
|
|
|
|
best_params = optimizer.best_params_
|
|
print(f"Best LightGBM Hyperparameters: {best_params}")
|
|
best_model = optimizer.best_estimator_
|
|
|
|
y_train_pred = best_model.predict(X_train)
|
|
y_test_pred = best_model.predict(X_test)
|
|
|
|
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
|
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
|
|
|
gc.collect()
|
|
return best_params, train_metrics, test_metrics
|
|
|
|
|
|
# 优化CatBoost
|
|
def optimize_CatBoost(X_train, y_train, X_test, y_test):
|
|
param_dist = {
|
|
'depth': randint(4, 8), # 控制树的深度
|
|
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
|
|
'l2_leaf_reg': randint(1, 10), # L2正则化系数
|
|
'iterations': randint(100, 300), # 树的数量
|
|
'border_count': [32, 64, 128] # 分割点的数量
|
|
}
|
|
|
|
model = cb.CatBoostClassifier(
|
|
task_type='GPU',
|
|
verbose=0,
|
|
loss_function='MultiClass'
|
|
)
|
|
|
|
optimizer = RandomizedSearchCV(
|
|
model,
|
|
param_distributions=param_dist,
|
|
n_iter=30,
|
|
cv=StratifiedKFold(n_splits=3),
|
|
scoring='f1_weighted',
|
|
n_jobs=-1,
|
|
verbose=1
|
|
)
|
|
optimizer.fit(X_train, y_train)
|
|
|
|
best_params = optimizer.best_params_
|
|
print(f"Best CatBoost Hyperparameters: {best_params}")
|
|
best_model = optimizer.best_estimator_
|
|
|
|
y_train_pred = best_model.predict(X_train)
|
|
y_test_pred = best_model.predict(X_test)
|
|
|
|
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
|
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
|
|
|
gc.collect()
|
|
return best_params, train_metrics, test_metrics
|
|
|
|
|
|
# 优化SVM
|
|
def optimize_SVM(X_train, y_train, X_test, y_test):
|
|
param_dist = {
|
|
'C': loguniform(1e-2, 10), # 惩罚参数
|
|
'kernel': ['linear', 'rbf'], # 核函数
|
|
'gamma': loguniform(1e-4, 1e-1) # 核函数的系数
|
|
}
|
|
|
|
model = SVC(probability=True)
|
|
|
|
optimizer = RandomizedSearchCV(
|
|
model,
|
|
param_distributions=param_dist,
|
|
n_iter=30,
|
|
cv=StratifiedKFold(n_splits=3),
|
|
scoring='f1_weighted',
|
|
n_jobs=-1,
|
|
verbose=1
|
|
)
|
|
optimizer.fit(X_train, y_train)
|
|
|
|
best_params = optimizer.best_params_
|
|
print(f"Best SVM Hyperparameters: {best_params}")
|
|
best_model = optimizer.best_estimator_
|
|
|
|
y_train_pred = best_model.predict(X_train)
|
|
y_test_pred = best_model.predict(X_test)
|
|
|
|
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
|
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
|
|
|
return best_params, train_metrics, test_metrics
|
|
|
|
|
|
# 优化KNN
|
|
def optimize_KNN(X_train, y_train, X_test, y_test):
|
|
param_grid = {
|
|
'n_neighbors': list(range(3, 20, 2)), # 邻居的数量
|
|
'weights': ['uniform', 'distance'], # 权重函数
|
|
'p': [1, 2] # 距离度量
|
|
}
|
|
|
|
model = KNeighborsClassifier(algorithm='brute')
|
|
|
|
optimizer = GridSearchCV(
|
|
model,
|
|
param_grid=param_grid,
|
|
cv=StratifiedKFold(n_splits=3),
|
|
scoring='f1_weighted',
|
|
n_jobs=-1,
|
|
verbose=1
|
|
)
|
|
optimizer.fit(X_train, y_train)
|
|
|
|
best_params = optimizer.best_params_
|
|
print(f"Best KNN Hyperparameters: {best_params}")
|
|
best_model = optimizer.best_estimator_
|
|
|
|
y_train_pred = best_model.predict(X_train)
|
|
y_test_pred = best_model.predict(X_test)
|
|
|
|
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
|
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
|
|
|
return best_params, train_metrics, test_metrics
|
|
|
|
|
|
# 优化LogisticRegression
|
|
def optimize_LogisticRegression(X_train, y_train, X_test, y_test):
|
|
param_grid = {
|
|
'C': loguniform(1e-4, 1e2), # 正则化强度
|
|
'penalty': ['l2', None], # 正则化类型
|
|
'solver': ['lbfgs', 'sag', 'saga'] # 优化算法
|
|
}
|
|
|
|
model = LogisticRegression(max_iter=1000, random_state=42)
|
|
|
|
optimizer = RandomizedSearchCV(
|
|
model,
|
|
param_distributions=param_grid,
|
|
n_iter=30,
|
|
cv=StratifiedKFold(n_splits=3),
|
|
scoring='f1_weighted',
|
|
n_jobs=-1,
|
|
verbose=1
|
|
)
|
|
optimizer.fit(X_train, y_train)
|
|
|
|
best_params = optimizer.best_params_
|
|
print(f"Best Logistic Regression Hyperparameters: {best_params}")
|
|
best_model = optimizer.best_estimator_
|
|
|
|
y_train_pred = best_model.predict(X_train)
|
|
y_test_pred = best_model.predict(X_test)
|
|
|
|
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
|
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
|
|
|
return best_params, train_metrics, test_metrics
|
|
|
|
|
|
# 优化RandomForest
|
|
def optimize_RF(X_train, y_train, X_test, y_test):
|
|
param_dist = {
|
|
'n_estimators': randint(100, 300), # 树的数量
|
|
'max_depth': [None, 3, 5, 7], # 树的最大深度
|
|
'min_samples_split': randint(2, 10), # 分裂内部节点所需的最小样本数
|
|
'min_samples_leaf': randint(1, 10), # 叶子节点的最小样本数
|
|
'bootstrap': [True, False], # 是否使用自助法采样
|
|
'criterion': ['gini', 'entropy'] # 划分的标准
|
|
}
|
|
|
|
model = RandomForestClassifier(random_state=42)
|
|
|
|
optimizer = RandomizedSearchCV(
|
|
model,
|
|
param_distributions=param_dist,
|
|
n_iter=30,
|
|
cv=StratifiedKFold(n_splits=3),
|
|
scoring='f1_weighted',
|
|
n_jobs=-1,
|
|
verbose=1
|
|
)
|
|
optimizer.fit(X_train, y_train)
|
|
|
|
best_params = optimizer.best_params_
|
|
print(f"Best Random Forest Hyperparameters: {best_params}")
|
|
best_model = optimizer.best_estimator_
|
|
|
|
y_train_pred = best_model.predict(X_train)
|
|
y_test_pred = best_model.predict(X_test)
|
|
|
|
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
|
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
|
|
|
return best_params, train_metrics, test_metrics
|