初始提交
This commit is contained in:
306
classification_model/Classification/ClassicCls_网格搜索.py
Normal file
306
classification_model/Classification/ClassicCls_网格搜索.py
Normal file
@ -0,0 +1,306 @@
|
||||
import numpy as np
|
||||
from sklearn.metrics import f1_score, classification_report
|
||||
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold
|
||||
from scipy.stats import loguniform, randint
|
||||
from xgboost import XGBClassifier
|
||||
import lightgbm as lgb
|
||||
import catboost as cb
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
|
||||
import gc
|
||||
import os
|
||||
|
||||
|
||||
# 固定随机种子
|
||||
def set_random_seed(seed=42):
|
||||
np.random.seed(seed)
|
||||
os.environ['PYTHONHASHSEED'] = str(seed)
|
||||
|
||||
|
||||
set_random_seed()
|
||||
|
||||
|
||||
# 性能评估函数
|
||||
def evaluate_model(y_true, y_pred, dataset_name="Test"):
|
||||
print(f"\n{dataset_name} Classification Report:")
|
||||
print(classification_report(y_true, y_pred))
|
||||
|
||||
cm = confusion_matrix(y_true, y_pred)
|
||||
return {
|
||||
"accuracy": accuracy_score(y_true, y_pred),
|
||||
"precision": precision_score(y_true, y_pred, average='weighted'),
|
||||
"recall": recall_score(y_true, y_pred, average='weighted'),
|
||||
"f1_score": f1_score(y_true, y_pred, average='weighted'),
|
||||
"confusion_matrix": cm
|
||||
}
|
||||
|
||||
|
||||
# 优化XGBoost
|
||||
def optimize_XGBoost(X_train, y_train, X_test, y_test):
|
||||
param_dist = {
|
||||
'max_depth': randint(3, 10), # 控制树的最大深度
|
||||
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
|
||||
'subsample': [0.6, 0.8, 1.0], # 在每次迭代中随机选择部分样本进行训练
|
||||
'colsample_bytree': [0.6, 0.8, 1.0], # 在每棵树的构建过程中,随机选择部分特征
|
||||
'n_estimators': randint(100, 300), # 树的数量
|
||||
'min_child_weight': randint(1, 10), # 子叶节点的最小样本权重和
|
||||
'gamma': [0, 0.1, 0.2] # 控制模型切分节点的最小损失函数值
|
||||
}
|
||||
|
||||
model = XGBClassifier(
|
||||
tree_method='gpu_hist',
|
||||
gpu_id=0,
|
||||
use_label_encoder=False,
|
||||
eval_metric='mlogloss',
|
||||
objective='multi:softmax',
|
||||
num_class=len(np.unique(y_train))
|
||||
)
|
||||
|
||||
optimizer = RandomizedSearchCV(
|
||||
model,
|
||||
param_distributions=param_dist,
|
||||
n_iter=30,
|
||||
cv=StratifiedKFold(n_splits=3),
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1,
|
||||
verbose=1
|
||||
)
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
print(f"Best XGBoost Hyperparameters: {best_params}")
|
||||
best_model = optimizer.best_estimator_
|
||||
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
||||
|
||||
gc.collect()
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
|
||||
# 优化LightGBM
|
||||
def optimize_LightGBM(X_train, y_train, X_test, y_test):
|
||||
param_dist = {
|
||||
'num_leaves': randint(20, 50), # 控制树的复杂度
|
||||
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
|
||||
'subsample': [0.6, 0.8, 1.0], # 在每次迭代中随机选择部分样本进行训练
|
||||
'colsample_bytree': [0.6, 0.8, 1.0], # 在每棵树的构建过程中随机选择部分特征
|
||||
'n_estimators': randint(100, 300), # 树的数量
|
||||
'min_child_samples': randint(10, 100), # 叶子节点的最小样本数
|
||||
'max_depth': [None, 3, 5, 7] # 树的最大深度
|
||||
}
|
||||
|
||||
model = lgb.LGBMClassifier(
|
||||
device_type='gpu',
|
||||
objective='multiclass',
|
||||
num_class=len(np.unique(y_train))
|
||||
)
|
||||
|
||||
optimizer = RandomizedSearchCV(
|
||||
model,
|
||||
param_distributions=param_dist,
|
||||
n_iter=30,
|
||||
cv=StratifiedKFold(n_splits=3),
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1,
|
||||
verbose=1
|
||||
)
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
print(f"Best LightGBM Hyperparameters: {best_params}")
|
||||
best_model = optimizer.best_estimator_
|
||||
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
||||
|
||||
gc.collect()
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
|
||||
# 优化CatBoost
|
||||
def optimize_CatBoost(X_train, y_train, X_test, y_test):
|
||||
param_dist = {
|
||||
'depth': randint(4, 8), # 控制树的深度
|
||||
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
|
||||
'l2_leaf_reg': randint(1, 10), # L2正则化系数
|
||||
'iterations': randint(100, 300), # 树的数量
|
||||
'border_count': [32, 64, 128] # 分割点的数量
|
||||
}
|
||||
|
||||
model = cb.CatBoostClassifier(
|
||||
task_type='GPU',
|
||||
verbose=0,
|
||||
loss_function='MultiClass'
|
||||
)
|
||||
|
||||
optimizer = RandomizedSearchCV(
|
||||
model,
|
||||
param_distributions=param_dist,
|
||||
n_iter=30,
|
||||
cv=StratifiedKFold(n_splits=3),
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1,
|
||||
verbose=1
|
||||
)
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
print(f"Best CatBoost Hyperparameters: {best_params}")
|
||||
best_model = optimizer.best_estimator_
|
||||
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
||||
|
||||
gc.collect()
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
|
||||
# 优化SVM
|
||||
def optimize_SVM(X_train, y_train, X_test, y_test):
|
||||
param_dist = {
|
||||
'C': loguniform(1e-2, 10), # 惩罚参数
|
||||
'kernel': ['linear', 'rbf'], # 核函数
|
||||
'gamma': loguniform(1e-4, 1e-1) # 核函数的系数
|
||||
}
|
||||
|
||||
model = SVC(probability=True)
|
||||
|
||||
optimizer = RandomizedSearchCV(
|
||||
model,
|
||||
param_distributions=param_dist,
|
||||
n_iter=30,
|
||||
cv=StratifiedKFold(n_splits=3),
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1,
|
||||
verbose=1
|
||||
)
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
print(f"Best SVM Hyperparameters: {best_params}")
|
||||
best_model = optimizer.best_estimator_
|
||||
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
|
||||
# 优化KNN
|
||||
def optimize_KNN(X_train, y_train, X_test, y_test):
|
||||
param_grid = {
|
||||
'n_neighbors': list(range(3, 20, 2)), # 邻居的数量
|
||||
'weights': ['uniform', 'distance'], # 权重函数
|
||||
'p': [1, 2] # 距离度量
|
||||
}
|
||||
|
||||
model = KNeighborsClassifier(algorithm='brute')
|
||||
|
||||
optimizer = GridSearchCV(
|
||||
model,
|
||||
param_grid=param_grid,
|
||||
cv=StratifiedKFold(n_splits=3),
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1,
|
||||
verbose=1
|
||||
)
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
print(f"Best KNN Hyperparameters: {best_params}")
|
||||
best_model = optimizer.best_estimator_
|
||||
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
|
||||
# 优化LogisticRegression
|
||||
def optimize_LogisticRegression(X_train, y_train, X_test, y_test):
|
||||
param_grid = {
|
||||
'C': loguniform(1e-4, 1e2), # 正则化强度
|
||||
'penalty': ['l2', None], # 正则化类型
|
||||
'solver': ['lbfgs', 'sag', 'saga'] # 优化算法
|
||||
}
|
||||
|
||||
model = LogisticRegression(max_iter=1000, random_state=42)
|
||||
|
||||
optimizer = RandomizedSearchCV(
|
||||
model,
|
||||
param_distributions=param_grid,
|
||||
n_iter=30,
|
||||
cv=StratifiedKFold(n_splits=3),
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1,
|
||||
verbose=1
|
||||
)
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
print(f"Best Logistic Regression Hyperparameters: {best_params}")
|
||||
best_model = optimizer.best_estimator_
|
||||
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
|
||||
|
||||
# 优化RandomForest
|
||||
def optimize_RF(X_train, y_train, X_test, y_test):
|
||||
param_dist = {
|
||||
'n_estimators': randint(100, 300), # 树的数量
|
||||
'max_depth': [None, 3, 5, 7], # 树的最大深度
|
||||
'min_samples_split': randint(2, 10), # 分裂内部节点所需的最小样本数
|
||||
'min_samples_leaf': randint(1, 10), # 叶子节点的最小样本数
|
||||
'bootstrap': [True, False], # 是否使用自助法采样
|
||||
'criterion': ['gini', 'entropy'] # 划分的标准
|
||||
}
|
||||
|
||||
model = RandomForestClassifier(random_state=42)
|
||||
|
||||
optimizer = RandomizedSearchCV(
|
||||
model,
|
||||
param_distributions=param_dist,
|
||||
n_iter=30,
|
||||
cv=StratifiedKFold(n_splits=3),
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1,
|
||||
verbose=1
|
||||
)
|
||||
optimizer.fit(X_train, y_train)
|
||||
|
||||
best_params = optimizer.best_params_
|
||||
print(f"Best Random Forest Hyperparameters: {best_params}")
|
||||
best_model = optimizer.best_estimator_
|
||||
|
||||
y_train_pred = best_model.predict(X_train)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
|
||||
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
|
||||
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
||||
|
||||
return best_params, train_metrics, test_metrics
|
||||
Reference in New Issue
Block a user