Files
micro_plastic/classification_model/Classification/ClassicCls_网格搜索.py
2026-02-25 09:42:51 +08:00

307 lines
9.6 KiB
Python

import numpy as np
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold
from scipy.stats import loguniform, randint
from xgboost import XGBClassifier
import lightgbm as lgb
import catboost as cb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import gc
import os
# 固定随机种子
def set_random_seed(seed=42):
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
set_random_seed()
# 性能评估函数
def evaluate_model(y_true, y_pred, dataset_name="Test"):
print(f"\n{dataset_name} Classification Report:")
print(classification_report(y_true, y_pred))
cm = confusion_matrix(y_true, y_pred)
return {
"accuracy": accuracy_score(y_true, y_pred),
"precision": precision_score(y_true, y_pred, average='weighted'),
"recall": recall_score(y_true, y_pred, average='weighted'),
"f1_score": f1_score(y_true, y_pred, average='weighted'),
"confusion_matrix": cm
}
# 优化XGBoost
def optimize_XGBoost(X_train, y_train, X_test, y_test):
param_dist = {
'max_depth': randint(3, 10), # 控制树的最大深度
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
'subsample': [0.6, 0.8, 1.0], # 在每次迭代中随机选择部分样本进行训练
'colsample_bytree': [0.6, 0.8, 1.0], # 在每棵树的构建过程中,随机选择部分特征
'n_estimators': randint(100, 300), # 树的数量
'min_child_weight': randint(1, 10), # 子叶节点的最小样本权重和
'gamma': [0, 0.1, 0.2] # 控制模型切分节点的最小损失函数值
}
model = XGBClassifier(
tree_method='gpu_hist',
gpu_id=0,
use_label_encoder=False,
eval_metric='mlogloss',
objective='multi:softmax',
num_class=len(np.unique(y_train))
)
optimizer = RandomizedSearchCV(
model,
param_distributions=param_dist,
n_iter=30,
cv=StratifiedKFold(n_splits=3),
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
print(f"Best XGBoost Hyperparameters: {best_params}")
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
gc.collect()
return best_params, train_metrics, test_metrics
# 优化LightGBM
def optimize_LightGBM(X_train, y_train, X_test, y_test):
param_dist = {
'num_leaves': randint(20, 50), # 控制树的复杂度
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
'subsample': [0.6, 0.8, 1.0], # 在每次迭代中随机选择部分样本进行训练
'colsample_bytree': [0.6, 0.8, 1.0], # 在每棵树的构建过程中随机选择部分特征
'n_estimators': randint(100, 300), # 树的数量
'min_child_samples': randint(10, 100), # 叶子节点的最小样本数
'max_depth': [None, 3, 5, 7] # 树的最大深度
}
model = lgb.LGBMClassifier(
device_type='gpu',
objective='multiclass',
num_class=len(np.unique(y_train))
)
optimizer = RandomizedSearchCV(
model,
param_distributions=param_dist,
n_iter=30,
cv=StratifiedKFold(n_splits=3),
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
print(f"Best LightGBM Hyperparameters: {best_params}")
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
gc.collect()
return best_params, train_metrics, test_metrics
# 优化CatBoost
def optimize_CatBoost(X_train, y_train, X_test, y_test):
param_dist = {
'depth': randint(4, 8), # 控制树的深度
'learning_rate': loguniform(1e-3, 0.2), # 控制每棵树对最终结果的贡献
'l2_leaf_reg': randint(1, 10), # L2正则化系数
'iterations': randint(100, 300), # 树的数量
'border_count': [32, 64, 128] # 分割点的数量
}
model = cb.CatBoostClassifier(
task_type='GPU',
verbose=0,
loss_function='MultiClass'
)
optimizer = RandomizedSearchCV(
model,
param_distributions=param_dist,
n_iter=30,
cv=StratifiedKFold(n_splits=3),
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
print(f"Best CatBoost Hyperparameters: {best_params}")
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
gc.collect()
return best_params, train_metrics, test_metrics
# 优化SVM
def optimize_SVM(X_train, y_train, X_test, y_test):
param_dist = {
'C': loguniform(1e-2, 10), # 惩罚参数
'kernel': ['linear', 'rbf'], # 核函数
'gamma': loguniform(1e-4, 1e-1) # 核函数的系数
}
model = SVC(probability=True)
optimizer = RandomizedSearchCV(
model,
param_distributions=param_dist,
n_iter=30,
cv=StratifiedKFold(n_splits=3),
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
print(f"Best SVM Hyperparameters: {best_params}")
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
return best_params, train_metrics, test_metrics
# 优化KNN
def optimize_KNN(X_train, y_train, X_test, y_test):
param_grid = {
'n_neighbors': list(range(3, 20, 2)), # 邻居的数量
'weights': ['uniform', 'distance'], # 权重函数
'p': [1, 2] # 距离度量
}
model = KNeighborsClassifier(algorithm='brute')
optimizer = GridSearchCV(
model,
param_grid=param_grid,
cv=StratifiedKFold(n_splits=3),
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
print(f"Best KNN Hyperparameters: {best_params}")
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
return best_params, train_metrics, test_metrics
# 优化LogisticRegression
def optimize_LogisticRegression(X_train, y_train, X_test, y_test):
param_grid = {
'C': loguniform(1e-4, 1e2), # 正则化强度
'penalty': ['l2', None], # 正则化类型
'solver': ['lbfgs', 'sag', 'saga'] # 优化算法
}
model = LogisticRegression(max_iter=1000, random_state=42)
optimizer = RandomizedSearchCV(
model,
param_distributions=param_grid,
n_iter=30,
cv=StratifiedKFold(n_splits=3),
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
print(f"Best Logistic Regression Hyperparameters: {best_params}")
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
return best_params, train_metrics, test_metrics
# 优化RandomForest
def optimize_RF(X_train, y_train, X_test, y_test):
param_dist = {
'n_estimators': randint(100, 300), # 树的数量
'max_depth': [None, 3, 5, 7], # 树的最大深度
'min_samples_split': randint(2, 10), # 分裂内部节点所需的最小样本数
'min_samples_leaf': randint(1, 10), # 叶子节点的最小样本数
'bootstrap': [True, False], # 是否使用自助法采样
'criterion': ['gini', 'entropy'] # 划分的标准
}
model = RandomForestClassifier(random_state=42)
optimizer = RandomizedSearchCV(
model,
param_distributions=param_dist,
n_iter=30,
cv=StratifiedKFold(n_splits=3),
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
print(f"Best Random Forest Hyperparameters: {best_params}")
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, "Train")
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
return best_params, train_metrics, test_metrics