初始提交

This commit is contained in:
2026-02-25 09:42:51 +08:00
parent c25276c481
commit d84d886f35
182 changed files with 18438 additions and 0 deletions

View File

@ -0,0 +1,235 @@
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
import sklearn.svm as svm
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from xgboost import XGBClassifier
import lightgbm as lgb
import catboost as cb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
# 固定随机种子
def set_random_seed(seed=42):
np.random.seed(seed)
set_random_seed()
# 交叉验证(多核心支持)
def cross_validate_model(model, X, y, cv=5, n_jobs=-1):
"""
多核心交叉验证
"""
scores = cross_val_score(model, X, y, cv=cv, n_jobs=n_jobs)
print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
return scores
# 混淆矩阵与分类报告
def evaluate_model(y_true, y_pred, dataset_name="Test"):
"""
性能评估,包含分类报告和混淆矩阵。
"""
print(f"{dataset_name} Classification Report:")
print(classification_report(y_true, y_pred))
# 计算混淆矩阵
cm = confusion_matrix(y_true, y_pred)
# 返回多个性能指标的字典包括F1分数
return {
"accuracy": accuracy_score(y_true, y_pred),
"precision": precision_score(y_true, y_pred, average='weighted'),
"recall": recall_score(y_true, y_pred, average='weighted'),
"f1_score": f1_score(y_true, y_pred, average='weighted'),
"confusion_matrix": cm
}
# 逻辑回归模型 (Logistic Regression)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# 1. SVM 贝叶斯优化
def optimize_SVM(X_train, y_train, X_test, y_test):
param_space = {
'C': (0.01, 10.0, 'uniform'),
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma': (1e-4, 1e-1, 'log-uniform')
}
model = SVC()
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 2. KNN 贝叶斯优化
def optimize_KNN(X_train, y_train, X_test, y_test):
param_space = {
'n_neighbors': (1, 20),
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
model = KNeighborsClassifier()
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 3. XGBoost 贝叶斯优化
def optimize_XGBoost(X_train, y_train, X_test, y_test):
param_space = {
'n_estimators': Integer(50, 500),
'max_depth': Integer(3, 10),
'learning_rate': Real(1e-4, 1.0, prior='log-uniform'),
'subsample': Real(0.1, 1.0),
'colsample_bytree': Real(0.1, 1.0)
}
model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 4. Random Forest 贝叶斯优化
def optimize_RF(X_train, y_train, X_test, y_test):
param_space = {
'n_estimators': (50, 500),
'max_depth': (3, 15),
'min_samples_split': (2, 20),
'min_samples_leaf': (1, 20),
'max_features': ['auto', 'sqrt', 'log2']
}
model = RandomForestClassifier(random_state=42)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 5. CatBoost 贝叶斯优化
def optimize_CatBoost(X_train, y_train, X_test, y_test):
param_space = {
'iterations': (50, 500),
'learning_rate': (0.01, 0.3, 'uniform'),
'depth': (3, 10),
'l2_leaf_reg': (1, 10, 'uniform'),
'bagging_temperature': (0, 1, 'uniform')
}
model = cb.CatBoostClassifier(task_type='GPU', random_seed=42, verbose=0)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 6. Logistic Regression 贝叶斯优化
def optimize_LogisticRegression(X_train, y_train, X_test, y_test):
param_space = {
'C': (1e-5, 1e5, 'log-uniform'),
'penalty': ['l1', 'l2'],
'solver': ['lbfgs', 'liblinear', 'saga']
}
model = LogisticRegression(multi_class='multinomial', random_state=42)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics
# 7. Neural Network (ANN) 贝叶斯优化
def optimize_ANN(X_train, y_train, X_test, y_test):
param_space = {
'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50)],
'activation': ['relu', 'tanh', 'logistic'],
'solver': ['adam', 'sgd'],
'alpha': (1e-5, 1e-1, 'log-uniform'),
'learning_rate': ['constant', 'invscaling', 'adaptive']
}
model = MLPClassifier(max_iter=500, random_state=42)
optimizer = BayesSearchCV(model, param_space, n_iter=50, cv=5, n_jobs=-1, verbose=0, scoring='f1_weighted') # 使用f1_weighted作为评分标准
optimizer.fit(X_train, y_train)
best_params = optimizer.best_params_
# 使用最优超参数训练并评估模型
best_model = optimizer.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_metrics = evaluate_model(y_train, y_train_pred, dataset_name="Train")
test_metrics = evaluate_model(y_test, y_test_pred, dataset_name="Test")
return best_params, train_metrics, test_metrics