Files
2026-02-25 09:42:51 +08:00

112 lines
4.5 KiB
Python

import lightgbm as lgb
import numpy as np
from scipy.linalg import qr
from progress.bar import Bar
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
class SPA_acc:
def _projections_qr(self, X, k, M):
X_projected = X.copy()
norms = np.sum((X ** 2), axis=0)
norm_max = np.amax(norms)
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
return order[:M].T
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
train_data = lgb.Dataset(Xcal.iloc[:, var_sel], label=ycal)
params = {
'objective': 'multiclass', # 多分类任务的目标函数
'boosting_type': 'gbdt',
'metric': 'multi_logloss', # 使用多分类交叉熵损失
'device': 'gpu', # 启用GPU加速
'verbosity': -1,
'num_class': len(np.unique(ycal)) # 设置类别数量
}
# LightGBM 模型训练
model = lgb.train(params, train_data, num_boost_round=100)
# 在验证集或训练集上进行预测
if Xval is not None and yval is not None:
yhat = model.predict(Xval.iloc[:, var_sel])
yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别
accuracy = accuracy_score(yval, yhat)
else:
yhat = model.predict(Xcal.iloc[:, var_sel])
yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别
accuracy = accuracy_score(ycal, yhat)
return yhat, accuracy
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
N, K = Xcal.shape
m_max = min(N - 1, K) if m_max is None else m_max
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
SEL = np.zeros((m_max, K))
with Bar('Projections :', max=K) as bar:
for k in range(K):
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
bar.next()
ACCURACY = np.full((m_max + 1, K), -np.inf)
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
for k in range(K):
for m in range(m_min, m_max + 1):
var_sel = SEL[:m, k].astype(int)
_, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval)
ACCURACY[m, k] = accuracy
bar.next()
m_sel = np.argmax(ACCURACY, axis=0)
k_sel = np.argmax(np.max(ACCURACY, axis=0))
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
# Final LightGBM Training
Xcal2 = Xcal.iloc[:, var_sel_phase2]
train_data = lgb.Dataset(Xcal2, label=ycal)
params = {
'objective': 'multiclass', # 多分类任务的目标函数
'boosting_type': 'gbdt',
'metric': 'multi_logloss', # 使用多分类交叉熵损失
'device': 'gpu', # 启用GPU加速
'verbosity': -1,
'num_class': len(np.unique(ycal)) # 设置类别数量
}
model = lgb.train(params, train_data, num_boost_round=100)
relev = model.feature_importance(importance_type='gain')
index_decreasing_relev = np.argsort(-relev)
ACCURACY_scree = np.empty(len(var_sel_phase2))
for i in range(len(var_sel_phase2)):
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
_, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval)
ACCURACY_scree[i] = accuracy
# 绘图
plt.figure()
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.xlabel('Number of variables included in the model', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.title(f'Final number of selected variables: {len(var_sel_phase2)} (Accuracy={ACCURACY_scree.max():.4f})', fontsize=16)
plt.plot(ACCURACY_scree, label='Accuracy Scree Plot')
plt.scatter(np.argmax(ACCURACY_scree), ACCURACY_scree.max(), color='r', marker='s', label='Selected Point')
plt.grid(True)
plt.legend()
if save_path:
plt.savefig(save_path, bbox_inches='tight', dpi=300)
print(f"图像已保存至: {save_path}")
else:
plt.show()
return var_sel_phase2, ACCURACY_scree
def __repr__(self):
return "SPA()"