112 lines
4.5 KiB
Python
112 lines
4.5 KiB
Python
import lightgbm as lgb
|
|
import numpy as np
|
|
from scipy.linalg import qr
|
|
from progress.bar import Bar
|
|
from matplotlib import pyplot as plt
|
|
from sklearn.metrics import accuracy_score
|
|
|
|
|
|
class SPA_acc:
|
|
def _projections_qr(self, X, k, M):
|
|
X_projected = X.copy()
|
|
norms = np.sum((X ** 2), axis=0)
|
|
norm_max = np.amax(norms)
|
|
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
|
|
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
|
|
return order[:M].T
|
|
|
|
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
|
|
train_data = lgb.Dataset(Xcal.iloc[:, var_sel], label=ycal)
|
|
params = {
|
|
'objective': 'multiclass', # 多分类任务的目标函数
|
|
'boosting_type': 'gbdt',
|
|
'metric': 'multi_logloss', # 使用多分类交叉熵损失
|
|
'device': 'gpu', # 启用GPU加速
|
|
'verbosity': -1,
|
|
'num_class': len(np.unique(ycal)) # 设置类别数量
|
|
}
|
|
|
|
# LightGBM 模型训练
|
|
model = lgb.train(params, train_data, num_boost_round=100)
|
|
|
|
# 在验证集或训练集上进行预测
|
|
if Xval is not None and yval is not None:
|
|
yhat = model.predict(Xval.iloc[:, var_sel])
|
|
yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别
|
|
accuracy = accuracy_score(yval, yhat)
|
|
else:
|
|
yhat = model.predict(Xcal.iloc[:, var_sel])
|
|
yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别
|
|
accuracy = accuracy_score(ycal, yhat)
|
|
|
|
return yhat, accuracy
|
|
|
|
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
|
|
N, K = Xcal.shape
|
|
m_max = min(N - 1, K) if m_max is None else m_max
|
|
|
|
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
|
|
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
|
|
|
|
SEL = np.zeros((m_max, K))
|
|
with Bar('Projections :', max=K) as bar:
|
|
for k in range(K):
|
|
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
|
|
bar.next()
|
|
|
|
ACCURACY = np.full((m_max + 1, K), -np.inf)
|
|
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
|
|
for k in range(K):
|
|
for m in range(m_min, m_max + 1):
|
|
var_sel = SEL[:m, k].astype(int)
|
|
_, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval)
|
|
ACCURACY[m, k] = accuracy
|
|
bar.next()
|
|
|
|
m_sel = np.argmax(ACCURACY, axis=0)
|
|
k_sel = np.argmax(np.max(ACCURACY, axis=0))
|
|
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
|
|
|
|
# Final LightGBM Training
|
|
Xcal2 = Xcal.iloc[:, var_sel_phase2]
|
|
train_data = lgb.Dataset(Xcal2, label=ycal)
|
|
params = {
|
|
'objective': 'multiclass', # 多分类任务的目标函数
|
|
'boosting_type': 'gbdt',
|
|
'metric': 'multi_logloss', # 使用多分类交叉熵损失
|
|
'device': 'gpu', # 启用GPU加速
|
|
'verbosity': -1,
|
|
'num_class': len(np.unique(ycal)) # 设置类别数量
|
|
}
|
|
model = lgb.train(params, train_data, num_boost_round=100)
|
|
relev = model.feature_importance(importance_type='gain')
|
|
|
|
index_decreasing_relev = np.argsort(-relev)
|
|
ACCURACY_scree = np.empty(len(var_sel_phase2))
|
|
for i in range(len(var_sel_phase2)):
|
|
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
|
|
_, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval)
|
|
ACCURACY_scree[i] = accuracy
|
|
|
|
# 绘图
|
|
plt.figure()
|
|
plt.rcParams['font.sans-serif'] = ['Times New Roman']
|
|
plt.xlabel('Number of variables included in the model', fontsize=14)
|
|
plt.ylabel('Accuracy', fontsize=14)
|
|
plt.title(f'Final number of selected variables: {len(var_sel_phase2)} (Accuracy={ACCURACY_scree.max():.4f})', fontsize=16)
|
|
plt.plot(ACCURACY_scree, label='Accuracy Scree Plot')
|
|
plt.scatter(np.argmax(ACCURACY_scree), ACCURACY_scree.max(), color='r', marker='s', label='Selected Point')
|
|
plt.grid(True)
|
|
plt.legend()
|
|
|
|
if save_path:
|
|
plt.savefig(save_path, bbox_inches='tight', dpi=300)
|
|
print(f"图像已保存至: {save_path}")
|
|
else:
|
|
plt.show()
|
|
|
|
return var_sel_phase2, ACCURACY_scree
|
|
|
|
def __repr__(self):
|
|
return "SPA()"
|