import lightgbm as lgb import numpy as np from scipy.linalg import qr from progress.bar import Bar from matplotlib import pyplot as plt from sklearn.metrics import accuracy_score class SPA_acc: def _projections_qr(self, X, k, M): X_projected = X.copy() norms = np.sum((X ** 2), axis=0) norm_max = np.amax(norms) X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k] _, __, order = qr(X_projected.to_numpy(), 0, pivoting=True) return order[:M].T def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None): train_data = lgb.Dataset(Xcal.iloc[:, var_sel], label=ycal) params = { 'objective': 'multiclass', # 多分类任务的目标函数 'boosting_type': 'gbdt', 'metric': 'multi_logloss', # 使用多分类交叉熵损失 'device': 'gpu', # 启用GPU加速 'verbosity': -1, 'num_class': len(np.unique(ycal)) # 设置类别数量 } # LightGBM 模型训练 model = lgb.train(params, train_data, num_boost_round=100) # 在验证集或训练集上进行预测 if Xval is not None and yval is not None: yhat = model.predict(Xval.iloc[:, var_sel]) yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别 accuracy = accuracy_score(yval, yhat) else: yhat = model.predict(Xcal.iloc[:, var_sel]) yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别 accuracy = accuracy_score(ycal, yhat) return yhat, accuracy def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None): N, K = Xcal.shape m_max = min(N - 1, K) if m_max is None else m_max normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K) Xcaln = (Xcal - Xcal.mean()) / normalization_factor SEL = np.zeros((m_max, K)) with Bar('Projections :', max=K) as bar: for k in range(K): SEL[:, k] = self._projections_qr(Xcaln, k, m_max) bar.next() ACCURACY = np.full((m_max + 1, K), -np.inf) with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar: for k in range(K): for m in range(m_min, m_max + 1): var_sel = SEL[:m, k].astype(int) _, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval) ACCURACY[m, k] = accuracy bar.next() m_sel = np.argmax(ACCURACY, axis=0) k_sel = np.argmax(np.max(ACCURACY, axis=0)) var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int) # Final LightGBM Training Xcal2 = Xcal.iloc[:, var_sel_phase2] train_data = lgb.Dataset(Xcal2, label=ycal) params = { 'objective': 'multiclass', # 多分类任务的目标函数 'boosting_type': 'gbdt', 'metric': 'multi_logloss', # 使用多分类交叉熵损失 'device': 'gpu', # 启用GPU加速 'verbosity': -1, 'num_class': len(np.unique(ycal)) # 设置类别数量 } model = lgb.train(params, train_data, num_boost_round=100) relev = model.feature_importance(importance_type='gain') index_decreasing_relev = np.argsort(-relev) ACCURACY_scree = np.empty(len(var_sel_phase2)) for i in range(len(var_sel_phase2)): var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]] _, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval) ACCURACY_scree[i] = accuracy # 绘图 plt.figure() plt.rcParams['font.sans-serif'] = ['Times New Roman'] plt.xlabel('Number of variables included in the model', fontsize=14) plt.ylabel('Accuracy', fontsize=14) plt.title(f'Final number of selected variables: {len(var_sel_phase2)} (Accuracy={ACCURACY_scree.max():.4f})', fontsize=16) plt.plot(ACCURACY_scree, label='Accuracy Scree Plot') plt.scatter(np.argmax(ACCURACY_scree), ACCURACY_scree.max(), color='r', marker='s', label='Selected Point') plt.grid(True) plt.legend() if save_path: plt.savefig(save_path, bbox_inches='tight', dpi=300) print(f"图像已保存至: {save_path}") else: plt.show() return var_sel_phase2, ACCURACY_scree def __repr__(self): return "SPA()"