Files
HSI/Feature_Selection_method/Spa.py

116 lines
4.5 KiB
Python

import scipy.stats
import numpy as np
from scipy.linalg import qr, inv, pinv
import scipy.stats
from progress.bar import Bar
from matplotlib import pyplot as plt
class SPA:
def _projections_qr(self, X, k, M):
X_projected = X.copy()
norms = np.sum((X ** 2), axis=0)
norm_max = np.amax(norms)
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
return order[:M].T
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
N = Xcal.shape[0]
NV = Xval.shape[0] if Xval is not None else 0
yhat, e = None, None
if NV > 0:
Xcal_ones = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel].to_numpy()])
b = np.linalg.lstsq(Xcal_ones, ycal, rcond=None)[0]
Xval_ones = np.hstack([np.ones((NV, 1)), Xval.iloc[:, var_sel].to_numpy()])
yhat = Xval_ones.dot(b)
e = yval - yhat
else:
yhat = np.zeros((N, 1))
for i in range(N):
cal = np.hstack([np.arange(i), np.arange(i + 1, N)])
X = Xcal.iloc[cal, var_sel]
y = ycal.iloc[cal]
X_ones = np.hstack([np.ones((N - 1, 1)), X.to_numpy()])
b = np.linalg.lstsq(X_ones, y, rcond=None)[0]
xtest = Xcal.iloc[i, var_sel].to_numpy()
yhat[i] = np.hstack([1, xtest]).dot(b)
e = ycal.to_numpy() - yhat
return yhat, e
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
N, K = Xcal.shape
m_max = min(N - 1, K) if m_max is None else m_max
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
SEL = np.zeros((m_max, K))
with Bar('Projections :', max=K) as bar:
for k in range(K):
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
bar.next()
PRESS = np.full((m_max + 1, K), np.inf)
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
for k in range(K):
for m in range(m_min, m_max + 1):
var_sel = SEL[:m, k].astype(int)
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
PRESS[m, k] = e.T @ e
bar.next()
m_sel = np.argmin(PRESS, axis=0)
k_sel = np.argmin(np.min(PRESS, axis=0))
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
Xcal2 = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel_phase2].to_numpy()])
b = np.linalg.lstsq(Xcal2, ycal, rcond=None)[0]
std_deviation = Xcal2.std(ddof=1, axis=0)
relev = np.abs(b * std_deviation)[1:]
index_decreasing_relev = np.argsort(-relev)
PRESS_scree = np.empty(len(var_sel_phase2))
for i in range(len(var_sel_phase2)):
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
PRESS_scree[i] = np.conj(e).T @ e
RMSEP_scree = np.sqrt(PRESS_scree / len(e))
alpha = 0.25
dof = len(e)
fcrit = scipy.stats.f.ppf(1 - alpha, dof, dof)
PRESS_crit = np.min(PRESS_scree) * fcrit
i_crit = np.min(np.nonzero(PRESS_scree < PRESS_crit))
i_crit = max(m_min, i_crit)
var_sel = var_sel_phase2[index_decreasing_relev[:i_crit]]
# 绘图
plt.figure()
# 设置字体为 Times New Roman
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False # 确保负号显示正常
# 设置标题、标签和网格
plt.xlabel('Number of variables included in the model', fontsize=14)
plt.ylabel('RMSE', fontsize=14)
plt.title(f'Final number of selected variables: {len(var_sel)} (RMSE={RMSEP_scree[i_crit]:.4f})', fontsize=16)
# 绘制 RMSEP 曲线
plt.plot(RMSEP_scree, label='RMSEP Scree Plot')
plt.scatter(i_crit, RMSEP_scree[i_crit], color='r', marker='s', label='Selected Point')
# 添加网格和图例
plt.grid(True)
plt.legend()
# 显示或保存图像
if save_path:
plt.savefig(save_path, bbox_inches='tight', dpi=300)
print(f"图像已保存至: {save_path}")
else:
plt.show()
return var_sel, var_sel_phase2
def __repr__(self):
return "SPA()"