初始提交

This commit is contained in:
2026-02-25 09:42:51 +08:00
parent c25276c481
commit d84d886f35
182 changed files with 18438 additions and 0 deletions

View File

@ -0,0 +1,176 @@
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
def PC_Cross_Validation(X, y, pc, cv):
'''
X : 光谱矩阵 (DataFrame) nxm
y : 浓度阵 (Series) (化学值)
pc: 最大主成分数
cv: 交叉验证数量
return :
RMSECV: 各主成分数对应的RMSECV
rindex: 最佳主成分数
'''
kf = KFold(n_splits=cv)
RMSECV = []
for i in range(pc):
RMSE = []
for train_index, test_index in kf.split(X):
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
pls = PLSRegression(n_components=i + 1)
pls.fit(x_train, y_train)
y_predict = pls.predict(x_test)
RMSE.append(np.sqrt(mean_squared_error(y_test, y_predict)))
RMSE_mean = np.mean(RMSE)
RMSECV.append(RMSE_mean)
rindex = np.argmin(RMSECV)
return RMSECV, rindex
def Cross_Validation(X, y, pc, cv):
'''
X : 光谱矩阵 (DataFrame) nxm
y : 浓度阵 (Series) (化学值)
pc: 最大主成分数
cv: 交叉验证数量
return :
RMSECV: 各主成分数对应的RMSECV
'''
kf = KFold(n_splits=cv)
RMSE = []
for train_index, test_index in kf.split(X):
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
pls = PLSRegression(n_components=pc)
pls.fit(x_train, y_train)
y_predict = pls.predict(x_test)
RMSE.append(np.sqrt(mean_squared_error(y_test, y_predict)))
RMSE_mean = np.mean(RMSE)
return RMSE_mean
def CARS_Cloud(X, y, N=50, f=20, cv=10, save_fig=False, save_path=None):
'''
X : 光谱矩阵 (DataFrame 或 ndarray)
y : 浓度阵 (Series 或 ndarray)
N : 蒙特卡洛迭代次数
f : 最大特征数
cv : 交叉验证的次数
save_fig : 是否保存图像
save_path : 图像保存路径
return :
OptWave : 选择的波长
'''
p = 0.8
m, n = X.shape
u = np.power((n / 2), (1 / (N - 1)))
k = (1 / (N - 1)) * np.log(n / 2)
cal_num = np.round(m * p)
b2 = np.arange(n)
x = X # 将 DataFrame 转换为 numpy 数组
y = y # 将 Series 转换为 numpy 数组
D = np.vstack((np.array(b2).reshape(1, -1), x))
WaveData = []
WaveNum = []
RMSECV = []
r = []
for i in range(1, N + 1):
r.append(u * np.exp(-1 * k * i))
wave_num = int(np.round(r[i - 1] * n))
WaveNum = np.hstack((WaveNum, wave_num))
cal_index = np.random.choice(np.arange(m), size=int(cal_num), replace=False)
wave_index = b2[:wave_num].reshape(1, -1)[0]
# 使用 np.ix_ 来进行行列索引
xcal = x[np.ix_(cal_index, wave_index)] # 选择对应的行和列
ycal = y[cal_index] # 选择对应的 y
# 将 ycal 转换为一维数组
ycal = ycal.ravel() # 使其成为一维数组
x = x[:, wave_index] # 更新 x
D = D[:, wave_index] # 更新 D
d = D[0, :].reshape(1, -1)
wnum = n - wave_num
if wnum > 0:
d = np.hstack((d, np.full((1, wnum), -1)))
if len(WaveData) == 0:
WaveData = d
else:
WaveData = np.vstack((WaveData, d.reshape(1, -1)))
if wave_num < f:
f = wave_num
pls = PLSRegression(n_components=f)
pls.fit(xcal, ycal)
beta = pls.coef_
# 针对新版sklearn处理 coef_ 的方式
if beta.shape[0] == 1: # 新版sklearn(1, x)
b = np.abs(beta[0]) # 从第一行提取数据
coeff = beta[0, b2] # 修改为beta[0, b2]因为coef只有一行
else: # 旧版sklearn(x, 1)
b = np.abs(beta[:, 0]) # 从列中提取数据
coeff = beta[b2, 0] # 修改为beta[b2, 0]因为coef只有一列
b2 = np.argsort(-b, axis=0)
coef = copy.deepcopy(beta)
coeff = coef[b2, :].reshape(len(b2), -1)
rmsecv, rindex = PC_Cross_Validation(pd.DataFrame(xcal), pd.Series(ycal), f, cv)
RMSECV.append(Cross_Validation(pd.DataFrame(xcal), pd.Series(ycal), rindex + 1, cv))
WAVE = []
for i in range(WaveData.shape[0]):
wd = WaveData[i, :]
WD = np.ones((len(wd)))
for j in range(len(wd)):
ind = np.where(wd == j)
if len(ind[0]) == 0:
WD[j] = 0
else:
WD[j] = wd[ind[0]]
if len(WAVE) == 0:
WAVE = copy.deepcopy(WD)
else:
WAVE = np.vstack((WAVE, WD.reshape(1, -1)))
MinIndex = np.argmin(RMSECV)
Optimal = WAVE[MinIndex, :]
boindex = np.where(Optimal != 0)
OptWave = boindex[0]
plt.figure(figsize=(12, 10))
# 设置字体为新罗马
plt.rcParams['font.sans-serif'] = ['Times New Roman'] # 使用 Times New Roman 字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
fonts = 20
plt.subplot(211)
plt.xlabel('Monte Carlo Iterations', fontsize=fonts)
plt.ylabel('Number of Selected Wavelengths', fontsize=fonts)
plt.title('Optimal Iteration: ' + str(MinIndex), fontsize=fonts)
plt.plot(np.arange(N), WaveNum)
plt.subplot(212)
plt.xlabel('Monte Carlo Iterations', fontsize=fonts)
plt.ylabel('RMSECV', fontsize=fonts)
plt.plot(np.arange(N), RMSECV)
# 保存图像
if save_fig:
plt.savefig(save_path) # 保存图像到文件
print(f"The figure has been saved as {save_path}")
# plt.show()
return OptWave

View File

@ -0,0 +1,59 @@
from deap import base, creator, tools, algorithms
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
def GA(X, y, n_generations=20, population_size=50, crossover_prob=0.7, mutation_prob=0.2):
"""
使用遗传算法进行特征选择,返回选择的特征索引。
参数:
X (ndarray): 特征矩阵
y (ndarray): 标签
n_generations (int): 迭代次数
population_size (int): 种群大小
crossover_prob (float): 交叉概率
mutation_prob (float): 变异概率
返回:
list: 选择的特征索引
"""
# 创建适应度和个体
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_bool", lambda: np.random.randint(0, 2))
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
# 定义适应度函数
def evaluate(individual):
selected_features = [index for index, val in enumerate(individual) if val == 1]
if not selected_features:
return 0, # 没有特征时适应度为 0
X_selected = X[:, selected_features]
clf = RandomForestClassifier(random_state=42)
score = cross_val_score(clf, X_selected, y, cv=5).mean() # 5 折交叉验证
return score,
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
# 初始化种群
population = toolbox.population(n=population_size)
# 运行遗传算法
result_population, _ = algorithms.eaSimple(population, toolbox, cxpb=crossover_prob,
mutpb=mutation_prob, ngen=n_generations,
verbose=False)
# 获取最优个体
best_individual = tools.selBest(result_population, k=1)[0]
selected_features = [index for index, val in enumerate(best_individual) if val == 1]
return selected_features

View File

@ -0,0 +1,41 @@
"""
-*- coding: utf-8 -*-
@Time :2022/04/12 17:10
@Author : Pengyou FU
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
@github : https://github.com/FuSiry/OpenSA
@WeChat : Fu_siry
@LicenseApache-2.0 license
"""
from sklearn import linear_model
import numpy as np
def Lar(X, y, nums=40):
"""
使用 LARSLeast Angle Regression选择重要的特征波长。
参数:
X : np.ndarray预测变量矩阵输入数据
y : np.ndarray标签目标值
nums : int选择的特征点数量默认为 40
返回:
np.ndarray选择的特征波长索引
"""
# 初始化 LARS 模型
Lars = linear_model.Lars()
# 拟合模型
Lars.fit(X, y)
# 获取回归系数的绝对值,表示特征的重要性
corflist = np.abs(Lars.coef_)
# 将系数转换为数组并按重要性排序,选择前 nums 个最重要的特征
SpectrumList = np.argsort(corflist)[-nums:][::-1]
# 对选择的特征索引进行排序,保证顺序一致
SpectrumList = np.sort(SpectrumList)
return SpectrumList

View File

@ -0,0 +1,49 @@
import pymrmr
import pandas as pd
class MRMRFeatureSelection:
def __init__(self, X, y):
"""
初始化 mRMR 特征选择模块。
:param X: 输入特征矩阵 (DataFrame),每列为一个特征。
:param y: 目标变量 (Series),与特征矩阵 X 对应。
"""
self.X = X
self.y = y
self.selected_features = None
def select_features(self, k=18, method='MIQ'):
"""
执行 mRMR 特征选择。
:param k: 选择的特征个数。
:param method: 选择的 mRMR 方法 ('MIQ''MRMR')。
:return: 选择的特征列表
"""
# 拼接特征和目标变量
df = pd.concat([self.y, self.X], axis=1)
# 使用 pymrmr 进行 mRMR 特征选择
self.selected_features = pymrmr.mRMR(df, method, k)
return self.selected_features
def get_selected_features(self):
"""
获取已选择的特征。
:return: 选择的特征
"""
return self.selected_features
def get_selected_feature_names(self):
"""
获取已选择特征的列名
:return: 选择的特征列名列表
"""
if self.selected_features is None:
return None
return self.selected_features

View File

@ -0,0 +1,24 @@
"""
-*- coding: utf-8 -*-
@Time :2022/04/12 17:10
@Author : Pengyou FU
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
@github : https://github.com/FuSiry/OpenSA
@WeChat : Fu_siry
@LicenseApache-2.0 license
"""
from sklearn.decomposition import PCA
def Pca(X, nums=20):
"""
:param X: raw spectrum data, shape (n_samples, n_features)
:param nums: Number of principal components retained
:return: X_reductionSpectral data after dimensionality reduction
"""
pca = PCA(n_components=nums) # 保留的特征数码
pca.fit(X)
X_reduction = pca.transform(X)
return X_reduction

View File

@ -0,0 +1,88 @@
import numpy as np
from sklearn.neighbors import NearestNeighbors
class ReliefF:
def __init__(self, n_neighbors=20, n_features_to_keep=20):
"""
初始化 ReliefF 算法参数。
:param n_neighbors: 每个样本的近邻数量。
:param n_features_to_keep: 每次保留的特征数量。
"""
self.n_neighbors = n_neighbors
self.n_features_to_keep = n_features_to_keep
self.feature_scores = None # 用于存储每个特征的评分
self.top_features = None # 用于存储评分最高的特征索引
def fit(self, X, y):
"""
根据给定的数据 X 和标签 y 计算特征评分。
:param X: 输入特征矩阵。
:param y: 类别标签。
:return: 返回选择的特征索引。
"""
m, n = X.shape # m 是样本数n 是特征数
self.feature_scores = np.zeros(n) # 初始化特征评分为 0
# 寻找每个样本的 n_neighbors 个近邻
nbrs = NearestNeighbors(n_neighbors=self.n_neighbors + 1).fit(X)
distances, indices = nbrs.kneighbors(X)
# 遍历每个样本,更新特征评分
for i in range(m):
y_i = y[i] # 当前样本的类别标签
# 初始化同类和异类邻居
hit_neighbors = []
miss_neighbors = []
for j in indices[i][1:]: # indices[i][0] 是样本自身,跳过
if y[j] == y_i:
hit_neighbors.append(X[j])
else:
miss_neighbors.append(X[j])
# 更新每个特征的评分
for f in range(n):
for hit in hit_neighbors:
self.feature_scores[f] -= (X[i, f] - hit[f]) ** 2 / (self.n_neighbors * m)
for miss in miss_neighbors:
self.feature_scores[f] += (X[i, f] - miss[f]) ** 2 / (self.n_neighbors * m)
# 选择评分最高的 n_features_to_keep 个特征的索引
self.top_features = np.argsort(self.feature_scores)[-self.n_features_to_keep:]
return self.top_features # 返回选择的特征索引
def fit_transform(self, X, y):
"""一步完成拟合和转换,返回选择的特征索引。"""
return self.fit(X, y)
def multi_scale_relieff_stratified(X, y, segment_size=100, n_subsegments=20, n_features_per_subsegment=5):
"""
分层多尺度特征选择,确保每个波长段都能被覆盖。
:param X: 输入特征矩阵。
:param y: 类别标签。
:param segment_size: 每个波长段的大小。
:param n_subsegments: 每个段内的子区域数量。
:param n_features_per_subsegment: 每个子区域选择的特征数量。
:return: 分层选择的特征索引。
"""
selected_features = []
# 遍历每个波长段
for i in range(0, X.shape[1], segment_size):
segment_X = X[:, i:i + segment_size]
subsegment_size = segment_size // n_subsegments # 子区域大小
# 在每个子区域内进行特征选择
for j in range(0, segment_size, subsegment_size):
subsegment_X = segment_X[:, j:j + subsegment_size]
relief = ReliefF(n_neighbors=10, n_features_to_keep=n_features_per_subsegment)
subsegment_selected = relief.fit_transform(subsegment_X, y)
# 将局部索引转换为全局索引并添加到结果中
selected_features.extend(subsegment_selected + i + j)
# 返回去重后的特征索引
return np.unique(selected_features)

View File

@ -0,0 +1,116 @@
import scipy.stats
import numpy as np
from scipy.linalg import qr, inv, pinv
import scipy.stats
from progress.bar import Bar
from matplotlib import pyplot as plt
class SPA:
def _projections_qr(self, X, k, M):
X_projected = X.copy()
norms = np.sum((X ** 2), axis=0)
norm_max = np.amax(norms)
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
return order[:M].T
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
N = Xcal.shape[0]
NV = Xval.shape[0] if Xval is not None else 0
yhat, e = None, None
if NV > 0:
Xcal_ones = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel].to_numpy()])
b = np.linalg.lstsq(Xcal_ones, ycal, rcond=None)[0]
Xval_ones = np.hstack([np.ones((NV, 1)), Xval.iloc[:, var_sel].to_numpy()])
yhat = Xval_ones.dot(b)
e = yval - yhat
else:
yhat = np.zeros((N, 1))
for i in range(N):
cal = np.hstack([np.arange(i), np.arange(i + 1, N)])
X = Xcal.iloc[cal, var_sel]
y = ycal.iloc[cal]
X_ones = np.hstack([np.ones((N - 1, 1)), X.to_numpy()])
b = np.linalg.lstsq(X_ones, y, rcond=None)[0]
xtest = Xcal.iloc[i, var_sel].to_numpy()
yhat[i] = np.hstack([1, xtest]).dot(b)
e = ycal.to_numpy() - yhat
return yhat, e
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
N, K = Xcal.shape
m_max = min(N - 1, K) if m_max is None else m_max
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
SEL = np.zeros((m_max, K))
with Bar('Projections :', max=K) as bar:
for k in range(K):
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
bar.next()
PRESS = np.full((m_max + 1, K), np.inf)
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
for k in range(K):
for m in range(m_min, m_max + 1):
var_sel = SEL[:m, k].astype(int)
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
PRESS[m, k] = e.T @ e
bar.next()
m_sel = np.argmin(PRESS, axis=0)
k_sel = np.argmin(np.min(PRESS, axis=0))
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
Xcal2 = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel_phase2].to_numpy()])
b = np.linalg.lstsq(Xcal2, ycal, rcond=None)[0]
std_deviation = Xcal2.std(ddof=1, axis=0)
relev = np.abs(b * std_deviation)[1:]
index_decreasing_relev = np.argsort(-relev)
PRESS_scree = np.empty(len(var_sel_phase2))
for i in range(len(var_sel_phase2)):
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
PRESS_scree[i] = np.conj(e).T @ e
RMSEP_scree = np.sqrt(PRESS_scree / len(e))
alpha = 0.25
dof = len(e)
fcrit = scipy.stats.f.ppf(1 - alpha, dof, dof)
PRESS_crit = np.min(PRESS_scree) * fcrit
i_crit = np.min(np.nonzero(PRESS_scree < PRESS_crit))
i_crit = max(m_min, i_crit)
var_sel = var_sel_phase2[index_decreasing_relev[:i_crit]]
# 绘图
plt.figure()
# 设置字体为 Times New Roman
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False # 确保负号显示正常
# 设置标题、标签和网格
plt.xlabel('Number of variables included in the model', fontsize=14)
plt.ylabel('RMSE', fontsize=14)
plt.title(f'Final number of selected variables: {len(var_sel)} (RMSE={RMSEP_scree[i_crit]:.4f})', fontsize=16)
# 绘制 RMSEP 曲线
plt.plot(RMSEP_scree, label='RMSEP Scree Plot')
plt.scatter(i_crit, RMSEP_scree[i_crit], color='r', marker='s', label='Selected Point')
# 添加网格和图例
plt.grid(True)
plt.legend()
# 显示或保存图像
if save_path:
plt.savefig(save_path, bbox_inches='tight', dpi=300)
print(f"图像已保存至: {save_path}")
else:
plt.show()
return var_sel, var_sel_phase2
def __repr__(self):
return "SPA()"

View File

@ -0,0 +1,111 @@
import lightgbm as lgb
import numpy as np
from scipy.linalg import qr
from progress.bar import Bar
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
class SPA_acc:
def _projections_qr(self, X, k, M):
X_projected = X.copy()
norms = np.sum((X ** 2), axis=0)
norm_max = np.amax(norms)
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
return order[:M].T
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
train_data = lgb.Dataset(Xcal.iloc[:, var_sel], label=ycal)
params = {
'objective': 'multiclass', # 多分类任务的目标函数
'boosting_type': 'gbdt',
'metric': 'multi_logloss', # 使用多分类交叉熵损失
'device': 'gpu', # 启用GPU加速
'verbosity': -1,
'num_class': len(np.unique(ycal)) # 设置类别数量
}
# LightGBM 模型训练
model = lgb.train(params, train_data, num_boost_round=100)
# 在验证集或训练集上进行预测
if Xval is not None and yval is not None:
yhat = model.predict(Xval.iloc[:, var_sel])
yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别
accuracy = accuracy_score(yval, yhat)
else:
yhat = model.predict(Xcal.iloc[:, var_sel])
yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别
accuracy = accuracy_score(ycal, yhat)
return yhat, accuracy
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
N, K = Xcal.shape
m_max = min(N - 1, K) if m_max is None else m_max
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
SEL = np.zeros((m_max, K))
with Bar('Projections :', max=K) as bar:
for k in range(K):
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
bar.next()
ACCURACY = np.full((m_max + 1, K), -np.inf)
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
for k in range(K):
for m in range(m_min, m_max + 1):
var_sel = SEL[:m, k].astype(int)
_, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval)
ACCURACY[m, k] = accuracy
bar.next()
m_sel = np.argmax(ACCURACY, axis=0)
k_sel = np.argmax(np.max(ACCURACY, axis=0))
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
# Final LightGBM Training
Xcal2 = Xcal.iloc[:, var_sel_phase2]
train_data = lgb.Dataset(Xcal2, label=ycal)
params = {
'objective': 'multiclass', # 多分类任务的目标函数
'boosting_type': 'gbdt',
'metric': 'multi_logloss', # 使用多分类交叉熵损失
'device': 'gpu', # 启用GPU加速
'verbosity': -1,
'num_class': len(np.unique(ycal)) # 设置类别数量
}
model = lgb.train(params, train_data, num_boost_round=100)
relev = model.feature_importance(importance_type='gain')
index_decreasing_relev = np.argsort(-relev)
ACCURACY_scree = np.empty(len(var_sel_phase2))
for i in range(len(var_sel_phase2)):
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
_, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval)
ACCURACY_scree[i] = accuracy
# 绘图
plt.figure()
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.xlabel('Number of variables included in the model', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.title(f'Final number of selected variables: {len(var_sel_phase2)} (Accuracy={ACCURACY_scree.max():.4f})', fontsize=16)
plt.plot(ACCURACY_scree, label='Accuracy Scree Plot')
plt.scatter(np.argmax(ACCURACY_scree), ACCURACY_scree.max(), color='r', marker='s', label='Selected Point')
plt.grid(True)
plt.legend()
if save_path:
plt.savefig(save_path, bbox_inches='tight', dpi=300)
print(f"图像已保存至: {save_path}")
else:
plt.show()
return var_sel_phase2, ACCURACY_scree
def __repr__(self):
return "SPA()"

View File

@ -0,0 +1,92 @@
"""
-*- coding: utf-8 -*-
@Time :2022/04/12 17:10
@Author : Pengyou FU
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
@github : https://github.com/FuSiry/OpenSA
@WeChat : Fu_siry
@LicenseApache-2.0 license
"""
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit, cross_val_score
from numpy.linalg import matrix_rank as rank
import numpy as np
class UVE:
def __init__(self, x, y, ncomp=20, nrep=500, testSize=0.2):
"""
初始化 UVE 模型。
参数:
x : np.ndarray预测变量矩阵输入数据
y : np.ndarray标签目标值
ncomp : intPLS 中的最大潜变量数量,默认为 20
nrep : int重复次数默认为 500
testSize : float训练集中划分的测试集比例默认为 0.2
"""
self.x = x
self.y = y
self.ncomp = min(ncomp, rank(x)) # 确保潜变量数量不超过矩阵秩
self.nrep = nrep
self.testSize = testSize
self.criteria = None # 存储标准化系数
self.featureIndex = None # 存储特征排序索引
self.featureR2 = np.full(self.x.shape[1], np.nan) # 存储 R² 值
self.selFeature = None # 存储最终选择的特征索引
def calcCriteria(self):
"""计算每个变量的标准化系数 (meanCoef / stdCoef)。"""
PLSCoef = np.zeros((self.nrep, self.x.shape[1])) # 存储每次迭代的 PLS 系数
ss = ShuffleSplit(n_splits=self.nrep, test_size=self.testSize)
# 遍历每次划分的数据集,计算 PLS 系数
for step, (train, test) in enumerate(ss.split(self.x, self.y)):
xtrain, ytrain = self.x[train], self.y[train]
plsModel = PLSRegression(n_components=min(self.ncomp, rank(xtrain)))
plsModel.fit(xtrain, ytrain)
PLSCoef[step, :] = plsModel.coef_.flatten()
# 使用 np.divide 处理除法,避免除以零的问题
meanCoef = np.mean(PLSCoef, axis=0)
stdCoef = np.std(PLSCoef, axis=0)
self.criteria = np.divide(meanCoef, stdCoef, out=np.zeros_like(meanCoef), where=stdCoef != 0)
def evalCriteria(self, cv=3):
"""基于标准化系数评估每个变量组合的 R² 值。"""
# 按标准化系数的绝对值降序排序,获取特征的索引
self.featureIndex = np.argsort(-np.abs(self.criteria))
# 依次增加特征,计算每个组合的 R² 值
for i in range(self.x.shape[1]):
xi = self.x[:, self.featureIndex[:i + 1]] # 选择前 i+1 个特征
# 根据特征数量选择回归模型
if i < self.ncomp:
regModel = LinearRegression()
else:
regModel = PLSRegression(n_components=min(self.ncomp, rank(xi)))
# 进行交叉验证并存储 R² 值
cvScore = cross_val_score(regModel, xi, self.y, cv=cv, scoring='r2')
self.featureR2[i] = np.mean(cvScore)
def cutFeature(self, *args):
"""根据 R² 最大值选择特征,并返回所选特征的索引(列号)。"""
# 找到 R² 最大值对应的索引位置
cuti = np.nanargmax(self.featureR2) # 使用 nanargmax 以避免 NaN 的影响
self.selFeature = self.featureIndex[:cuti + 1] # 最优特征索引
# 如果传入其他数据集,返回筛选后的数据
if len(args) != 0:
returnx = list(args)
for i, argi in enumerate(args):
if argi.shape[1] == self.x.shape[1]:
returnx[i] = argi[:, self.selFeature]
return returnx
# 返回所选特征的索引(列号)
return self.selFeature

View File

@ -0,0 +1,93 @@
import pandas as pd
import numpy as np
from classification_model.WaveSelect.Lar import Lar
from classification_model.WaveSelect.Spa import SPA
from classification_model.WaveSelect.Spa_acc import SPA_acc
from classification_model.WaveSelect.Uve import UVE
from classification_model.WaveSelect.Cars import CARS_Cloud
from classification_model.WaveSelect.Pca import Pca
from classification_model.WaveSelect.GA import GA
from classification_model.WaveSelect.ReliefF import ReliefF
from sklearn.model_selection import train_test_split
# from WaveSelect.MRMR import MRMRFeatureSelection
import os
import matplotlib.pyplot as plt
def SpctrumFeatureSelcet(method, X, y, name='', result_dir='', column_names=None):
"""
:param method: 波长筛选/降维的方法包括Cars, Lars, Uve, Spa, Pca。
:param X: 光谱数据,可以是 pandas DataFrame 或 numpy array (n_samples, n_features)。
:param y: 光谱数据对应的标签 (n_samples,)。
:param name: 结果图像的文件名。
:param result_dir: 保存结果的文件夹路径。
:param column_names: 如果 X 是 numpy array需要提供列名列表。
:return:
- X_Feature: 选择/降维后的数据 (n_samples, n_features)。
- y: 对应的标签。
- selected_columns: 选择的特征列名或索引。
"""
global X_Feature
# 判断输入数据类型并转换为 DataFrame如有必要
if isinstance(X, np.ndarray):
if column_names is None:
column_names = [f"{i}" for i in range(X.shape[1])] # 默认列名
X_df = pd.DataFrame(X, columns=column_names)
else:
X_df = X
# 根据所选方法执行特征选择
if method == "None":
X_Feature = X_df
selected_columns = X_df.columns
elif method == "Cars":
save_path = os.path.join(result_dir, f"{name}_cars.png")
# 调用 CARS_Cloud 并获取结果
Featuresecletidx = CARS_Cloud(X_df.values, y, N=50, f=20, cv=10, save_fig=True,save_path=save_path)
Featuresecletidx = Featuresecletidx.astype(int)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = Featuresecletidx
elif method == "Lars":
Featuresecletidx = Lar(X_df.values, y)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "Uve":
uve = UVE(X_df.values, y, 20)
uve.calcCriteria()
uve.evalCriteria(cv=5)
Featuresecletidx = uve.cutFeature() # 返回所选特征的索引
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "Spa":
save_path = os.path.join(result_dir, f"{name}_spa.png")
Xcal, Xval, ycal, yval = train_test_split(X_df, y, test_size=0.3)
Featuresecletidx, var_sel_phase2 = SPA().spa(
Xcal, ycal, m_min=2, m_max=50, Xval=Xval, yval=yval, autoscaling=1,save_path=save_path)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "Spa_acc":
save_path = os.path.join(result_dir, f"{name}_spa_acc.png")
Xcal, Xval, ycal, yval = train_test_split(X_df, y, test_size=0.3)
Featuresecletidx, var_sel_phase2 = SPA_acc().spa(
Xcal, ycal, m_min=2, m_max=50, Xval=Xval, yval=yval, autoscaling=1,save_path=save_path)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "GA":
Featuresecletidx = GA(X_df.values, y, 10)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "Pca":
X_Feature = Pca(X_df.values)
selected_columns = [f"PC{i+1}" for i in range(X_Feature.shape[1])]
elif method == "ReliefF":
relieff = ReliefF(n_neighbors=20, n_features_to_keep=20)
Featuresecletidx = relieff.fit(X_df.values, y)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
else:
print("没有这个波长筛选方法!")
return None, None
return X_Feature, y, selected_columns # 返回所选特征数据、标签和列名

View File

@ -0,0 +1,123 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from WaveSelect.Lar import Lar
from WaveSelect.Spa import SPA
from WaveSelect.Uve import UVE
from WaveSelect.Cars import CARS_Cloud
from WaveSelect.GA import GA
from WaveSelect.ReliefF import ReliefF
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm
import os
def IntegratedWaveSelect(methods, X, y, strategy="voting", column_names=None, name='', result_dir=''):
# 检查并确保 X 是 DataFrame
if isinstance(X, np.ndarray):
if column_names is None:
column_names = [f"{i}" for i in range(X.shape[1])]
X_df = pd.DataFrame(X, columns=column_names)
else:
X_df = X
feature_indices_list = []
for method in tqdm(methods, desc="Processing Feature Selection Methods"):
print(f"Applying method: {method}") # 调试信息
if method == "Cars":
save_path = os.path.join(result_dir, f"{name}_cars.png")
Featuresecletidx = CARS_Cloud(X, y, N=50, f=20, cv=10, save_path=save_path)
Featuresecletidx = Featuresecletidx.astype(int)
feature_indices_list.append(Featuresecletidx)
elif method == "Lars":
Featuresecletidx = Lar(X_df.values, y)
feature_indices_list.append(Featuresecletidx)
elif method == "Uve":
uve = UVE(X_df.values, y, 20)
uve.calcCriteria()
uve.evalCriteria(cv=5)
Featuresecletidx = uve.cutFeature()
feature_indices_list.append(Featuresecletidx)
elif method == "Spa":
save_path = os.path.join(result_dir, f"{name}_spa.png")
Xcal, Xval, ycal, yval = train_test_split(X_df, y, test_size=0.2)
Featuresecletidx = SPA().spa(
Xcal=Xcal.values, ycal=ycal, m_min=4, m_max=32, Xval=Xval.values, yval=yval, autoscaling=1,save_path=save_path)
feature_indices_list.append(Featuresecletidx)
elif method == "GA":
Featuresecletidx = GA(X_df.values, y, 10)
feature_indices_list.append(Featuresecletidx)
elif method == "ReliefF":
relieff = ReliefF(n_neighbors=20, n_features_to_keep=20)
Featuresecletidx = relieff.fit(X_df.values, y)
feature_indices_list.append(Featuresecletidx)
else:
print(f"No such method: {method}")
continue
print(f"Selected indices by {method}: {Featuresecletidx}") # 调试信息
print("Feature indices list after all methods:", feature_indices_list) # 调试信息
if strategy == "voting":
if feature_indices_list:
all_indices = np.concatenate(feature_indices_list)
print("All indices concatenated:", all_indices) # 调试信息
counter = Counter(all_indices)
print("Counter result:", counter) # 调试信息
selected_features = [
idx for idx, count in tqdm(counter.items(), desc="Voting Selection")
if count > len(methods) / 2
]
print("Selected features after voting:", selected_features) # 调试信息
else:
print("No features selected by any method.") # 提示信息
selected_features = []
elif strategy == "weighted":
weights = {method: 1 for method in methods}
weighted_counts = Counter()
for method, indices in zip(methods, feature_indices_list):
for idx in indices:
weighted_counts[idx] += weights[method]
print("Weighted counts:", weighted_counts) # 调试信息
selected_features = [
idx for idx, count in tqdm(weighted_counts.items(), desc="Weighted Selection")
if count >= np.mean(list(weighted_counts.values()))
]
print("Selected features after weighted strategy:", selected_features) # 调试信息
elif strategy == "bagging":
num_iterations = 5
selected_features = set()
for _ in tqdm(range(num_iterations), desc="Bagging Iterations"):
X_sample, _, y_sample, _ = train_test_split(X_df, y, test_size=0.5)
sub_feature_indices_list = []
for method in methods:
if method == "Spa":
Xcal, Xval, ycal, yval = train_test_split(X_sample, y_sample, test_size=0.2)
sub_feature_indices = SPA().spa(Xcal=Xcal.values, ycal=ycal, m_min=4, m_max=32, Xval=Xval.values,
yval=yval, autoscaling=1)
elif method == "Cars":
sub_feature_indices = CARS_Cloud(X_sample.values, y_sample)
else:
continue
sub_feature_indices_list.append(sub_feature_indices)
for sub_indices in sub_feature_indices_list:
selected_features.update(sub_indices)
selected_features = list(selected_features)
print("Selected features after bagging strategy:", selected_features) # 调试信息
else:
raise ValueError("Invalid strategy. Choose from 'voting', 'weighted', or 'bagging'.")
selected_features = list(map(int, selected_features)) # 确保索引是整数类型
X_Feature = X_df.iloc[:, selected_features]
selected_columns = X_df.columns[selected_features]
# 关闭图像以释放资源
plt.close()
return X_Feature, y, selected_columns