增加模块;增加主调用命令
This commit is contained in:
176
Feature_Selection_method/Cars.py
Normal file
176
Feature_Selection_method/Cars.py
Normal file
@ -0,0 +1,176 @@
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import copy
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
|
||||
def PC_Cross_Validation(X, y, pc, cv):
|
||||
'''
|
||||
X : 光谱矩阵 (DataFrame) nxm
|
||||
y : 浓度阵 (Series) (化学值)
|
||||
pc: 最大主成分数
|
||||
cv: 交叉验证数量
|
||||
return :
|
||||
RMSECV: 各主成分数对应的RMSECV
|
||||
rindex: 最佳主成分数
|
||||
'''
|
||||
kf = KFold(n_splits=cv)
|
||||
RMSECV = []
|
||||
for i in range(pc):
|
||||
RMSE = []
|
||||
for train_index, test_index in kf.split(X):
|
||||
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
|
||||
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
|
||||
pls = PLSRegression(n_components=i + 1)
|
||||
pls.fit(x_train, y_train)
|
||||
y_predict = pls.predict(x_test)
|
||||
RMSE.append(np.sqrt(mean_squared_error(y_test, y_predict)))
|
||||
RMSE_mean = np.mean(RMSE)
|
||||
RMSECV.append(RMSE_mean)
|
||||
rindex = np.argmin(RMSECV)
|
||||
return RMSECV, rindex
|
||||
|
||||
|
||||
def Cross_Validation(X, y, pc, cv):
|
||||
'''
|
||||
X : 光谱矩阵 (DataFrame) nxm
|
||||
y : 浓度阵 (Series) (化学值)
|
||||
pc: 最大主成分数
|
||||
cv: 交叉验证数量
|
||||
return :
|
||||
RMSECV: 各主成分数对应的RMSECV
|
||||
'''
|
||||
kf = KFold(n_splits=cv)
|
||||
RMSE = []
|
||||
for train_index, test_index in kf.split(X):
|
||||
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
|
||||
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
|
||||
pls = PLSRegression(n_components=pc)
|
||||
pls.fit(x_train, y_train)
|
||||
y_predict = pls.predict(x_test)
|
||||
RMSE.append(np.sqrt(mean_squared_error(y_test, y_predict)))
|
||||
RMSE_mean = np.mean(RMSE)
|
||||
return RMSE_mean
|
||||
|
||||
|
||||
def CARS_Cloud(X, y, N=50, f=20, cv=10, save_fig=False, save_path=None):
|
||||
'''
|
||||
X : 光谱矩阵 (DataFrame 或 ndarray)
|
||||
y : 浓度阵 (Series 或 ndarray)
|
||||
N : 蒙特卡洛迭代次数
|
||||
f : 最大特征数
|
||||
cv : 交叉验证的次数
|
||||
save_fig : 是否保存图像
|
||||
save_path : 图像保存路径
|
||||
return :
|
||||
OptWave : 选择的波长
|
||||
'''
|
||||
p = 0.8
|
||||
m, n = X.shape
|
||||
u = np.power((n / 2), (1 / (N - 1)))
|
||||
k = (1 / (N - 1)) * np.log(n / 2)
|
||||
cal_num = np.round(m * p)
|
||||
b2 = np.arange(n)
|
||||
x = X # 将 DataFrame 转换为 numpy 数组
|
||||
y = y # 将 Series 转换为 numpy 数组
|
||||
D = np.vstack((np.array(b2).reshape(1, -1), x))
|
||||
WaveData = []
|
||||
WaveNum = []
|
||||
RMSECV = []
|
||||
r = []
|
||||
|
||||
for i in range(1, N + 1):
|
||||
r.append(u * np.exp(-1 * k * i))
|
||||
wave_num = int(np.round(r[i - 1] * n))
|
||||
WaveNum = np.hstack((WaveNum, wave_num))
|
||||
cal_index = np.random.choice(np.arange(m), size=int(cal_num), replace=False)
|
||||
wave_index = b2[:wave_num].reshape(1, -1)[0]
|
||||
|
||||
# 使用 np.ix_ 来进行行列索引
|
||||
xcal = x[np.ix_(cal_index, wave_index)] # 选择对应的行和列
|
||||
ycal = y[cal_index] # 选择对应的 y
|
||||
|
||||
# 将 ycal 转换为一维数组
|
||||
ycal = ycal.ravel() # 使其成为一维数组
|
||||
|
||||
x = x[:, wave_index] # 更新 x
|
||||
D = D[:, wave_index] # 更新 D
|
||||
d = D[0, :].reshape(1, -1)
|
||||
wnum = n - wave_num
|
||||
if wnum > 0:
|
||||
d = np.hstack((d, np.full((1, wnum), -1)))
|
||||
if len(WaveData) == 0:
|
||||
WaveData = d
|
||||
else:
|
||||
WaveData = np.vstack((WaveData, d.reshape(1, -1)))
|
||||
|
||||
if wave_num < f:
|
||||
f = wave_num
|
||||
|
||||
pls = PLSRegression(n_components=f)
|
||||
pls.fit(xcal, ycal)
|
||||
beta = pls.coef_
|
||||
|
||||
# 针对新版sklearn处理 coef_ 的方式
|
||||
if beta.shape[0] == 1: # 新版sklearn,(1, x)
|
||||
b = np.abs(beta[0]) # 从第一行提取数据
|
||||
coeff = beta[0, b2] # 修改为beta[0, b2],因为coef只有一行
|
||||
else: # 旧版sklearn,(x, 1)
|
||||
b = np.abs(beta[:, 0]) # 从列中提取数据
|
||||
coeff = beta[b2, 0] # 修改为beta[b2, 0],因为coef只有一列
|
||||
|
||||
b2 = np.argsort(-b, axis=0)
|
||||
coef = copy.deepcopy(beta)
|
||||
coeff = coef[b2, :].reshape(len(b2), -1)
|
||||
rmsecv, rindex = PC_Cross_Validation(pd.DataFrame(xcal), pd.Series(ycal), f, cv)
|
||||
RMSECV.append(Cross_Validation(pd.DataFrame(xcal), pd.Series(ycal), rindex + 1, cv))
|
||||
|
||||
WAVE = []
|
||||
for i in range(WaveData.shape[0]):
|
||||
wd = WaveData[i, :]
|
||||
WD = np.ones((len(wd)))
|
||||
for j in range(len(wd)):
|
||||
ind = np.where(wd == j)
|
||||
if len(ind[0]) == 0:
|
||||
WD[j] = 0
|
||||
else:
|
||||
WD[j] = wd[ind[0]]
|
||||
if len(WAVE) == 0:
|
||||
WAVE = copy.deepcopy(WD)
|
||||
else:
|
||||
WAVE = np.vstack((WAVE, WD.reshape(1, -1)))
|
||||
|
||||
MinIndex = np.argmin(RMSECV)
|
||||
Optimal = WAVE[MinIndex, :]
|
||||
boindex = np.where(Optimal != 0)
|
||||
OptWave = boindex[0]
|
||||
|
||||
plt.figure(figsize=(12, 10))
|
||||
# 设置字体为新罗马
|
||||
plt.rcParams['font.sans-serif'] = ['Times New Roman'] # 使用 Times New Roman 字体
|
||||
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
|
||||
fonts = 20
|
||||
|
||||
plt.subplot(211)
|
||||
plt.xlabel('Monte Carlo Iterations', fontsize=fonts)
|
||||
plt.ylabel('Number of Selected Wavelengths', fontsize=fonts)
|
||||
plt.title('Optimal Iteration: ' + str(MinIndex), fontsize=fonts)
|
||||
plt.plot(np.arange(N), WaveNum)
|
||||
|
||||
plt.subplot(212)
|
||||
plt.xlabel('Monte Carlo Iterations', fontsize=fonts)
|
||||
plt.ylabel('RMSECV', fontsize=fonts)
|
||||
plt.plot(np.arange(N), RMSECV)
|
||||
|
||||
# 保存图像
|
||||
if save_fig:
|
||||
plt.savefig(save_path) # 保存图像到文件
|
||||
print(f"The figure has been saved as {save_path}")
|
||||
|
||||
|
||||
# plt.show()
|
||||
|
||||
return OptWave
|
||||
59
Feature_Selection_method/GA.py
Normal file
59
Feature_Selection_method/GA.py
Normal file
@ -0,0 +1,59 @@
|
||||
from deap import base, creator, tools, algorithms
|
||||
import numpy as np
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
|
||||
def GA(X, y, n_generations=20, population_size=50, crossover_prob=0.7, mutation_prob=0.2):
|
||||
"""
|
||||
使用遗传算法进行特征选择,返回选择的特征索引。
|
||||
|
||||
参数:
|
||||
X (ndarray): 特征矩阵
|
||||
y (ndarray): 标签
|
||||
n_generations (int): 迭代次数
|
||||
population_size (int): 种群大小
|
||||
crossover_prob (float): 交叉概率
|
||||
mutation_prob (float): 变异概率
|
||||
|
||||
返回:
|
||||
list: 选择的特征索引
|
||||
"""
|
||||
# 创建适应度和个体
|
||||
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
|
||||
creator.create("Individual", list, fitness=creator.FitnessMax)
|
||||
|
||||
toolbox = base.Toolbox()
|
||||
toolbox.register("attr_bool", lambda: np.random.randint(0, 2))
|
||||
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])
|
||||
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
|
||||
|
||||
# 定义适应度函数
|
||||
def evaluate(individual):
|
||||
selected_features = [index for index, val in enumerate(individual) if val == 1]
|
||||
if not selected_features:
|
||||
return 0, # 没有特征时适应度为 0
|
||||
X_selected = X[:, selected_features]
|
||||
clf = RandomForestClassifier(random_state=42)
|
||||
score = cross_val_score(clf, X_selected, y, cv=5).mean() # 5 折交叉验证
|
||||
return score,
|
||||
|
||||
toolbox.register("evaluate", evaluate)
|
||||
toolbox.register("mate", tools.cxTwoPoint)
|
||||
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
|
||||
toolbox.register("select", tools.selTournament, tournsize=3)
|
||||
|
||||
# 初始化种群
|
||||
population = toolbox.population(n=population_size)
|
||||
|
||||
# 运行遗传算法
|
||||
result_population, _ = algorithms.eaSimple(population, toolbox, cxpb=crossover_prob,
|
||||
mutpb=mutation_prob, ngen=n_generations,
|
||||
verbose=False)
|
||||
|
||||
# 获取最优个体
|
||||
best_individual = tools.selBest(result_population, k=1)[0]
|
||||
selected_features = [index for index, val in enumerate(best_individual) if val == 1]
|
||||
|
||||
return selected_features
|
||||
31
Feature_Selection_method/Lar.py
Normal file
31
Feature_Selection_method/Lar.py
Normal file
@ -0,0 +1,31 @@
|
||||
from sklearn import linear_model
|
||||
import numpy as np
|
||||
|
||||
def Lar(X, y, nums=40):
|
||||
"""
|
||||
使用 LARS(Least Angle Regression)选择重要的特征波长。
|
||||
|
||||
参数:
|
||||
X : np.ndarray,预测变量矩阵(输入数据)
|
||||
y : np.ndarray,标签(目标值)
|
||||
nums : int,选择的特征点数量,默认为 40
|
||||
|
||||
返回:
|
||||
np.ndarray,选择的特征波长索引
|
||||
"""
|
||||
# 初始化 LARS 模型
|
||||
Lars = linear_model.Lars()
|
||||
|
||||
# 拟合模型
|
||||
Lars.fit(X, y)
|
||||
|
||||
# 获取回归系数的绝对值,表示特征的重要性
|
||||
corflist = np.abs(Lars.coef_)
|
||||
|
||||
# 将系数转换为数组并按重要性排序,选择前 nums 个最重要的特征
|
||||
SpectrumList = np.argsort(corflist)[-nums:][::-1]
|
||||
|
||||
# 对选择的特征索引进行排序,保证顺序一致
|
||||
SpectrumList = np.sort(SpectrumList)
|
||||
|
||||
return SpectrumList
|
||||
88
Feature_Selection_method/ReliefF.py
Normal file
88
Feature_Selection_method/ReliefF.py
Normal file
@ -0,0 +1,88 @@
|
||||
|
||||
import numpy as np
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
class ReliefF:
|
||||
def __init__(self, n_neighbors=20, n_features_to_keep=20):
|
||||
"""
|
||||
初始化 ReliefF 算法参数。
|
||||
:param n_neighbors: 每个样本的近邻数量。
|
||||
:param n_features_to_keep: 每次保留的特征数量。
|
||||
"""
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_features_to_keep = n_features_to_keep
|
||||
self.feature_scores = None # 用于存储每个特征的评分
|
||||
self.top_features = None # 用于存储评分最高的特征索引
|
||||
|
||||
def fit(self, X, y):
|
||||
"""
|
||||
根据给定的数据 X 和标签 y 计算特征评分。
|
||||
:param X: 输入特征矩阵。
|
||||
:param y: 类别标签。
|
||||
:return: 返回选择的特征索引。
|
||||
"""
|
||||
m, n = X.shape # m 是样本数,n 是特征数
|
||||
|
||||
self.feature_scores = np.zeros(n) # 初始化特征评分为 0
|
||||
|
||||
# 寻找每个样本的 n_neighbors 个近邻
|
||||
nbrs = NearestNeighbors(n_neighbors=self.n_neighbors + 1).fit(X)
|
||||
distances, indices = nbrs.kneighbors(X)
|
||||
|
||||
# 遍历每个样本,更新特征评分
|
||||
for i in range(m):
|
||||
y_i = y[i] # 当前样本的类别标签
|
||||
|
||||
# 初始化同类和异类邻居
|
||||
hit_neighbors = []
|
||||
miss_neighbors = []
|
||||
|
||||
for j in indices[i][1:]: # indices[i][0] 是样本自身,跳过
|
||||
if y[j] == y_i:
|
||||
hit_neighbors.append(X[j])
|
||||
else:
|
||||
miss_neighbors.append(X[j])
|
||||
|
||||
# 更新每个特征的评分
|
||||
for f in range(n):
|
||||
for hit in hit_neighbors:
|
||||
self.feature_scores[f] -= (X[i, f] - hit[f]) ** 2 / (self.n_neighbors * m)
|
||||
for miss in miss_neighbors:
|
||||
self.feature_scores[f] += (X[i, f] - miss[f]) ** 2 / (self.n_neighbors * m)
|
||||
|
||||
# 选择评分最高的 n_features_to_keep 个特征的索引
|
||||
self.top_features = np.argsort(self.feature_scores)[-self.n_features_to_keep:]
|
||||
|
||||
return self.top_features # 返回选择的特征索引
|
||||
|
||||
def fit_transform(self, X, y):
|
||||
"""一步完成拟合和转换,返回选择的特征索引。"""
|
||||
return self.fit(X, y)
|
||||
|
||||
def multi_scale_relieff_stratified(X, y, segment_size=100, n_subsegments=20, n_features_per_subsegment=5):
|
||||
"""
|
||||
分层多尺度特征选择,确保每个波长段都能被覆盖。
|
||||
:param X: 输入特征矩阵。
|
||||
:param y: 类别标签。
|
||||
:param segment_size: 每个波长段的大小。
|
||||
:param n_subsegments: 每个段内的子区域数量。
|
||||
:param n_features_per_subsegment: 每个子区域选择的特征数量。
|
||||
:return: 分层选择的特征索引。
|
||||
"""
|
||||
selected_features = []
|
||||
|
||||
# 遍历每个波长段
|
||||
for i in range(0, X.shape[1], segment_size):
|
||||
segment_X = X[:, i:i + segment_size]
|
||||
subsegment_size = segment_size // n_subsegments # 子区域大小
|
||||
|
||||
# 在每个子区域内进行特征选择
|
||||
for j in range(0, segment_size, subsegment_size):
|
||||
subsegment_X = segment_X[:, j:j + subsegment_size]
|
||||
relief = ReliefF(n_neighbors=10, n_features_to_keep=n_features_per_subsegment)
|
||||
subsegment_selected = relief.fit_transform(subsegment_X, y)
|
||||
|
||||
# 将局部索引转换为全局索引并添加到结果中
|
||||
selected_features.extend(subsegment_selected + i + j)
|
||||
|
||||
# 返回去重后的特征索引
|
||||
return np.unique(selected_features)
|
||||
116
Feature_Selection_method/Spa.py
Normal file
116
Feature_Selection_method/Spa.py
Normal file
@ -0,0 +1,116 @@
|
||||
import scipy.stats
|
||||
import numpy as np
|
||||
from scipy.linalg import qr, inv, pinv
|
||||
import scipy.stats
|
||||
from progress.bar import Bar
|
||||
from matplotlib import pyplot as plt
|
||||
class SPA:
|
||||
def _projections_qr(self, X, k, M):
|
||||
X_projected = X.copy()
|
||||
norms = np.sum((X ** 2), axis=0)
|
||||
norm_max = np.amax(norms)
|
||||
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
|
||||
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
|
||||
return order[:M].T
|
||||
|
||||
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
|
||||
N = Xcal.shape[0]
|
||||
NV = Xval.shape[0] if Xval is not None else 0
|
||||
|
||||
yhat, e = None, None
|
||||
if NV > 0:
|
||||
Xcal_ones = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel].to_numpy()])
|
||||
b = np.linalg.lstsq(Xcal_ones, ycal, rcond=None)[0]
|
||||
Xval_ones = np.hstack([np.ones((NV, 1)), Xval.iloc[:, var_sel].to_numpy()])
|
||||
yhat = Xval_ones.dot(b)
|
||||
e = yval - yhat
|
||||
else:
|
||||
yhat = np.zeros((N, 1))
|
||||
for i in range(N):
|
||||
cal = np.hstack([np.arange(i), np.arange(i + 1, N)])
|
||||
X = Xcal.iloc[cal, var_sel]
|
||||
y = ycal.iloc[cal]
|
||||
X_ones = np.hstack([np.ones((N - 1, 1)), X.to_numpy()])
|
||||
b = np.linalg.lstsq(X_ones, y, rcond=None)[0]
|
||||
xtest = Xcal.iloc[i, var_sel].to_numpy()
|
||||
yhat[i] = np.hstack([1, xtest]).dot(b)
|
||||
e = ycal.to_numpy() - yhat
|
||||
return yhat, e
|
||||
|
||||
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
|
||||
N, K = Xcal.shape
|
||||
m_max = min(N - 1, K) if m_max is None else m_max
|
||||
|
||||
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
|
||||
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
|
||||
|
||||
SEL = np.zeros((m_max, K))
|
||||
with Bar('Projections :', max=K) as bar:
|
||||
for k in range(K):
|
||||
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
|
||||
bar.next()
|
||||
|
||||
PRESS = np.full((m_max + 1, K), np.inf)
|
||||
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
|
||||
for k in range(K):
|
||||
for m in range(m_min, m_max + 1):
|
||||
var_sel = SEL[:m, k].astype(int)
|
||||
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
|
||||
PRESS[m, k] = e.T @ e
|
||||
bar.next()
|
||||
|
||||
m_sel = np.argmin(PRESS, axis=0)
|
||||
k_sel = np.argmin(np.min(PRESS, axis=0))
|
||||
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
|
||||
|
||||
Xcal2 = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel_phase2].to_numpy()])
|
||||
b = np.linalg.lstsq(Xcal2, ycal, rcond=None)[0]
|
||||
std_deviation = Xcal2.std(ddof=1, axis=0)
|
||||
relev = np.abs(b * std_deviation)[1:]
|
||||
|
||||
index_decreasing_relev = np.argsort(-relev)
|
||||
PRESS_scree = np.empty(len(var_sel_phase2))
|
||||
for i in range(len(var_sel_phase2)):
|
||||
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
|
||||
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
|
||||
PRESS_scree[i] = np.conj(e).T @ e
|
||||
|
||||
RMSEP_scree = np.sqrt(PRESS_scree / len(e))
|
||||
alpha = 0.25
|
||||
dof = len(e)
|
||||
fcrit = scipy.stats.f.ppf(1 - alpha, dof, dof)
|
||||
PRESS_crit = np.min(PRESS_scree) * fcrit
|
||||
i_crit = np.min(np.nonzero(PRESS_scree < PRESS_crit))
|
||||
i_crit = max(m_min, i_crit)
|
||||
var_sel = var_sel_phase2[index_decreasing_relev[:i_crit]]
|
||||
|
||||
# 绘图
|
||||
plt.figure()
|
||||
|
||||
# 设置字体为 Times New Roman
|
||||
plt.rcParams['font.sans-serif'] = ['Times New Roman']
|
||||
plt.rcParams['axes.unicode_minus'] = False # 确保负号显示正常
|
||||
|
||||
# 设置标题、标签和网格
|
||||
plt.xlabel('Number of variables included in the model', fontsize=14)
|
||||
plt.ylabel('RMSE', fontsize=14)
|
||||
plt.title(f'Final number of selected variables: {len(var_sel)} (RMSE={RMSEP_scree[i_crit]:.4f})', fontsize=16)
|
||||
|
||||
# 绘制 RMSEP 曲线
|
||||
plt.plot(RMSEP_scree, label='RMSEP Scree Plot')
|
||||
plt.scatter(i_crit, RMSEP_scree[i_crit], color='r', marker='s', label='Selected Point')
|
||||
|
||||
# 添加网格和图例
|
||||
plt.grid(True)
|
||||
plt.legend()
|
||||
|
||||
# 显示或保存图像
|
||||
if save_path:
|
||||
plt.savefig(save_path, bbox_inches='tight', dpi=300)
|
||||
print(f"图像已保存至: {save_path}")
|
||||
else:
|
||||
plt.show()
|
||||
return var_sel, var_sel_phase2
|
||||
|
||||
def __repr__(self):
|
||||
return "SPA()"
|
||||
82
Feature_Selection_method/Uve.py
Normal file
82
Feature_Selection_method/Uve.py
Normal file
@ -0,0 +1,82 @@
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.model_selection import ShuffleSplit, cross_val_score
|
||||
from numpy.linalg import matrix_rank as rank
|
||||
import numpy as np
|
||||
|
||||
|
||||
class UVE:
|
||||
def __init__(self, x, y, ncomp=20, nrep=500, testSize=0.2):
|
||||
"""
|
||||
初始化 UVE 模型。
|
||||
|
||||
参数:
|
||||
x : np.ndarray,预测变量矩阵(输入数据)
|
||||
y : np.ndarray,标签(目标值)
|
||||
ncomp : int,PLS 中的最大潜变量数量,默认为 20
|
||||
nrep : int,重复次数,默认为 500
|
||||
testSize : float,训练集中划分的测试集比例,默认为 0.2
|
||||
"""
|
||||
self.x = x
|
||||
self.y = y
|
||||
self.ncomp = min(ncomp, rank(x)) # 确保潜变量数量不超过矩阵秩
|
||||
self.nrep = nrep
|
||||
self.testSize = testSize
|
||||
|
||||
self.criteria = None # 存储标准化系数
|
||||
self.featureIndex = None # 存储特征排序索引
|
||||
self.featureR2 = np.full(self.x.shape[1], np.nan) # 存储 R² 值
|
||||
self.selFeature = None # 存储最终选择的特征索引
|
||||
|
||||
def calcCriteria(self):
|
||||
"""计算每个变量的标准化系数 (meanCoef / stdCoef)。"""
|
||||
PLSCoef = np.zeros((self.nrep, self.x.shape[1])) # 存储每次迭代的 PLS 系数
|
||||
ss = ShuffleSplit(n_splits=self.nrep, test_size=self.testSize)
|
||||
|
||||
# 遍历每次划分的数据集,计算 PLS 系数
|
||||
for step, (train, test) in enumerate(ss.split(self.x, self.y)):
|
||||
xtrain, ytrain = self.x[train], self.y[train]
|
||||
plsModel = PLSRegression(n_components=min(self.ncomp, rank(xtrain)))
|
||||
plsModel.fit(xtrain, ytrain)
|
||||
PLSCoef[step, :] = plsModel.coef_.flatten()
|
||||
|
||||
# 使用 np.divide 处理除法,避免除以零的问题
|
||||
meanCoef = np.mean(PLSCoef, axis=0)
|
||||
stdCoef = np.std(PLSCoef, axis=0)
|
||||
self.criteria = np.divide(meanCoef, stdCoef, out=np.zeros_like(meanCoef), where=stdCoef != 0)
|
||||
|
||||
def evalCriteria(self, cv=3):
|
||||
"""基于标准化系数评估每个变量组合的 R² 值。"""
|
||||
# 按标准化系数的绝对值降序排序,获取特征的索引
|
||||
self.featureIndex = np.argsort(-np.abs(self.criteria))
|
||||
|
||||
# 依次增加特征,计算每个组合的 R² 值
|
||||
for i in range(self.x.shape[1]):
|
||||
xi = self.x[:, self.featureIndex[:i + 1]] # 选择前 i+1 个特征
|
||||
|
||||
# 根据特征数量选择回归模型
|
||||
if i < self.ncomp:
|
||||
regModel = LinearRegression()
|
||||
else:
|
||||
regModel = PLSRegression(n_components=min(self.ncomp, rank(xi)))
|
||||
|
||||
# 进行交叉验证并存储 R² 值
|
||||
cvScore = cross_val_score(regModel, xi, self.y, cv=cv, scoring='r2')
|
||||
self.featureR2[i] = np.mean(cvScore)
|
||||
|
||||
def cutFeature(self, *args):
|
||||
"""根据 R² 最大值选择特征,并返回所选特征的索引(列号)。"""
|
||||
# 找到 R² 最大值对应的索引位置
|
||||
cuti = np.nanargmax(self.featureR2) # 使用 nanargmax 以避免 NaN 的影响
|
||||
self.selFeature = self.featureIndex[:cuti + 1] # 最优特征索引
|
||||
|
||||
# 如果传入其他数据集,返回筛选后的数据
|
||||
if len(args) != 0:
|
||||
returnx = list(args)
|
||||
for i, argi in enumerate(args):
|
||||
if argi.shape[1] == self.x.shape[1]:
|
||||
returnx[i] = argi[:, self.selFeature]
|
||||
return returnx
|
||||
|
||||
# 返回所选特征的索引(列号)
|
||||
return self.selFeature
|
||||
728
Feature_Selection_method/batch_feature_selection.py
Normal file
728
Feature_Selection_method/batch_feature_selection.py
Normal file
@ -0,0 +1,728 @@
|
||||
"""
|
||||
批量特征选择工具
|
||||
支持对多个CSV文件或数据集进行批量特征选择
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Tuple, Union
|
||||
import argparse
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
import warnings
|
||||
|
||||
# 导入特征选择模块
|
||||
from feture_select import (
|
||||
FeatureSelectionConfig,
|
||||
select_features_from_csv,
|
||||
select_features_from_data
|
||||
)
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def parse_column_range(column_range: Union[str, int, List[Union[str, int]]], total_columns: int) -> List[int]:
|
||||
"""
|
||||
解析列范围字符串,返回列索引列表
|
||||
|
||||
Args:
|
||||
column_range: 列范围,如 "0:5", "2,4,6-8", [0,1,2] 或单个索引
|
||||
total_columns: 总列数
|
||||
|
||||
Returns:
|
||||
列索引列表
|
||||
"""
|
||||
if isinstance(column_range, (int, np.integer)):
|
||||
# 单个列索引
|
||||
if column_range >= total_columns or column_range < 0:
|
||||
raise ValueError(f"Column index {column_range} out of range [0, {total_columns-1}]")
|
||||
return [column_range]
|
||||
|
||||
elif isinstance(column_range, str):
|
||||
# 解析范围字符串
|
||||
columns = []
|
||||
# 分割多个范围(用逗号分隔)
|
||||
for part in column_range.split(','):
|
||||
part = part.strip()
|
||||
if ':' in part:
|
||||
# 范围选择,如 "0:5"
|
||||
start, end = part.split(':')
|
||||
start = int(start.strip()) if start.strip() else 0
|
||||
end = int(end.strip()) if end.strip() else total_columns
|
||||
if start < 0:
|
||||
start = total_columns + start
|
||||
if end < 0:
|
||||
end = total_columns + end
|
||||
if start >= total_columns or end > total_columns:
|
||||
raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
|
||||
columns.extend(range(start, end))
|
||||
else:
|
||||
# 单个索引
|
||||
idx = int(part.strip())
|
||||
if idx < 0:
|
||||
idx = total_columns + idx
|
||||
if idx >= total_columns or idx < 0:
|
||||
raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
|
||||
columns.append(idx)
|
||||
return list(set(columns)) # 去重
|
||||
|
||||
elif isinstance(column_range, (list, tuple)):
|
||||
# 直接的列索引列表
|
||||
columns = []
|
||||
for idx in column_range:
|
||||
if isinstance(idx, str):
|
||||
if ':' in idx:
|
||||
# 处理列表中的范围字符串
|
||||
start, end = idx.split(':')
|
||||
start = int(start.strip()) if start.strip() else 0
|
||||
end = int(end.strip()) if end.strip() else total_columns
|
||||
if start < 0:
|
||||
start = total_columns + start
|
||||
if end < 0:
|
||||
end = total_columns + end
|
||||
if start >= total_columns or end > total_columns:
|
||||
raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
|
||||
columns.extend(range(start, end))
|
||||
else:
|
||||
idx_int = int(idx.strip())
|
||||
if idx_int < 0:
|
||||
idx_int = total_columns + idx_int
|
||||
if idx_int >= total_columns or idx_int < 0:
|
||||
raise ValueError(f"Column index {idx_int} out of range [0, {total_columns-1}]")
|
||||
columns.append(idx_int)
|
||||
else:
|
||||
if idx < 0:
|
||||
idx = total_columns + idx
|
||||
if idx >= total_columns or idx < 0:
|
||||
raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
|
||||
columns.append(idx)
|
||||
return list(set(columns)) # 去重
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported column range format: {type(column_range)}")
|
||||
|
||||
|
||||
def convert_column_indices_to_names(df: pd.DataFrame, column_indices: List[int]) -> List[str]:
|
||||
"""
|
||||
将列索引转换为列名
|
||||
|
||||
Args:
|
||||
df: DataFrame
|
||||
column_indices: 列索引列表
|
||||
|
||||
Returns:
|
||||
列名列表
|
||||
"""
|
||||
return [df.columns[i] for i in column_indices]
|
||||
|
||||
|
||||
def resolve_spectral_columns(df: pd.DataFrame, spectral_columns: Union[str, List[Union[str, int]], None]) -> List[str]:
|
||||
"""
|
||||
解析光谱列配置,支持列名和列号范围
|
||||
|
||||
Args:
|
||||
df: DataFrame
|
||||
spectral_columns: 光谱列配置
|
||||
|
||||
Returns:
|
||||
光谱列名列表
|
||||
"""
|
||||
if spectral_columns is None:
|
||||
# 默认使用除标签列外的所有列
|
||||
return df.columns.tolist()
|
||||
|
||||
elif isinstance(spectral_columns, str) and spectral_columns == "auto":
|
||||
# 自动检测光谱列(通常是数值列)
|
||||
potential_spectral_cols = []
|
||||
for col in df.columns:
|
||||
if pd.api.types.is_numeric_dtype(df[col]):
|
||||
# 检查是否是连续的数值序列(光谱波段)
|
||||
try:
|
||||
values = pd.to_numeric(df[col], errors='coerce')
|
||||
if values.notna().sum() > len(df) * 0.8: # 至少80%是数值
|
||||
potential_spectral_cols.append(col)
|
||||
except:
|
||||
continue
|
||||
return potential_spectral_cols
|
||||
|
||||
else:
|
||||
# 解析列范围
|
||||
try:
|
||||
column_indices = parse_column_range(spectral_columns, len(df.columns))
|
||||
return convert_column_indices_to_names(df, column_indices)
|
||||
except ValueError as e:
|
||||
print(f"解析光谱列时出错: {e}")
|
||||
print(f"将使用自动检测模式")
|
||||
return resolve_spectral_columns(df, "auto")
|
||||
|
||||
|
||||
def find_csv_files(directory: Union[str, Path], pattern: str = "*.csv") -> List[Path]:
|
||||
"""
|
||||
在目录中查找所有CSV文件
|
||||
|
||||
Args:
|
||||
directory: 搜索目录
|
||||
pattern: 文件匹配模式
|
||||
|
||||
Returns:
|
||||
CSV文件路径列表
|
||||
"""
|
||||
directory = Path(directory)
|
||||
if not directory.exists():
|
||||
raise FileNotFoundError(f"目录不存在: {directory}")
|
||||
|
||||
csv_files = list(directory.glob(pattern))
|
||||
csv_files.sort() # 排序以保证顺序一致性
|
||||
|
||||
print(f"在目录 {directory} 中找到 {len(csv_files)} 个CSV文件")
|
||||
return csv_files
|
||||
|
||||
|
||||
def create_batch_configs(csv_files: List[Path],
|
||||
base_config: FeatureSelectionConfig,
|
||||
output_base_dir: Union[str, Path]) -> List[Tuple[Path, FeatureSelectionConfig]]:
|
||||
"""
|
||||
为每个CSV文件创建配置
|
||||
|
||||
Args:
|
||||
csv_files: CSV文件列表
|
||||
base_config: 基础配置
|
||||
output_base_dir: 输出基础目录
|
||||
|
||||
Returns:
|
||||
(文件路径, 配置) 元组列表
|
||||
"""
|
||||
configs = []
|
||||
output_base_dir = Path(output_base_dir)
|
||||
|
||||
for csv_file in csv_files:
|
||||
try:
|
||||
# 先读取CSV文件来获取列信息
|
||||
df = pd.read_csv(csv_file, nrows=5) # 只读取前5行来获取列信息
|
||||
|
||||
# 解析标签列
|
||||
if isinstance(base_config.label_column, str):
|
||||
if base_config.label_column not in df.columns:
|
||||
print(f"警告: 文件 {csv_file.name} 中不存在标签列 '{base_config.label_column}',将尝试使用第一列")
|
||||
resolved_label_column = df.columns[0]
|
||||
else:
|
||||
resolved_label_column = base_config.label_column
|
||||
else:
|
||||
# 如果是列索引
|
||||
try:
|
||||
resolved_label_column = df.columns[base_config.label_column]
|
||||
except IndexError:
|
||||
print(f"警告: 文件 {csv_file.name} 中的列索引 {base_config.label_column} 超出范围,将使用第一列")
|
||||
resolved_label_column = df.columns[0]
|
||||
|
||||
# 解析光谱列
|
||||
resolved_spectral_columns = resolve_spectral_columns(df, base_config.spectral_columns)
|
||||
|
||||
# 确保标签列不在光谱列中
|
||||
if resolved_label_column in resolved_spectral_columns:
|
||||
resolved_spectral_columns.remove(resolved_label_column)
|
||||
|
||||
if len(resolved_spectral_columns) == 0:
|
||||
print(f"警告: 文件 {csv_file.name} 中没有找到有效的光谱列")
|
||||
continue
|
||||
|
||||
print(f"文件 {csv_file.name}: 标签列='{resolved_label_column}', 光谱列数={len(resolved_spectral_columns)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"读取文件 {csv_file.name} 时出错: {e},跳过此文件")
|
||||
continue
|
||||
|
||||
# 为每个文件创建独立的输出目录
|
||||
file_stem = csv_file.stem
|
||||
file_output_dir = output_base_dir / file_stem
|
||||
file_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 复制基础配置并修改文件特定的参数
|
||||
config = FeatureSelectionConfig(
|
||||
method=base_config.method,
|
||||
method_params=base_config.method_params.copy(),
|
||||
csv_file_path=str(csv_file),
|
||||
label_column=resolved_label_column,
|
||||
spectral_columns=resolved_spectral_columns,
|
||||
output_csv=base_config.output_csv,
|
||||
output_dir=str(file_output_dir),
|
||||
output_filename=f"{file_stem}_selected_features",
|
||||
save_plots=base_config.save_plots,
|
||||
plot_name_prefix=f"{file_stem}_{base_config.method}",
|
||||
plot_dir=str(file_output_dir) if base_config.plot_dir else None
|
||||
)
|
||||
|
||||
configs.append((csv_file, config))
|
||||
|
||||
return configs
|
||||
|
||||
|
||||
def process_single_file(csv_file: Path, config: FeatureSelectionConfig) -> Dict:
|
||||
"""
|
||||
处理单个CSV文件的特征选择
|
||||
|
||||
Args:
|
||||
csv_file: CSV文件路径
|
||||
config: 特征选择配置
|
||||
|
||||
Returns:
|
||||
处理结果字典
|
||||
"""
|
||||
result = {
|
||||
'file': str(csv_file),
|
||||
'file_name': csv_file.name,
|
||||
'success': False,
|
||||
'error': None,
|
||||
'n_selected_features': 0,
|
||||
'selected_columns': [],
|
||||
'processing_time': 0,
|
||||
'output_dir': config.output_dir
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
print(f"开始处理文件: {csv_file.name}")
|
||||
|
||||
# 执行特征选择
|
||||
X_selected, y, selected_columns = select_features_from_csv(config)
|
||||
|
||||
# 记录结果
|
||||
result['success'] = True
|
||||
result['n_selected_features'] = X_selected.shape[1]
|
||||
result['selected_columns'] = selected_columns.tolist() if hasattr(selected_columns, 'tolist') else list(selected_columns)
|
||||
result['n_samples'] = X_selected.shape[0]
|
||||
|
||||
print(f"文件 {csv_file.name} 处理完成,选择特征数: {result['n_selected_features']}")
|
||||
|
||||
except Exception as e:
|
||||
result['error'] = str(e)
|
||||
print(f"文件 {csv_file.name} 处理失败: {e}")
|
||||
|
||||
finally:
|
||||
result['processing_time'] = time.time() - start_time
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def batch_feature_selection(csv_files: List[Path],
|
||||
base_config: FeatureSelectionConfig,
|
||||
output_base_dir: Union[str, Path],
|
||||
max_workers: Optional[int] = None,
|
||||
parallel: bool = False) -> List[Dict]:
|
||||
"""
|
||||
批量执行特征选择
|
||||
|
||||
Args:
|
||||
csv_files: CSV文件列表
|
||||
base_config: 基础配置
|
||||
output_base_dir: 输出基础目录
|
||||
max_workers: 最大并行工作数
|
||||
parallel: 是否并行处理
|
||||
|
||||
Returns:
|
||||
处理结果列表
|
||||
"""
|
||||
# 创建配置
|
||||
file_configs = create_batch_configs(csv_files, base_config, output_base_dir)
|
||||
|
||||
results = []
|
||||
|
||||
if parallel and len(file_configs) > 1:
|
||||
# 并行处理
|
||||
print(f"开始并行处理 {len(file_configs)} 个文件 (最大并行数: {max_workers or 'auto'})")
|
||||
|
||||
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 提交所有任务
|
||||
future_to_config = {
|
||||
executor.submit(process_single_file, csv_file, config): (csv_file, config)
|
||||
for csv_file, config in file_configs
|
||||
}
|
||||
|
||||
# 收集结果
|
||||
for future in as_completed(future_to_config):
|
||||
csv_file, config = future_to_config[future]
|
||||
try:
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
print(f"并行处理失败 {csv_file.name}: {e}")
|
||||
results.append({
|
||||
'file': str(csv_file),
|
||||
'file_name': csv_file.name,
|
||||
'success': False,
|
||||
'error': str(e),
|
||||
'processing_time': 0
|
||||
})
|
||||
|
||||
else:
|
||||
# 串行处理
|
||||
print(f"开始串行处理 {len(file_configs)} 个文件")
|
||||
|
||||
for csv_file, config in file_configs:
|
||||
result = process_single_file(csv_file, config)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def save_batch_results(results: List[Dict], output_file: Union[str, Path]):
|
||||
"""
|
||||
保存批量处理结果到文件
|
||||
|
||||
Args:
|
||||
results: 处理结果列表
|
||||
output_file: 输出文件路径
|
||||
"""
|
||||
output_file = Path(output_file)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 转换为DataFrame
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
# 保存为CSV
|
||||
results_df.to_csv(output_file, index=False, encoding='utf-8')
|
||||
|
||||
print(f"批量处理结果已保存到: {output_file}")
|
||||
|
||||
|
||||
def print_batch_summary(results: List[Dict]):
|
||||
"""
|
||||
打印批量处理摘要
|
||||
|
||||
Args:
|
||||
results: 处理结果列表
|
||||
"""
|
||||
total_files = len(results)
|
||||
successful_files = sum(1 for r in results if r['success'])
|
||||
failed_files = total_files - successful_files
|
||||
|
||||
total_time = sum(r['processing_time'] for r in results)
|
||||
avg_time = total_time / total_files if total_files > 0 else 0
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("批量特征选择处理摘要")
|
||||
print("="*60)
|
||||
print(f"总文件数: {total_files}")
|
||||
print(f"成功处理: {successful_files}")
|
||||
print(f"失败处理: {failed_files}")
|
||||
print(".2f")
|
||||
print(".2f")
|
||||
|
||||
if successful_files > 0:
|
||||
selected_features = [r['n_selected_features'] for r in results if r['success']]
|
||||
print(f"平均选择的特征数: {np.mean(selected_features):.1f} ± {np.std(selected_features):.1f}")
|
||||
|
||||
if failed_files > 0:
|
||||
print(f"\n失败的文件:")
|
||||
for result in results:
|
||||
if not result['success']:
|
||||
print(f" - {result['file_name']}: {result['error']}")
|
||||
|
||||
print("="*60)
|
||||
|
||||
|
||||
def create_example_batch_config() -> FeatureSelectionConfig:
|
||||
"""
|
||||
创建示例批量配置
|
||||
|
||||
Returns:
|
||||
示例配置对象
|
||||
"""
|
||||
return FeatureSelectionConfig(
|
||||
method="CARS", # 可以使用: Cars, Lars, Uve, Spa, GA, ReliefF, RandomFrog, SiPLS
|
||||
method_params={
|
||||
'N': 50, # CARS参数
|
||||
'f': 20,
|
||||
'cv': 10
|
||||
},
|
||||
# 注意: csv_file_path, label_column, spectral_columns 会在处理每个文件时设置
|
||||
output_csv=True,
|
||||
save_plots=True,
|
||||
plot_name_prefix="batch_fs"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
parser = argparse.ArgumentParser(description='批量特征选择工具')
|
||||
|
||||
# 必需参数
|
||||
parser.add_argument('input_dir', help='包含CSV文件的输入目录')
|
||||
parser.add_argument('output_dir', help='输出目录')
|
||||
|
||||
# 可选参数
|
||||
parser.add_argument('--method', default='CARS',
|
||||
choices=['Cars', 'Lars', 'Uve', 'Spa', 'GA', 'ReliefF', 'RandomFrog', 'SiPLS'],
|
||||
help='特征选择方法 (默认: CARS)')
|
||||
parser.add_argument('--label_column', required=True,
|
||||
help='标签列名或列索引 (例如: "concentration" 或 0)')
|
||||
parser.add_argument('--spectral_columns', required=True,
|
||||
help='光谱列配置,支持: 列名列表 "col1 col2 col3", 列号范围 "1:10", 混合 "2,4,6-8", 或 "auto" 自动检测')
|
||||
parser.add_argument('--parallel', action='store_true', help='启用并行处理')
|
||||
parser.add_argument('--max_workers', type=int, help='最大并行工作数')
|
||||
parser.add_argument('--no_csv_output', action='store_true', help='不输出CSV文件')
|
||||
parser.add_argument('--no_plots', action='store_true', help='不生成可视化图')
|
||||
parser.add_argument('--results_file', default='batch_results.csv', help='结果文件路径')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# 解析光谱列参数
|
||||
if args.spectral_columns == "auto":
|
||||
spectral_columns = "auto"
|
||||
elif ':' in str(args.spectral_columns) or ',' in str(args.spectral_columns):
|
||||
# 如果包含范围符号,保持为字符串让后续解析
|
||||
spectral_columns = args.spectral_columns
|
||||
else:
|
||||
# 可能是空格分隔的列名列表
|
||||
spectral_columns = args.spectral_columns.split()
|
||||
|
||||
# 尝试转换标签列为适当类型
|
||||
try:
|
||||
# 如果是数字,转换为整数
|
||||
label_column = int(args.label_column)
|
||||
except ValueError:
|
||||
# 如果不是数字,当作列名
|
||||
label_column = args.label_column
|
||||
|
||||
# 创建基础配置
|
||||
base_config = FeatureSelectionConfig(
|
||||
method=args.method,
|
||||
method_params={}, # 使用默认参数
|
||||
label_column=label_column,
|
||||
spectral_columns=spectral_columns,
|
||||
output_csv=not args.no_csv_output,
|
||||
save_plots=not args.no_plots,
|
||||
plot_name_prefix=f"batch_{args.method}"
|
||||
)
|
||||
|
||||
# 查找CSV文件
|
||||
csv_files = find_csv_files(args.input_dir)
|
||||
if not csv_files:
|
||||
print("未找到CSV文件")
|
||||
return 1
|
||||
|
||||
# 执行批量特征选择
|
||||
results = batch_feature_selection(
|
||||
csv_files=csv_files,
|
||||
base_config=base_config,
|
||||
output_base_dir=args.output_dir,
|
||||
max_workers=args.max_workers,
|
||||
parallel=args.parallel
|
||||
)
|
||||
|
||||
# 保存结果
|
||||
results_file = Path(args.output_dir) / args.results_file
|
||||
save_batch_results(results, results_file)
|
||||
|
||||
# 打印摘要
|
||||
print_batch_summary(results)
|
||||
|
||||
successful = sum(1 for r in results if r['success'])
|
||||
return 0 if successful > 0 else 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"批量处理失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
|
||||
def example_usage():
|
||||
"""
|
||||
显示使用示例
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("批量特征选择工具 - 使用指南")
|
||||
print("=" * 80)
|
||||
|
||||
print("\n1. 列范围选择功能:")
|
||||
print(" 支持多种列选择方式:")
|
||||
print(" - 列号范围: '1:10' 表示列1到列10")
|
||||
print(" - 混合选择: '2,4,6-8' 表示列2,4,6,7,8")
|
||||
print(" - 自动检测: 'auto' 自动选择数值列作为光谱列")
|
||||
print(" - 列名列表: 'wavelength_400 wavelength_410 wavelength_420'")
|
||||
|
||||
print("\n2. 命令行使用示例:")
|
||||
print(" # 使用列号范围")
|
||||
print(" python batch_feature_selection.py input_dir output_dir --label_column 0 --spectral_columns 1:50")
|
||||
print("")
|
||||
print(" # 使用混合范围")
|
||||
print(" python batch_feature_selection.py input_dir output_dir --label_column concentration --spectral_columns 2,4,6-8")
|
||||
print("")
|
||||
print(" # 自动检测光谱列")
|
||||
print(" python batch_feature_selection.py input_dir output_dir --label_column Label --spectral_columns auto")
|
||||
|
||||
print("\n3. Python代码使用示例:")
|
||||
print("""
|
||||
from batch_feature_selection import batch_feature_selection, create_example_batch_config, find_csv_files
|
||||
|
||||
# 查找CSV文件
|
||||
csv_files = find_csv_files('your/data/directory')
|
||||
|
||||
# 创建配置
|
||||
base_config = create_example_batch_config()
|
||||
base_config.label_column = 'concentration' # 标签列名
|
||||
base_config.spectral_columns = "5:25" # 列5到25作为光谱列
|
||||
|
||||
# 执行批量处理
|
||||
results = batch_feature_selection(
|
||||
csv_files=csv_files,
|
||||
base_config=base_config,
|
||||
output_base_dir='output/directory',
|
||||
parallel=True
|
||||
)
|
||||
""")
|
||||
|
||||
print("\n4. 支持的特征选择方法:")
|
||||
methods = ['CARS', 'Lars', 'Uve', 'Spa', 'GA', 'ReliefF', 'RandomFrog', 'SiPLS']
|
||||
for method in methods:
|
||||
print(f" - {method}")
|
||||
|
||||
print("\n5. 方法参数配置示例:")
|
||||
print("""
|
||||
# CARS方法
|
||||
config.method_params = {'N': 50, 'f': 20, 'cv': 10}
|
||||
|
||||
# UVE方法
|
||||
config.method_params = {'ncomp': 20, 'cv': 5}
|
||||
|
||||
# SPA方法
|
||||
config.method_params = {'m_min': 2, 'm_max': 50, 'autoscaling': 1}
|
||||
""")
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
# 查找CSV文件
|
||||
csv_files = find_csv_files("E:\code\spectronon\single_classsfication\data")
|
||||
|
||||
# 定义所有可用的特征选择方法及其参数
|
||||
methods_config = [
|
||||
{
|
||||
'method': 'Cars',
|
||||
'method_params': {'N': 50, 'f': 20, 'cv': 10},
|
||||
'description': 'Competitive Adaptive Reweighted Sampling'
|
||||
},
|
||||
{
|
||||
'method': 'Uve',
|
||||
'method_params': {'ncomp': 20, 'cv': 5},
|
||||
'description': 'Uninformative Variable Elimination'
|
||||
},
|
||||
{
|
||||
'method': 'Spa',
|
||||
'method_params': {'m_min': 2, 'm_max': 50, 'autoscaling': 1},
|
||||
'description': 'Successive Projections Algorithm'
|
||||
},
|
||||
{
|
||||
'method': 'GA',
|
||||
'method_params': {'population_size': 10},
|
||||
'description': 'Genetic Algorithm'
|
||||
},
|
||||
{
|
||||
'method': 'ReliefF',
|
||||
'method_params': {'n_neighbors': 20, 'n_features_to_keep': 20},
|
||||
'description': 'ReliefF Algorithm'
|
||||
},
|
||||
{
|
||||
'method': 'RandomFrog',
|
||||
'method_params': {'n_frogs': 50, 'n_memeplexes': 5, 'n_evolution_steps': 10, 'n_shuffle_iterations': 10, 'cv': 5},
|
||||
'description': 'Random Frog Leaping Algorithm'
|
||||
},
|
||||
{
|
||||
'method': 'SiPLS',
|
||||
'method_params': {'n_intervals_list': [10, 15, 20]},
|
||||
'description': 'Synergy Interval Partial Least Squares'
|
||||
}
|
||||
]
|
||||
|
||||
print("=" * 80)
|
||||
print("开始批量特征选择 - 使用所有可用方法")
|
||||
print(f"找到 {len(csv_files)} 个CSV文件待处理")
|
||||
print(f"将使用 {len(methods_config)} 种特征选择方法")
|
||||
print("=" * 80)
|
||||
|
||||
all_results = {}
|
||||
|
||||
# 为每种方法执行批量特征选择
|
||||
for i, method_cfg in enumerate(methods_config, 1):
|
||||
method_name = method_cfg['method']
|
||||
description = method_cfg['description']
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"方法 {i}/{len(methods_config)}: {method_name}")
|
||||
print(f"描述: {description}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
try:
|
||||
# 创建该方法的配置
|
||||
method_config = create_example_batch_config()
|
||||
method_config.method = method_name
|
||||
method_config.method_params = method_cfg['method_params']
|
||||
method_config.label_column = 'Label' # 标签列名
|
||||
method_config.spectral_columns = "1:" # 列1到最后作为光谱列
|
||||
method_config.plot_name_prefix = f"{method_name.lower()}_batch_fs"
|
||||
|
||||
# 执行批量处理
|
||||
method_results = batch_feature_selection(
|
||||
csv_files=csv_files,
|
||||
base_config=method_config,
|
||||
output_base_dir=f'E:\\code\\spectronon\\single_classsfication\\Feature_Selection_method\\directory\\{method_name.lower()}_results',
|
||||
parallel=True
|
||||
)
|
||||
|
||||
all_results[method_name] = {
|
||||
'results': method_results,
|
||||
'description': description,
|
||||
'config': method_cfg
|
||||
}
|
||||
|
||||
print(f"✅ {method_name} 方法处理完成")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ {method_name} 方法处理失败: {str(e)}")
|
||||
all_results[method_name] = {
|
||||
'error': str(e),
|
||||
'description': description,
|
||||
'config': method_cfg
|
||||
}
|
||||
|
||||
# 输出汇总结果
|
||||
print(f"\n{'='*80}")
|
||||
print("批量特征选择处理完成汇总")
|
||||
print(f"{'='*80}")
|
||||
|
||||
successful_methods = []
|
||||
failed_methods = []
|
||||
|
||||
for method_name, result in all_results.items():
|
||||
if 'error' in result:
|
||||
failed_methods.append(f"{method_name}: {result['error']}")
|
||||
print(f"❌ {method_name}: 失败 - {result['error']}")
|
||||
else:
|
||||
successful_methods.append(method_name)
|
||||
print(f"✅ {method_name}: 成功")
|
||||
|
||||
print(f"\n总计: {len(successful_methods)}/{len(methods_config)} 种方法成功处理")
|
||||
print(f"成功的方法: {', '.join(successful_methods)}")
|
||||
|
||||
if failed_methods:
|
||||
print(f"失败的方法: {len(failed_methods)} 种")
|
||||
for failed in failed_methods:
|
||||
print(f" - {failed}")
|
||||
|
||||
print(f"\n结果文件保存在: E:\\code\\spectronon\\single_classsfication\\Feature_Selection_method\\directory\\")
|
||||
print("每个方法都有独立的子目录存储结果")
|
||||
# 如果直接运行此脚本,显示使用指南
|
||||
# if __name__ == "__main__":
|
||||
# import sys
|
||||
# if len(sys.argv) == 1:
|
||||
# example_usage()
|
||||
# else:
|
||||
# # 运行主函数进行批量处理
|
||||
# exit(main())
|
||||
|
||||
|
||||
594
Feature_Selection_method/feture_select.py
Normal file
594
Feature_Selection_method/feture_select.py
Normal file
@ -0,0 +1,594 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from Feature_Selection_method.Lar import Lar
|
||||
from Feature_Selection_method.Spa import SPA
|
||||
from Feature_Selection_method.Uve import UVE
|
||||
from Feature_Selection_method.Cars import CARS_Cloud
|
||||
from Feature_Selection_method.GA import GA
|
||||
from Feature_Selection_method.ReliefF import ReliefF
|
||||
from Feature_Selection_method.random_fog import shuffled_frog_leaping_selection
|
||||
from Feature_Selection_method.sipls import sipls_feature_selection
|
||||
from sklearn.model_selection import train_test_split
|
||||
import os
|
||||
import matplotlib.pyplot as plt
|
||||
from typing import Optional, Union, List, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
def _get_x_axis_values(feature_names: List[str]) -> Tuple[Optional[np.ndarray], str]:
|
||||
"""
|
||||
从特征名称中提取x轴数值(通常是波长)
|
||||
|
||||
Args:
|
||||
feature_names: 特征名称列表
|
||||
|
||||
Returns:
|
||||
(x_values, x_label): x轴数值数组和标签,如果无法提取则返回(None, "")
|
||||
"""
|
||||
if not feature_names:
|
||||
return None, ""
|
||||
|
||||
# 尝试从列名中提取数值
|
||||
x_values = []
|
||||
for name in feature_names:
|
||||
try:
|
||||
# 尝试将列名转换为浮点数
|
||||
if isinstance(name, (int, float)):
|
||||
x_values.append(float(name))
|
||||
elif isinstance(name, str):
|
||||
# 尝试提取字符串中的数值
|
||||
# 处理类似 "400.5", "Band_400", "Wavelength_400.5nm" 的格式
|
||||
import re
|
||||
# 查找浮点数模式
|
||||
match = re.search(r'(\d+\.?\d*)', str(name))
|
||||
if match:
|
||||
x_values.append(float(match.group(1)))
|
||||
else:
|
||||
# 如果找不到数值,返回None
|
||||
return None, ""
|
||||
else:
|
||||
return None, ""
|
||||
except (ValueError, TypeError):
|
||||
return None, ""
|
||||
|
||||
# 检查是否所有值都是唯一的(避免重复的波长)
|
||||
if len(set(x_values)) != len(x_values):
|
||||
return None, ""
|
||||
|
||||
# 检查波长范围是否合理(假设是nm单位,范围在200-2500nm之间)
|
||||
x_array = np.array(x_values)
|
||||
if np.min(x_array) < 200 or np.max(x_array) > 2500:
|
||||
return None, ""
|
||||
|
||||
# 确定标签
|
||||
x_label = "Wavelength (nm)"
|
||||
|
||||
return x_array, x_label
|
||||
|
||||
|
||||
def plot_feature_selection_results(X: Union[pd.DataFrame, np.ndarray],
|
||||
selected_indices: Union[List[int], np.ndarray],
|
||||
method_name: str,
|
||||
save_path: Optional[str] = None,
|
||||
figsize: Tuple[int, int] = (12, 6)) -> plt.Figure:
|
||||
"""
|
||||
绘制特征选择结果的可视化图
|
||||
|
||||
Args:
|
||||
X: 特征数据矩阵 (n_samples, n_features)
|
||||
selected_indices: 选择的特征索引列表
|
||||
method_name: 特征选择方法名称
|
||||
save_path: 图片保存路径,如果为None则不保存
|
||||
figsize: 图片尺寸
|
||||
|
||||
Returns:
|
||||
matplotlib Figure对象
|
||||
"""
|
||||
# 转换为numpy数组
|
||||
if isinstance(X, pd.DataFrame):
|
||||
X_array = X.values
|
||||
feature_names = X.columns.tolist()
|
||||
else:
|
||||
X_array = X
|
||||
feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
|
||||
|
||||
# 计算平均光谱
|
||||
mean_spectrum = np.mean(X_array, axis=0)
|
||||
n_features = X_array.shape[1]
|
||||
|
||||
# 创建x轴 - 尝试使用波长值而不是索引
|
||||
x_values, x_label = _get_x_axis_values(feature_names)
|
||||
if x_values is None:
|
||||
# 如果无法提取波长值,使用特征索引
|
||||
x_values = np.arange(n_features)
|
||||
x_label = "Feature Index"
|
||||
|
||||
# 创建图形
|
||||
fig, ax = plt.subplots(figsize=figsize)
|
||||
|
||||
# 绘制平均光谱曲线
|
||||
ax.plot(x_values, mean_spectrum, 'b-', linewidth=1.5, alpha=0.8, label='Mean Spectrum')
|
||||
|
||||
# 标注选择的特征点
|
||||
if len(selected_indices) > 0:
|
||||
# 确保selected_indices是有效的numpy数组
|
||||
selected_indices = np.asarray(selected_indices, dtype=int)
|
||||
|
||||
# 检查索引范围
|
||||
valid_indices = selected_indices[(selected_indices >= 0) & (selected_indices < len(x_values))]
|
||||
|
||||
if len(valid_indices) > 0:
|
||||
selected_x = x_values[valid_indices]
|
||||
selected_y = mean_spectrum[valid_indices]
|
||||
|
||||
ax.scatter(selected_x, selected_y, color='red', s=60, alpha=0.9,
|
||||
edgecolors='darkred', linewidth=1.5, label='Selected Features', zorder=5)
|
||||
|
||||
# 添加选择的特征数量信息
|
||||
ax.text(0.02, 0.98, f'Selected: {len(selected_indices)}/{n_features} features',
|
||||
transform=ax.transAxes, fontsize=10, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
|
||||
|
||||
# 设置标题和标签
|
||||
ax.set_title(f'Feature Selection Results - {method_name}', fontsize=14, fontweight='bold')
|
||||
ax.set_xlabel(x_label, fontsize=12)
|
||||
ax.set_ylabel('Intensity', fontsize=12)
|
||||
|
||||
# 设置网格和图例
|
||||
ax.grid(True, alpha=0.3)
|
||||
ax.legend(loc='upper right', fontsize=10)
|
||||
|
||||
# 调整布局
|
||||
plt.tight_layout()
|
||||
|
||||
# 保存图片
|
||||
if save_path:
|
||||
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
||||
print(f"Visualization saved to: {save_path}")
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
@dataclass
|
||||
class FeatureSelectionConfig:
|
||||
"""特征选择配置类"""
|
||||
# CSV文件相关配置
|
||||
csv_file_path: Optional[str] = None
|
||||
label_column: Optional[str] = None
|
||||
spectral_columns: Optional[List[str]] = None
|
||||
|
||||
# 特征选择方法配置
|
||||
method: str = "None"
|
||||
method_params: dict = field(default_factory=dict)
|
||||
|
||||
# 输出配置
|
||||
output_csv: bool = False
|
||||
output_dir: str = ""
|
||||
output_filename: str = "selected_features"
|
||||
|
||||
# 可视化配置
|
||||
save_plots: bool = True
|
||||
plot_name_prefix: str = ""
|
||||
plot_dir: Optional[str] = None # 可视化图片保存目录,如果为None则使用output_dir
|
||||
|
||||
def __post_init__(self):
|
||||
"""参数校验和默认值设置"""
|
||||
if self.csv_file_path and not os.path.exists(self.csv_file_path):
|
||||
raise FileNotFoundError(f"CSV文件不存在: {self.csv_file_path}")
|
||||
|
||||
if self.csv_file_path and not self.label_column:
|
||||
raise ValueError("指定CSV文件时必须提供标签列名(label_column)")
|
||||
|
||||
if self.csv_file_path and not self.spectral_columns:
|
||||
raise ValueError("指定CSV文件时必须提供光谱列名列表(spectral_columns)")
|
||||
|
||||
# 设置默认的方法参数
|
||||
self._set_default_method_params()
|
||||
|
||||
def _set_default_method_params(self):
|
||||
"""根据方法设置默认参数"""
|
||||
if self.method == "Cars":
|
||||
self.method_params.setdefault('N', 50)
|
||||
self.method_params.setdefault('f', 20)
|
||||
self.method_params.setdefault('cv', 10)
|
||||
elif self.method == "Uve":
|
||||
self.method_params.setdefault('ncomp', 20)
|
||||
self.method_params.setdefault('cv', 5)
|
||||
elif self.method == "Spa":
|
||||
self.method_params.setdefault('m_min', 2)
|
||||
self.method_params.setdefault('m_max', 50)
|
||||
self.method_params.setdefault('autoscaling', 1)
|
||||
elif self.method == "GA":
|
||||
self.method_params.setdefault('population_size', 10)
|
||||
elif self.method == "ReliefF":
|
||||
self.method_params.setdefault('n_neighbors', 20)
|
||||
self.method_params.setdefault('n_features_to_keep', 20)
|
||||
elif self.method == "RandomFrog":
|
||||
self.method_params.setdefault('n_frogs', 50)
|
||||
self.method_params.setdefault('n_memeplexes', 5)
|
||||
self.method_params.setdefault('n_evolution_steps', 10)
|
||||
self.method_params.setdefault('n_shuffle_iterations', 10)
|
||||
self.method_params.setdefault('cv', 5)
|
||||
elif self.method == "SiPLS":
|
||||
self.method_params.setdefault('n_intervals_list', [10, 15, 20])
|
||||
self.method_params.setdefault('n_combinations_list', [2, 3, 4])
|
||||
self.method_params.setdefault('max_components', 15)
|
||||
self.method_params.setdefault('cv_folds', 5)
|
||||
|
||||
|
||||
class SpectrumFeatureSelector:
|
||||
"""光谱特征选择器"""
|
||||
|
||||
def __init__(self, config: FeatureSelectionConfig):
|
||||
self.config = config
|
||||
|
||||
def load_csv_data(self) -> Tuple[pd.DataFrame, np.ndarray]:
|
||||
"""从CSV文件加载数据"""
|
||||
if not self.config.csv_file_path:
|
||||
raise ValueError("未指定CSV文件路径")
|
||||
|
||||
df = pd.read_csv(self.config.csv_file_path)
|
||||
|
||||
# 验证列是否存在
|
||||
if self.config.label_column not in df.columns:
|
||||
raise ValueError(f"标签列 '{self.config.label_column}' 不存在于CSV文件中")
|
||||
|
||||
missing_cols = [col for col in self.config.spectral_columns if col not in df.columns]
|
||||
if missing_cols:
|
||||
raise ValueError(f"以下光谱列不存在于CSV文件中: {missing_cols}")
|
||||
|
||||
# 提取特征和标签
|
||||
X = df[self.config.spectral_columns]
|
||||
y = df[self.config.label_column].values
|
||||
|
||||
return X, y
|
||||
|
||||
def save_selected_features_csv(self, X_selected: pd.DataFrame, y: np.ndarray,
|
||||
selected_columns: Union[List[str], np.ndarray]):
|
||||
"""保存选定的特征到CSV文件"""
|
||||
if not self.config.output_csv:
|
||||
return
|
||||
|
||||
os.makedirs(self.config.output_dir, exist_ok=True)
|
||||
|
||||
# 创建结果DataFrame
|
||||
if isinstance(selected_columns, np.ndarray):
|
||||
selected_col_names = [f"feature_{i}" for i in selected_columns]
|
||||
else:
|
||||
selected_col_names = selected_columns
|
||||
|
||||
result_df = pd.DataFrame(X_selected.values, columns=selected_col_names)
|
||||
result_df[self.config.label_column] = y
|
||||
|
||||
output_path = os.path.join(self.config.output_dir,
|
||||
f"{self.config.output_filename}.csv")
|
||||
result_df.to_csv(output_path, index=False)
|
||||
print(f"Selected features saved to: {output_path}")
|
||||
|
||||
def plot_feature_selection(self, X: pd.DataFrame,
|
||||
selected_indices: Union[List[int], np.ndarray]) -> Optional[plt.Figure]:
|
||||
"""绘制特征选择结果可视化"""
|
||||
if not self.config.save_plots:
|
||||
return None
|
||||
|
||||
# 确定保存目录
|
||||
plot_dir = self.config.plot_dir if self.config.plot_dir else self.config.output_dir
|
||||
if not plot_dir:
|
||||
return None
|
||||
|
||||
os.makedirs(plot_dir, exist_ok=True)
|
||||
|
||||
# 生成文件名
|
||||
filename = f"{self.config.plot_name_prefix}_{self.config.method}_feature_selection.png"
|
||||
save_path = os.path.join(plot_dir, filename)
|
||||
|
||||
# 绘制可视化图
|
||||
fig = plot_feature_selection_results(
|
||||
X=X,
|
||||
selected_indices=selected_indices,
|
||||
method_name=self.config.method,
|
||||
save_path=save_path
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
def _convert_to_indices(self, X: pd.DataFrame, selected_columns) -> List[int]:
|
||||
"""
|
||||
将selected_columns转换为原始DataFrame X的索引列表
|
||||
|
||||
Args:
|
||||
X: 原始DataFrame
|
||||
selected_columns: 选择的列,可以是索引数组、列名列表等
|
||||
|
||||
Returns:
|
||||
索引列表
|
||||
"""
|
||||
try:
|
||||
# 处理pandas Index对象
|
||||
if hasattr(selected_columns, 'tolist'): # pandas Index or Series
|
||||
selected_columns = selected_columns.tolist()
|
||||
|
||||
if isinstance(selected_columns, np.ndarray):
|
||||
# 如果是numpy数组,直接作为索引
|
||||
return selected_columns.tolist()
|
||||
elif isinstance(selected_columns, list) and len(selected_columns) > 0:
|
||||
if isinstance(selected_columns[0], str):
|
||||
# 如果是列名列表,转换为索引
|
||||
indices = []
|
||||
for col in selected_columns:
|
||||
try:
|
||||
# 首先尝试精确匹配
|
||||
idx = X.columns.get_loc(col)
|
||||
indices.append(idx)
|
||||
except KeyError:
|
||||
# 如果精确匹配失败,尝试数值近似匹配(处理小数点精度问题)
|
||||
try:
|
||||
target_value = float(col)
|
||||
# 找到最接近的列名
|
||||
best_match = None
|
||||
best_diff = float('inf')
|
||||
best_idx = None
|
||||
|
||||
for i, col_name in enumerate(X.columns):
|
||||
try:
|
||||
col_value = float(col_name)
|
||||
diff = abs(col_value - target_value)
|
||||
if diff < best_diff:
|
||||
best_diff = diff
|
||||
best_match = col_name
|
||||
best_idx = i
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
if best_match is not None and best_diff < 1.0: # 允许1.0以内的误差
|
||||
print(f"Approximate match: '{col}' -> '{best_match}' (diff: {best_diff:.3f})")
|
||||
indices.append(best_idx)
|
||||
else:
|
||||
print(f"Warning: No suitable match found for column '{col}' in DataFrame columns")
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
print(f"Warning: Cannot parse column name '{col}' as numeric")
|
||||
continue
|
||||
return indices
|
||||
else:
|
||||
# 如果是数字列表,直接作为索引
|
||||
return [int(idx) for idx in selected_columns]
|
||||
else:
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Error converting selected_columns to indices: {e}")
|
||||
return []
|
||||
|
||||
def select_features(self, X: Optional[pd.DataFrame] = None, y: Optional[np.ndarray] = None,
|
||||
column_names: Optional[List[str]] = None) -> Tuple[pd.DataFrame, np.ndarray, Union[List[str], np.ndarray]]:
|
||||
"""
|
||||
执行特征选择
|
||||
|
||||
Args:
|
||||
X: 特征数据,如果为None则从CSV文件加载
|
||||
y: 标签数据,如果为None则从CSV文件加载
|
||||
column_names: 列名,用于numpy数组输入
|
||||
|
||||
Returns:
|
||||
X_selected: 选定的特征数据
|
||||
y: 标签数据
|
||||
selected_columns: 选定的列名或索引
|
||||
"""
|
||||
# 如果没有提供数据,从CSV加载
|
||||
if X is None or y is None:
|
||||
X, y = self.load_csv_data()
|
||||
|
||||
# 确保X是DataFrame格式
|
||||
if isinstance(X, np.ndarray):
|
||||
if column_names is not None:
|
||||
X = pd.DataFrame(X, columns=column_names)
|
||||
else:
|
||||
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
|
||||
|
||||
# 执行特征选择
|
||||
X_selected, y_selected, selected_columns = SpctrumFeatureSelcet(
|
||||
method=self.config.method,
|
||||
X=X,
|
||||
y=y,
|
||||
name=self.config.plot_name_prefix,
|
||||
result_dir=self.config.output_dir if self.config.save_plots else '',
|
||||
column_names=None # 已经转换为DataFrame,不再需要column_names
|
||||
)
|
||||
|
||||
# 保存结果到CSV(如果配置了)
|
||||
self.save_selected_features_csv(X_selected, y_selected, selected_columns)
|
||||
|
||||
# 生成可视化图(如果配置了)
|
||||
if self.config.save_plots:
|
||||
# 转换selected_columns为原始数据集X中的索引列表
|
||||
# selected_columns对应X_selected中的列,我们需要找到它们在原始数据集X中的位置
|
||||
selected_indices = self._convert_to_indices(X, selected_columns)
|
||||
|
||||
if len(selected_indices) > 0:
|
||||
self.plot_feature_selection(X, selected_indices)
|
||||
else:
|
||||
print(f"Warning: No valid indices found for plotting. selected_columns: {selected_columns}")
|
||||
print(f"Available columns in X: {list(X.columns[:5])}...") # 显示前5个列名用于调试
|
||||
|
||||
return X_selected, y_selected, selected_columns
|
||||
|
||||
|
||||
def SpctrumFeatureSelcet(method, X, y, name='', result_dir='', column_names=None, method_params=None):
|
||||
"""
|
||||
核心特征选择函数(保持原有业务逻辑不变)
|
||||
|
||||
:param method: 波长筛选/降维的方法,包括:Cars, Lars, Uve, Spa, GA, ReliefF, RandomFrog, SiPLS。
|
||||
:param X: 光谱数据,可以是 pandas DataFrame 或 numpy array (n_samples, n_features)。
|
||||
:param y: 光谱数据对应的标签 (n_samples,)。
|
||||
:param name: 结果图像的文件名。
|
||||
:param result_dir: 保存结果的文件夹路径。
|
||||
:param column_names: 如果 X 是 numpy array,需要提供列名列表。
|
||||
:param method_params: 方法特定的参数字典。
|
||||
:return:
|
||||
- X_Feature: 选择/降维后的数据 (n_samples, n_features)。
|
||||
- y: 对应的标签。
|
||||
- selected_columns: 选择的特征列名或索引。
|
||||
"""
|
||||
if method_params is None:
|
||||
method_params = {}
|
||||
|
||||
global X_Feature
|
||||
|
||||
# 判断输入数据类型并转换为 DataFrame(如有必要)
|
||||
if isinstance(X, np.ndarray):
|
||||
if column_names is None:
|
||||
column_names = [f"{i}" for i in range(X.shape[1])] # 默认列名
|
||||
X_df = pd.DataFrame(X, columns=column_names)
|
||||
else:
|
||||
X_df = X
|
||||
|
||||
# 根据所选方法执行特征选择
|
||||
if method == "None":
|
||||
X_Feature = X_df
|
||||
selected_columns = X_df.columns
|
||||
elif method == "Cars":
|
||||
save_path = os.path.join(result_dir, f"{name}_cars.png") if result_dir else None
|
||||
# 调用 CARS_Cloud 并获取结果,使用配置的参数
|
||||
N = method_params.get('N', 50)
|
||||
f = method_params.get('f', 20)
|
||||
cv = method_params.get('cv', 10)
|
||||
|
||||
Featuresecletidx = CARS_Cloud(X_df.values, y, N=N, f=f, cv=cv,
|
||||
save_fig=bool(save_path), save_path=save_path)
|
||||
Featuresecletidx = Featuresecletidx.astype(int)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = Featuresecletidx
|
||||
|
||||
elif method == "Lars":
|
||||
Featuresecletidx = Lar(X_df.values, y)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "Uve":
|
||||
ncomp = method_params.get('ncomp', 20)
|
||||
cv = method_params.get('cv', 5)
|
||||
|
||||
uve = UVE(X_df.values, y, ncomp)
|
||||
uve.calcCriteria()
|
||||
uve.evalCriteria(cv=cv)
|
||||
Featuresecletidx = uve.cutFeature() # 返回所选特征的索引
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "Spa":
|
||||
save_path = os.path.join(result_dir, f"{name}_spa.png") if result_dir else None
|
||||
|
||||
Xcal, Xval, ycal, yval = train_test_split(X_df, y, test_size=0.3)
|
||||
|
||||
m_min = method_params.get('m_min', 2)
|
||||
m_max = method_params.get('m_max', 50)
|
||||
autoscaling = method_params.get('autoscaling', 1)
|
||||
|
||||
Featuresecletidx, var_sel_phase2 = SPA().spa(
|
||||
Xcal, ycal, m_min=m_min, m_max=m_max, Xval=Xval, yval=yval,
|
||||
autoscaling=autoscaling, save_path=save_path)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "GA":
|
||||
population_size = method_params.get('population_size', 10)
|
||||
Featuresecletidx = GA(X_df.values, y, population_size)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "ReliefF":
|
||||
n_neighbors = method_params.get('n_neighbors', 20)
|
||||
n_features_to_keep = method_params.get('n_features_to_keep', 20)
|
||||
|
||||
relieff = ReliefF(n_neighbors=n_neighbors, n_features_to_keep=n_features_to_keep)
|
||||
Featuresecletidx = relieff.fit(X_df.values, y)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "RandomFrog":
|
||||
n_frogs = method_params.get('n_frogs', 50)
|
||||
n_memeplexes = method_params.get('n_memeplexes', 5)
|
||||
n_evolution_steps = method_params.get('n_evolution_steps', 10)
|
||||
n_shuffle_iterations = method_params.get('n_shuffle_iterations', 10)
|
||||
cv = method_params.get('cv', 5)
|
||||
|
||||
Featuresecletidx = shuffled_frog_leaping_selection(
|
||||
X_df.values, y,
|
||||
n_frogs=n_frogs,
|
||||
n_memeplexes=n_memeplexes,
|
||||
n_evolution_steps=n_evolution_steps,
|
||||
n_shuffle_iterations=n_shuffle_iterations,
|
||||
cv=cv
|
||||
)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "SiPLS":
|
||||
n_intervals_list = method_params.get('n_intervals_list', [10, 15, 20])
|
||||
n_combinations_list = method_params.get('n_combinations_list', [2, 3, 4])
|
||||
max_components = method_params.get('max_components', 15)
|
||||
cv_folds = method_params.get('cv_folds', 5)
|
||||
|
||||
result = sipls_feature_selection(
|
||||
X_df.values, y,
|
||||
n_intervals_list=n_intervals_list,
|
||||
n_combinations_list=n_combinations_list,
|
||||
max_components=max_components,
|
||||
cv_folds=cv_folds
|
||||
)
|
||||
|
||||
if result and 'selected_wavelengths' in result:
|
||||
Featuresecletidx = result['selected_wavelengths']
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
else:
|
||||
raise ValueError("SiPLS算法未能找到有效的特征选择结果")
|
||||
|
||||
else:
|
||||
raise ValueError(f"不支持的特征选择方法: {method}。支持的方法包括: None, Cars, Lars, Uve, Spa, GA, ReliefF, RandomFrog, SiPLS")
|
||||
|
||||
return X_Feature, y, selected_columns # 返回所选特征数据、标签和列名
|
||||
|
||||
|
||||
# 便捷函数,用于向后兼容和简化使用
|
||||
def select_features_from_csv(config: FeatureSelectionConfig) -> Tuple[pd.DataFrame, np.ndarray, Union[List[str], np.ndarray]]:
|
||||
"""
|
||||
从CSV文件进行特征选择的主要接口函数
|
||||
|
||||
Args:
|
||||
config: 特征选择配置对象
|
||||
|
||||
Returns:
|
||||
X_selected: 选定的特征数据
|
||||
y: 标签数据
|
||||
selected_columns: 选定的列名或索引
|
||||
"""
|
||||
selector = SpectrumFeatureSelector(config)
|
||||
return selector.select_features()
|
||||
|
||||
|
||||
def select_features_from_data(X: pd.DataFrame, y: np.ndarray, method: str,
|
||||
method_params: Optional[dict] = None,
|
||||
name: str = '', result_dir: str = '',
|
||||
column_names: Optional[List[str]] = None) -> Tuple[pd.DataFrame, np.ndarray, Union[List[str], np.ndarray]]:
|
||||
"""
|
||||
直接从数据进行特征选择的便捷函数
|
||||
|
||||
Args:
|
||||
X: 特征数据
|
||||
y: 标签数据
|
||||
method: 特征选择方法
|
||||
method_params: 方法参数
|
||||
name: 输出文件名前缀
|
||||
result_dir: 输出目录
|
||||
column_names: 列名
|
||||
|
||||
Returns:
|
||||
X_selected: 选定的特征数据
|
||||
y: 标签数据
|
||||
selected_columns: 选定的列名或索引
|
||||
"""
|
||||
config = FeatureSelectionConfig(
|
||||
method=method,
|
||||
method_params=method_params or {},
|
||||
output_csv=False, # 直接数据输入不输出CSV
|
||||
save_plots=bool(result_dir),
|
||||
plot_name_prefix=name
|
||||
)
|
||||
|
||||
selector = SpectrumFeatureSelector(config)
|
||||
return selector.select_features(X=X, y=y, column_names=column_names)
|
||||
292
Feature_Selection_method/random_fog.py
Normal file
292
Feature_Selection_method/random_fog.py
Normal file
@ -0,0 +1,292 @@
|
||||
import numpy as np
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.base import clone
|
||||
import copy
|
||||
|
||||
|
||||
class ShuffledFrogLeaping:
|
||||
"""
|
||||
随机蛙跳算法 (Shuffled Frog Leaping Algorithm, SFLA) 进行特征选择
|
||||
|
||||
算法原理:
|
||||
1. 将青蛙种群分成多个小组(memeplexes)
|
||||
2. 在每个小组内进行局部搜索和进化
|
||||
3. 定期重组所有青蛙,进行全局信息交换
|
||||
4. 重复直到满足停止条件
|
||||
"""
|
||||
|
||||
def __init__(self, n_frogs=50, n_memeplexes=5, n_evolution_steps=10,
|
||||
n_shuffle_iterations=10, classifier=None, cv=5):
|
||||
"""
|
||||
初始化随机蛙跳算法参数
|
||||
|
||||
参数:
|
||||
n_frogs: 青蛙种群大小
|
||||
n_memeplexes: 小组数量
|
||||
n_evolution_steps: 每个小组的进化步数
|
||||
n_shuffle_iterations: 重组迭代次数
|
||||
classifier: 用于评估特征子集的分类器
|
||||
cv: 交叉验证折数
|
||||
"""
|
||||
self.n_frogs = n_frogs
|
||||
self.n_memeplexes = n_memeplexes
|
||||
self.n_evolution_steps = n_evolution_steps
|
||||
self.n_shuffle_iterations = n_shuffle_iterations
|
||||
self.classifier = classifier or RandomForestClassifier(random_state=42, n_estimators=50)
|
||||
self.cv = cv
|
||||
|
||||
# 算法内部变量
|
||||
self.n_features = None
|
||||
self.frogs = None # 青蛙种群,每个青蛙是一个二进制向量
|
||||
self.fitness_values = None
|
||||
self.best_frog = None
|
||||
self.best_fitness = -np.inf
|
||||
self.selected_features = None
|
||||
|
||||
def _initialize_population(self):
|
||||
"""初始化青蛙种群"""
|
||||
self.frogs = []
|
||||
for _ in range(self.n_frogs):
|
||||
# 随机初始化二进制向量,1表示选择该特征,0表示不选择
|
||||
frog = np.random.randint(0, 2, self.n_features)
|
||||
self.frogs.append(frog)
|
||||
self.frogs = np.array(self.frogs)
|
||||
|
||||
def _evaluate_fitness(self, X, y):
|
||||
"""评估所有青蛙的适应度"""
|
||||
self.fitness_values = []
|
||||
for frog in self.frogs:
|
||||
fitness = self._calculate_fitness(frog, X, y)
|
||||
self.fitness_values.append(fitness)
|
||||
|
||||
# 更新全局最优
|
||||
if fitness > self.best_fitness:
|
||||
self.best_fitness = fitness
|
||||
self.best_frog = frog.copy()
|
||||
|
||||
self.fitness_values = np.array(self.fitness_values)
|
||||
|
||||
def _calculate_fitness(self, frog, X, y):
|
||||
"""计算单个青蛙的适应度"""
|
||||
selected_features = np.where(frog == 1)[0]
|
||||
|
||||
# 如果没有选择任何特征,返回最低适应度
|
||||
if len(selected_features) == 0:
|
||||
return 0.0
|
||||
|
||||
# 使用选择的特征进行交叉验证
|
||||
X_selected = X[:, selected_features]
|
||||
|
||||
try:
|
||||
scores = cross_val_score(clone(self.classifier), X_selected, y, cv=self.cv)
|
||||
return np.mean(scores)
|
||||
except:
|
||||
# 如果交叉验证失败,返回低适应度
|
||||
return 0.0
|
||||
|
||||
def _divide_into_memeplexes(self):
|
||||
"""将青蛙按适应度排序并分成小组"""
|
||||
# 按适应度降序排序
|
||||
sorted_indices = np.argsort(self.fitness_values)[::-1]
|
||||
self.frogs = self.frogs[sorted_indices]
|
||||
self.fitness_values = self.fitness_values[sorted_indices]
|
||||
|
||||
# 分成小组
|
||||
memeplexes = []
|
||||
frogs_per_memeplex = self.n_frogs // self.n_memeplexes
|
||||
|
||||
for i in range(self.n_memeplexes):
|
||||
start_idx = i * frogs_per_memeplex
|
||||
if i == self.n_memeplexes - 1:
|
||||
# 最后一个小组包含剩余的所有青蛙
|
||||
end_idx = self.n_frogs
|
||||
else:
|
||||
end_idx = (i + 1) * frogs_per_memeplex
|
||||
|
||||
memeplex = {
|
||||
'frogs': self.frogs[start_idx:end_idx].copy(),
|
||||
'fitness': self.fitness_values[start_idx:end_idx].copy()
|
||||
}
|
||||
memeplexes.append(memeplex)
|
||||
|
||||
return memeplexes
|
||||
|
||||
def _evolve_memeplex(self, memeplex, X, y):
|
||||
"""进化单个小组"""
|
||||
frogs = memeplex['frogs']
|
||||
fitness = memeplex['fitness']
|
||||
|
||||
# 找出小组中的最好和最坏青蛙
|
||||
best_idx = np.argmax(fitness)
|
||||
worst_idx = np.argmin(fitness)
|
||||
|
||||
best_frog = frogs[best_idx]
|
||||
worst_frog = frogs[worst_idx]
|
||||
|
||||
# 对最坏的青蛙进行进化
|
||||
for step in range(self.n_evolution_steps):
|
||||
# 生成新的青蛙: worst_frog + rand() * (best_frog - worst_frog)
|
||||
rand = np.random.random(self.n_features)
|
||||
new_frog = worst_frog + rand * (best_frog - worst_frog)
|
||||
|
||||
# 二进制化:大于0.5的为1,否则为0
|
||||
new_frog = (new_frog > 0.5).astype(int)
|
||||
|
||||
# 确保至少选择一个特征
|
||||
if np.sum(new_frog) == 0:
|
||||
new_frog[np.random.randint(self.n_features)] = 1
|
||||
|
||||
# 计算新青蛙的适应度
|
||||
new_fitness = self._calculate_fitness(new_frog, X, y)
|
||||
|
||||
# 如果新青蛙更好,替换最坏的青蛙
|
||||
if new_fitness > fitness[worst_idx]:
|
||||
frogs[worst_idx] = new_frog
|
||||
fitness[worst_idx] = new_fitness
|
||||
|
||||
# 更新小组内的最好青蛙
|
||||
if new_fitness > fitness[best_idx]:
|
||||
best_idx = worst_idx
|
||||
best_frog = new_frog
|
||||
|
||||
# 重新找出最坏的青蛙
|
||||
worst_idx = np.argmin(fitness)
|
||||
worst_frog = frogs[worst_idx]
|
||||
else:
|
||||
# 如果没有改善,随机生成一个新青蛙
|
||||
new_frog = np.random.randint(0, 2, self.n_features)
|
||||
if np.sum(new_frog) == 0:
|
||||
new_frog[np.random.randint(self.n_features)] = 1
|
||||
|
||||
new_fitness = self._calculate_fitness(new_frog, X, y)
|
||||
|
||||
if new_fitness > fitness[worst_idx]:
|
||||
frogs[worst_idx] = new_frog
|
||||
fitness[worst_idx] = new_fitness
|
||||
|
||||
return frogs, fitness
|
||||
|
||||
def fit(self, X, y):
|
||||
"""
|
||||
运行随机蛙跳算法进行特征选择
|
||||
|
||||
参数:
|
||||
X: 特征矩阵 (n_samples, n_features)
|
||||
y: 标签向量 (n_samples,)
|
||||
|
||||
返回:
|
||||
selected_features: 选择的特征索引列表
|
||||
"""
|
||||
self.n_features = X.shape[1]
|
||||
|
||||
# 初始化种群
|
||||
self._initialize_population()
|
||||
|
||||
# 初始评估
|
||||
self._evaluate_fitness(X, y)
|
||||
|
||||
# 主循环
|
||||
for iteration in range(self.n_shuffle_iterations):
|
||||
# 将青蛙分成小组
|
||||
memeplexes = self._divide_into_memeplexes()
|
||||
|
||||
# 进化每个小组
|
||||
evolved_frogs = []
|
||||
evolved_fitness = []
|
||||
|
||||
for memeplex in memeplexes:
|
||||
evolved_frog, evolved_fit = self._evolve_memeplex(memeplex, X, y)
|
||||
evolved_frogs.extend(evolved_frog)
|
||||
evolved_fitness.extend(evolved_fit)
|
||||
|
||||
# 更新种群
|
||||
self.frogs = np.array(evolved_frogs)
|
||||
self.fitness_values = np.array(evolved_fitness)
|
||||
|
||||
# 再次评估所有青蛙(确保一致性)
|
||||
self._evaluate_fitness(X, y)
|
||||
|
||||
# 返回最优解
|
||||
self.selected_features = np.where(self.best_frog == 1)[0]
|
||||
return self.selected_features.tolist()
|
||||
|
||||
def get_feature_importance(self):
|
||||
"""获取特征选择结果的统计信息"""
|
||||
if self.selected_features is None:
|
||||
raise ValueError("请先运行 fit 方法")
|
||||
|
||||
n_selected = len(self.selected_features)
|
||||
selection_ratio = n_selected / self.n_features
|
||||
|
||||
return {
|
||||
'selected_features': self.selected_features,
|
||||
'n_selected': n_selected,
|
||||
'n_total': self.n_features,
|
||||
'selection_ratio': selection_ratio,
|
||||
'best_fitness': self.best_fitness
|
||||
}
|
||||
|
||||
|
||||
def shuffled_frog_leaping_selection(X, y, n_frogs=50, n_memeplexes=5,
|
||||
n_evolution_steps=10, n_shuffle_iterations=10,
|
||||
classifier=None, cv=5):
|
||||
"""
|
||||
使用随机蛙跳算法进行特征选择
|
||||
|
||||
参数:
|
||||
X: 特征矩阵 (n_samples, n_features)
|
||||
y: 标签向量 (n_samples,)
|
||||
n_frogs: 青蛙种群大小
|
||||
n_memeplexes: 小组数量
|
||||
n_evolution_steps: 每个小组的进化步数
|
||||
n_shuffle_iterations: 重组迭代次数
|
||||
classifier: 用于评估特征子集的分类器
|
||||
cv: 交叉验证折数
|
||||
|
||||
返回:
|
||||
selected_features: 选择的特征索引列表
|
||||
"""
|
||||
sfla = ShuffledFrogLeaping(
|
||||
n_frogs=n_frogs,
|
||||
n_memeplexes=n_memeplexes,
|
||||
n_evolution_steps=n_evolution_steps,
|
||||
n_shuffle_iterations=n_shuffle_iterations,
|
||||
classifier=classifier,
|
||||
cv=cv
|
||||
)
|
||||
|
||||
return sfla.fit(X, y)
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 生成示例数据
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
X, y = make_classification(
|
||||
n_samples=200,
|
||||
n_features=50,
|
||||
n_informative=10,
|
||||
n_redundant=10,
|
||||
n_clusters_per_class=1,
|
||||
random_state=42
|
||||
)
|
||||
|
||||
print("原始特征数量:", X.shape[1])
|
||||
|
||||
# 使用随机蛙跳算法进行特征选择
|
||||
selected_features = shuffled_frog_leaping_selection(
|
||||
X, y,
|
||||
n_frogs=30,
|
||||
n_memeplexes=3,
|
||||
n_evolution_steps=5,
|
||||
n_shuffle_iterations=5
|
||||
)
|
||||
|
||||
print("选择的特征数量:", len(selected_features))
|
||||
print("选择的特征索引:", selected_features)
|
||||
|
||||
# 计算选择率
|
||||
selection_ratio = len(selected_features) / X.shape[1]
|
||||
print(".2f")
|
||||
271
Feature_Selection_method/sipls.py
Normal file
271
Feature_Selection_method/sipls.py
Normal file
@ -0,0 +1,271 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.model_selection import KFold
|
||||
from itertools import combinations
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def synergy_interval_pls(X, y, n_intervals=20, n_combinations=2, max_components=15, cv_folds=5):
|
||||
"""
|
||||
协同区间偏最小二乘法 (Synergy Interval PLS, SiPLS) 进行特征选择
|
||||
|
||||
参数:
|
||||
X: 光谱矩阵 (n_samples, n_wavelengths)
|
||||
y: 浓度/属性向量 (n_samples,)
|
||||
n_intervals: 将光谱分成多少个等宽区间
|
||||
n_combinations: 每次选择的区间组合数量 (通常2-4)
|
||||
max_components: PLS的最大主成分数
|
||||
cv_folds: 交叉验证折数
|
||||
|
||||
返回:
|
||||
best_intervals: 最优的区间组合
|
||||
best_rmsecv: 最优组合的RMSECV
|
||||
best_n_components: 最优的主成分数
|
||||
selected_wavelengths: 选择的波长索引
|
||||
"""
|
||||
|
||||
n_samples, n_wavelengths = X.shape
|
||||
|
||||
# 将光谱分成等宽的区间
|
||||
interval_size = n_wavelengths // n_intervals
|
||||
intervals = []
|
||||
|
||||
for i in range(n_intervals):
|
||||
start_idx = i * interval_size
|
||||
if i == n_intervals - 1:
|
||||
# 最后一个区间包含剩余的所有波长
|
||||
end_idx = n_wavelengths
|
||||
else:
|
||||
end_idx = (i + 1) * interval_size
|
||||
|
||||
intervals.append((start_idx, end_idx))
|
||||
|
||||
print(f"将 {n_wavelengths} 个波长分成 {n_intervals} 个区间:")
|
||||
for i, (start, end) in enumerate(intervals):
|
||||
print(f" 区间 {i+1}: 波长 {start}-{end-1} (宽度: {end-start})")
|
||||
|
||||
# 生成所有可能的区间组合
|
||||
interval_combinations = list(combinations(range(n_intervals), n_combinations))
|
||||
|
||||
print(f"\n总共 {len(interval_combinations)} 个 {n_combinations} 区间的组合")
|
||||
|
||||
best_rmsecv = float('inf')
|
||||
best_intervals = None
|
||||
best_n_components = None
|
||||
results = []
|
||||
|
||||
# 对每个组合进行评估
|
||||
for combo_idx, combo in enumerate(interval_combinations):
|
||||
if (combo_idx + 1) % 50 == 0:
|
||||
print(f"正在处理组合 {combo_idx + 1}/{len(interval_combinations)}")
|
||||
|
||||
# 合并选中区间的光谱数据
|
||||
selected_wavelengths = []
|
||||
for interval_idx in combo:
|
||||
start_idx, end_idx = intervals[interval_idx]
|
||||
selected_wavelengths.extend(range(start_idx, end_idx))
|
||||
|
||||
X_selected = X[:, selected_wavelengths]
|
||||
|
||||
# 对不同主成分数进行交叉验证
|
||||
kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
|
||||
rmse_results = []
|
||||
|
||||
for n_comp in range(1, min(max_components + 1, X_selected.shape[1] + 1)):
|
||||
rmse_scores = []
|
||||
|
||||
for train_idx, test_idx in kf.split(X_selected):
|
||||
X_train, X_test = X_selected[train_idx], X_selected[test_idx]
|
||||
y_train, y_test = y[train_idx], y[test_idx]
|
||||
|
||||
pls = PLSRegression(n_components=n_comp)
|
||||
pls.fit(X_train, y_train)
|
||||
y_pred = pls.predict(X_test)
|
||||
|
||||
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
||||
rmse_scores.append(rmse)
|
||||
|
||||
mean_rmse = np.mean(rmse_scores)
|
||||
rmse_results.append(mean_rmse)
|
||||
|
||||
# 找到该组合的最佳主成分数和RMSE
|
||||
min_rmse_idx = np.argmin(rmse_results)
|
||||
min_rmse = rmse_results[min_rmse_idx]
|
||||
best_comp = min_rmse_idx + 1
|
||||
|
||||
results.append({
|
||||
'intervals': combo,
|
||||
'rmsecv': min_rmse,
|
||||
'n_components': best_comp,
|
||||
'wavelengths': selected_wavelengths
|
||||
})
|
||||
|
||||
# 更新全局最优
|
||||
if min_rmse < best_rmsecv:
|
||||
best_rmsecv = min_rmse
|
||||
best_intervals = combo
|
||||
best_n_components = best_comp
|
||||
|
||||
print("最优结果:")
|
||||
print(f" 区间组合: {best_intervals}")
|
||||
print(f" RMSECV: {best_rmsecv:.6f}")
|
||||
print(f" 主成分数: {best_n_components}")
|
||||
print(f" 选择的波长数: {len(results[-1]['wavelengths'])}")
|
||||
|
||||
# 返回最优区间的波长索引
|
||||
selected_wavelengths = []
|
||||
for interval_idx in best_intervals:
|
||||
start_idx, end_idx = intervals[interval_idx]
|
||||
selected_wavelengths.extend(range(start_idx, end_idx))
|
||||
|
||||
return selected_wavelengths, best_rmsecv, best_n_components
|
||||
|
||||
|
||||
def sipls_feature_selection(X, y, n_intervals_list=[10, 15, 20], n_combinations_list=[2, 3, 4],
|
||||
max_components=15, cv_folds=5):
|
||||
"""
|
||||
高级SiPLS特征选择,尝试不同的参数组合
|
||||
|
||||
参数:
|
||||
X: 光谱矩阵 (n_samples, n_wavelengths)
|
||||
y: 浓度/属性向量 (n_samples,)
|
||||
n_intervals_list: 尝试的区间数量列表
|
||||
n_combinations_list: 尝试的组合数量列表
|
||||
max_components: PLS的最大主成分数
|
||||
cv_folds: 交叉验证折数
|
||||
|
||||
返回:
|
||||
best_result: 包含最优结果的字典
|
||||
"""
|
||||
|
||||
best_overall_rmsecv = float('inf')
|
||||
best_overall_result = None
|
||||
|
||||
print("=== SiPLS 特征选择 ===")
|
||||
print(f"数据形状: {X.shape}")
|
||||
print(f"尝试的参数组合: {len(n_intervals_list)} × {len(n_combinations_list)} = {len(n_intervals_list) * len(n_combinations_list)}")
|
||||
|
||||
for n_intervals in n_intervals_list:
|
||||
for n_combinations in n_combinations_list:
|
||||
print(f"\n--- 测试参数: 区间数={n_intervals}, 组合数={n_combinations} ---")
|
||||
|
||||
try:
|
||||
selected_wavelengths, rmsecv, n_components = synergy_interval_pls(
|
||||
X, y,
|
||||
n_intervals=n_intervals,
|
||||
n_combinations=n_combinations,
|
||||
max_components=max_components,
|
||||
cv_folds=cv_folds
|
||||
)
|
||||
|
||||
if rmsecv < best_overall_rmsecv:
|
||||
best_overall_rmsecv = rmsecv
|
||||
best_overall_result = {
|
||||
'selected_wavelengths': selected_wavelengths,
|
||||
'rmsecv': rmsecv,
|
||||
'n_components': n_components,
|
||||
'n_intervals': n_intervals,
|
||||
'n_combinations': n_combinations,
|
||||
'selection_ratio': len(selected_wavelengths) / X.shape[1]
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"参数组合 (区间数={n_intervals}, 组合数={n_combinations}) 处理失败: {str(e)}")
|
||||
continue
|
||||
|
||||
if best_overall_result:
|
||||
print("=== 最终最优结果 ===")
|
||||
print(f"区间数: {best_overall_result['n_intervals']}")
|
||||
print(f"组合数: {best_overall_result['n_combinations']}")
|
||||
print(f"RMSECV: {best_overall_result['rmsecv']:.6f}")
|
||||
print(f"主成分数: {best_overall_result['n_components']}")
|
||||
print(f"选择的波长数: {len(best_overall_result['selected_wavelengths'])}")
|
||||
print(f"选择率: {best_overall_result['selection_ratio']:.3f}")
|
||||
|
||||
return best_overall_result
|
||||
|
||||
|
||||
def plot_sipls_results(X, selected_wavelengths, title="SiPLS Selected Wavelengths"):
|
||||
"""
|
||||
绘制SiPLS选择结果的可视化图
|
||||
|
||||
参数:
|
||||
X: 原始光谱矩阵
|
||||
selected_wavelengths: 选择的波长索引
|
||||
title: 图表标题
|
||||
"""
|
||||
n_wavelengths = X.shape[1]
|
||||
wavelength_indices = np.arange(n_wavelengths)
|
||||
|
||||
# 创建选择掩码
|
||||
selection_mask = np.zeros(n_wavelengths, dtype=bool)
|
||||
selection_mask[selected_wavelengths] = True
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
# 绘制平均光谱
|
||||
mean_spectrum = np.mean(X, axis=0)
|
||||
plt.plot(wavelength_indices, mean_spectrum, 'b-', alpha=0.7, label='Mean Spectrum')
|
||||
|
||||
# 高亮选择的波长
|
||||
plt.scatter(wavelength_indices[selection_mask], mean_spectrum[selection_mask],
|
||||
color='red', s=50, alpha=0.8, label='Selected Wavelengths')
|
||||
|
||||
plt.xlabel('Wavelength Index')
|
||||
plt.ylabel('Intensity')
|
||||
plt.title(title)
|
||||
plt.legend()
|
||||
plt.grid(True, alpha=0.3)
|
||||
|
||||
return plt.gcf()
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 生成模拟光谱数据
|
||||
np.random.seed(42)
|
||||
n_samples = 100
|
||||
n_wavelengths = 1000
|
||||
|
||||
# 模拟光谱数据(高斯峰)
|
||||
wavelengths = np.linspace(400, 2500, n_wavelengths)
|
||||
X = np.zeros((n_samples, n_wavelengths))
|
||||
|
||||
# 添加一些特征峰
|
||||
peak_positions = [500, 800, 1200, 1800, 2200] # nm
|
||||
peak_indices = [np.argmin(np.abs(wavelengths - pos)) for pos in peak_positions]
|
||||
|
||||
for i in range(n_samples):
|
||||
for peak_idx in peak_indices:
|
||||
# 添加高斯峰
|
||||
gaussian = np.exp(-0.5 * ((np.arange(n_wavelengths) - peak_idx) / 50)**2)
|
||||
X[i] += gaussian * np.random.uniform(0.5, 1.5)
|
||||
|
||||
# 添加噪声
|
||||
X[i] += np.random.normal(0, 0.1, n_wavelengths)
|
||||
|
||||
# 生成模拟浓度数据(与某些峰相关)
|
||||
y = (X[:, peak_indices[0]] + X[:, peak_indices[2]] + X[:, peak_indices[4]]) / 3
|
||||
y += np.random.normal(0, 0.05, n_samples) # 添加噪声
|
||||
|
||||
print("模拟数据生成完成")
|
||||
print(f"数据形状: {X.shape}")
|
||||
print(".3f")
|
||||
|
||||
# 运行SiPLS特征选择
|
||||
result = sipls_feature_selection(
|
||||
X, y,
|
||||
n_intervals_list=[10, 15],
|
||||
n_combinations_list=[2, 3],
|
||||
max_components=10,
|
||||
cv_folds=5
|
||||
)
|
||||
|
||||
if result:
|
||||
print(f"\n选择的波长索引: {result['selected_wavelengths'][:10]}...") # 只显示前10个
|
||||
|
||||
# 绘制结果
|
||||
fig = plot_sipls_results(X, result['selected_wavelengths'])
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user