增加模块;增加主调用命令

This commit is contained in:
2026-01-07 16:36:47 +08:00
commit 2d4b170a45
109 changed files with 55763 additions and 0 deletions

View File

@ -0,0 +1,176 @@
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
def PC_Cross_Validation(X, y, pc, cv):
'''
X : 光谱矩阵 (DataFrame) nxm
y : 浓度阵 (Series) (化学值)
pc: 最大主成分数
cv: 交叉验证数量
return :
RMSECV: 各主成分数对应的RMSECV
rindex: 最佳主成分数
'''
kf = KFold(n_splits=cv)
RMSECV = []
for i in range(pc):
RMSE = []
for train_index, test_index in kf.split(X):
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
pls = PLSRegression(n_components=i + 1)
pls.fit(x_train, y_train)
y_predict = pls.predict(x_test)
RMSE.append(np.sqrt(mean_squared_error(y_test, y_predict)))
RMSE_mean = np.mean(RMSE)
RMSECV.append(RMSE_mean)
rindex = np.argmin(RMSECV)
return RMSECV, rindex
def Cross_Validation(X, y, pc, cv):
'''
X : 光谱矩阵 (DataFrame) nxm
y : 浓度阵 (Series) (化学值)
pc: 最大主成分数
cv: 交叉验证数量
return :
RMSECV: 各主成分数对应的RMSECV
'''
kf = KFold(n_splits=cv)
RMSE = []
for train_index, test_index in kf.split(X):
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
pls = PLSRegression(n_components=pc)
pls.fit(x_train, y_train)
y_predict = pls.predict(x_test)
RMSE.append(np.sqrt(mean_squared_error(y_test, y_predict)))
RMSE_mean = np.mean(RMSE)
return RMSE_mean
def CARS_Cloud(X, y, N=50, f=20, cv=10, save_fig=False, save_path=None):
'''
X : 光谱矩阵 (DataFrame 或 ndarray)
y : 浓度阵 (Series 或 ndarray)
N : 蒙特卡洛迭代次数
f : 最大特征数
cv : 交叉验证的次数
save_fig : 是否保存图像
save_path : 图像保存路径
return :
OptWave : 选择的波长
'''
p = 0.8
m, n = X.shape
u = np.power((n / 2), (1 / (N - 1)))
k = (1 / (N - 1)) * np.log(n / 2)
cal_num = np.round(m * p)
b2 = np.arange(n)
x = X # 将 DataFrame 转换为 numpy 数组
y = y # 将 Series 转换为 numpy 数组
D = np.vstack((np.array(b2).reshape(1, -1), x))
WaveData = []
WaveNum = []
RMSECV = []
r = []
for i in range(1, N + 1):
r.append(u * np.exp(-1 * k * i))
wave_num = int(np.round(r[i - 1] * n))
WaveNum = np.hstack((WaveNum, wave_num))
cal_index = np.random.choice(np.arange(m), size=int(cal_num), replace=False)
wave_index = b2[:wave_num].reshape(1, -1)[0]
# 使用 np.ix_ 来进行行列索引
xcal = x[np.ix_(cal_index, wave_index)] # 选择对应的行和列
ycal = y[cal_index] # 选择对应的 y
# 将 ycal 转换为一维数组
ycal = ycal.ravel() # 使其成为一维数组
x = x[:, wave_index] # 更新 x
D = D[:, wave_index] # 更新 D
d = D[0, :].reshape(1, -1)
wnum = n - wave_num
if wnum > 0:
d = np.hstack((d, np.full((1, wnum), -1)))
if len(WaveData) == 0:
WaveData = d
else:
WaveData = np.vstack((WaveData, d.reshape(1, -1)))
if wave_num < f:
f = wave_num
pls = PLSRegression(n_components=f)
pls.fit(xcal, ycal)
beta = pls.coef_
# 针对新版sklearn处理 coef_ 的方式
if beta.shape[0] == 1: # 新版sklearn(1, x)
b = np.abs(beta[0]) # 从第一行提取数据
coeff = beta[0, b2] # 修改为beta[0, b2]因为coef只有一行
else: # 旧版sklearn(x, 1)
b = np.abs(beta[:, 0]) # 从列中提取数据
coeff = beta[b2, 0] # 修改为beta[b2, 0]因为coef只有一列
b2 = np.argsort(-b, axis=0)
coef = copy.deepcopy(beta)
coeff = coef[b2, :].reshape(len(b2), -1)
rmsecv, rindex = PC_Cross_Validation(pd.DataFrame(xcal), pd.Series(ycal), f, cv)
RMSECV.append(Cross_Validation(pd.DataFrame(xcal), pd.Series(ycal), rindex + 1, cv))
WAVE = []
for i in range(WaveData.shape[0]):
wd = WaveData[i, :]
WD = np.ones((len(wd)))
for j in range(len(wd)):
ind = np.where(wd == j)
if len(ind[0]) == 0:
WD[j] = 0
else:
WD[j] = wd[ind[0]]
if len(WAVE) == 0:
WAVE = copy.deepcopy(WD)
else:
WAVE = np.vstack((WAVE, WD.reshape(1, -1)))
MinIndex = np.argmin(RMSECV)
Optimal = WAVE[MinIndex, :]
boindex = np.where(Optimal != 0)
OptWave = boindex[0]
plt.figure(figsize=(12, 10))
# 设置字体为新罗马
plt.rcParams['font.sans-serif'] = ['Times New Roman'] # 使用 Times New Roman 字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
fonts = 20
plt.subplot(211)
plt.xlabel('Monte Carlo Iterations', fontsize=fonts)
plt.ylabel('Number of Selected Wavelengths', fontsize=fonts)
plt.title('Optimal Iteration: ' + str(MinIndex), fontsize=fonts)
plt.plot(np.arange(N), WaveNum)
plt.subplot(212)
plt.xlabel('Monte Carlo Iterations', fontsize=fonts)
plt.ylabel('RMSECV', fontsize=fonts)
plt.plot(np.arange(N), RMSECV)
# 保存图像
if save_fig:
plt.savefig(save_path) # 保存图像到文件
print(f"The figure has been saved as {save_path}")
# plt.show()
return OptWave

View File

@ -0,0 +1,59 @@
from deap import base, creator, tools, algorithms
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
def GA(X, y, n_generations=20, population_size=50, crossover_prob=0.7, mutation_prob=0.2):
"""
使用遗传算法进行特征选择,返回选择的特征索引。
参数:
X (ndarray): 特征矩阵
y (ndarray): 标签
n_generations (int): 迭代次数
population_size (int): 种群大小
crossover_prob (float): 交叉概率
mutation_prob (float): 变异概率
返回:
list: 选择的特征索引
"""
# 创建适应度和个体
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_bool", lambda: np.random.randint(0, 2))
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
# 定义适应度函数
def evaluate(individual):
selected_features = [index for index, val in enumerate(individual) if val == 1]
if not selected_features:
return 0, # 没有特征时适应度为 0
X_selected = X[:, selected_features]
clf = RandomForestClassifier(random_state=42)
score = cross_val_score(clf, X_selected, y, cv=5).mean() # 5 折交叉验证
return score,
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
# 初始化种群
population = toolbox.population(n=population_size)
# 运行遗传算法
result_population, _ = algorithms.eaSimple(population, toolbox, cxpb=crossover_prob,
mutpb=mutation_prob, ngen=n_generations,
verbose=False)
# 获取最优个体
best_individual = tools.selBest(result_population, k=1)[0]
selected_features = [index for index, val in enumerate(best_individual) if val == 1]
return selected_features

View File

@ -0,0 +1,31 @@
from sklearn import linear_model
import numpy as np
def Lar(X, y, nums=40):
"""
使用 LARSLeast Angle Regression选择重要的特征波长。
参数:
X : np.ndarray预测变量矩阵输入数据
y : np.ndarray标签目标值
nums : int选择的特征点数量默认为 40
返回:
np.ndarray选择的特征波长索引
"""
# 初始化 LARS 模型
Lars = linear_model.Lars()
# 拟合模型
Lars.fit(X, y)
# 获取回归系数的绝对值,表示特征的重要性
corflist = np.abs(Lars.coef_)
# 将系数转换为数组并按重要性排序,选择前 nums 个最重要的特征
SpectrumList = np.argsort(corflist)[-nums:][::-1]
# 对选择的特征索引进行排序,保证顺序一致
SpectrumList = np.sort(SpectrumList)
return SpectrumList

View File

@ -0,0 +1,88 @@
import numpy as np
from sklearn.neighbors import NearestNeighbors
class ReliefF:
def __init__(self, n_neighbors=20, n_features_to_keep=20):
"""
初始化 ReliefF 算法参数。
:param n_neighbors: 每个样本的近邻数量。
:param n_features_to_keep: 每次保留的特征数量。
"""
self.n_neighbors = n_neighbors
self.n_features_to_keep = n_features_to_keep
self.feature_scores = None # 用于存储每个特征的评分
self.top_features = None # 用于存储评分最高的特征索引
def fit(self, X, y):
"""
根据给定的数据 X 和标签 y 计算特征评分。
:param X: 输入特征矩阵。
:param y: 类别标签。
:return: 返回选择的特征索引。
"""
m, n = X.shape # m 是样本数n 是特征数
self.feature_scores = np.zeros(n) # 初始化特征评分为 0
# 寻找每个样本的 n_neighbors 个近邻
nbrs = NearestNeighbors(n_neighbors=self.n_neighbors + 1).fit(X)
distances, indices = nbrs.kneighbors(X)
# 遍历每个样本,更新特征评分
for i in range(m):
y_i = y[i] # 当前样本的类别标签
# 初始化同类和异类邻居
hit_neighbors = []
miss_neighbors = []
for j in indices[i][1:]: # indices[i][0] 是样本自身,跳过
if y[j] == y_i:
hit_neighbors.append(X[j])
else:
miss_neighbors.append(X[j])
# 更新每个特征的评分
for f in range(n):
for hit in hit_neighbors:
self.feature_scores[f] -= (X[i, f] - hit[f]) ** 2 / (self.n_neighbors * m)
for miss in miss_neighbors:
self.feature_scores[f] += (X[i, f] - miss[f]) ** 2 / (self.n_neighbors * m)
# 选择评分最高的 n_features_to_keep 个特征的索引
self.top_features = np.argsort(self.feature_scores)[-self.n_features_to_keep:]
return self.top_features # 返回选择的特征索引
def fit_transform(self, X, y):
"""一步完成拟合和转换,返回选择的特征索引。"""
return self.fit(X, y)
def multi_scale_relieff_stratified(X, y, segment_size=100, n_subsegments=20, n_features_per_subsegment=5):
"""
分层多尺度特征选择,确保每个波长段都能被覆盖。
:param X: 输入特征矩阵。
:param y: 类别标签。
:param segment_size: 每个波长段的大小。
:param n_subsegments: 每个段内的子区域数量。
:param n_features_per_subsegment: 每个子区域选择的特征数量。
:return: 分层选择的特征索引。
"""
selected_features = []
# 遍历每个波长段
for i in range(0, X.shape[1], segment_size):
segment_X = X[:, i:i + segment_size]
subsegment_size = segment_size // n_subsegments # 子区域大小
# 在每个子区域内进行特征选择
for j in range(0, segment_size, subsegment_size):
subsegment_X = segment_X[:, j:j + subsegment_size]
relief = ReliefF(n_neighbors=10, n_features_to_keep=n_features_per_subsegment)
subsegment_selected = relief.fit_transform(subsegment_X, y)
# 将局部索引转换为全局索引并添加到结果中
selected_features.extend(subsegment_selected + i + j)
# 返回去重后的特征索引
return np.unique(selected_features)

View File

@ -0,0 +1,116 @@
import scipy.stats
import numpy as np
from scipy.linalg import qr, inv, pinv
import scipy.stats
from progress.bar import Bar
from matplotlib import pyplot as plt
class SPA:
def _projections_qr(self, X, k, M):
X_projected = X.copy()
norms = np.sum((X ** 2), axis=0)
norm_max = np.amax(norms)
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
return order[:M].T
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
N = Xcal.shape[0]
NV = Xval.shape[0] if Xval is not None else 0
yhat, e = None, None
if NV > 0:
Xcal_ones = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel].to_numpy()])
b = np.linalg.lstsq(Xcal_ones, ycal, rcond=None)[0]
Xval_ones = np.hstack([np.ones((NV, 1)), Xval.iloc[:, var_sel].to_numpy()])
yhat = Xval_ones.dot(b)
e = yval - yhat
else:
yhat = np.zeros((N, 1))
for i in range(N):
cal = np.hstack([np.arange(i), np.arange(i + 1, N)])
X = Xcal.iloc[cal, var_sel]
y = ycal.iloc[cal]
X_ones = np.hstack([np.ones((N - 1, 1)), X.to_numpy()])
b = np.linalg.lstsq(X_ones, y, rcond=None)[0]
xtest = Xcal.iloc[i, var_sel].to_numpy()
yhat[i] = np.hstack([1, xtest]).dot(b)
e = ycal.to_numpy() - yhat
return yhat, e
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
N, K = Xcal.shape
m_max = min(N - 1, K) if m_max is None else m_max
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
SEL = np.zeros((m_max, K))
with Bar('Projections :', max=K) as bar:
for k in range(K):
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
bar.next()
PRESS = np.full((m_max + 1, K), np.inf)
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
for k in range(K):
for m in range(m_min, m_max + 1):
var_sel = SEL[:m, k].astype(int)
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
PRESS[m, k] = e.T @ e
bar.next()
m_sel = np.argmin(PRESS, axis=0)
k_sel = np.argmin(np.min(PRESS, axis=0))
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
Xcal2 = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel_phase2].to_numpy()])
b = np.linalg.lstsq(Xcal2, ycal, rcond=None)[0]
std_deviation = Xcal2.std(ddof=1, axis=0)
relev = np.abs(b * std_deviation)[1:]
index_decreasing_relev = np.argsort(-relev)
PRESS_scree = np.empty(len(var_sel_phase2))
for i in range(len(var_sel_phase2)):
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
PRESS_scree[i] = np.conj(e).T @ e
RMSEP_scree = np.sqrt(PRESS_scree / len(e))
alpha = 0.25
dof = len(e)
fcrit = scipy.stats.f.ppf(1 - alpha, dof, dof)
PRESS_crit = np.min(PRESS_scree) * fcrit
i_crit = np.min(np.nonzero(PRESS_scree < PRESS_crit))
i_crit = max(m_min, i_crit)
var_sel = var_sel_phase2[index_decreasing_relev[:i_crit]]
# 绘图
plt.figure()
# 设置字体为 Times New Roman
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False # 确保负号显示正常
# 设置标题、标签和网格
plt.xlabel('Number of variables included in the model', fontsize=14)
plt.ylabel('RMSE', fontsize=14)
plt.title(f'Final number of selected variables: {len(var_sel)} (RMSE={RMSEP_scree[i_crit]:.4f})', fontsize=16)
# 绘制 RMSEP 曲线
plt.plot(RMSEP_scree, label='RMSEP Scree Plot')
plt.scatter(i_crit, RMSEP_scree[i_crit], color='r', marker='s', label='Selected Point')
# 添加网格和图例
plt.grid(True)
plt.legend()
# 显示或保存图像
if save_path:
plt.savefig(save_path, bbox_inches='tight', dpi=300)
print(f"图像已保存至: {save_path}")
else:
plt.show()
return var_sel, var_sel_phase2
def __repr__(self):
return "SPA()"

View File

@ -0,0 +1,82 @@
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit, cross_val_score
from numpy.linalg import matrix_rank as rank
import numpy as np
class UVE:
def __init__(self, x, y, ncomp=20, nrep=500, testSize=0.2):
"""
初始化 UVE 模型。
参数:
x : np.ndarray预测变量矩阵输入数据
y : np.ndarray标签目标值
ncomp : intPLS 中的最大潜变量数量,默认为 20
nrep : int重复次数默认为 500
testSize : float训练集中划分的测试集比例默认为 0.2
"""
self.x = x
self.y = y
self.ncomp = min(ncomp, rank(x)) # 确保潜变量数量不超过矩阵秩
self.nrep = nrep
self.testSize = testSize
self.criteria = None # 存储标准化系数
self.featureIndex = None # 存储特征排序索引
self.featureR2 = np.full(self.x.shape[1], np.nan) # 存储 R² 值
self.selFeature = None # 存储最终选择的特征索引
def calcCriteria(self):
"""计算每个变量的标准化系数 (meanCoef / stdCoef)。"""
PLSCoef = np.zeros((self.nrep, self.x.shape[1])) # 存储每次迭代的 PLS 系数
ss = ShuffleSplit(n_splits=self.nrep, test_size=self.testSize)
# 遍历每次划分的数据集,计算 PLS 系数
for step, (train, test) in enumerate(ss.split(self.x, self.y)):
xtrain, ytrain = self.x[train], self.y[train]
plsModel = PLSRegression(n_components=min(self.ncomp, rank(xtrain)))
plsModel.fit(xtrain, ytrain)
PLSCoef[step, :] = plsModel.coef_.flatten()
# 使用 np.divide 处理除法,避免除以零的问题
meanCoef = np.mean(PLSCoef, axis=0)
stdCoef = np.std(PLSCoef, axis=0)
self.criteria = np.divide(meanCoef, stdCoef, out=np.zeros_like(meanCoef), where=stdCoef != 0)
def evalCriteria(self, cv=3):
"""基于标准化系数评估每个变量组合的 R² 值。"""
# 按标准化系数的绝对值降序排序,获取特征的索引
self.featureIndex = np.argsort(-np.abs(self.criteria))
# 依次增加特征,计算每个组合的 R² 值
for i in range(self.x.shape[1]):
xi = self.x[:, self.featureIndex[:i + 1]] # 选择前 i+1 个特征
# 根据特征数量选择回归模型
if i < self.ncomp:
regModel = LinearRegression()
else:
regModel = PLSRegression(n_components=min(self.ncomp, rank(xi)))
# 进行交叉验证并存储 R² 值
cvScore = cross_val_score(regModel, xi, self.y, cv=cv, scoring='r2')
self.featureR2[i] = np.mean(cvScore)
def cutFeature(self, *args):
"""根据 R² 最大值选择特征,并返回所选特征的索引(列号)。"""
# 找到 R² 最大值对应的索引位置
cuti = np.nanargmax(self.featureR2) # 使用 nanargmax 以避免 NaN 的影响
self.selFeature = self.featureIndex[:cuti + 1] # 最优特征索引
# 如果传入其他数据集,返回筛选后的数据
if len(args) != 0:
returnx = list(args)
for i, argi in enumerate(args):
if argi.shape[1] == self.x.shape[1]:
returnx[i] = argi[:, self.selFeature]
return returnx
# 返回所选特征的索引(列号)
return self.selFeature

View File

@ -0,0 +1,728 @@
"""
批量特征选择工具
支持对多个CSV文件或数据集进行批量特征选择
"""
import os
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Optional, Tuple, Union
import argparse
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
import warnings
# 导入特征选择模块
from feture_select import (
FeatureSelectionConfig,
select_features_from_csv,
select_features_from_data
)
warnings.filterwarnings('ignore')
def parse_column_range(column_range: Union[str, int, List[Union[str, int]]], total_columns: int) -> List[int]:
"""
解析列范围字符串,返回列索引列表
Args:
column_range: 列范围,如 "0:5", "2,4,6-8", [0,1,2] 或单个索引
total_columns: 总列数
Returns:
列索引列表
"""
if isinstance(column_range, (int, np.integer)):
# 单个列索引
if column_range >= total_columns or column_range < 0:
raise ValueError(f"Column index {column_range} out of range [0, {total_columns-1}]")
return [column_range]
elif isinstance(column_range, str):
# 解析范围字符串
columns = []
# 分割多个范围(用逗号分隔)
for part in column_range.split(','):
part = part.strip()
if ':' in part:
# 范围选择,如 "0:5"
start, end = part.split(':')
start = int(start.strip()) if start.strip() else 0
end = int(end.strip()) if end.strip() else total_columns
if start < 0:
start = total_columns + start
if end < 0:
end = total_columns + end
if start >= total_columns or end > total_columns:
raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
columns.extend(range(start, end))
else:
# 单个索引
idx = int(part.strip())
if idx < 0:
idx = total_columns + idx
if idx >= total_columns or idx < 0:
raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
columns.append(idx)
return list(set(columns)) # 去重
elif isinstance(column_range, (list, tuple)):
# 直接的列索引列表
columns = []
for idx in column_range:
if isinstance(idx, str):
if ':' in idx:
# 处理列表中的范围字符串
start, end = idx.split(':')
start = int(start.strip()) if start.strip() else 0
end = int(end.strip()) if end.strip() else total_columns
if start < 0:
start = total_columns + start
if end < 0:
end = total_columns + end
if start >= total_columns or end > total_columns:
raise ValueError(f"Range {start}:{end} out of column range [0, {total_columns-1}]")
columns.extend(range(start, end))
else:
idx_int = int(idx.strip())
if idx_int < 0:
idx_int = total_columns + idx_int
if idx_int >= total_columns or idx_int < 0:
raise ValueError(f"Column index {idx_int} out of range [0, {total_columns-1}]")
columns.append(idx_int)
else:
if idx < 0:
idx = total_columns + idx
if idx >= total_columns or idx < 0:
raise ValueError(f"Column index {idx} out of range [0, {total_columns-1}]")
columns.append(idx)
return list(set(columns)) # 去重
else:
raise ValueError(f"Unsupported column range format: {type(column_range)}")
def convert_column_indices_to_names(df: pd.DataFrame, column_indices: List[int]) -> List[str]:
"""
将列索引转换为列名
Args:
df: DataFrame
column_indices: 列索引列表
Returns:
列名列表
"""
return [df.columns[i] for i in column_indices]
def resolve_spectral_columns(df: pd.DataFrame, spectral_columns: Union[str, List[Union[str, int]], None]) -> List[str]:
"""
解析光谱列配置,支持列名和列号范围
Args:
df: DataFrame
spectral_columns: 光谱列配置
Returns:
光谱列名列表
"""
if spectral_columns is None:
# 默认使用除标签列外的所有列
return df.columns.tolist()
elif isinstance(spectral_columns, str) and spectral_columns == "auto":
# 自动检测光谱列(通常是数值列)
potential_spectral_cols = []
for col in df.columns:
if pd.api.types.is_numeric_dtype(df[col]):
# 检查是否是连续的数值序列(光谱波段)
try:
values = pd.to_numeric(df[col], errors='coerce')
if values.notna().sum() > len(df) * 0.8: # 至少80%是数值
potential_spectral_cols.append(col)
except:
continue
return potential_spectral_cols
else:
# 解析列范围
try:
column_indices = parse_column_range(spectral_columns, len(df.columns))
return convert_column_indices_to_names(df, column_indices)
except ValueError as e:
print(f"解析光谱列时出错: {e}")
print(f"将使用自动检测模式")
return resolve_spectral_columns(df, "auto")
def find_csv_files(directory: Union[str, Path], pattern: str = "*.csv") -> List[Path]:
"""
在目录中查找所有CSV文件
Args:
directory: 搜索目录
pattern: 文件匹配模式
Returns:
CSV文件路径列表
"""
directory = Path(directory)
if not directory.exists():
raise FileNotFoundError(f"目录不存在: {directory}")
csv_files = list(directory.glob(pattern))
csv_files.sort() # 排序以保证顺序一致性
print(f"在目录 {directory} 中找到 {len(csv_files)} 个CSV文件")
return csv_files
def create_batch_configs(csv_files: List[Path],
base_config: FeatureSelectionConfig,
output_base_dir: Union[str, Path]) -> List[Tuple[Path, FeatureSelectionConfig]]:
"""
为每个CSV文件创建配置
Args:
csv_files: CSV文件列表
base_config: 基础配置
output_base_dir: 输出基础目录
Returns:
(文件路径, 配置) 元组列表
"""
configs = []
output_base_dir = Path(output_base_dir)
for csv_file in csv_files:
try:
# 先读取CSV文件来获取列信息
df = pd.read_csv(csv_file, nrows=5) # 只读取前5行来获取列信息
# 解析标签列
if isinstance(base_config.label_column, str):
if base_config.label_column not in df.columns:
print(f"警告: 文件 {csv_file.name} 中不存在标签列 '{base_config.label_column}',将尝试使用第一列")
resolved_label_column = df.columns[0]
else:
resolved_label_column = base_config.label_column
else:
# 如果是列索引
try:
resolved_label_column = df.columns[base_config.label_column]
except IndexError:
print(f"警告: 文件 {csv_file.name} 中的列索引 {base_config.label_column} 超出范围,将使用第一列")
resolved_label_column = df.columns[0]
# 解析光谱列
resolved_spectral_columns = resolve_spectral_columns(df, base_config.spectral_columns)
# 确保标签列不在光谱列中
if resolved_label_column in resolved_spectral_columns:
resolved_spectral_columns.remove(resolved_label_column)
if len(resolved_spectral_columns) == 0:
print(f"警告: 文件 {csv_file.name} 中没有找到有效的光谱列")
continue
print(f"文件 {csv_file.name}: 标签列='{resolved_label_column}', 光谱列数={len(resolved_spectral_columns)}")
except Exception as e:
print(f"读取文件 {csv_file.name} 时出错: {e},跳过此文件")
continue
# 为每个文件创建独立的输出目录
file_stem = csv_file.stem
file_output_dir = output_base_dir / file_stem
file_output_dir.mkdir(parents=True, exist_ok=True)
# 复制基础配置并修改文件特定的参数
config = FeatureSelectionConfig(
method=base_config.method,
method_params=base_config.method_params.copy(),
csv_file_path=str(csv_file),
label_column=resolved_label_column,
spectral_columns=resolved_spectral_columns,
output_csv=base_config.output_csv,
output_dir=str(file_output_dir),
output_filename=f"{file_stem}_selected_features",
save_plots=base_config.save_plots,
plot_name_prefix=f"{file_stem}_{base_config.method}",
plot_dir=str(file_output_dir) if base_config.plot_dir else None
)
configs.append((csv_file, config))
return configs
def process_single_file(csv_file: Path, config: FeatureSelectionConfig) -> Dict:
"""
处理单个CSV文件的特征选择
Args:
csv_file: CSV文件路径
config: 特征选择配置
Returns:
处理结果字典
"""
result = {
'file': str(csv_file),
'file_name': csv_file.name,
'success': False,
'error': None,
'n_selected_features': 0,
'selected_columns': [],
'processing_time': 0,
'output_dir': config.output_dir
}
start_time = time.time()
try:
print(f"开始处理文件: {csv_file.name}")
# 执行特征选择
X_selected, y, selected_columns = select_features_from_csv(config)
# 记录结果
result['success'] = True
result['n_selected_features'] = X_selected.shape[1]
result['selected_columns'] = selected_columns.tolist() if hasattr(selected_columns, 'tolist') else list(selected_columns)
result['n_samples'] = X_selected.shape[0]
print(f"文件 {csv_file.name} 处理完成,选择特征数: {result['n_selected_features']}")
except Exception as e:
result['error'] = str(e)
print(f"文件 {csv_file.name} 处理失败: {e}")
finally:
result['processing_time'] = time.time() - start_time
return result
def batch_feature_selection(csv_files: List[Path],
base_config: FeatureSelectionConfig,
output_base_dir: Union[str, Path],
max_workers: Optional[int] = None,
parallel: bool = False) -> List[Dict]:
"""
批量执行特征选择
Args:
csv_files: CSV文件列表
base_config: 基础配置
output_base_dir: 输出基础目录
max_workers: 最大并行工作数
parallel: 是否并行处理
Returns:
处理结果列表
"""
# 创建配置
file_configs = create_batch_configs(csv_files, base_config, output_base_dir)
results = []
if parallel and len(file_configs) > 1:
# 并行处理
print(f"开始并行处理 {len(file_configs)} 个文件 (最大并行数: {max_workers or 'auto'})")
with ProcessPoolExecutor(max_workers=max_workers) as executor:
# 提交所有任务
future_to_config = {
executor.submit(process_single_file, csv_file, config): (csv_file, config)
for csv_file, config in file_configs
}
# 收集结果
for future in as_completed(future_to_config):
csv_file, config = future_to_config[future]
try:
result = future.result()
results.append(result)
except Exception as e:
print(f"并行处理失败 {csv_file.name}: {e}")
results.append({
'file': str(csv_file),
'file_name': csv_file.name,
'success': False,
'error': str(e),
'processing_time': 0
})
else:
# 串行处理
print(f"开始串行处理 {len(file_configs)} 个文件")
for csv_file, config in file_configs:
result = process_single_file(csv_file, config)
results.append(result)
return results
def save_batch_results(results: List[Dict], output_file: Union[str, Path]):
"""
保存批量处理结果到文件
Args:
results: 处理结果列表
output_file: 输出文件路径
"""
output_file = Path(output_file)
output_file.parent.mkdir(parents=True, exist_ok=True)
# 转换为DataFrame
results_df = pd.DataFrame(results)
# 保存为CSV
results_df.to_csv(output_file, index=False, encoding='utf-8')
print(f"批量处理结果已保存到: {output_file}")
def print_batch_summary(results: List[Dict]):
"""
打印批量处理摘要
Args:
results: 处理结果列表
"""
total_files = len(results)
successful_files = sum(1 for r in results if r['success'])
failed_files = total_files - successful_files
total_time = sum(r['processing_time'] for r in results)
avg_time = total_time / total_files if total_files > 0 else 0
print("\n" + "="*60)
print("批量特征选择处理摘要")
print("="*60)
print(f"总文件数: {total_files}")
print(f"成功处理: {successful_files}")
print(f"失败处理: {failed_files}")
print(".2f")
print(".2f")
if successful_files > 0:
selected_features = [r['n_selected_features'] for r in results if r['success']]
print(f"平均选择的特征数: {np.mean(selected_features):.1f} ± {np.std(selected_features):.1f}")
if failed_files > 0:
print(f"\n失败的文件:")
for result in results:
if not result['success']:
print(f" - {result['file_name']}: {result['error']}")
print("="*60)
def create_example_batch_config() -> FeatureSelectionConfig:
"""
创建示例批量配置
Returns:
示例配置对象
"""
return FeatureSelectionConfig(
method="CARS", # 可以使用: Cars, Lars, Uve, Spa, GA, ReliefF, RandomFrog, SiPLS
method_params={
'N': 50, # CARS参数
'f': 20,
'cv': 10
},
# 注意: csv_file_path, label_column, spectral_columns 会在处理每个文件时设置
output_csv=True,
save_plots=True,
plot_name_prefix="batch_fs"
)
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='批量特征选择工具')
# 必需参数
parser.add_argument('input_dir', help='包含CSV文件的输入目录')
parser.add_argument('output_dir', help='输出目录')
# 可选参数
parser.add_argument('--method', default='CARS',
choices=['Cars', 'Lars', 'Uve', 'Spa', 'GA', 'ReliefF', 'RandomFrog', 'SiPLS'],
help='特征选择方法 (默认: CARS)')
parser.add_argument('--label_column', required=True,
help='标签列名或列索引 (例如: "concentration" 或 0)')
parser.add_argument('--spectral_columns', required=True,
help='光谱列配置,支持: 列名列表 "col1 col2 col3", 列号范围 "1:10", 混合 "2,4,6-8", 或 "auto" 自动检测')
parser.add_argument('--parallel', action='store_true', help='启用并行处理')
parser.add_argument('--max_workers', type=int, help='最大并行工作数')
parser.add_argument('--no_csv_output', action='store_true', help='不输出CSV文件')
parser.add_argument('--no_plots', action='store_true', help='不生成可视化图')
parser.add_argument('--results_file', default='batch_results.csv', help='结果文件路径')
args = parser.parse_args()
try:
# 解析光谱列参数
if args.spectral_columns == "auto":
spectral_columns = "auto"
elif ':' in str(args.spectral_columns) or ',' in str(args.spectral_columns):
# 如果包含范围符号,保持为字符串让后续解析
spectral_columns = args.spectral_columns
else:
# 可能是空格分隔的列名列表
spectral_columns = args.spectral_columns.split()
# 尝试转换标签列为适当类型
try:
# 如果是数字,转换为整数
label_column = int(args.label_column)
except ValueError:
# 如果不是数字,当作列名
label_column = args.label_column
# 创建基础配置
base_config = FeatureSelectionConfig(
method=args.method,
method_params={}, # 使用默认参数
label_column=label_column,
spectral_columns=spectral_columns,
output_csv=not args.no_csv_output,
save_plots=not args.no_plots,
plot_name_prefix=f"batch_{args.method}"
)
# 查找CSV文件
csv_files = find_csv_files(args.input_dir)
if not csv_files:
print("未找到CSV文件")
return 1
# 执行批量特征选择
results = batch_feature_selection(
csv_files=csv_files,
base_config=base_config,
output_base_dir=args.output_dir,
max_workers=args.max_workers,
parallel=args.parallel
)
# 保存结果
results_file = Path(args.output_dir) / args.results_file
save_batch_results(results, results_file)
# 打印摘要
print_batch_summary(results)
successful = sum(1 for r in results if r['success'])
return 0 if successful > 0 else 1
except Exception as e:
print(f"批量处理失败: {e}")
import traceback
traceback.print_exc()
return 1
def example_usage():
"""
显示使用示例
"""
print("=" * 80)
print("批量特征选择工具 - 使用指南")
print("=" * 80)
print("\n1. 列范围选择功能:")
print(" 支持多种列选择方式:")
print(" - 列号范围: '1:10' 表示列1到列10")
print(" - 混合选择: '2,4,6-8' 表示列2,4,6,7,8")
print(" - 自动检测: 'auto' 自动选择数值列作为光谱列")
print(" - 列名列表: 'wavelength_400 wavelength_410 wavelength_420'")
print("\n2. 命令行使用示例:")
print(" # 使用列号范围")
print(" python batch_feature_selection.py input_dir output_dir --label_column 0 --spectral_columns 1:50")
print("")
print(" # 使用混合范围")
print(" python batch_feature_selection.py input_dir output_dir --label_column concentration --spectral_columns 2,4,6-8")
print("")
print(" # 自动检测光谱列")
print(" python batch_feature_selection.py input_dir output_dir --label_column Label --spectral_columns auto")
print("\n3. Python代码使用示例:")
print("""
from batch_feature_selection import batch_feature_selection, create_example_batch_config, find_csv_files
# 查找CSV文件
csv_files = find_csv_files('your/data/directory')
# 创建配置
base_config = create_example_batch_config()
base_config.label_column = 'concentration' # 标签列名
base_config.spectral_columns = "5:25" # 列5到25作为光谱列
# 执行批量处理
results = batch_feature_selection(
csv_files=csv_files,
base_config=base_config,
output_base_dir='output/directory',
parallel=True
)
""")
print("\n4. 支持的特征选择方法:")
methods = ['CARS', 'Lars', 'Uve', 'Spa', 'GA', 'ReliefF', 'RandomFrog', 'SiPLS']
for method in methods:
print(f" - {method}")
print("\n5. 方法参数配置示例:")
print("""
# CARS方法
config.method_params = {'N': 50, 'f': 20, 'cv': 10}
# UVE方法
config.method_params = {'ncomp': 20, 'cv': 5}
# SPA方法
config.method_params = {'m_min': 2, 'm_max': 50, 'autoscaling': 1}
""")
print("=" * 80)
# 查找CSV文件
csv_files = find_csv_files("E:\code\spectronon\single_classsfication\data")
# 定义所有可用的特征选择方法及其参数
methods_config = [
{
'method': 'Cars',
'method_params': {'N': 50, 'f': 20, 'cv': 10},
'description': 'Competitive Adaptive Reweighted Sampling'
},
{
'method': 'Uve',
'method_params': {'ncomp': 20, 'cv': 5},
'description': 'Uninformative Variable Elimination'
},
{
'method': 'Spa',
'method_params': {'m_min': 2, 'm_max': 50, 'autoscaling': 1},
'description': 'Successive Projections Algorithm'
},
{
'method': 'GA',
'method_params': {'population_size': 10},
'description': 'Genetic Algorithm'
},
{
'method': 'ReliefF',
'method_params': {'n_neighbors': 20, 'n_features_to_keep': 20},
'description': 'ReliefF Algorithm'
},
{
'method': 'RandomFrog',
'method_params': {'n_frogs': 50, 'n_memeplexes': 5, 'n_evolution_steps': 10, 'n_shuffle_iterations': 10, 'cv': 5},
'description': 'Random Frog Leaping Algorithm'
},
{
'method': 'SiPLS',
'method_params': {'n_intervals_list': [10, 15, 20]},
'description': 'Synergy Interval Partial Least Squares'
}
]
print("=" * 80)
print("开始批量特征选择 - 使用所有可用方法")
print(f"找到 {len(csv_files)} 个CSV文件待处理")
print(f"将使用 {len(methods_config)} 种特征选择方法")
print("=" * 80)
all_results = {}
# 为每种方法执行批量特征选择
for i, method_cfg in enumerate(methods_config, 1):
method_name = method_cfg['method']
description = method_cfg['description']
print(f"\n{'='*60}")
print(f"方法 {i}/{len(methods_config)}: {method_name}")
print(f"描述: {description}")
print(f"{'='*60}")
try:
# 创建该方法的配置
method_config = create_example_batch_config()
method_config.method = method_name
method_config.method_params = method_cfg['method_params']
method_config.label_column = 'Label' # 标签列名
method_config.spectral_columns = "1:" # 列1到最后作为光谱列
method_config.plot_name_prefix = f"{method_name.lower()}_batch_fs"
# 执行批量处理
method_results = batch_feature_selection(
csv_files=csv_files,
base_config=method_config,
output_base_dir=f'E:\\code\\spectronon\\single_classsfication\\Feature_Selection_method\\directory\\{method_name.lower()}_results',
parallel=True
)
all_results[method_name] = {
'results': method_results,
'description': description,
'config': method_cfg
}
print(f"{method_name} 方法处理完成")
except Exception as e:
print(f"{method_name} 方法处理失败: {str(e)}")
all_results[method_name] = {
'error': str(e),
'description': description,
'config': method_cfg
}
# 输出汇总结果
print(f"\n{'='*80}")
print("批量特征选择处理完成汇总")
print(f"{'='*80}")
successful_methods = []
failed_methods = []
for method_name, result in all_results.items():
if 'error' in result:
failed_methods.append(f"{method_name}: {result['error']}")
print(f"{method_name}: 失败 - {result['error']}")
else:
successful_methods.append(method_name)
print(f"{method_name}: 成功")
print(f"\n总计: {len(successful_methods)}/{len(methods_config)} 种方法成功处理")
print(f"成功的方法: {', '.join(successful_methods)}")
if failed_methods:
print(f"失败的方法: {len(failed_methods)}")
for failed in failed_methods:
print(f" - {failed}")
print(f"\n结果文件保存在: E:\\code\\spectronon\\single_classsfication\\Feature_Selection_method\\directory\\")
print("每个方法都有独立的子目录存储结果")
# 如果直接运行此脚本,显示使用指南
# if __name__ == "__main__":
# import sys
# if len(sys.argv) == 1:
# example_usage()
# else:
# # 运行主函数进行批量处理
# exit(main())

View File

@ -0,0 +1,594 @@
import pandas as pd
import numpy as np
from Feature_Selection_method.Lar import Lar
from Feature_Selection_method.Spa import SPA
from Feature_Selection_method.Uve import UVE
from Feature_Selection_method.Cars import CARS_Cloud
from Feature_Selection_method.GA import GA
from Feature_Selection_method.ReliefF import ReliefF
from Feature_Selection_method.random_fog import shuffled_frog_leaping_selection
from Feature_Selection_method.sipls import sipls_feature_selection
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
from typing import Optional, Union, List, Tuple
from dataclasses import dataclass, field
def _get_x_axis_values(feature_names: List[str]) -> Tuple[Optional[np.ndarray], str]:
"""
从特征名称中提取x轴数值通常是波长
Args:
feature_names: 特征名称列表
Returns:
(x_values, x_label): x轴数值数组和标签如果无法提取则返回(None, "")
"""
if not feature_names:
return None, ""
# 尝试从列名中提取数值
x_values = []
for name in feature_names:
try:
# 尝试将列名转换为浮点数
if isinstance(name, (int, float)):
x_values.append(float(name))
elif isinstance(name, str):
# 尝试提取字符串中的数值
# 处理类似 "400.5", "Band_400", "Wavelength_400.5nm" 的格式
import re
# 查找浮点数模式
match = re.search(r'(\d+\.?\d*)', str(name))
if match:
x_values.append(float(match.group(1)))
else:
# 如果找不到数值返回None
return None, ""
else:
return None, ""
except (ValueError, TypeError):
return None, ""
# 检查是否所有值都是唯一的(避免重复的波长)
if len(set(x_values)) != len(x_values):
return None, ""
# 检查波长范围是否合理假设是nm单位范围在200-2500nm之间
x_array = np.array(x_values)
if np.min(x_array) < 200 or np.max(x_array) > 2500:
return None, ""
# 确定标签
x_label = "Wavelength (nm)"
return x_array, x_label
def plot_feature_selection_results(X: Union[pd.DataFrame, np.ndarray],
selected_indices: Union[List[int], np.ndarray],
method_name: str,
save_path: Optional[str] = None,
figsize: Tuple[int, int] = (12, 6)) -> plt.Figure:
"""
绘制特征选择结果的可视化图
Args:
X: 特征数据矩阵 (n_samples, n_features)
selected_indices: 选择的特征索引列表
method_name: 特征选择方法名称
save_path: 图片保存路径如果为None则不保存
figsize: 图片尺寸
Returns:
matplotlib Figure对象
"""
# 转换为numpy数组
if isinstance(X, pd.DataFrame):
X_array = X.values
feature_names = X.columns.tolist()
else:
X_array = X
feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
# 计算平均光谱
mean_spectrum = np.mean(X_array, axis=0)
n_features = X_array.shape[1]
# 创建x轴 - 尝试使用波长值而不是索引
x_values, x_label = _get_x_axis_values(feature_names)
if x_values is None:
# 如果无法提取波长值,使用特征索引
x_values = np.arange(n_features)
x_label = "Feature Index"
# 创建图形
fig, ax = plt.subplots(figsize=figsize)
# 绘制平均光谱曲线
ax.plot(x_values, mean_spectrum, 'b-', linewidth=1.5, alpha=0.8, label='Mean Spectrum')
# 标注选择的特征点
if len(selected_indices) > 0:
# 确保selected_indices是有效的numpy数组
selected_indices = np.asarray(selected_indices, dtype=int)
# 检查索引范围
valid_indices = selected_indices[(selected_indices >= 0) & (selected_indices < len(x_values))]
if len(valid_indices) > 0:
selected_x = x_values[valid_indices]
selected_y = mean_spectrum[valid_indices]
ax.scatter(selected_x, selected_y, color='red', s=60, alpha=0.9,
edgecolors='darkred', linewidth=1.5, label='Selected Features', zorder=5)
# 添加选择的特征数量信息
ax.text(0.02, 0.98, f'Selected: {len(selected_indices)}/{n_features} features',
transform=ax.transAxes, fontsize=10, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
# 设置标题和标签
ax.set_title(f'Feature Selection Results - {method_name}', fontsize=14, fontweight='bold')
ax.set_xlabel(x_label, fontsize=12)
ax.set_ylabel('Intensity', fontsize=12)
# 设置网格和图例
ax.grid(True, alpha=0.3)
ax.legend(loc='upper right', fontsize=10)
# 调整布局
plt.tight_layout()
# 保存图片
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Visualization saved to: {save_path}")
return fig
@dataclass
class FeatureSelectionConfig:
"""特征选择配置类"""
# CSV文件相关配置
csv_file_path: Optional[str] = None
label_column: Optional[str] = None
spectral_columns: Optional[List[str]] = None
# 特征选择方法配置
method: str = "None"
method_params: dict = field(default_factory=dict)
# 输出配置
output_csv: bool = False
output_dir: str = ""
output_filename: str = "selected_features"
# 可视化配置
save_plots: bool = True
plot_name_prefix: str = ""
plot_dir: Optional[str] = None # 可视化图片保存目录如果为None则使用output_dir
def __post_init__(self):
"""参数校验和默认值设置"""
if self.csv_file_path and not os.path.exists(self.csv_file_path):
raise FileNotFoundError(f"CSV文件不存在: {self.csv_file_path}")
if self.csv_file_path and not self.label_column:
raise ValueError("指定CSV文件时必须提供标签列名(label_column)")
if self.csv_file_path and not self.spectral_columns:
raise ValueError("指定CSV文件时必须提供光谱列名列表(spectral_columns)")
# 设置默认的方法参数
self._set_default_method_params()
def _set_default_method_params(self):
"""根据方法设置默认参数"""
if self.method == "Cars":
self.method_params.setdefault('N', 50)
self.method_params.setdefault('f', 20)
self.method_params.setdefault('cv', 10)
elif self.method == "Uve":
self.method_params.setdefault('ncomp', 20)
self.method_params.setdefault('cv', 5)
elif self.method == "Spa":
self.method_params.setdefault('m_min', 2)
self.method_params.setdefault('m_max', 50)
self.method_params.setdefault('autoscaling', 1)
elif self.method == "GA":
self.method_params.setdefault('population_size', 10)
elif self.method == "ReliefF":
self.method_params.setdefault('n_neighbors', 20)
self.method_params.setdefault('n_features_to_keep', 20)
elif self.method == "RandomFrog":
self.method_params.setdefault('n_frogs', 50)
self.method_params.setdefault('n_memeplexes', 5)
self.method_params.setdefault('n_evolution_steps', 10)
self.method_params.setdefault('n_shuffle_iterations', 10)
self.method_params.setdefault('cv', 5)
elif self.method == "SiPLS":
self.method_params.setdefault('n_intervals_list', [10, 15, 20])
self.method_params.setdefault('n_combinations_list', [2, 3, 4])
self.method_params.setdefault('max_components', 15)
self.method_params.setdefault('cv_folds', 5)
class SpectrumFeatureSelector:
"""光谱特征选择器"""
def __init__(self, config: FeatureSelectionConfig):
self.config = config
def load_csv_data(self) -> Tuple[pd.DataFrame, np.ndarray]:
"""从CSV文件加载数据"""
if not self.config.csv_file_path:
raise ValueError("未指定CSV文件路径")
df = pd.read_csv(self.config.csv_file_path)
# 验证列是否存在
if self.config.label_column not in df.columns:
raise ValueError(f"标签列 '{self.config.label_column}' 不存在于CSV文件中")
missing_cols = [col for col in self.config.spectral_columns if col not in df.columns]
if missing_cols:
raise ValueError(f"以下光谱列不存在于CSV文件中: {missing_cols}")
# 提取特征和标签
X = df[self.config.spectral_columns]
y = df[self.config.label_column].values
return X, y
def save_selected_features_csv(self, X_selected: pd.DataFrame, y: np.ndarray,
selected_columns: Union[List[str], np.ndarray]):
"""保存选定的特征到CSV文件"""
if not self.config.output_csv:
return
os.makedirs(self.config.output_dir, exist_ok=True)
# 创建结果DataFrame
if isinstance(selected_columns, np.ndarray):
selected_col_names = [f"feature_{i}" for i in selected_columns]
else:
selected_col_names = selected_columns
result_df = pd.DataFrame(X_selected.values, columns=selected_col_names)
result_df[self.config.label_column] = y
output_path = os.path.join(self.config.output_dir,
f"{self.config.output_filename}.csv")
result_df.to_csv(output_path, index=False)
print(f"Selected features saved to: {output_path}")
def plot_feature_selection(self, X: pd.DataFrame,
selected_indices: Union[List[int], np.ndarray]) -> Optional[plt.Figure]:
"""绘制特征选择结果可视化"""
if not self.config.save_plots:
return None
# 确定保存目录
plot_dir = self.config.plot_dir if self.config.plot_dir else self.config.output_dir
if not plot_dir:
return None
os.makedirs(plot_dir, exist_ok=True)
# 生成文件名
filename = f"{self.config.plot_name_prefix}_{self.config.method}_feature_selection.png"
save_path = os.path.join(plot_dir, filename)
# 绘制可视化图
fig = plot_feature_selection_results(
X=X,
selected_indices=selected_indices,
method_name=self.config.method,
save_path=save_path
)
return fig
def _convert_to_indices(self, X: pd.DataFrame, selected_columns) -> List[int]:
"""
将selected_columns转换为原始DataFrame X的索引列表
Args:
X: 原始DataFrame
selected_columns: 选择的列,可以是索引数组、列名列表等
Returns:
索引列表
"""
try:
# 处理pandas Index对象
if hasattr(selected_columns, 'tolist'): # pandas Index or Series
selected_columns = selected_columns.tolist()
if isinstance(selected_columns, np.ndarray):
# 如果是numpy数组直接作为索引
return selected_columns.tolist()
elif isinstance(selected_columns, list) and len(selected_columns) > 0:
if isinstance(selected_columns[0], str):
# 如果是列名列表,转换为索引
indices = []
for col in selected_columns:
try:
# 首先尝试精确匹配
idx = X.columns.get_loc(col)
indices.append(idx)
except KeyError:
# 如果精确匹配失败,尝试数值近似匹配(处理小数点精度问题)
try:
target_value = float(col)
# 找到最接近的列名
best_match = None
best_diff = float('inf')
best_idx = None
for i, col_name in enumerate(X.columns):
try:
col_value = float(col_name)
diff = abs(col_value - target_value)
if diff < best_diff:
best_diff = diff
best_match = col_name
best_idx = i
except (ValueError, TypeError):
continue
if best_match is not None and best_diff < 1.0: # 允许1.0以内的误差
print(f"Approximate match: '{col}' -> '{best_match}' (diff: {best_diff:.3f})")
indices.append(best_idx)
else:
print(f"Warning: No suitable match found for column '{col}' in DataFrame columns")
continue
except (ValueError, TypeError):
print(f"Warning: Cannot parse column name '{col}' as numeric")
continue
return indices
else:
# 如果是数字列表,直接作为索引
return [int(idx) for idx in selected_columns]
else:
return []
except Exception as e:
print(f"Error converting selected_columns to indices: {e}")
return []
def select_features(self, X: Optional[pd.DataFrame] = None, y: Optional[np.ndarray] = None,
column_names: Optional[List[str]] = None) -> Tuple[pd.DataFrame, np.ndarray, Union[List[str], np.ndarray]]:
"""
执行特征选择
Args:
X: 特征数据如果为None则从CSV文件加载
y: 标签数据如果为None则从CSV文件加载
column_names: 列名用于numpy数组输入
Returns:
X_selected: 选定的特征数据
y: 标签数据
selected_columns: 选定的列名或索引
"""
# 如果没有提供数据从CSV加载
if X is None or y is None:
X, y = self.load_csv_data()
# 确保X是DataFrame格式
if isinstance(X, np.ndarray):
if column_names is not None:
X = pd.DataFrame(X, columns=column_names)
else:
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
# 执行特征选择
X_selected, y_selected, selected_columns = SpctrumFeatureSelcet(
method=self.config.method,
X=X,
y=y,
name=self.config.plot_name_prefix,
result_dir=self.config.output_dir if self.config.save_plots else '',
column_names=None # 已经转换为DataFrame不再需要column_names
)
# 保存结果到CSV如果配置了
self.save_selected_features_csv(X_selected, y_selected, selected_columns)
# 生成可视化图(如果配置了)
if self.config.save_plots:
# 转换selected_columns为原始数据集X中的索引列表
# selected_columns对应X_selected中的列我们需要找到它们在原始数据集X中的位置
selected_indices = self._convert_to_indices(X, selected_columns)
if len(selected_indices) > 0:
self.plot_feature_selection(X, selected_indices)
else:
print(f"Warning: No valid indices found for plotting. selected_columns: {selected_columns}")
print(f"Available columns in X: {list(X.columns[:5])}...") # 显示前5个列名用于调试
return X_selected, y_selected, selected_columns
def SpctrumFeatureSelcet(method, X, y, name='', result_dir='', column_names=None, method_params=None):
"""
核心特征选择函数(保持原有业务逻辑不变)
:param method: 波长筛选/降维的方法包括Cars, Lars, Uve, Spa, GA, ReliefF, RandomFrog, SiPLS。
:param X: 光谱数据,可以是 pandas DataFrame 或 numpy array (n_samples, n_features)。
:param y: 光谱数据对应的标签 (n_samples,)。
:param name: 结果图像的文件名。
:param result_dir: 保存结果的文件夹路径。
:param column_names: 如果 X 是 numpy array需要提供列名列表。
:param method_params: 方法特定的参数字典。
:return:
- X_Feature: 选择/降维后的数据 (n_samples, n_features)。
- y: 对应的标签。
- selected_columns: 选择的特征列名或索引。
"""
if method_params is None:
method_params = {}
global X_Feature
# 判断输入数据类型并转换为 DataFrame如有必要
if isinstance(X, np.ndarray):
if column_names is None:
column_names = [f"{i}" for i in range(X.shape[1])] # 默认列名
X_df = pd.DataFrame(X, columns=column_names)
else:
X_df = X
# 根据所选方法执行特征选择
if method == "None":
X_Feature = X_df
selected_columns = X_df.columns
elif method == "Cars":
save_path = os.path.join(result_dir, f"{name}_cars.png") if result_dir else None
# 调用 CARS_Cloud 并获取结果,使用配置的参数
N = method_params.get('N', 50)
f = method_params.get('f', 20)
cv = method_params.get('cv', 10)
Featuresecletidx = CARS_Cloud(X_df.values, y, N=N, f=f, cv=cv,
save_fig=bool(save_path), save_path=save_path)
Featuresecletidx = Featuresecletidx.astype(int)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = Featuresecletidx
elif method == "Lars":
Featuresecletidx = Lar(X_df.values, y)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "Uve":
ncomp = method_params.get('ncomp', 20)
cv = method_params.get('cv', 5)
uve = UVE(X_df.values, y, ncomp)
uve.calcCriteria()
uve.evalCriteria(cv=cv)
Featuresecletidx = uve.cutFeature() # 返回所选特征的索引
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "Spa":
save_path = os.path.join(result_dir, f"{name}_spa.png") if result_dir else None
Xcal, Xval, ycal, yval = train_test_split(X_df, y, test_size=0.3)
m_min = method_params.get('m_min', 2)
m_max = method_params.get('m_max', 50)
autoscaling = method_params.get('autoscaling', 1)
Featuresecletidx, var_sel_phase2 = SPA().spa(
Xcal, ycal, m_min=m_min, m_max=m_max, Xval=Xval, yval=yval,
autoscaling=autoscaling, save_path=save_path)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "GA":
population_size = method_params.get('population_size', 10)
Featuresecletidx = GA(X_df.values, y, population_size)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "ReliefF":
n_neighbors = method_params.get('n_neighbors', 20)
n_features_to_keep = method_params.get('n_features_to_keep', 20)
relieff = ReliefF(n_neighbors=n_neighbors, n_features_to_keep=n_features_to_keep)
Featuresecletidx = relieff.fit(X_df.values, y)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "RandomFrog":
n_frogs = method_params.get('n_frogs', 50)
n_memeplexes = method_params.get('n_memeplexes', 5)
n_evolution_steps = method_params.get('n_evolution_steps', 10)
n_shuffle_iterations = method_params.get('n_shuffle_iterations', 10)
cv = method_params.get('cv', 5)
Featuresecletidx = shuffled_frog_leaping_selection(
X_df.values, y,
n_frogs=n_frogs,
n_memeplexes=n_memeplexes,
n_evolution_steps=n_evolution_steps,
n_shuffle_iterations=n_shuffle_iterations,
cv=cv
)
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
elif method == "SiPLS":
n_intervals_list = method_params.get('n_intervals_list', [10, 15, 20])
n_combinations_list = method_params.get('n_combinations_list', [2, 3, 4])
max_components = method_params.get('max_components', 15)
cv_folds = method_params.get('cv_folds', 5)
result = sipls_feature_selection(
X_df.values, y,
n_intervals_list=n_intervals_list,
n_combinations_list=n_combinations_list,
max_components=max_components,
cv_folds=cv_folds
)
if result and 'selected_wavelengths' in result:
Featuresecletidx = result['selected_wavelengths']
X_Feature = X_df.iloc[:, Featuresecletidx]
selected_columns = X_df.columns[Featuresecletidx]
else:
raise ValueError("SiPLS算法未能找到有效的特征选择结果")
else:
raise ValueError(f"不支持的特征选择方法: {method}。支持的方法包括: None, Cars, Lars, Uve, Spa, GA, ReliefF, RandomFrog, SiPLS")
return X_Feature, y, selected_columns # 返回所选特征数据、标签和列名
# 便捷函数,用于向后兼容和简化使用
def select_features_from_csv(config: FeatureSelectionConfig) -> Tuple[pd.DataFrame, np.ndarray, Union[List[str], np.ndarray]]:
"""
从CSV文件进行特征选择的主要接口函数
Args:
config: 特征选择配置对象
Returns:
X_selected: 选定的特征数据
y: 标签数据
selected_columns: 选定的列名或索引
"""
selector = SpectrumFeatureSelector(config)
return selector.select_features()
def select_features_from_data(X: pd.DataFrame, y: np.ndarray, method: str,
method_params: Optional[dict] = None,
name: str = '', result_dir: str = '',
column_names: Optional[List[str]] = None) -> Tuple[pd.DataFrame, np.ndarray, Union[List[str], np.ndarray]]:
"""
直接从数据进行特征选择的便捷函数
Args:
X: 特征数据
y: 标签数据
method: 特征选择方法
method_params: 方法参数
name: 输出文件名前缀
result_dir: 输出目录
column_names: 列名
Returns:
X_selected: 选定的特征数据
y: 标签数据
selected_columns: 选定的列名或索引
"""
config = FeatureSelectionConfig(
method=method,
method_params=method_params or {},
output_csv=False, # 直接数据输入不输出CSV
save_plots=bool(result_dir),
plot_name_prefix=name
)
selector = SpectrumFeatureSelector(config)
return selector.select_features(X=X, y=y, column_names=column_names)

View File

@ -0,0 +1,292 @@
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
import copy
class ShuffledFrogLeaping:
"""
随机蛙跳算法 (Shuffled Frog Leaping Algorithm, SFLA) 进行特征选择
算法原理:
1. 将青蛙种群分成多个小组memeplexes
2. 在每个小组内进行局部搜索和进化
3. 定期重组所有青蛙,进行全局信息交换
4. 重复直到满足停止条件
"""
def __init__(self, n_frogs=50, n_memeplexes=5, n_evolution_steps=10,
n_shuffle_iterations=10, classifier=None, cv=5):
"""
初始化随机蛙跳算法参数
参数:
n_frogs: 青蛙种群大小
n_memeplexes: 小组数量
n_evolution_steps: 每个小组的进化步数
n_shuffle_iterations: 重组迭代次数
classifier: 用于评估特征子集的分类器
cv: 交叉验证折数
"""
self.n_frogs = n_frogs
self.n_memeplexes = n_memeplexes
self.n_evolution_steps = n_evolution_steps
self.n_shuffle_iterations = n_shuffle_iterations
self.classifier = classifier or RandomForestClassifier(random_state=42, n_estimators=50)
self.cv = cv
# 算法内部变量
self.n_features = None
self.frogs = None # 青蛙种群,每个青蛙是一个二进制向量
self.fitness_values = None
self.best_frog = None
self.best_fitness = -np.inf
self.selected_features = None
def _initialize_population(self):
"""初始化青蛙种群"""
self.frogs = []
for _ in range(self.n_frogs):
# 随机初始化二进制向量1表示选择该特征0表示不选择
frog = np.random.randint(0, 2, self.n_features)
self.frogs.append(frog)
self.frogs = np.array(self.frogs)
def _evaluate_fitness(self, X, y):
"""评估所有青蛙的适应度"""
self.fitness_values = []
for frog in self.frogs:
fitness = self._calculate_fitness(frog, X, y)
self.fitness_values.append(fitness)
# 更新全局最优
if fitness > self.best_fitness:
self.best_fitness = fitness
self.best_frog = frog.copy()
self.fitness_values = np.array(self.fitness_values)
def _calculate_fitness(self, frog, X, y):
"""计算单个青蛙的适应度"""
selected_features = np.where(frog == 1)[0]
# 如果没有选择任何特征,返回最低适应度
if len(selected_features) == 0:
return 0.0
# 使用选择的特征进行交叉验证
X_selected = X[:, selected_features]
try:
scores = cross_val_score(clone(self.classifier), X_selected, y, cv=self.cv)
return np.mean(scores)
except:
# 如果交叉验证失败,返回低适应度
return 0.0
def _divide_into_memeplexes(self):
"""将青蛙按适应度排序并分成小组"""
# 按适应度降序排序
sorted_indices = np.argsort(self.fitness_values)[::-1]
self.frogs = self.frogs[sorted_indices]
self.fitness_values = self.fitness_values[sorted_indices]
# 分成小组
memeplexes = []
frogs_per_memeplex = self.n_frogs // self.n_memeplexes
for i in range(self.n_memeplexes):
start_idx = i * frogs_per_memeplex
if i == self.n_memeplexes - 1:
# 最后一个小组包含剩余的所有青蛙
end_idx = self.n_frogs
else:
end_idx = (i + 1) * frogs_per_memeplex
memeplex = {
'frogs': self.frogs[start_idx:end_idx].copy(),
'fitness': self.fitness_values[start_idx:end_idx].copy()
}
memeplexes.append(memeplex)
return memeplexes
def _evolve_memeplex(self, memeplex, X, y):
"""进化单个小组"""
frogs = memeplex['frogs']
fitness = memeplex['fitness']
# 找出小组中的最好和最坏青蛙
best_idx = np.argmax(fitness)
worst_idx = np.argmin(fitness)
best_frog = frogs[best_idx]
worst_frog = frogs[worst_idx]
# 对最坏的青蛙进行进化
for step in range(self.n_evolution_steps):
# 生成新的青蛙: worst_frog + rand() * (best_frog - worst_frog)
rand = np.random.random(self.n_features)
new_frog = worst_frog + rand * (best_frog - worst_frog)
# 二进制化大于0.5的为1否则为0
new_frog = (new_frog > 0.5).astype(int)
# 确保至少选择一个特征
if np.sum(new_frog) == 0:
new_frog[np.random.randint(self.n_features)] = 1
# 计算新青蛙的适应度
new_fitness = self._calculate_fitness(new_frog, X, y)
# 如果新青蛙更好,替换最坏的青蛙
if new_fitness > fitness[worst_idx]:
frogs[worst_idx] = new_frog
fitness[worst_idx] = new_fitness
# 更新小组内的最好青蛙
if new_fitness > fitness[best_idx]:
best_idx = worst_idx
best_frog = new_frog
# 重新找出最坏的青蛙
worst_idx = np.argmin(fitness)
worst_frog = frogs[worst_idx]
else:
# 如果没有改善,随机生成一个新青蛙
new_frog = np.random.randint(0, 2, self.n_features)
if np.sum(new_frog) == 0:
new_frog[np.random.randint(self.n_features)] = 1
new_fitness = self._calculate_fitness(new_frog, X, y)
if new_fitness > fitness[worst_idx]:
frogs[worst_idx] = new_frog
fitness[worst_idx] = new_fitness
return frogs, fitness
def fit(self, X, y):
"""
运行随机蛙跳算法进行特征选择
参数:
X: 特征矩阵 (n_samples, n_features)
y: 标签向量 (n_samples,)
返回:
selected_features: 选择的特征索引列表
"""
self.n_features = X.shape[1]
# 初始化种群
self._initialize_population()
# 初始评估
self._evaluate_fitness(X, y)
# 主循环
for iteration in range(self.n_shuffle_iterations):
# 将青蛙分成小组
memeplexes = self._divide_into_memeplexes()
# 进化每个小组
evolved_frogs = []
evolved_fitness = []
for memeplex in memeplexes:
evolved_frog, evolved_fit = self._evolve_memeplex(memeplex, X, y)
evolved_frogs.extend(evolved_frog)
evolved_fitness.extend(evolved_fit)
# 更新种群
self.frogs = np.array(evolved_frogs)
self.fitness_values = np.array(evolved_fitness)
# 再次评估所有青蛙(确保一致性)
self._evaluate_fitness(X, y)
# 返回最优解
self.selected_features = np.where(self.best_frog == 1)[0]
return self.selected_features.tolist()
def get_feature_importance(self):
"""获取特征选择结果的统计信息"""
if self.selected_features is None:
raise ValueError("请先运行 fit 方法")
n_selected = len(self.selected_features)
selection_ratio = n_selected / self.n_features
return {
'selected_features': self.selected_features,
'n_selected': n_selected,
'n_total': self.n_features,
'selection_ratio': selection_ratio,
'best_fitness': self.best_fitness
}
def shuffled_frog_leaping_selection(X, y, n_frogs=50, n_memeplexes=5,
n_evolution_steps=10, n_shuffle_iterations=10,
classifier=None, cv=5):
"""
使用随机蛙跳算法进行特征选择
参数:
X: 特征矩阵 (n_samples, n_features)
y: 标签向量 (n_samples,)
n_frogs: 青蛙种群大小
n_memeplexes: 小组数量
n_evolution_steps: 每个小组的进化步数
n_shuffle_iterations: 重组迭代次数
classifier: 用于评估特征子集的分类器
cv: 交叉验证折数
返回:
selected_features: 选择的特征索引列表
"""
sfla = ShuffledFrogLeaping(
n_frogs=n_frogs,
n_memeplexes=n_memeplexes,
n_evolution_steps=n_evolution_steps,
n_shuffle_iterations=n_shuffle_iterations,
classifier=classifier,
cv=cv
)
return sfla.fit(X, y)
# 使用示例
if __name__ == "__main__":
# 生成示例数据
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=200,
n_features=50,
n_informative=10,
n_redundant=10,
n_clusters_per_class=1,
random_state=42
)
print("原始特征数量:", X.shape[1])
# 使用随机蛙跳算法进行特征选择
selected_features = shuffled_frog_leaping_selection(
X, y,
n_frogs=30,
n_memeplexes=3,
n_evolution_steps=5,
n_shuffle_iterations=5
)
print("选择的特征数量:", len(selected_features))
print("选择的特征索引:", selected_features)
# 计算选择率
selection_ratio = len(selected_features) / X.shape[1]
print(".2f")

View File

@ -0,0 +1,271 @@
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from itertools import combinations
import matplotlib.pyplot as plt
def synergy_interval_pls(X, y, n_intervals=20, n_combinations=2, max_components=15, cv_folds=5):
"""
协同区间偏最小二乘法 (Synergy Interval PLS, SiPLS) 进行特征选择
参数:
X: 光谱矩阵 (n_samples, n_wavelengths)
y: 浓度/属性向量 (n_samples,)
n_intervals: 将光谱分成多少个等宽区间
n_combinations: 每次选择的区间组合数量 (通常2-4)
max_components: PLS的最大主成分数
cv_folds: 交叉验证折数
返回:
best_intervals: 最优的区间组合
best_rmsecv: 最优组合的RMSECV
best_n_components: 最优的主成分数
selected_wavelengths: 选择的波长索引
"""
n_samples, n_wavelengths = X.shape
# 将光谱分成等宽的区间
interval_size = n_wavelengths // n_intervals
intervals = []
for i in range(n_intervals):
start_idx = i * interval_size
if i == n_intervals - 1:
# 最后一个区间包含剩余的所有波长
end_idx = n_wavelengths
else:
end_idx = (i + 1) * interval_size
intervals.append((start_idx, end_idx))
print(f"{n_wavelengths} 个波长分成 {n_intervals} 个区间:")
for i, (start, end) in enumerate(intervals):
print(f" 区间 {i+1}: 波长 {start}-{end-1} (宽度: {end-start})")
# 生成所有可能的区间组合
interval_combinations = list(combinations(range(n_intervals), n_combinations))
print(f"\n总共 {len(interval_combinations)}{n_combinations} 区间的组合")
best_rmsecv = float('inf')
best_intervals = None
best_n_components = None
results = []
# 对每个组合进行评估
for combo_idx, combo in enumerate(interval_combinations):
if (combo_idx + 1) % 50 == 0:
print(f"正在处理组合 {combo_idx + 1}/{len(interval_combinations)}")
# 合并选中区间的光谱数据
selected_wavelengths = []
for interval_idx in combo:
start_idx, end_idx = intervals[interval_idx]
selected_wavelengths.extend(range(start_idx, end_idx))
X_selected = X[:, selected_wavelengths]
# 对不同主成分数进行交叉验证
kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
rmse_results = []
for n_comp in range(1, min(max_components + 1, X_selected.shape[1] + 1)):
rmse_scores = []
for train_idx, test_idx in kf.split(X_selected):
X_train, X_test = X_selected[train_idx], X_selected[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
pls = PLSRegression(n_components=n_comp)
pls.fit(X_train, y_train)
y_pred = pls.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse_scores.append(rmse)
mean_rmse = np.mean(rmse_scores)
rmse_results.append(mean_rmse)
# 找到该组合的最佳主成分数和RMSE
min_rmse_idx = np.argmin(rmse_results)
min_rmse = rmse_results[min_rmse_idx]
best_comp = min_rmse_idx + 1
results.append({
'intervals': combo,
'rmsecv': min_rmse,
'n_components': best_comp,
'wavelengths': selected_wavelengths
})
# 更新全局最优
if min_rmse < best_rmsecv:
best_rmsecv = min_rmse
best_intervals = combo
best_n_components = best_comp
print("最优结果:")
print(f" 区间组合: {best_intervals}")
print(f" RMSECV: {best_rmsecv:.6f}")
print(f" 主成分数: {best_n_components}")
print(f" 选择的波长数: {len(results[-1]['wavelengths'])}")
# 返回最优区间的波长索引
selected_wavelengths = []
for interval_idx in best_intervals:
start_idx, end_idx = intervals[interval_idx]
selected_wavelengths.extend(range(start_idx, end_idx))
return selected_wavelengths, best_rmsecv, best_n_components
def sipls_feature_selection(X, y, n_intervals_list=[10, 15, 20], n_combinations_list=[2, 3, 4],
max_components=15, cv_folds=5):
"""
高级SiPLS特征选择尝试不同的参数组合
参数:
X: 光谱矩阵 (n_samples, n_wavelengths)
y: 浓度/属性向量 (n_samples,)
n_intervals_list: 尝试的区间数量列表
n_combinations_list: 尝试的组合数量列表
max_components: PLS的最大主成分数
cv_folds: 交叉验证折数
返回:
best_result: 包含最优结果的字典
"""
best_overall_rmsecv = float('inf')
best_overall_result = None
print("=== SiPLS 特征选择 ===")
print(f"数据形状: {X.shape}")
print(f"尝试的参数组合: {len(n_intervals_list)} × {len(n_combinations_list)} = {len(n_intervals_list) * len(n_combinations_list)}")
for n_intervals in n_intervals_list:
for n_combinations in n_combinations_list:
print(f"\n--- 测试参数: 区间数={n_intervals}, 组合数={n_combinations} ---")
try:
selected_wavelengths, rmsecv, n_components = synergy_interval_pls(
X, y,
n_intervals=n_intervals,
n_combinations=n_combinations,
max_components=max_components,
cv_folds=cv_folds
)
if rmsecv < best_overall_rmsecv:
best_overall_rmsecv = rmsecv
best_overall_result = {
'selected_wavelengths': selected_wavelengths,
'rmsecv': rmsecv,
'n_components': n_components,
'n_intervals': n_intervals,
'n_combinations': n_combinations,
'selection_ratio': len(selected_wavelengths) / X.shape[1]
}
except Exception as e:
print(f"参数组合 (区间数={n_intervals}, 组合数={n_combinations}) 处理失败: {str(e)}")
continue
if best_overall_result:
print("=== 最终最优结果 ===")
print(f"区间数: {best_overall_result['n_intervals']}")
print(f"组合数: {best_overall_result['n_combinations']}")
print(f"RMSECV: {best_overall_result['rmsecv']:.6f}")
print(f"主成分数: {best_overall_result['n_components']}")
print(f"选择的波长数: {len(best_overall_result['selected_wavelengths'])}")
print(f"选择率: {best_overall_result['selection_ratio']:.3f}")
return best_overall_result
def plot_sipls_results(X, selected_wavelengths, title="SiPLS Selected Wavelengths"):
"""
绘制SiPLS选择结果的可视化图
参数:
X: 原始光谱矩阵
selected_wavelengths: 选择的波长索引
title: 图表标题
"""
n_wavelengths = X.shape[1]
wavelength_indices = np.arange(n_wavelengths)
# 创建选择掩码
selection_mask = np.zeros(n_wavelengths, dtype=bool)
selection_mask[selected_wavelengths] = True
plt.figure(figsize=(12, 6))
# 绘制平均光谱
mean_spectrum = np.mean(X, axis=0)
plt.plot(wavelength_indices, mean_spectrum, 'b-', alpha=0.7, label='Mean Spectrum')
# 高亮选择的波长
plt.scatter(wavelength_indices[selection_mask], mean_spectrum[selection_mask],
color='red', s=50, alpha=0.8, label='Selected Wavelengths')
plt.xlabel('Wavelength Index')
plt.ylabel('Intensity')
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
return plt.gcf()
# 使用示例
if __name__ == "__main__":
# 生成模拟光谱数据
np.random.seed(42)
n_samples = 100
n_wavelengths = 1000
# 模拟光谱数据(高斯峰)
wavelengths = np.linspace(400, 2500, n_wavelengths)
X = np.zeros((n_samples, n_wavelengths))
# 添加一些特征峰
peak_positions = [500, 800, 1200, 1800, 2200] # nm
peak_indices = [np.argmin(np.abs(wavelengths - pos)) for pos in peak_positions]
for i in range(n_samples):
for peak_idx in peak_indices:
# 添加高斯峰
gaussian = np.exp(-0.5 * ((np.arange(n_wavelengths) - peak_idx) / 50)**2)
X[i] += gaussian * np.random.uniform(0.5, 1.5)
# 添加噪声
X[i] += np.random.normal(0, 0.1, n_wavelengths)
# 生成模拟浓度数据(与某些峰相关)
y = (X[:, peak_indices[0]] + X[:, peak_indices[2]] + X[:, peak_indices[4]]) / 3
y += np.random.normal(0, 0.05, n_samples) # 添加噪声
print("模拟数据生成完成")
print(f"数据形状: {X.shape}")
print(".3f")
# 运行SiPLS特征选择
result = sipls_feature_selection(
X, y,
n_intervals_list=[10, 15],
n_combinations_list=[2, 3],
max_components=10,
cv_folds=5
)
if result:
print(f"\n选择的波长索引: {result['selected_wavelengths'][:10]}...") # 只显示前10个
# 绘制结果
fig = plot_sipls_results(X, result['selected_wavelengths'])
plt.show()