import numpy as np from sklearn.neighbors import NearestNeighbors class ReliefF: def __init__(self, n_neighbors=20, n_features_to_keep=20): """ 初始化 ReliefF 算法参数。 :param n_neighbors: 每个样本的近邻数量。 :param n_features_to_keep: 每次保留的特征数量。 """ self.n_neighbors = n_neighbors self.n_features_to_keep = n_features_to_keep self.feature_scores = None # 用于存储每个特征的评分 self.top_features = None # 用于存储评分最高的特征索引 def fit(self, X, y): """ 根据给定的数据 X 和标签 y 计算特征评分。 :param X: 输入特征矩阵。 :param y: 类别标签。 :return: 返回选择的特征索引。 """ m, n = X.shape # m 是样本数,n 是特征数 self.feature_scores = np.zeros(n) # 初始化特征评分为 0 # 寻找每个样本的 n_neighbors 个近邻 nbrs = NearestNeighbors(n_neighbors=self.n_neighbors + 1).fit(X) distances, indices = nbrs.kneighbors(X) # 遍历每个样本,更新特征评分 for i in range(m): y_i = y[i] # 当前样本的类别标签 # 初始化同类和异类邻居 hit_neighbors = [] miss_neighbors = [] for j in indices[i][1:]: # indices[i][0] 是样本自身,跳过 if y[j] == y_i: hit_neighbors.append(X[j]) else: miss_neighbors.append(X[j]) # 更新每个特征的评分 for f in range(n): for hit in hit_neighbors: self.feature_scores[f] -= (X[i, f] - hit[f]) ** 2 / (self.n_neighbors * m) for miss in miss_neighbors: self.feature_scores[f] += (X[i, f] - miss[f]) ** 2 / (self.n_neighbors * m) # 选择评分最高的 n_features_to_keep 个特征的索引 self.top_features = np.argsort(self.feature_scores)[-self.n_features_to_keep:] return self.top_features # 返回选择的特征索引 def fit_transform(self, X, y): """一步完成拟合和转换,返回选择的特征索引。""" return self.fit(X, y) def multi_scale_relieff_stratified(X, y, segment_size=100, n_subsegments=20, n_features_per_subsegment=5): """ 分层多尺度特征选择,确保每个波长段都能被覆盖。 :param X: 输入特征矩阵。 :param y: 类别标签。 :param segment_size: 每个波长段的大小。 :param n_subsegments: 每个段内的子区域数量。 :param n_features_per_subsegment: 每个子区域选择的特征数量。 :return: 分层选择的特征索引。 """ selected_features = [] # 遍历每个波长段 for i in range(0, X.shape[1], segment_size): segment_X = X[:, i:i + segment_size] subsegment_size = segment_size // n_subsegments # 子区域大小 # 在每个子区域内进行特征选择 for j in range(0, segment_size, subsegment_size): subsegment_X = segment_X[:, j:j + subsegment_size] relief = ReliefF(n_neighbors=10, n_features_to_keep=n_features_per_subsegment) subsegment_selected = relief.fit_transform(subsegment_X, y) # 将局部索引转换为全局索引并添加到结果中 selected_features.extend(subsegment_selected + i + j) # 返回去重后的特征索引 return np.unique(selected_features)