初始提交
This commit is contained in:
176
classification_model/WaveSelect/Cars.py
Normal file
176
classification_model/WaveSelect/Cars.py
Normal file
@ -0,0 +1,176 @@
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import copy
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
|
||||
def PC_Cross_Validation(X, y, pc, cv):
|
||||
'''
|
||||
X : 光谱矩阵 (DataFrame) nxm
|
||||
y : 浓度阵 (Series) (化学值)
|
||||
pc: 最大主成分数
|
||||
cv: 交叉验证数量
|
||||
return :
|
||||
RMSECV: 各主成分数对应的RMSECV
|
||||
rindex: 最佳主成分数
|
||||
'''
|
||||
kf = KFold(n_splits=cv)
|
||||
RMSECV = []
|
||||
for i in range(pc):
|
||||
RMSE = []
|
||||
for train_index, test_index in kf.split(X):
|
||||
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
|
||||
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
|
||||
pls = PLSRegression(n_components=i + 1)
|
||||
pls.fit(x_train, y_train)
|
||||
y_predict = pls.predict(x_test)
|
||||
RMSE.append(np.sqrt(mean_squared_error(y_test, y_predict)))
|
||||
RMSE_mean = np.mean(RMSE)
|
||||
RMSECV.append(RMSE_mean)
|
||||
rindex = np.argmin(RMSECV)
|
||||
return RMSECV, rindex
|
||||
|
||||
|
||||
def Cross_Validation(X, y, pc, cv):
|
||||
'''
|
||||
X : 光谱矩阵 (DataFrame) nxm
|
||||
y : 浓度阵 (Series) (化学值)
|
||||
pc: 最大主成分数
|
||||
cv: 交叉验证数量
|
||||
return :
|
||||
RMSECV: 各主成分数对应的RMSECV
|
||||
'''
|
||||
kf = KFold(n_splits=cv)
|
||||
RMSE = []
|
||||
for train_index, test_index in kf.split(X):
|
||||
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
|
||||
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
|
||||
pls = PLSRegression(n_components=pc)
|
||||
pls.fit(x_train, y_train)
|
||||
y_predict = pls.predict(x_test)
|
||||
RMSE.append(np.sqrt(mean_squared_error(y_test, y_predict)))
|
||||
RMSE_mean = np.mean(RMSE)
|
||||
return RMSE_mean
|
||||
|
||||
|
||||
def CARS_Cloud(X, y, N=50, f=20, cv=10, save_fig=False, save_path=None):
|
||||
'''
|
||||
X : 光谱矩阵 (DataFrame 或 ndarray)
|
||||
y : 浓度阵 (Series 或 ndarray)
|
||||
N : 蒙特卡洛迭代次数
|
||||
f : 最大特征数
|
||||
cv : 交叉验证的次数
|
||||
save_fig : 是否保存图像
|
||||
save_path : 图像保存路径
|
||||
return :
|
||||
OptWave : 选择的波长
|
||||
'''
|
||||
p = 0.8
|
||||
m, n = X.shape
|
||||
u = np.power((n / 2), (1 / (N - 1)))
|
||||
k = (1 / (N - 1)) * np.log(n / 2)
|
||||
cal_num = np.round(m * p)
|
||||
b2 = np.arange(n)
|
||||
x = X # 将 DataFrame 转换为 numpy 数组
|
||||
y = y # 将 Series 转换为 numpy 数组
|
||||
D = np.vstack((np.array(b2).reshape(1, -1), x))
|
||||
WaveData = []
|
||||
WaveNum = []
|
||||
RMSECV = []
|
||||
r = []
|
||||
|
||||
for i in range(1, N + 1):
|
||||
r.append(u * np.exp(-1 * k * i))
|
||||
wave_num = int(np.round(r[i - 1] * n))
|
||||
WaveNum = np.hstack((WaveNum, wave_num))
|
||||
cal_index = np.random.choice(np.arange(m), size=int(cal_num), replace=False)
|
||||
wave_index = b2[:wave_num].reshape(1, -1)[0]
|
||||
|
||||
# 使用 np.ix_ 来进行行列索引
|
||||
xcal = x[np.ix_(cal_index, wave_index)] # 选择对应的行和列
|
||||
ycal = y[cal_index] # 选择对应的 y
|
||||
|
||||
# 将 ycal 转换为一维数组
|
||||
ycal = ycal.ravel() # 使其成为一维数组
|
||||
|
||||
x = x[:, wave_index] # 更新 x
|
||||
D = D[:, wave_index] # 更新 D
|
||||
d = D[0, :].reshape(1, -1)
|
||||
wnum = n - wave_num
|
||||
if wnum > 0:
|
||||
d = np.hstack((d, np.full((1, wnum), -1)))
|
||||
if len(WaveData) == 0:
|
||||
WaveData = d
|
||||
else:
|
||||
WaveData = np.vstack((WaveData, d.reshape(1, -1)))
|
||||
|
||||
if wave_num < f:
|
||||
f = wave_num
|
||||
|
||||
pls = PLSRegression(n_components=f)
|
||||
pls.fit(xcal, ycal)
|
||||
beta = pls.coef_
|
||||
|
||||
# 针对新版sklearn处理 coef_ 的方式
|
||||
if beta.shape[0] == 1: # 新版sklearn,(1, x)
|
||||
b = np.abs(beta[0]) # 从第一行提取数据
|
||||
coeff = beta[0, b2] # 修改为beta[0, b2],因为coef只有一行
|
||||
else: # 旧版sklearn,(x, 1)
|
||||
b = np.abs(beta[:, 0]) # 从列中提取数据
|
||||
coeff = beta[b2, 0] # 修改为beta[b2, 0],因为coef只有一列
|
||||
|
||||
b2 = np.argsort(-b, axis=0)
|
||||
coef = copy.deepcopy(beta)
|
||||
coeff = coef[b2, :].reshape(len(b2), -1)
|
||||
rmsecv, rindex = PC_Cross_Validation(pd.DataFrame(xcal), pd.Series(ycal), f, cv)
|
||||
RMSECV.append(Cross_Validation(pd.DataFrame(xcal), pd.Series(ycal), rindex + 1, cv))
|
||||
|
||||
WAVE = []
|
||||
for i in range(WaveData.shape[0]):
|
||||
wd = WaveData[i, :]
|
||||
WD = np.ones((len(wd)))
|
||||
for j in range(len(wd)):
|
||||
ind = np.where(wd == j)
|
||||
if len(ind[0]) == 0:
|
||||
WD[j] = 0
|
||||
else:
|
||||
WD[j] = wd[ind[0]]
|
||||
if len(WAVE) == 0:
|
||||
WAVE = copy.deepcopy(WD)
|
||||
else:
|
||||
WAVE = np.vstack((WAVE, WD.reshape(1, -1)))
|
||||
|
||||
MinIndex = np.argmin(RMSECV)
|
||||
Optimal = WAVE[MinIndex, :]
|
||||
boindex = np.where(Optimal != 0)
|
||||
OptWave = boindex[0]
|
||||
|
||||
plt.figure(figsize=(12, 10))
|
||||
# 设置字体为新罗马
|
||||
plt.rcParams['font.sans-serif'] = ['Times New Roman'] # 使用 Times New Roman 字体
|
||||
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
|
||||
fonts = 20
|
||||
|
||||
plt.subplot(211)
|
||||
plt.xlabel('Monte Carlo Iterations', fontsize=fonts)
|
||||
plt.ylabel('Number of Selected Wavelengths', fontsize=fonts)
|
||||
plt.title('Optimal Iteration: ' + str(MinIndex), fontsize=fonts)
|
||||
plt.plot(np.arange(N), WaveNum)
|
||||
|
||||
plt.subplot(212)
|
||||
plt.xlabel('Monte Carlo Iterations', fontsize=fonts)
|
||||
plt.ylabel('RMSECV', fontsize=fonts)
|
||||
plt.plot(np.arange(N), RMSECV)
|
||||
|
||||
# 保存图像
|
||||
if save_fig:
|
||||
plt.savefig(save_path) # 保存图像到文件
|
||||
print(f"The figure has been saved as {save_path}")
|
||||
|
||||
|
||||
# plt.show()
|
||||
|
||||
return OptWave
|
||||
59
classification_model/WaveSelect/GA.py
Normal file
59
classification_model/WaveSelect/GA.py
Normal file
@ -0,0 +1,59 @@
|
||||
from deap import base, creator, tools, algorithms
|
||||
import numpy as np
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
|
||||
def GA(X, y, n_generations=20, population_size=50, crossover_prob=0.7, mutation_prob=0.2):
|
||||
"""
|
||||
使用遗传算法进行特征选择,返回选择的特征索引。
|
||||
|
||||
参数:
|
||||
X (ndarray): 特征矩阵
|
||||
y (ndarray): 标签
|
||||
n_generations (int): 迭代次数
|
||||
population_size (int): 种群大小
|
||||
crossover_prob (float): 交叉概率
|
||||
mutation_prob (float): 变异概率
|
||||
|
||||
返回:
|
||||
list: 选择的特征索引
|
||||
"""
|
||||
# 创建适应度和个体
|
||||
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
|
||||
creator.create("Individual", list, fitness=creator.FitnessMax)
|
||||
|
||||
toolbox = base.Toolbox()
|
||||
toolbox.register("attr_bool", lambda: np.random.randint(0, 2))
|
||||
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])
|
||||
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
|
||||
|
||||
# 定义适应度函数
|
||||
def evaluate(individual):
|
||||
selected_features = [index for index, val in enumerate(individual) if val == 1]
|
||||
if not selected_features:
|
||||
return 0, # 没有特征时适应度为 0
|
||||
X_selected = X[:, selected_features]
|
||||
clf = RandomForestClassifier(random_state=42)
|
||||
score = cross_val_score(clf, X_selected, y, cv=5).mean() # 5 折交叉验证
|
||||
return score,
|
||||
|
||||
toolbox.register("evaluate", evaluate)
|
||||
toolbox.register("mate", tools.cxTwoPoint)
|
||||
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
|
||||
toolbox.register("select", tools.selTournament, tournsize=3)
|
||||
|
||||
# 初始化种群
|
||||
population = toolbox.population(n=population_size)
|
||||
|
||||
# 运行遗传算法
|
||||
result_population, _ = algorithms.eaSimple(population, toolbox, cxpb=crossover_prob,
|
||||
mutpb=mutation_prob, ngen=n_generations,
|
||||
verbose=False)
|
||||
|
||||
# 获取最优个体
|
||||
best_individual = tools.selBest(result_population, k=1)[0]
|
||||
selected_features = [index for index, val in enumerate(best_individual) if val == 1]
|
||||
|
||||
return selected_features
|
||||
41
classification_model/WaveSelect/Lar.py
Normal file
41
classification_model/WaveSelect/Lar.py
Normal file
@ -0,0 +1,41 @@
|
||||
"""
|
||||
-*- coding: utf-8 -*-
|
||||
@Time :2022/04/12 17:10
|
||||
@Author : Pengyou FU
|
||||
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
|
||||
@github : https://github.com/FuSiry/OpenSA
|
||||
@WeChat : Fu_siry
|
||||
@License:Apache-2.0 license
|
||||
"""
|
||||
|
||||
from sklearn import linear_model
|
||||
import numpy as np
|
||||
|
||||
def Lar(X, y, nums=40):
|
||||
"""
|
||||
使用 LARS(Least Angle Regression)选择重要的特征波长。
|
||||
|
||||
参数:
|
||||
X : np.ndarray,预测变量矩阵(输入数据)
|
||||
y : np.ndarray,标签(目标值)
|
||||
nums : int,选择的特征点数量,默认为 40
|
||||
|
||||
返回:
|
||||
np.ndarray,选择的特征波长索引
|
||||
"""
|
||||
# 初始化 LARS 模型
|
||||
Lars = linear_model.Lars()
|
||||
|
||||
# 拟合模型
|
||||
Lars.fit(X, y)
|
||||
|
||||
# 获取回归系数的绝对值,表示特征的重要性
|
||||
corflist = np.abs(Lars.coef_)
|
||||
|
||||
# 将系数转换为数组并按重要性排序,选择前 nums 个最重要的特征
|
||||
SpectrumList = np.argsort(corflist)[-nums:][::-1]
|
||||
|
||||
# 对选择的特征索引进行排序,保证顺序一致
|
||||
SpectrumList = np.sort(SpectrumList)
|
||||
|
||||
return SpectrumList
|
||||
49
classification_model/WaveSelect/MRMR.py
Normal file
49
classification_model/WaveSelect/MRMR.py
Normal file
@ -0,0 +1,49 @@
|
||||
import pymrmr
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class MRMRFeatureSelection:
|
||||
def __init__(self, X, y):
|
||||
"""
|
||||
初始化 mRMR 特征选择模块。
|
||||
|
||||
:param X: 输入特征矩阵 (DataFrame),每列为一个特征。
|
||||
:param y: 目标变量 (Series),与特征矩阵 X 对应。
|
||||
"""
|
||||
self.X = X
|
||||
self.y = y
|
||||
self.selected_features = None
|
||||
|
||||
def select_features(self, k=18, method='MIQ'):
|
||||
"""
|
||||
执行 mRMR 特征选择。
|
||||
|
||||
:param k: 选择的特征个数。
|
||||
:param method: 选择的 mRMR 方法 ('MIQ' 或 'MRMR')。
|
||||
:return: 选择的特征列表
|
||||
"""
|
||||
# 拼接特征和目标变量
|
||||
df = pd.concat([self.y, self.X], axis=1)
|
||||
|
||||
# 使用 pymrmr 进行 mRMR 特征选择
|
||||
self.selected_features = pymrmr.mRMR(df, method, k)
|
||||
|
||||
return self.selected_features
|
||||
|
||||
def get_selected_features(self):
|
||||
"""
|
||||
获取已选择的特征。
|
||||
|
||||
:return: 选择的特征
|
||||
"""
|
||||
return self.selected_features
|
||||
|
||||
def get_selected_feature_names(self):
|
||||
"""
|
||||
获取已选择特征的列名
|
||||
|
||||
:return: 选择的特征列名列表
|
||||
"""
|
||||
if self.selected_features is None:
|
||||
return None
|
||||
return self.selected_features
|
||||
24
classification_model/WaveSelect/Pca.py
Normal file
24
classification_model/WaveSelect/Pca.py
Normal file
@ -0,0 +1,24 @@
|
||||
"""
|
||||
-*- coding: utf-8 -*-
|
||||
@Time :2022/04/12 17:10
|
||||
@Author : Pengyou FU
|
||||
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
|
||||
@github : https://github.com/FuSiry/OpenSA
|
||||
@WeChat : Fu_siry
|
||||
@License:Apache-2.0 license
|
||||
|
||||
"""
|
||||
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
def Pca(X, nums=20):
|
||||
"""
|
||||
:param X: raw spectrum data, shape (n_samples, n_features)
|
||||
:param nums: Number of principal components retained
|
||||
:return: X_reduction:Spectral data after dimensionality reduction
|
||||
"""
|
||||
pca = PCA(n_components=nums) # 保留的特征数码
|
||||
pca.fit(X)
|
||||
X_reduction = pca.transform(X)
|
||||
|
||||
return X_reduction
|
||||
88
classification_model/WaveSelect/ReliefF.py
Normal file
88
classification_model/WaveSelect/ReliefF.py
Normal file
@ -0,0 +1,88 @@
|
||||
|
||||
import numpy as np
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
class ReliefF:
|
||||
def __init__(self, n_neighbors=20, n_features_to_keep=20):
|
||||
"""
|
||||
初始化 ReliefF 算法参数。
|
||||
:param n_neighbors: 每个样本的近邻数量。
|
||||
:param n_features_to_keep: 每次保留的特征数量。
|
||||
"""
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_features_to_keep = n_features_to_keep
|
||||
self.feature_scores = None # 用于存储每个特征的评分
|
||||
self.top_features = None # 用于存储评分最高的特征索引
|
||||
|
||||
def fit(self, X, y):
|
||||
"""
|
||||
根据给定的数据 X 和标签 y 计算特征评分。
|
||||
:param X: 输入特征矩阵。
|
||||
:param y: 类别标签。
|
||||
:return: 返回选择的特征索引。
|
||||
"""
|
||||
m, n = X.shape # m 是样本数,n 是特征数
|
||||
|
||||
self.feature_scores = np.zeros(n) # 初始化特征评分为 0
|
||||
|
||||
# 寻找每个样本的 n_neighbors 个近邻
|
||||
nbrs = NearestNeighbors(n_neighbors=self.n_neighbors + 1).fit(X)
|
||||
distances, indices = nbrs.kneighbors(X)
|
||||
|
||||
# 遍历每个样本,更新特征评分
|
||||
for i in range(m):
|
||||
y_i = y[i] # 当前样本的类别标签
|
||||
|
||||
# 初始化同类和异类邻居
|
||||
hit_neighbors = []
|
||||
miss_neighbors = []
|
||||
|
||||
for j in indices[i][1:]: # indices[i][0] 是样本自身,跳过
|
||||
if y[j] == y_i:
|
||||
hit_neighbors.append(X[j])
|
||||
else:
|
||||
miss_neighbors.append(X[j])
|
||||
|
||||
# 更新每个特征的评分
|
||||
for f in range(n):
|
||||
for hit in hit_neighbors:
|
||||
self.feature_scores[f] -= (X[i, f] - hit[f]) ** 2 / (self.n_neighbors * m)
|
||||
for miss in miss_neighbors:
|
||||
self.feature_scores[f] += (X[i, f] - miss[f]) ** 2 / (self.n_neighbors * m)
|
||||
|
||||
# 选择评分最高的 n_features_to_keep 个特征的索引
|
||||
self.top_features = np.argsort(self.feature_scores)[-self.n_features_to_keep:]
|
||||
|
||||
return self.top_features # 返回选择的特征索引
|
||||
|
||||
def fit_transform(self, X, y):
|
||||
"""一步完成拟合和转换,返回选择的特征索引。"""
|
||||
return self.fit(X, y)
|
||||
|
||||
def multi_scale_relieff_stratified(X, y, segment_size=100, n_subsegments=20, n_features_per_subsegment=5):
|
||||
"""
|
||||
分层多尺度特征选择,确保每个波长段都能被覆盖。
|
||||
:param X: 输入特征矩阵。
|
||||
:param y: 类别标签。
|
||||
:param segment_size: 每个波长段的大小。
|
||||
:param n_subsegments: 每个段内的子区域数量。
|
||||
:param n_features_per_subsegment: 每个子区域选择的特征数量。
|
||||
:return: 分层选择的特征索引。
|
||||
"""
|
||||
selected_features = []
|
||||
|
||||
# 遍历每个波长段
|
||||
for i in range(0, X.shape[1], segment_size):
|
||||
segment_X = X[:, i:i + segment_size]
|
||||
subsegment_size = segment_size // n_subsegments # 子区域大小
|
||||
|
||||
# 在每个子区域内进行特征选择
|
||||
for j in range(0, segment_size, subsegment_size):
|
||||
subsegment_X = segment_X[:, j:j + subsegment_size]
|
||||
relief = ReliefF(n_neighbors=10, n_features_to_keep=n_features_per_subsegment)
|
||||
subsegment_selected = relief.fit_transform(subsegment_X, y)
|
||||
|
||||
# 将局部索引转换为全局索引并添加到结果中
|
||||
selected_features.extend(subsegment_selected + i + j)
|
||||
|
||||
# 返回去重后的特征索引
|
||||
return np.unique(selected_features)
|
||||
116
classification_model/WaveSelect/Spa.py
Normal file
116
classification_model/WaveSelect/Spa.py
Normal file
@ -0,0 +1,116 @@
|
||||
import scipy.stats
|
||||
import numpy as np
|
||||
from scipy.linalg import qr, inv, pinv
|
||||
import scipy.stats
|
||||
from progress.bar import Bar
|
||||
from matplotlib import pyplot as plt
|
||||
class SPA:
|
||||
def _projections_qr(self, X, k, M):
|
||||
X_projected = X.copy()
|
||||
norms = np.sum((X ** 2), axis=0)
|
||||
norm_max = np.amax(norms)
|
||||
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
|
||||
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
|
||||
return order[:M].T
|
||||
|
||||
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
|
||||
N = Xcal.shape[0]
|
||||
NV = Xval.shape[0] if Xval is not None else 0
|
||||
|
||||
yhat, e = None, None
|
||||
if NV > 0:
|
||||
Xcal_ones = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel].to_numpy()])
|
||||
b = np.linalg.lstsq(Xcal_ones, ycal, rcond=None)[0]
|
||||
Xval_ones = np.hstack([np.ones((NV, 1)), Xval.iloc[:, var_sel].to_numpy()])
|
||||
yhat = Xval_ones.dot(b)
|
||||
e = yval - yhat
|
||||
else:
|
||||
yhat = np.zeros((N, 1))
|
||||
for i in range(N):
|
||||
cal = np.hstack([np.arange(i), np.arange(i + 1, N)])
|
||||
X = Xcal.iloc[cal, var_sel]
|
||||
y = ycal.iloc[cal]
|
||||
X_ones = np.hstack([np.ones((N - 1, 1)), X.to_numpy()])
|
||||
b = np.linalg.lstsq(X_ones, y, rcond=None)[0]
|
||||
xtest = Xcal.iloc[i, var_sel].to_numpy()
|
||||
yhat[i] = np.hstack([1, xtest]).dot(b)
|
||||
e = ycal.to_numpy() - yhat
|
||||
return yhat, e
|
||||
|
||||
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
|
||||
N, K = Xcal.shape
|
||||
m_max = min(N - 1, K) if m_max is None else m_max
|
||||
|
||||
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
|
||||
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
|
||||
|
||||
SEL = np.zeros((m_max, K))
|
||||
with Bar('Projections :', max=K) as bar:
|
||||
for k in range(K):
|
||||
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
|
||||
bar.next()
|
||||
|
||||
PRESS = np.full((m_max + 1, K), np.inf)
|
||||
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
|
||||
for k in range(K):
|
||||
for m in range(m_min, m_max + 1):
|
||||
var_sel = SEL[:m, k].astype(int)
|
||||
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
|
||||
PRESS[m, k] = e.T @ e
|
||||
bar.next()
|
||||
|
||||
m_sel = np.argmin(PRESS, axis=0)
|
||||
k_sel = np.argmin(np.min(PRESS, axis=0))
|
||||
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
|
||||
|
||||
Xcal2 = np.hstack([np.ones((N, 1)), Xcal.iloc[:, var_sel_phase2].to_numpy()])
|
||||
b = np.linalg.lstsq(Xcal2, ycal, rcond=None)[0]
|
||||
std_deviation = Xcal2.std(ddof=1, axis=0)
|
||||
relev = np.abs(b * std_deviation)[1:]
|
||||
|
||||
index_decreasing_relev = np.argsort(-relev)
|
||||
PRESS_scree = np.empty(len(var_sel_phase2))
|
||||
for i in range(len(var_sel_phase2)):
|
||||
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
|
||||
_, e = self._validation(Xcal, ycal, var_sel, Xval, yval)
|
||||
PRESS_scree[i] = np.conj(e).T @ e
|
||||
|
||||
RMSEP_scree = np.sqrt(PRESS_scree / len(e))
|
||||
alpha = 0.25
|
||||
dof = len(e)
|
||||
fcrit = scipy.stats.f.ppf(1 - alpha, dof, dof)
|
||||
PRESS_crit = np.min(PRESS_scree) * fcrit
|
||||
i_crit = np.min(np.nonzero(PRESS_scree < PRESS_crit))
|
||||
i_crit = max(m_min, i_crit)
|
||||
var_sel = var_sel_phase2[index_decreasing_relev[:i_crit]]
|
||||
|
||||
# 绘图
|
||||
plt.figure()
|
||||
|
||||
# 设置字体为 Times New Roman
|
||||
plt.rcParams['font.sans-serif'] = ['Times New Roman']
|
||||
plt.rcParams['axes.unicode_minus'] = False # 确保负号显示正常
|
||||
|
||||
# 设置标题、标签和网格
|
||||
plt.xlabel('Number of variables included in the model', fontsize=14)
|
||||
plt.ylabel('RMSE', fontsize=14)
|
||||
plt.title(f'Final number of selected variables: {len(var_sel)} (RMSE={RMSEP_scree[i_crit]:.4f})', fontsize=16)
|
||||
|
||||
# 绘制 RMSEP 曲线
|
||||
plt.plot(RMSEP_scree, label='RMSEP Scree Plot')
|
||||
plt.scatter(i_crit, RMSEP_scree[i_crit], color='r', marker='s', label='Selected Point')
|
||||
|
||||
# 添加网格和图例
|
||||
plt.grid(True)
|
||||
plt.legend()
|
||||
|
||||
# 显示或保存图像
|
||||
if save_path:
|
||||
plt.savefig(save_path, bbox_inches='tight', dpi=300)
|
||||
print(f"图像已保存至: {save_path}")
|
||||
else:
|
||||
plt.show()
|
||||
return var_sel, var_sel_phase2
|
||||
|
||||
def __repr__(self):
|
||||
return "SPA()"
|
||||
111
classification_model/WaveSelect/Spa_acc.py
Normal file
111
classification_model/WaveSelect/Spa_acc.py
Normal file
@ -0,0 +1,111 @@
|
||||
import lightgbm as lgb
|
||||
import numpy as np
|
||||
from scipy.linalg import qr
|
||||
from progress.bar import Bar
|
||||
from matplotlib import pyplot as plt
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
class SPA_acc:
|
||||
def _projections_qr(self, X, k, M):
|
||||
X_projected = X.copy()
|
||||
norms = np.sum((X ** 2), axis=0)
|
||||
norm_max = np.amax(norms)
|
||||
X_projected.iloc[:, k] = X_projected.iloc[:, k] * 2 * norm_max / norms[k]
|
||||
_, __, order = qr(X_projected.to_numpy(), 0, pivoting=True)
|
||||
return order[:M].T
|
||||
|
||||
def _validation(self, Xcal, ycal, var_sel, Xval=None, yval=None):
|
||||
train_data = lgb.Dataset(Xcal.iloc[:, var_sel], label=ycal)
|
||||
params = {
|
||||
'objective': 'multiclass', # 多分类任务的目标函数
|
||||
'boosting_type': 'gbdt',
|
||||
'metric': 'multi_logloss', # 使用多分类交叉熵损失
|
||||
'device': 'gpu', # 启用GPU加速
|
||||
'verbosity': -1,
|
||||
'num_class': len(np.unique(ycal)) # 设置类别数量
|
||||
}
|
||||
|
||||
# LightGBM 模型训练
|
||||
model = lgb.train(params, train_data, num_boost_round=100)
|
||||
|
||||
# 在验证集或训练集上进行预测
|
||||
if Xval is not None and yval is not None:
|
||||
yhat = model.predict(Xval.iloc[:, var_sel])
|
||||
yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别
|
||||
accuracy = accuracy_score(yval, yhat)
|
||||
else:
|
||||
yhat = model.predict(Xcal.iloc[:, var_sel])
|
||||
yhat = np.argmax(yhat, axis=1) # 获取每个样本的最大概率类别
|
||||
accuracy = accuracy_score(ycal, yhat)
|
||||
|
||||
return yhat, accuracy
|
||||
|
||||
def spa(self, Xcal, ycal, m_min=1, m_max=None, Xval=None, yval=None, autoscaling=1, save_path=None):
|
||||
N, K = Xcal.shape
|
||||
m_max = min(N - 1, K) if m_max is None else m_max
|
||||
|
||||
normalization_factor = Xcal.std(ddof=1, axis=0) if autoscaling else np.ones(K)
|
||||
Xcaln = (Xcal - Xcal.mean()) / normalization_factor
|
||||
|
||||
SEL = np.zeros((m_max, K))
|
||||
with Bar('Projections :', max=K) as bar:
|
||||
for k in range(K):
|
||||
SEL[:, k] = self._projections_qr(Xcaln, k, m_max)
|
||||
bar.next()
|
||||
|
||||
ACCURACY = np.full((m_max + 1, K), -np.inf)
|
||||
with Bar('Evaluating subsets:', max=K * (m_max - m_min + 1)) as bar:
|
||||
for k in range(K):
|
||||
for m in range(m_min, m_max + 1):
|
||||
var_sel = SEL[:m, k].astype(int)
|
||||
_, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval)
|
||||
ACCURACY[m, k] = accuracy
|
||||
bar.next()
|
||||
|
||||
m_sel = np.argmax(ACCURACY, axis=0)
|
||||
k_sel = np.argmax(np.max(ACCURACY, axis=0))
|
||||
var_sel_phase2 = SEL[:m_sel[k_sel], k_sel].astype(int)
|
||||
|
||||
# Final LightGBM Training
|
||||
Xcal2 = Xcal.iloc[:, var_sel_phase2]
|
||||
train_data = lgb.Dataset(Xcal2, label=ycal)
|
||||
params = {
|
||||
'objective': 'multiclass', # 多分类任务的目标函数
|
||||
'boosting_type': 'gbdt',
|
||||
'metric': 'multi_logloss', # 使用多分类交叉熵损失
|
||||
'device': 'gpu', # 启用GPU加速
|
||||
'verbosity': -1,
|
||||
'num_class': len(np.unique(ycal)) # 设置类别数量
|
||||
}
|
||||
model = lgb.train(params, train_data, num_boost_round=100)
|
||||
relev = model.feature_importance(importance_type='gain')
|
||||
|
||||
index_decreasing_relev = np.argsort(-relev)
|
||||
ACCURACY_scree = np.empty(len(var_sel_phase2))
|
||||
for i in range(len(var_sel_phase2)):
|
||||
var_sel = var_sel_phase2[index_decreasing_relev[:i + 1]]
|
||||
_, accuracy = self._validation(Xcal, ycal, var_sel, Xval, yval)
|
||||
ACCURACY_scree[i] = accuracy
|
||||
|
||||
# 绘图
|
||||
plt.figure()
|
||||
plt.rcParams['font.sans-serif'] = ['Times New Roman']
|
||||
plt.xlabel('Number of variables included in the model', fontsize=14)
|
||||
plt.ylabel('Accuracy', fontsize=14)
|
||||
plt.title(f'Final number of selected variables: {len(var_sel_phase2)} (Accuracy={ACCURACY_scree.max():.4f})', fontsize=16)
|
||||
plt.plot(ACCURACY_scree, label='Accuracy Scree Plot')
|
||||
plt.scatter(np.argmax(ACCURACY_scree), ACCURACY_scree.max(), color='r', marker='s', label='Selected Point')
|
||||
plt.grid(True)
|
||||
plt.legend()
|
||||
|
||||
if save_path:
|
||||
plt.savefig(save_path, bbox_inches='tight', dpi=300)
|
||||
print(f"图像已保存至: {save_path}")
|
||||
else:
|
||||
plt.show()
|
||||
|
||||
return var_sel_phase2, ACCURACY_scree
|
||||
|
||||
def __repr__(self):
|
||||
return "SPA()"
|
||||
92
classification_model/WaveSelect/Uve.py
Normal file
92
classification_model/WaveSelect/Uve.py
Normal file
@ -0,0 +1,92 @@
|
||||
"""
|
||||
-*- coding: utf-8 -*-
|
||||
@Time :2022/04/12 17:10
|
||||
@Author : Pengyou FU
|
||||
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
|
||||
@github : https://github.com/FuSiry/OpenSA
|
||||
@WeChat : Fu_siry
|
||||
@License:Apache-2.0 license
|
||||
"""
|
||||
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.model_selection import ShuffleSplit, cross_val_score
|
||||
from numpy.linalg import matrix_rank as rank
|
||||
import numpy as np
|
||||
|
||||
|
||||
class UVE:
|
||||
def __init__(self, x, y, ncomp=20, nrep=500, testSize=0.2):
|
||||
"""
|
||||
初始化 UVE 模型。
|
||||
|
||||
参数:
|
||||
x : np.ndarray,预测变量矩阵(输入数据)
|
||||
y : np.ndarray,标签(目标值)
|
||||
ncomp : int,PLS 中的最大潜变量数量,默认为 20
|
||||
nrep : int,重复次数,默认为 500
|
||||
testSize : float,训练集中划分的测试集比例,默认为 0.2
|
||||
"""
|
||||
self.x = x
|
||||
self.y = y
|
||||
self.ncomp = min(ncomp, rank(x)) # 确保潜变量数量不超过矩阵秩
|
||||
self.nrep = nrep
|
||||
self.testSize = testSize
|
||||
|
||||
self.criteria = None # 存储标准化系数
|
||||
self.featureIndex = None # 存储特征排序索引
|
||||
self.featureR2 = np.full(self.x.shape[1], np.nan) # 存储 R² 值
|
||||
self.selFeature = None # 存储最终选择的特征索引
|
||||
|
||||
def calcCriteria(self):
|
||||
"""计算每个变量的标准化系数 (meanCoef / stdCoef)。"""
|
||||
PLSCoef = np.zeros((self.nrep, self.x.shape[1])) # 存储每次迭代的 PLS 系数
|
||||
ss = ShuffleSplit(n_splits=self.nrep, test_size=self.testSize)
|
||||
|
||||
# 遍历每次划分的数据集,计算 PLS 系数
|
||||
for step, (train, test) in enumerate(ss.split(self.x, self.y)):
|
||||
xtrain, ytrain = self.x[train], self.y[train]
|
||||
plsModel = PLSRegression(n_components=min(self.ncomp, rank(xtrain)))
|
||||
plsModel.fit(xtrain, ytrain)
|
||||
PLSCoef[step, :] = plsModel.coef_.flatten()
|
||||
|
||||
# 使用 np.divide 处理除法,避免除以零的问题
|
||||
meanCoef = np.mean(PLSCoef, axis=0)
|
||||
stdCoef = np.std(PLSCoef, axis=0)
|
||||
self.criteria = np.divide(meanCoef, stdCoef, out=np.zeros_like(meanCoef), where=stdCoef != 0)
|
||||
|
||||
def evalCriteria(self, cv=3):
|
||||
"""基于标准化系数评估每个变量组合的 R² 值。"""
|
||||
# 按标准化系数的绝对值降序排序,获取特征的索引
|
||||
self.featureIndex = np.argsort(-np.abs(self.criteria))
|
||||
|
||||
# 依次增加特征,计算每个组合的 R² 值
|
||||
for i in range(self.x.shape[1]):
|
||||
xi = self.x[:, self.featureIndex[:i + 1]] # 选择前 i+1 个特征
|
||||
|
||||
# 根据特征数量选择回归模型
|
||||
if i < self.ncomp:
|
||||
regModel = LinearRegression()
|
||||
else:
|
||||
regModel = PLSRegression(n_components=min(self.ncomp, rank(xi)))
|
||||
|
||||
# 进行交叉验证并存储 R² 值
|
||||
cvScore = cross_val_score(regModel, xi, self.y, cv=cv, scoring='r2')
|
||||
self.featureR2[i] = np.mean(cvScore)
|
||||
|
||||
def cutFeature(self, *args):
|
||||
"""根据 R² 最大值选择特征,并返回所选特征的索引(列号)。"""
|
||||
# 找到 R² 最大值对应的索引位置
|
||||
cuti = np.nanargmax(self.featureR2) # 使用 nanargmax 以避免 NaN 的影响
|
||||
self.selFeature = self.featureIndex[:cuti + 1] # 最优特征索引
|
||||
|
||||
# 如果传入其他数据集,返回筛选后的数据
|
||||
if len(args) != 0:
|
||||
returnx = list(args)
|
||||
for i, argi in enumerate(args):
|
||||
if argi.shape[1] == self.x.shape[1]:
|
||||
returnx[i] = argi[:, self.selFeature]
|
||||
return returnx
|
||||
|
||||
# 返回所选特征的索引(列号)
|
||||
return self.selFeature
|
||||
93
classification_model/WaveSelect/WaveSelcet.py
Normal file
93
classification_model/WaveSelect/WaveSelcet.py
Normal file
@ -0,0 +1,93 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from classification_model.WaveSelect.Lar import Lar
|
||||
from classification_model.WaveSelect.Spa import SPA
|
||||
from classification_model.WaveSelect.Spa_acc import SPA_acc
|
||||
from classification_model.WaveSelect.Uve import UVE
|
||||
from classification_model.WaveSelect.Cars import CARS_Cloud
|
||||
from classification_model.WaveSelect.Pca import Pca
|
||||
from classification_model.WaveSelect.GA import GA
|
||||
from classification_model.WaveSelect.ReliefF import ReliefF
|
||||
from sklearn.model_selection import train_test_split
|
||||
# from WaveSelect.MRMR import MRMRFeatureSelection
|
||||
import os
|
||||
import matplotlib.pyplot as plt
|
||||
def SpctrumFeatureSelcet(method, X, y, name='', result_dir='', column_names=None):
|
||||
"""
|
||||
:param method: 波长筛选/降维的方法,包括:Cars, Lars, Uve, Spa, Pca。
|
||||
:param X: 光谱数据,可以是 pandas DataFrame 或 numpy array (n_samples, n_features)。
|
||||
:param y: 光谱数据对应的标签 (n_samples,)。
|
||||
:param name: 结果图像的文件名。
|
||||
:param result_dir: 保存结果的文件夹路径。
|
||||
:param column_names: 如果 X 是 numpy array,需要提供列名列表。
|
||||
:return:
|
||||
- X_Feature: 选择/降维后的数据 (n_samples, n_features)。
|
||||
- y: 对应的标签。
|
||||
- selected_columns: 选择的特征列名或索引。
|
||||
"""
|
||||
global X_Feature
|
||||
|
||||
# 判断输入数据类型并转换为 DataFrame(如有必要)
|
||||
if isinstance(X, np.ndarray):
|
||||
if column_names is None:
|
||||
column_names = [f"{i}" for i in range(X.shape[1])] # 默认列名
|
||||
X_df = pd.DataFrame(X, columns=column_names)
|
||||
else:
|
||||
X_df = X
|
||||
|
||||
# 根据所选方法执行特征选择
|
||||
if method == "None":
|
||||
X_Feature = X_df
|
||||
selected_columns = X_df.columns
|
||||
elif method == "Cars":
|
||||
save_path = os.path.join(result_dir, f"{name}_cars.png")
|
||||
# 调用 CARS_Cloud 并获取结果
|
||||
Featuresecletidx = CARS_Cloud(X_df.values, y, N=50, f=20, cv=10, save_fig=True,save_path=save_path)
|
||||
Featuresecletidx = Featuresecletidx.astype(int)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = Featuresecletidx
|
||||
|
||||
elif method == "Lars":
|
||||
Featuresecletidx = Lar(X_df.values, y)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "Uve":
|
||||
uve = UVE(X_df.values, y, 20)
|
||||
uve.calcCriteria()
|
||||
uve.evalCriteria(cv=5)
|
||||
Featuresecletidx = uve.cutFeature() # 返回所选特征的索引
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "Spa":
|
||||
save_path = os.path.join(result_dir, f"{name}_spa.png")
|
||||
|
||||
Xcal, Xval, ycal, yval = train_test_split(X_df, y, test_size=0.3)
|
||||
Featuresecletidx, var_sel_phase2 = SPA().spa(
|
||||
Xcal, ycal, m_min=2, m_max=50, Xval=Xval, yval=yval, autoscaling=1,save_path=save_path)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "Spa_acc":
|
||||
save_path = os.path.join(result_dir, f"{name}_spa_acc.png")
|
||||
|
||||
Xcal, Xval, ycal, yval = train_test_split(X_df, y, test_size=0.3)
|
||||
Featuresecletidx, var_sel_phase2 = SPA_acc().spa(
|
||||
Xcal, ycal, m_min=2, m_max=50, Xval=Xval, yval=yval, autoscaling=1,save_path=save_path)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "GA":
|
||||
Featuresecletidx = GA(X_df.values, y, 10)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
elif method == "Pca":
|
||||
X_Feature = Pca(X_df.values)
|
||||
selected_columns = [f"PC{i+1}" for i in range(X_Feature.shape[1])]
|
||||
elif method == "ReliefF":
|
||||
relieff = ReliefF(n_neighbors=20, n_features_to_keep=20)
|
||||
Featuresecletidx = relieff.fit(X_df.values, y)
|
||||
X_Feature = X_df.iloc[:, Featuresecletidx]
|
||||
selected_columns = X_df.columns[Featuresecletidx]
|
||||
else:
|
||||
print("没有这个波长筛选方法!")
|
||||
return None, None
|
||||
|
||||
return X_Feature, y, selected_columns # 返回所选特征数据、标签和列名
|
||||
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-310.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-310.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-311.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-311.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-312.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-312.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-38.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-38.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-39.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Cars.cpython-39.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-310.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-310.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-311.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-311.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-312.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-312.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-38.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-38.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-39.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/GA.cpython-39.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-310.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-310.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-311.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-311.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-312.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-312.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-38.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-38.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-39.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Lar.cpython-39.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/MRMR.cpython-311.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/MRMR.cpython-311.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/MRMR.cpython-312.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/MRMR.cpython-312.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-310.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-310.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-311.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-311.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-312.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-312.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-38.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-38.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-39.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Pca.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-310.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-310.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-311.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-311.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-312.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-312.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-38.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-38.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-39.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Spa.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-310.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-310.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-311.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-311.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-312.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-312.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-38.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-38.pyc
Normal file
Binary file not shown.
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-39.pyc
Normal file
BIN
classification_model/WaveSelect/__pycache__/Uve.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
123
classification_model/WaveSelect/centry.py
Normal file
123
classification_model/WaveSelect/centry.py
Normal file
@ -0,0 +1,123 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from WaveSelect.Lar import Lar
|
||||
from WaveSelect.Spa import SPA
|
||||
from WaveSelect.Uve import UVE
|
||||
from WaveSelect.Cars import CARS_Cloud
|
||||
from WaveSelect.GA import GA
|
||||
from WaveSelect.ReliefF import ReliefF
|
||||
from sklearn.model_selection import train_test_split
|
||||
from collections import Counter
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
|
||||
def IntegratedWaveSelect(methods, X, y, strategy="voting", column_names=None, name='', result_dir=''):
|
||||
# 检查并确保 X 是 DataFrame
|
||||
if isinstance(X, np.ndarray):
|
||||
if column_names is None:
|
||||
column_names = [f"{i}" for i in range(X.shape[1])]
|
||||
X_df = pd.DataFrame(X, columns=column_names)
|
||||
else:
|
||||
X_df = X
|
||||
|
||||
feature_indices_list = []
|
||||
|
||||
for method in tqdm(methods, desc="Processing Feature Selection Methods"):
|
||||
print(f"Applying method: {method}") # 调试信息
|
||||
if method == "Cars":
|
||||
save_path = os.path.join(result_dir, f"{name}_cars.png")
|
||||
Featuresecletidx = CARS_Cloud(X, y, N=50, f=20, cv=10, save_path=save_path)
|
||||
Featuresecletidx = Featuresecletidx.astype(int)
|
||||
feature_indices_list.append(Featuresecletidx)
|
||||
elif method == "Lars":
|
||||
Featuresecletidx = Lar(X_df.values, y)
|
||||
feature_indices_list.append(Featuresecletidx)
|
||||
elif method == "Uve":
|
||||
uve = UVE(X_df.values, y, 20)
|
||||
uve.calcCriteria()
|
||||
uve.evalCriteria(cv=5)
|
||||
Featuresecletidx = uve.cutFeature()
|
||||
feature_indices_list.append(Featuresecletidx)
|
||||
elif method == "Spa":
|
||||
save_path = os.path.join(result_dir, f"{name}_spa.png")
|
||||
Xcal, Xval, ycal, yval = train_test_split(X_df, y, test_size=0.2)
|
||||
Featuresecletidx = SPA().spa(
|
||||
Xcal=Xcal.values, ycal=ycal, m_min=4, m_max=32, Xval=Xval.values, yval=yval, autoscaling=1,save_path=save_path)
|
||||
feature_indices_list.append(Featuresecletidx)
|
||||
elif method == "GA":
|
||||
Featuresecletidx = GA(X_df.values, y, 10)
|
||||
feature_indices_list.append(Featuresecletidx)
|
||||
elif method == "ReliefF":
|
||||
relieff = ReliefF(n_neighbors=20, n_features_to_keep=20)
|
||||
Featuresecletidx = relieff.fit(X_df.values, y)
|
||||
feature_indices_list.append(Featuresecletidx)
|
||||
else:
|
||||
print(f"No such method: {method}")
|
||||
continue
|
||||
|
||||
print(f"Selected indices by {method}: {Featuresecletidx}") # 调试信息
|
||||
|
||||
print("Feature indices list after all methods:", feature_indices_list) # 调试信息
|
||||
|
||||
if strategy == "voting":
|
||||
if feature_indices_list:
|
||||
all_indices = np.concatenate(feature_indices_list)
|
||||
print("All indices concatenated:", all_indices) # 调试信息
|
||||
counter = Counter(all_indices)
|
||||
print("Counter result:", counter) # 调试信息
|
||||
selected_features = [
|
||||
idx for idx, count in tqdm(counter.items(), desc="Voting Selection")
|
||||
if count > len(methods) / 2
|
||||
]
|
||||
print("Selected features after voting:", selected_features) # 调试信息
|
||||
else:
|
||||
print("No features selected by any method.") # 提示信息
|
||||
selected_features = []
|
||||
elif strategy == "weighted":
|
||||
weights = {method: 1 for method in methods}
|
||||
weighted_counts = Counter()
|
||||
for method, indices in zip(methods, feature_indices_list):
|
||||
for idx in indices:
|
||||
weighted_counts[idx] += weights[method]
|
||||
print("Weighted counts:", weighted_counts) # 调试信息
|
||||
selected_features = [
|
||||
idx for idx, count in tqdm(weighted_counts.items(), desc="Weighted Selection")
|
||||
if count >= np.mean(list(weighted_counts.values()))
|
||||
]
|
||||
print("Selected features after weighted strategy:", selected_features) # 调试信息
|
||||
elif strategy == "bagging":
|
||||
num_iterations = 5
|
||||
selected_features = set()
|
||||
for _ in tqdm(range(num_iterations), desc="Bagging Iterations"):
|
||||
X_sample, _, y_sample, _ = train_test_split(X_df, y, test_size=0.5)
|
||||
sub_feature_indices_list = []
|
||||
for method in methods:
|
||||
if method == "Spa":
|
||||
Xcal, Xval, ycal, yval = train_test_split(X_sample, y_sample, test_size=0.2)
|
||||
sub_feature_indices = SPA().spa(Xcal=Xcal.values, ycal=ycal, m_min=4, m_max=32, Xval=Xval.values,
|
||||
yval=yval, autoscaling=1)
|
||||
elif method == "Cars":
|
||||
sub_feature_indices = CARS_Cloud(X_sample.values, y_sample)
|
||||
else:
|
||||
continue
|
||||
sub_feature_indices_list.append(sub_feature_indices)
|
||||
for sub_indices in sub_feature_indices_list:
|
||||
selected_features.update(sub_indices)
|
||||
selected_features = list(selected_features)
|
||||
print("Selected features after bagging strategy:", selected_features) # 调试信息
|
||||
else:
|
||||
raise ValueError("Invalid strategy. Choose from 'voting', 'weighted', or 'bagging'.")
|
||||
|
||||
selected_features = list(map(int, selected_features)) # 确保索引是整数类型
|
||||
X_Feature = X_df.iloc[:, selected_features]
|
||||
selected_columns = X_df.columns[selected_features]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# 关闭图像以释放资源
|
||||
plt.close()
|
||||
|
||||
return X_Feature, y, selected_columns
|
||||
Reference in New Issue
Block a user