初始提交
This commit is contained in:
92
classification_model/WaveSelect/Uve.py
Normal file
92
classification_model/WaveSelect/Uve.py
Normal file
@ -0,0 +1,92 @@
|
||||
"""
|
||||
-*- coding: utf-8 -*-
|
||||
@Time :2022/04/12 17:10
|
||||
@Author : Pengyou FU
|
||||
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
|
||||
@github : https://github.com/FuSiry/OpenSA
|
||||
@WeChat : Fu_siry
|
||||
@License:Apache-2.0 license
|
||||
"""
|
||||
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.model_selection import ShuffleSplit, cross_val_score
|
||||
from numpy.linalg import matrix_rank as rank
|
||||
import numpy as np
|
||||
|
||||
|
||||
class UVE:
|
||||
def __init__(self, x, y, ncomp=20, nrep=500, testSize=0.2):
|
||||
"""
|
||||
初始化 UVE 模型。
|
||||
|
||||
参数:
|
||||
x : np.ndarray,预测变量矩阵(输入数据)
|
||||
y : np.ndarray,标签(目标值)
|
||||
ncomp : int,PLS 中的最大潜变量数量,默认为 20
|
||||
nrep : int,重复次数,默认为 500
|
||||
testSize : float,训练集中划分的测试集比例,默认为 0.2
|
||||
"""
|
||||
self.x = x
|
||||
self.y = y
|
||||
self.ncomp = min(ncomp, rank(x)) # 确保潜变量数量不超过矩阵秩
|
||||
self.nrep = nrep
|
||||
self.testSize = testSize
|
||||
|
||||
self.criteria = None # 存储标准化系数
|
||||
self.featureIndex = None # 存储特征排序索引
|
||||
self.featureR2 = np.full(self.x.shape[1], np.nan) # 存储 R² 值
|
||||
self.selFeature = None # 存储最终选择的特征索引
|
||||
|
||||
def calcCriteria(self):
|
||||
"""计算每个变量的标准化系数 (meanCoef / stdCoef)。"""
|
||||
PLSCoef = np.zeros((self.nrep, self.x.shape[1])) # 存储每次迭代的 PLS 系数
|
||||
ss = ShuffleSplit(n_splits=self.nrep, test_size=self.testSize)
|
||||
|
||||
# 遍历每次划分的数据集,计算 PLS 系数
|
||||
for step, (train, test) in enumerate(ss.split(self.x, self.y)):
|
||||
xtrain, ytrain = self.x[train], self.y[train]
|
||||
plsModel = PLSRegression(n_components=min(self.ncomp, rank(xtrain)))
|
||||
plsModel.fit(xtrain, ytrain)
|
||||
PLSCoef[step, :] = plsModel.coef_.flatten()
|
||||
|
||||
# 使用 np.divide 处理除法,避免除以零的问题
|
||||
meanCoef = np.mean(PLSCoef, axis=0)
|
||||
stdCoef = np.std(PLSCoef, axis=0)
|
||||
self.criteria = np.divide(meanCoef, stdCoef, out=np.zeros_like(meanCoef), where=stdCoef != 0)
|
||||
|
||||
def evalCriteria(self, cv=3):
|
||||
"""基于标准化系数评估每个变量组合的 R² 值。"""
|
||||
# 按标准化系数的绝对值降序排序,获取特征的索引
|
||||
self.featureIndex = np.argsort(-np.abs(self.criteria))
|
||||
|
||||
# 依次增加特征,计算每个组合的 R² 值
|
||||
for i in range(self.x.shape[1]):
|
||||
xi = self.x[:, self.featureIndex[:i + 1]] # 选择前 i+1 个特征
|
||||
|
||||
# 根据特征数量选择回归模型
|
||||
if i < self.ncomp:
|
||||
regModel = LinearRegression()
|
||||
else:
|
||||
regModel = PLSRegression(n_components=min(self.ncomp, rank(xi)))
|
||||
|
||||
# 进行交叉验证并存储 R² 值
|
||||
cvScore = cross_val_score(regModel, xi, self.y, cv=cv, scoring='r2')
|
||||
self.featureR2[i] = np.mean(cvScore)
|
||||
|
||||
def cutFeature(self, *args):
|
||||
"""根据 R² 最大值选择特征,并返回所选特征的索引(列号)。"""
|
||||
# 找到 R² 最大值对应的索引位置
|
||||
cuti = np.nanargmax(self.featureR2) # 使用 nanargmax 以避免 NaN 的影响
|
||||
self.selFeature = self.featureIndex[:cuti + 1] # 最优特征索引
|
||||
|
||||
# 如果传入其他数据集,返回筛选后的数据
|
||||
if len(args) != 0:
|
||||
returnx = list(args)
|
||||
for i, argi in enumerate(args):
|
||||
if argi.shape[1] == self.x.shape[1]:
|
||||
returnx[i] = argi[:, self.selFeature]
|
||||
return returnx
|
||||
|
||||
# 返回所选特征的索引(列号)
|
||||
return self.selFeature
|
||||
Reference in New Issue
Block a user