292 lines
9.5 KiB
Python
292 lines
9.5 KiB
Python
import numpy as np
|
||
from sklearn.model_selection import cross_val_score
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.base import clone
|
||
import copy
|
||
|
||
|
||
class ShuffledFrogLeaping:
|
||
"""
|
||
随机蛙跳算法 (Shuffled Frog Leaping Algorithm, SFLA) 进行特征选择
|
||
|
||
算法原理:
|
||
1. 将青蛙种群分成多个小组(memeplexes)
|
||
2. 在每个小组内进行局部搜索和进化
|
||
3. 定期重组所有青蛙,进行全局信息交换
|
||
4. 重复直到满足停止条件
|
||
"""
|
||
|
||
def __init__(self, n_frogs=50, n_memeplexes=5, n_evolution_steps=10,
|
||
n_shuffle_iterations=10, classifier=None, cv=5):
|
||
"""
|
||
初始化随机蛙跳算法参数
|
||
|
||
参数:
|
||
n_frogs: 青蛙种群大小
|
||
n_memeplexes: 小组数量
|
||
n_evolution_steps: 每个小组的进化步数
|
||
n_shuffle_iterations: 重组迭代次数
|
||
classifier: 用于评估特征子集的分类器
|
||
cv: 交叉验证折数
|
||
"""
|
||
self.n_frogs = n_frogs
|
||
self.n_memeplexes = n_memeplexes
|
||
self.n_evolution_steps = n_evolution_steps
|
||
self.n_shuffle_iterations = n_shuffle_iterations
|
||
self.classifier = classifier or RandomForestClassifier(random_state=42, n_estimators=50)
|
||
self.cv = cv
|
||
|
||
# 算法内部变量
|
||
self.n_features = None
|
||
self.frogs = None # 青蛙种群,每个青蛙是一个二进制向量
|
||
self.fitness_values = None
|
||
self.best_frog = None
|
||
self.best_fitness = -np.inf
|
||
self.selected_features = None
|
||
|
||
def _initialize_population(self):
|
||
"""初始化青蛙种群"""
|
||
self.frogs = []
|
||
for _ in range(self.n_frogs):
|
||
# 随机初始化二进制向量,1表示选择该特征,0表示不选择
|
||
frog = np.random.randint(0, 2, self.n_features)
|
||
self.frogs.append(frog)
|
||
self.frogs = np.array(self.frogs)
|
||
|
||
def _evaluate_fitness(self, X, y):
|
||
"""评估所有青蛙的适应度"""
|
||
self.fitness_values = []
|
||
for frog in self.frogs:
|
||
fitness = self._calculate_fitness(frog, X, y)
|
||
self.fitness_values.append(fitness)
|
||
|
||
# 更新全局最优
|
||
if fitness > self.best_fitness:
|
||
self.best_fitness = fitness
|
||
self.best_frog = frog.copy()
|
||
|
||
self.fitness_values = np.array(self.fitness_values)
|
||
|
||
def _calculate_fitness(self, frog, X, y):
|
||
"""计算单个青蛙的适应度"""
|
||
selected_features = np.where(frog == 1)[0]
|
||
|
||
# 如果没有选择任何特征,返回最低适应度
|
||
if len(selected_features) == 0:
|
||
return 0.0
|
||
|
||
# 使用选择的特征进行交叉验证
|
||
X_selected = X[:, selected_features]
|
||
|
||
try:
|
||
scores = cross_val_score(clone(self.classifier), X_selected, y, cv=self.cv)
|
||
return np.mean(scores)
|
||
except:
|
||
# 如果交叉验证失败,返回低适应度
|
||
return 0.0
|
||
|
||
def _divide_into_memeplexes(self):
|
||
"""将青蛙按适应度排序并分成小组"""
|
||
# 按适应度降序排序
|
||
sorted_indices = np.argsort(self.fitness_values)[::-1]
|
||
self.frogs = self.frogs[sorted_indices]
|
||
self.fitness_values = self.fitness_values[sorted_indices]
|
||
|
||
# 分成小组
|
||
memeplexes = []
|
||
frogs_per_memeplex = self.n_frogs // self.n_memeplexes
|
||
|
||
for i in range(self.n_memeplexes):
|
||
start_idx = i * frogs_per_memeplex
|
||
if i == self.n_memeplexes - 1:
|
||
# 最后一个小组包含剩余的所有青蛙
|
||
end_idx = self.n_frogs
|
||
else:
|
||
end_idx = (i + 1) * frogs_per_memeplex
|
||
|
||
memeplex = {
|
||
'frogs': self.frogs[start_idx:end_idx].copy(),
|
||
'fitness': self.fitness_values[start_idx:end_idx].copy()
|
||
}
|
||
memeplexes.append(memeplex)
|
||
|
||
return memeplexes
|
||
|
||
def _evolve_memeplex(self, memeplex, X, y):
|
||
"""进化单个小组"""
|
||
frogs = memeplex['frogs']
|
||
fitness = memeplex['fitness']
|
||
|
||
# 找出小组中的最好和最坏青蛙
|
||
best_idx = np.argmax(fitness)
|
||
worst_idx = np.argmin(fitness)
|
||
|
||
best_frog = frogs[best_idx]
|
||
worst_frog = frogs[worst_idx]
|
||
|
||
# 对最坏的青蛙进行进化
|
||
for step in range(self.n_evolution_steps):
|
||
# 生成新的青蛙: worst_frog + rand() * (best_frog - worst_frog)
|
||
rand = np.random.random(self.n_features)
|
||
new_frog = worst_frog + rand * (best_frog - worst_frog)
|
||
|
||
# 二进制化:大于0.5的为1,否则为0
|
||
new_frog = (new_frog > 0.5).astype(int)
|
||
|
||
# 确保至少选择一个特征
|
||
if np.sum(new_frog) == 0:
|
||
new_frog[np.random.randint(self.n_features)] = 1
|
||
|
||
# 计算新青蛙的适应度
|
||
new_fitness = self._calculate_fitness(new_frog, X, y)
|
||
|
||
# 如果新青蛙更好,替换最坏的青蛙
|
||
if new_fitness > fitness[worst_idx]:
|
||
frogs[worst_idx] = new_frog
|
||
fitness[worst_idx] = new_fitness
|
||
|
||
# 更新小组内的最好青蛙
|
||
if new_fitness > fitness[best_idx]:
|
||
best_idx = worst_idx
|
||
best_frog = new_frog
|
||
|
||
# 重新找出最坏的青蛙
|
||
worst_idx = np.argmin(fitness)
|
||
worst_frog = frogs[worst_idx]
|
||
else:
|
||
# 如果没有改善,随机生成一个新青蛙
|
||
new_frog = np.random.randint(0, 2, self.n_features)
|
||
if np.sum(new_frog) == 0:
|
||
new_frog[np.random.randint(self.n_features)] = 1
|
||
|
||
new_fitness = self._calculate_fitness(new_frog, X, y)
|
||
|
||
if new_fitness > fitness[worst_idx]:
|
||
frogs[worst_idx] = new_frog
|
||
fitness[worst_idx] = new_fitness
|
||
|
||
return frogs, fitness
|
||
|
||
def fit(self, X, y):
|
||
"""
|
||
运行随机蛙跳算法进行特征选择
|
||
|
||
参数:
|
||
X: 特征矩阵 (n_samples, n_features)
|
||
y: 标签向量 (n_samples,)
|
||
|
||
返回:
|
||
selected_features: 选择的特征索引列表
|
||
"""
|
||
self.n_features = X.shape[1]
|
||
|
||
# 初始化种群
|
||
self._initialize_population()
|
||
|
||
# 初始评估
|
||
self._evaluate_fitness(X, y)
|
||
|
||
# 主循环
|
||
for iteration in range(self.n_shuffle_iterations):
|
||
# 将青蛙分成小组
|
||
memeplexes = self._divide_into_memeplexes()
|
||
|
||
# 进化每个小组
|
||
evolved_frogs = []
|
||
evolved_fitness = []
|
||
|
||
for memeplex in memeplexes:
|
||
evolved_frog, evolved_fit = self._evolve_memeplex(memeplex, X, y)
|
||
evolved_frogs.extend(evolved_frog)
|
||
evolved_fitness.extend(evolved_fit)
|
||
|
||
# 更新种群
|
||
self.frogs = np.array(evolved_frogs)
|
||
self.fitness_values = np.array(evolved_fitness)
|
||
|
||
# 再次评估所有青蛙(确保一致性)
|
||
self._evaluate_fitness(X, y)
|
||
|
||
# 返回最优解
|
||
self.selected_features = np.where(self.best_frog == 1)[0]
|
||
return self.selected_features.tolist()
|
||
|
||
def get_feature_importance(self):
|
||
"""获取特征选择结果的统计信息"""
|
||
if self.selected_features is None:
|
||
raise ValueError("请先运行 fit 方法")
|
||
|
||
n_selected = len(self.selected_features)
|
||
selection_ratio = n_selected / self.n_features
|
||
|
||
return {
|
||
'selected_features': self.selected_features,
|
||
'n_selected': n_selected,
|
||
'n_total': self.n_features,
|
||
'selection_ratio': selection_ratio,
|
||
'best_fitness': self.best_fitness
|
||
}
|
||
|
||
|
||
def shuffled_frog_leaping_selection(X, y, n_frogs=50, n_memeplexes=5,
|
||
n_evolution_steps=10, n_shuffle_iterations=10,
|
||
classifier=None, cv=5):
|
||
"""
|
||
使用随机蛙跳算法进行特征选择
|
||
|
||
参数:
|
||
X: 特征矩阵 (n_samples, n_features)
|
||
y: 标签向量 (n_samples,)
|
||
n_frogs: 青蛙种群大小
|
||
n_memeplexes: 小组数量
|
||
n_evolution_steps: 每个小组的进化步数
|
||
n_shuffle_iterations: 重组迭代次数
|
||
classifier: 用于评估特征子集的分类器
|
||
cv: 交叉验证折数
|
||
|
||
返回:
|
||
selected_features: 选择的特征索引列表
|
||
"""
|
||
sfla = ShuffledFrogLeaping(
|
||
n_frogs=n_frogs,
|
||
n_memeplexes=n_memeplexes,
|
||
n_evolution_steps=n_evolution_steps,
|
||
n_shuffle_iterations=n_shuffle_iterations,
|
||
classifier=classifier,
|
||
cv=cv
|
||
)
|
||
|
||
return sfla.fit(X, y)
|
||
|
||
|
||
# 使用示例
|
||
if __name__ == "__main__":
|
||
# 生成示例数据
|
||
from sklearn.datasets import make_classification
|
||
|
||
X, y = make_classification(
|
||
n_samples=200,
|
||
n_features=50,
|
||
n_informative=10,
|
||
n_redundant=10,
|
||
n_clusters_per_class=1,
|
||
random_state=42
|
||
)
|
||
|
||
print("原始特征数量:", X.shape[1])
|
||
|
||
# 使用随机蛙跳算法进行特征选择
|
||
selected_features = shuffled_frog_leaping_selection(
|
||
X, y,
|
||
n_frogs=30,
|
||
n_memeplexes=3,
|
||
n_evolution_steps=5,
|
||
n_shuffle_iterations=5
|
||
)
|
||
|
||
print("选择的特征数量:", len(selected_features))
|
||
print("选择的特征索引:", selected_features)
|
||
|
||
# 计算选择率
|
||
selection_ratio = len(selected_features) / X.shape[1]
|
||
print(".2f") |