import numpy as np from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier from sklearn.base import clone import copy class ShuffledFrogLeaping: """ 随机蛙跳算法 (Shuffled Frog Leaping Algorithm, SFLA) 进行特征选择 算法原理: 1. 将青蛙种群分成多个小组(memeplexes) 2. 在每个小组内进行局部搜索和进化 3. 定期重组所有青蛙,进行全局信息交换 4. 重复直到满足停止条件 """ def __init__(self, n_frogs=50, n_memeplexes=5, n_evolution_steps=10, n_shuffle_iterations=10, classifier=None, cv=5): """ 初始化随机蛙跳算法参数 参数: n_frogs: 青蛙种群大小 n_memeplexes: 小组数量 n_evolution_steps: 每个小组的进化步数 n_shuffle_iterations: 重组迭代次数 classifier: 用于评估特征子集的分类器 cv: 交叉验证折数 """ self.n_frogs = n_frogs self.n_memeplexes = n_memeplexes self.n_evolution_steps = n_evolution_steps self.n_shuffle_iterations = n_shuffle_iterations self.classifier = classifier or RandomForestClassifier(random_state=42, n_estimators=50) self.cv = cv # 算法内部变量 self.n_features = None self.frogs = None # 青蛙种群,每个青蛙是一个二进制向量 self.fitness_values = None self.best_frog = None self.best_fitness = -np.inf self.selected_features = None def _initialize_population(self): """初始化青蛙种群""" self.frogs = [] for _ in range(self.n_frogs): # 随机初始化二进制向量,1表示选择该特征,0表示不选择 frog = np.random.randint(0, 2, self.n_features) self.frogs.append(frog) self.frogs = np.array(self.frogs) def _evaluate_fitness(self, X, y): """评估所有青蛙的适应度""" self.fitness_values = [] for frog in self.frogs: fitness = self._calculate_fitness(frog, X, y) self.fitness_values.append(fitness) # 更新全局最优 if fitness > self.best_fitness: self.best_fitness = fitness self.best_frog = frog.copy() self.fitness_values = np.array(self.fitness_values) def _calculate_fitness(self, frog, X, y): """计算单个青蛙的适应度""" selected_features = np.where(frog == 1)[0] # 如果没有选择任何特征,返回最低适应度 if len(selected_features) == 0: return 0.0 # 使用选择的特征进行交叉验证 X_selected = X[:, selected_features] try: scores = cross_val_score(clone(self.classifier), X_selected, y, cv=self.cv) return np.mean(scores) except: # 如果交叉验证失败,返回低适应度 return 0.0 def _divide_into_memeplexes(self): """将青蛙按适应度排序并分成小组""" # 按适应度降序排序 sorted_indices = np.argsort(self.fitness_values)[::-1] self.frogs = self.frogs[sorted_indices] self.fitness_values = self.fitness_values[sorted_indices] # 分成小组 memeplexes = [] frogs_per_memeplex = self.n_frogs // self.n_memeplexes for i in range(self.n_memeplexes): start_idx = i * frogs_per_memeplex if i == self.n_memeplexes - 1: # 最后一个小组包含剩余的所有青蛙 end_idx = self.n_frogs else: end_idx = (i + 1) * frogs_per_memeplex memeplex = { 'frogs': self.frogs[start_idx:end_idx].copy(), 'fitness': self.fitness_values[start_idx:end_idx].copy() } memeplexes.append(memeplex) return memeplexes def _evolve_memeplex(self, memeplex, X, y): """进化单个小组""" frogs = memeplex['frogs'] fitness = memeplex['fitness'] # 找出小组中的最好和最坏青蛙 best_idx = np.argmax(fitness) worst_idx = np.argmin(fitness) best_frog = frogs[best_idx] worst_frog = frogs[worst_idx] # 对最坏的青蛙进行进化 for step in range(self.n_evolution_steps): # 生成新的青蛙: worst_frog + rand() * (best_frog - worst_frog) rand = np.random.random(self.n_features) new_frog = worst_frog + rand * (best_frog - worst_frog) # 二进制化:大于0.5的为1,否则为0 new_frog = (new_frog > 0.5).astype(int) # 确保至少选择一个特征 if np.sum(new_frog) == 0: new_frog[np.random.randint(self.n_features)] = 1 # 计算新青蛙的适应度 new_fitness = self._calculate_fitness(new_frog, X, y) # 如果新青蛙更好,替换最坏的青蛙 if new_fitness > fitness[worst_idx]: frogs[worst_idx] = new_frog fitness[worst_idx] = new_fitness # 更新小组内的最好青蛙 if new_fitness > fitness[best_idx]: best_idx = worst_idx best_frog = new_frog # 重新找出最坏的青蛙 worst_idx = np.argmin(fitness) worst_frog = frogs[worst_idx] else: # 如果没有改善,随机生成一个新青蛙 new_frog = np.random.randint(0, 2, self.n_features) if np.sum(new_frog) == 0: new_frog[np.random.randint(self.n_features)] = 1 new_fitness = self._calculate_fitness(new_frog, X, y) if new_fitness > fitness[worst_idx]: frogs[worst_idx] = new_frog fitness[worst_idx] = new_fitness return frogs, fitness def fit(self, X, y): """ 运行随机蛙跳算法进行特征选择 参数: X: 特征矩阵 (n_samples, n_features) y: 标签向量 (n_samples,) 返回: selected_features: 选择的特征索引列表 """ self.n_features = X.shape[1] # 初始化种群 self._initialize_population() # 初始评估 self._evaluate_fitness(X, y) # 主循环 for iteration in range(self.n_shuffle_iterations): # 将青蛙分成小组 memeplexes = self._divide_into_memeplexes() # 进化每个小组 evolved_frogs = [] evolved_fitness = [] for memeplex in memeplexes: evolved_frog, evolved_fit = self._evolve_memeplex(memeplex, X, y) evolved_frogs.extend(evolved_frog) evolved_fitness.extend(evolved_fit) # 更新种群 self.frogs = np.array(evolved_frogs) self.fitness_values = np.array(evolved_fitness) # 再次评估所有青蛙(确保一致性) self._evaluate_fitness(X, y) # 返回最优解 self.selected_features = np.where(self.best_frog == 1)[0] return self.selected_features.tolist() def get_feature_importance(self): """获取特征选择结果的统计信息""" if self.selected_features is None: raise ValueError("请先运行 fit 方法") n_selected = len(self.selected_features) selection_ratio = n_selected / self.n_features return { 'selected_features': self.selected_features, 'n_selected': n_selected, 'n_total': self.n_features, 'selection_ratio': selection_ratio, 'best_fitness': self.best_fitness } def shuffled_frog_leaping_selection(X, y, n_frogs=50, n_memeplexes=5, n_evolution_steps=10, n_shuffle_iterations=10, classifier=None, cv=5): """ 使用随机蛙跳算法进行特征选择 参数: X: 特征矩阵 (n_samples, n_features) y: 标签向量 (n_samples,) n_frogs: 青蛙种群大小 n_memeplexes: 小组数量 n_evolution_steps: 每个小组的进化步数 n_shuffle_iterations: 重组迭代次数 classifier: 用于评估特征子集的分类器 cv: 交叉验证折数 返回: selected_features: 选择的特征索引列表 """ sfla = ShuffledFrogLeaping( n_frogs=n_frogs, n_memeplexes=n_memeplexes, n_evolution_steps=n_evolution_steps, n_shuffle_iterations=n_shuffle_iterations, classifier=classifier, cv=cv ) return sfla.fit(X, y) # 使用示例 if __name__ == "__main__": # 生成示例数据 from sklearn.datasets import make_classification X, y = make_classification( n_samples=200, n_features=50, n_informative=10, n_redundant=10, n_clusters_per_class=1, random_state=42 ) print("原始特征数量:", X.shape[1]) # 使用随机蛙跳算法进行特征选择 selected_features = shuffled_frog_leaping_selection( X, y, n_frogs=30, n_memeplexes=3, n_evolution_steps=5, n_shuffle_iterations=5 ) print("选择的特征数量:", len(selected_features)) print("选择的特征索引:", selected_features) # 计算选择率 selection_ratio = len(selected_features) / X.shape[1] print(".2f")