import numpy as np import pandas as pd from sklearn.cross_decomposition import PLSRegression from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold from itertools import combinations import matplotlib.pyplot as plt def synergy_interval_pls(X, y, n_intervals=20, n_combinations=2, max_components=15, cv_folds=5): """ 协同区间偏最小二乘法 (Synergy Interval PLS, SiPLS) 进行特征选择 参数: X: 光谱矩阵 (n_samples, n_wavelengths) y: 浓度/属性向量 (n_samples,) n_intervals: 将光谱分成多少个等宽区间 n_combinations: 每次选择的区间组合数量 (通常2-4) max_components: PLS的最大主成分数 cv_folds: 交叉验证折数 返回: best_intervals: 最优的区间组合 best_rmsecv: 最优组合的RMSECV best_n_components: 最优的主成分数 selected_wavelengths: 选择的波长索引 """ n_samples, n_wavelengths = X.shape # 将光谱分成等宽的区间 interval_size = n_wavelengths // n_intervals intervals = [] for i in range(n_intervals): start_idx = i * interval_size if i == n_intervals - 1: # 最后一个区间包含剩余的所有波长 end_idx = n_wavelengths else: end_idx = (i + 1) * interval_size intervals.append((start_idx, end_idx)) print(f"将 {n_wavelengths} 个波长分成 {n_intervals} 个区间:") for i, (start, end) in enumerate(intervals): print(f" 区间 {i+1}: 波长 {start}-{end-1} (宽度: {end-start})") # 生成所有可能的区间组合 interval_combinations = list(combinations(range(n_intervals), n_combinations)) print(f"\n总共 {len(interval_combinations)} 个 {n_combinations} 区间的组合") best_rmsecv = float('inf') best_intervals = None best_n_components = None results = [] # 对每个组合进行评估 for combo_idx, combo in enumerate(interval_combinations): if (combo_idx + 1) % 50 == 0: print(f"正在处理组合 {combo_idx + 1}/{len(interval_combinations)}") # 合并选中区间的光谱数据 selected_wavelengths = [] for interval_idx in combo: start_idx, end_idx = intervals[interval_idx] selected_wavelengths.extend(range(start_idx, end_idx)) X_selected = X[:, selected_wavelengths] # 对不同主成分数进行交叉验证 kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42) rmse_results = [] for n_comp in range(1, min(max_components + 1, X_selected.shape[1] + 1)): rmse_scores = [] for train_idx, test_idx in kf.split(X_selected): X_train, X_test = X_selected[train_idx], X_selected[test_idx] y_train, y_test = y[train_idx], y[test_idx] pls = PLSRegression(n_components=n_comp) pls.fit(X_train, y_train) y_pred = pls.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) rmse_scores.append(rmse) mean_rmse = np.mean(rmse_scores) rmse_results.append(mean_rmse) # 找到该组合的最佳主成分数和RMSE min_rmse_idx = np.argmin(rmse_results) min_rmse = rmse_results[min_rmse_idx] best_comp = min_rmse_idx + 1 results.append({ 'intervals': combo, 'rmsecv': min_rmse, 'n_components': best_comp, 'wavelengths': selected_wavelengths }) # 更新全局最优 if min_rmse < best_rmsecv: best_rmsecv = min_rmse best_intervals = combo best_n_components = best_comp print("最优结果:") print(f" 区间组合: {best_intervals}") print(f" RMSECV: {best_rmsecv:.6f}") print(f" 主成分数: {best_n_components}") print(f" 选择的波长数: {len(results[-1]['wavelengths'])}") # 返回最优区间的波长索引 selected_wavelengths = [] for interval_idx in best_intervals: start_idx, end_idx = intervals[interval_idx] selected_wavelengths.extend(range(start_idx, end_idx)) return selected_wavelengths, best_rmsecv, best_n_components def sipls_feature_selection(X, y, n_intervals_list=[10, 15, 20], n_combinations_list=[2, 3, 4], max_components=15, cv_folds=5): """ 高级SiPLS特征选择,尝试不同的参数组合 参数: X: 光谱矩阵 (n_samples, n_wavelengths) y: 浓度/属性向量 (n_samples,) n_intervals_list: 尝试的区间数量列表 n_combinations_list: 尝试的组合数量列表 max_components: PLS的最大主成分数 cv_folds: 交叉验证折数 返回: best_result: 包含最优结果的字典 """ best_overall_rmsecv = float('inf') best_overall_result = None print("=== SiPLS 特征选择 ===") print(f"数据形状: {X.shape}") print(f"尝试的参数组合: {len(n_intervals_list)} × {len(n_combinations_list)} = {len(n_intervals_list) * len(n_combinations_list)}") for n_intervals in n_intervals_list: for n_combinations in n_combinations_list: print(f"\n--- 测试参数: 区间数={n_intervals}, 组合数={n_combinations} ---") try: selected_wavelengths, rmsecv, n_components = synergy_interval_pls( X, y, n_intervals=n_intervals, n_combinations=n_combinations, max_components=max_components, cv_folds=cv_folds ) if rmsecv < best_overall_rmsecv: best_overall_rmsecv = rmsecv best_overall_result = { 'selected_wavelengths': selected_wavelengths, 'rmsecv': rmsecv, 'n_components': n_components, 'n_intervals': n_intervals, 'n_combinations': n_combinations, 'selection_ratio': len(selected_wavelengths) / X.shape[1] } except Exception as e: print(f"参数组合 (区间数={n_intervals}, 组合数={n_combinations}) 处理失败: {str(e)}") continue if best_overall_result: print("=== 最终最优结果 ===") print(f"区间数: {best_overall_result['n_intervals']}") print(f"组合数: {best_overall_result['n_combinations']}") print(f"RMSECV: {best_overall_result['rmsecv']:.6f}") print(f"主成分数: {best_overall_result['n_components']}") print(f"选择的波长数: {len(best_overall_result['selected_wavelengths'])}") print(f"选择率: {best_overall_result['selection_ratio']:.3f}") return best_overall_result def plot_sipls_results(X, selected_wavelengths, title="SiPLS Selected Wavelengths"): """ 绘制SiPLS选择结果的可视化图 参数: X: 原始光谱矩阵 selected_wavelengths: 选择的波长索引 title: 图表标题 """ n_wavelengths = X.shape[1] wavelength_indices = np.arange(n_wavelengths) # 创建选择掩码 selection_mask = np.zeros(n_wavelengths, dtype=bool) selection_mask[selected_wavelengths] = True plt.figure(figsize=(12, 6)) # 绘制平均光谱 mean_spectrum = np.mean(X, axis=0) plt.plot(wavelength_indices, mean_spectrum, 'b-', alpha=0.7, label='Mean Spectrum') # 高亮选择的波长 plt.scatter(wavelength_indices[selection_mask], mean_spectrum[selection_mask], color='red', s=50, alpha=0.8, label='Selected Wavelengths') plt.xlabel('Wavelength Index') plt.ylabel('Intensity') plt.title(title) plt.legend() plt.grid(True, alpha=0.3) return plt.gcf() # 使用示例 if __name__ == "__main__": # 生成模拟光谱数据 np.random.seed(42) n_samples = 100 n_wavelengths = 1000 # 模拟光谱数据(高斯峰) wavelengths = np.linspace(400, 2500, n_wavelengths) X = np.zeros((n_samples, n_wavelengths)) # 添加一些特征峰 peak_positions = [500, 800, 1200, 1800, 2200] # nm peak_indices = [np.argmin(np.abs(wavelengths - pos)) for pos in peak_positions] for i in range(n_samples): for peak_idx in peak_indices: # 添加高斯峰 gaussian = np.exp(-0.5 * ((np.arange(n_wavelengths) - peak_idx) / 50)**2) X[i] += gaussian * np.random.uniform(0.5, 1.5) # 添加噪声 X[i] += np.random.normal(0, 0.1, n_wavelengths) # 生成模拟浓度数据(与某些峰相关) y = (X[:, peak_indices[0]] + X[:, peak_indices[2]] + X[:, peak_indices[4]]) / 3 y += np.random.normal(0, 0.05, n_samples) # 添加噪声 print("模拟数据生成完成") print(f"数据形状: {X.shape}") print(".3f") # 运行SiPLS特征选择 result = sipls_feature_selection( X, y, n_intervals_list=[10, 15], n_combinations_list=[2, 3], max_components=10, cv_folds=5 ) if result: print(f"\n选择的波长索引: {result['selected_wavelengths'][:10]}...") # 只显示前10个 # 绘制结果 fig = plot_sipls_results(X, result['selected_wavelengths']) plt.show()