""" -*- coding: utf-8 -*- @Time :2022/04/12 17:10 @Author : Pengyou FU @blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343 @github : https://github.com/FuSiry/OpenSA @WeChat : Fu_siry @License:Apache-2.0 license """ from sklearn.model_selection import train_test_split import numpy as np import pandas as pd #随机划分数据集 def random(data, label, test_ratio=0.2, random_state=123): """ :param data: shape (n_samples, n_features) :param label: shape (n_sample, ) :param test_size: the ratio of test_size, default: 0.2 :param random_state: the randomseed, default: 123 :return: X_train :(n_samples, n_features) X_test: (n_samples, n_features) y_train: (n_sample, ) y_test: (n_sample, ) """ X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=test_ratio, random_state=random_state)#,stratify=label return X_train, X_test, y_train, y_test def spxy(data, label, test_size=0.2): """ :param data: shape (n_samples, n_features) :param label: shape (n_samples, ) :param test_size: the ratio of test_size, default: 0.2 :return: X_train :(n_samples, n_features) X_test: (n_samples, n_features) y_train: (n_samples, ) y_test: (n_samples, ) """ # 确保 data 和 label 是 NumPy 数组 data = data.to_numpy() if isinstance(data, pd.DataFrame) else data label = label.to_numpy() if isinstance(label, pd.Series) else label # 备份原始数据和标签 x_backup = data y_backup = label M = data.shape[0] N = round((1 - test_size) * M) samples = np.arange(M) # 归一化标签数据 label = (label - np.mean(label)) / np.std(label) D = np.zeros((M, M)) Dy = np.zeros((M, M)) # 计算样本之间的距离 for i in range(M - 1): xa = data[i, :] ya = label[i] for j in range((i + 1), M): xb = data[j, :] yb = label[j] D[i, j] = np.linalg.norm(xa - xb) Dy[i, j] = np.linalg.norm(ya - yb) # 距离归一化 Dmax = np.max(D) Dymax = np.max(Dy) D = D / Dmax + Dy / Dymax # 找到最远的两个点 maxD = D.max(axis=0) index_row = D.argmax(axis=0) index_column = maxD.argmax() m = np.zeros(N, dtype=int) m[0] = index_row[index_column] m[1] = index_column dminmax = np.zeros(N) dminmax[1] = D[m[0], m[1]] # 根据距离选择训练集 for i in range(2, N): pool = np.delete(samples, m[:i]) dmin = np.zeros(M - i) for j in range(M - i): indexa = pool[j] d = np.zeros(i) for k in range(i): indexb = m[k] if indexa < indexb: d[k] = D[indexa, indexb] else: d[k] = D[indexb, indexa] dmin[j] = np.min(d) dminmax[i] = np.max(dmin) index = np.argmax(dmin) m[i] = pool[index] m_complement = np.delete(samples, m) # 划分训练集和测试集 X_train = data[m, :] y_train = y_backup[m] X_test = data[m_complement, :] y_test = y_backup[m_complement] return X_train, X_test, y_train, y_test #利用kennard-stone算法划分数据集 def ks(data, label, test_size=0.2): """ :param data: shape (n_samples, n_features) :param label: shape (n_sample, ) :param test_size: the ratio of test_size, default: 0.2 :return: X_train: (n_samples, n_features) X_test: (n_samples, n_features) y_train: (n_samples, ) y_test: (n_samples, ) """ # 确保 data 和 label 是 NumPy 数组 data = data.to_numpy() if isinstance(data, pd.DataFrame) else data label = label.to_numpy() if isinstance(label, pd.Series) else label M = data.shape[0] N = round((1 - test_size) * M) samples = np.arange(M) D = np.zeros((M, M)) for i in range((M - 1)): xa = data[i, :] for j in range((i + 1), M): xb = data[j, :] D[i, j] = np.linalg.norm(xa - xb) maxD = np.max(D, axis=0) index_row = np.argmax(D, axis=0) index_column = np.argmax(maxD) m = np.zeros(N) m[0] = np.array(index_row[index_column]) m[1] = np.array(index_column) m = m.astype(int) dminmax = np.zeros(N) dminmax[1] = D[m[0], m[1]] for i in range(2, N): pool = np.delete(samples, m[:i]) dmin = np.zeros((M - i)) for j in range((M - i)): indexa = pool[j] d = np.zeros(i) for k in range(i): indexb = m[k] if indexa < indexb: d[k] = D[indexa, indexb] else: d[k] = D[indexb, indexa] dmin[j] = np.min(d) dminmax[i] = np.max(dmin) index = np.argmax(dmin) m[i] = pool[index] m_complement = np.delete(np.arange(data.shape[0]), m) X_train = data[m, :] y_train = label[m] X_test = data[m_complement, :] y_test = label[m_complement] return X_train, X_test, y_train, y_test # 分别使用一个回归、一个分类的公开数据集做为example def LoadNirtest(type): if type == "Rgs": CDataPath1 = r'G:\UAV\dazhou\20m\新,无条带\output.csv' data1 = np.loadtxt(open(CDataPath1, 'rb'), dtype=np.float64, delimiter=',', skiprows=0) data = data1[:, 2:] label = data1[:, 0] elif type == "Cls": path = r"G:\danzhu_test\rgb_refine\reflence\yellow_green_deepgreen\sum.csv" Nirdata = np.loadtxt(open(path, 'rb'), dtype=np.float64, delimiter=',', skiprows=0) data = Nirdata[1:, 1:463] label = Nirdata[1:,0] return data, label def SetSplit(method, data, label, test_size=0.3, randomseed=123): """ :param method: the method to split trainset and testset, include: random, kennard-stone(ks), spxy :param data: shape (n_samples, n_features) :param label: shape (n_sample, ) :param test_size: the ratio of test_size, default: 0.2 :return: X_train: (n_samples, n_features) X_test: (n_samples, n_features) y_train: (n_sample, ) y_test: (n_sample, ) """ if method == "random": X_train, X_test, y_train, y_test = random(data, label, test_size, randomseed) elif method == "spxy": X_train, X_test, y_train, y_test = spxy(data, label, test_size) elif method == "ks": X_train, X_test, y_train, y_test = ks(data, label, test_size) else: print("no this method of split dataset! ") return X_train, X_test, y_train, y_test