初始提交
This commit is contained in:
220
classification_model/DataLoad/DataLoad.py
Normal file
220
classification_model/DataLoad/DataLoad.py
Normal file
@ -0,0 +1,220 @@
|
||||
"""
|
||||
-*- coding: utf-8 -*-
|
||||
@Time :2022/04/12 17:10
|
||||
@Author : Pengyou FU
|
||||
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
|
||||
@github : https://github.com/FuSiry/OpenSA
|
||||
@WeChat : Fu_siry
|
||||
@License:Apache-2.0 license
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
#随机划分数据集
|
||||
def random(data, label, test_ratio=0.2, random_state=123):
|
||||
"""
|
||||
:param data: shape (n_samples, n_features)
|
||||
:param label: shape (n_sample, )
|
||||
:param test_size: the ratio of test_size, default: 0.2
|
||||
:param random_state: the randomseed, default: 123
|
||||
:return: X_train :(n_samples, n_features)
|
||||
X_test: (n_samples, n_features)
|
||||
y_train: (n_sample, )
|
||||
y_test: (n_sample, )
|
||||
"""
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=test_ratio, random_state=random_state)#,stratify=label
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
def spxy(data, label, test_size=0.2):
|
||||
"""
|
||||
:param data: shape (n_samples, n_features)
|
||||
:param label: shape (n_samples, )
|
||||
:param test_size: the ratio of test_size, default: 0.2
|
||||
:return: X_train :(n_samples, n_features)
|
||||
X_test: (n_samples, n_features)
|
||||
y_train: (n_samples, )
|
||||
y_test: (n_samples, )
|
||||
"""
|
||||
# 确保 data 和 label 是 NumPy 数组
|
||||
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
|
||||
label = label.to_numpy() if isinstance(label, pd.Series) else label
|
||||
|
||||
# 备份原始数据和标签
|
||||
x_backup = data
|
||||
y_backup = label
|
||||
|
||||
M = data.shape[0]
|
||||
N = round((1 - test_size) * M)
|
||||
samples = np.arange(M)
|
||||
|
||||
# 归一化标签数据
|
||||
label = (label - np.mean(label)) / np.std(label)
|
||||
D = np.zeros((M, M))
|
||||
Dy = np.zeros((M, M))
|
||||
|
||||
# 计算样本之间的距离
|
||||
for i in range(M - 1):
|
||||
xa = data[i, :]
|
||||
ya = label[i]
|
||||
for j in range((i + 1), M):
|
||||
xb = data[j, :]
|
||||
yb = label[j]
|
||||
D[i, j] = np.linalg.norm(xa - xb)
|
||||
Dy[i, j] = np.linalg.norm(ya - yb)
|
||||
|
||||
# 距离归一化
|
||||
Dmax = np.max(D)
|
||||
Dymax = np.max(Dy)
|
||||
D = D / Dmax + Dy / Dymax
|
||||
|
||||
# 找到最远的两个点
|
||||
maxD = D.max(axis=0)
|
||||
index_row = D.argmax(axis=0)
|
||||
index_column = maxD.argmax()
|
||||
|
||||
m = np.zeros(N, dtype=int)
|
||||
m[0] = index_row[index_column]
|
||||
m[1] = index_column
|
||||
|
||||
dminmax = np.zeros(N)
|
||||
dminmax[1] = D[m[0], m[1]]
|
||||
|
||||
# 根据距离选择训练集
|
||||
for i in range(2, N):
|
||||
pool = np.delete(samples, m[:i])
|
||||
dmin = np.zeros(M - i)
|
||||
for j in range(M - i):
|
||||
indexa = pool[j]
|
||||
d = np.zeros(i)
|
||||
for k in range(i):
|
||||
indexb = m[k]
|
||||
if indexa < indexb:
|
||||
d[k] = D[indexa, indexb]
|
||||
else:
|
||||
d[k] = D[indexb, indexa]
|
||||
dmin[j] = np.min(d)
|
||||
dminmax[i] = np.max(dmin)
|
||||
index = np.argmax(dmin)
|
||||
m[i] = pool[index]
|
||||
|
||||
m_complement = np.delete(samples, m)
|
||||
|
||||
# 划分训练集和测试集
|
||||
X_train = data[m, :]
|
||||
y_train = y_backup[m]
|
||||
X_test = data[m_complement, :]
|
||||
y_test = y_backup[m_complement]
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
#利用kennard-stone算法划分数据集
|
||||
def ks(data, label, test_size=0.2):
|
||||
"""
|
||||
:param data: shape (n_samples, n_features)
|
||||
:param label: shape (n_sample, )
|
||||
:param test_size: the ratio of test_size, default: 0.2
|
||||
:return: X_train: (n_samples, n_features)
|
||||
X_test: (n_samples, n_features)
|
||||
y_train: (n_samples, )
|
||||
y_test: (n_samples, )
|
||||
"""
|
||||
# 确保 data 和 label 是 NumPy 数组
|
||||
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
|
||||
label = label.to_numpy() if isinstance(label, pd.Series) else label
|
||||
|
||||
M = data.shape[0]
|
||||
N = round((1 - test_size) * M)
|
||||
samples = np.arange(M)
|
||||
|
||||
D = np.zeros((M, M))
|
||||
|
||||
for i in range((M - 1)):
|
||||
xa = data[i, :]
|
||||
for j in range((i + 1), M):
|
||||
xb = data[j, :]
|
||||
D[i, j] = np.linalg.norm(xa - xb)
|
||||
|
||||
maxD = np.max(D, axis=0)
|
||||
index_row = np.argmax(D, axis=0)
|
||||
index_column = np.argmax(maxD)
|
||||
|
||||
m = np.zeros(N)
|
||||
m[0] = np.array(index_row[index_column])
|
||||
m[1] = np.array(index_column)
|
||||
m = m.astype(int)
|
||||
dminmax = np.zeros(N)
|
||||
dminmax[1] = D[m[0], m[1]]
|
||||
|
||||
for i in range(2, N):
|
||||
pool = np.delete(samples, m[:i])
|
||||
dmin = np.zeros((M - i))
|
||||
for j in range((M - i)):
|
||||
indexa = pool[j]
|
||||
d = np.zeros(i)
|
||||
for k in range(i):
|
||||
indexb = m[k]
|
||||
if indexa < indexb:
|
||||
d[k] = D[indexa, indexb]
|
||||
else:
|
||||
d[k] = D[indexb, indexa]
|
||||
dmin[j] = np.min(d)
|
||||
dminmax[i] = np.max(dmin)
|
||||
index = np.argmax(dmin)
|
||||
m[i] = pool[index]
|
||||
|
||||
m_complement = np.delete(np.arange(data.shape[0]), m)
|
||||
|
||||
X_train = data[m, :]
|
||||
y_train = label[m]
|
||||
X_test = data[m_complement, :]
|
||||
y_test = label[m_complement]
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
# 分别使用一个回归、一个分类的公开数据集做为example
|
||||
def LoadNirtest(type):
|
||||
|
||||
if type == "Rgs":
|
||||
CDataPath1 = r'G:\UAV\dazhou\20m\新,无条带\output.csv'
|
||||
data1 = np.loadtxt(open(CDataPath1, 'rb'), dtype=np.float64, delimiter=',', skiprows=0)
|
||||
|
||||
data = data1[:, 2:]
|
||||
label = data1[:, 0]
|
||||
|
||||
|
||||
elif type == "Cls":
|
||||
path = r"G:\danzhu_test\rgb_refine\reflence\yellow_green_deepgreen\sum.csv"
|
||||
Nirdata = np.loadtxt(open(path, 'rb'), dtype=np.float64, delimiter=',', skiprows=0)
|
||||
data = Nirdata[1:, 1:463]
|
||||
label = Nirdata[1:,0]
|
||||
|
||||
return data, label
|
||||
|
||||
def SetSplit(method, data, label, test_size=0.3, randomseed=123):
|
||||
|
||||
"""
|
||||
:param method: the method to split trainset and testset, include: random, kennard-stone(ks), spxy
|
||||
:param data: shape (n_samples, n_features)
|
||||
:param label: shape (n_sample, )
|
||||
:param test_size: the ratio of test_size, default: 0.2
|
||||
:return: X_train: (n_samples, n_features)
|
||||
X_test: (n_samples, n_features)
|
||||
y_train: (n_sample, )
|
||||
y_test: (n_sample, )
|
||||
"""
|
||||
|
||||
if method == "random":
|
||||
X_train, X_test, y_train, y_test = random(data, label, test_size, randomseed)
|
||||
elif method == "spxy":
|
||||
X_train, X_test, y_train, y_test = spxy(data, label, test_size)
|
||||
elif method == "ks":
|
||||
X_train, X_test, y_train, y_test = ks(data, label, test_size)
|
||||
else:
|
||||
print("no this method of split dataset! ")
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user