Files
micro_plastic/classification_model/DataLoad/DataLoad.py
2026-02-25 09:42:51 +08:00

221 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
-*- coding: utf-8 -*-
@Time :2022/04/12 17:10
@Author : Pengyou FU
@blogs : https://blog.csdn.net/Echo_Code?spm=1000.2115.3001.5343
@github : https://github.com/FuSiry/OpenSA
@WeChat : Fu_siry
@LicenseApache-2.0 license
"""
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
#随机划分数据集
def random(data, label, test_ratio=0.2, random_state=123):
"""
:param data: shape (n_samples, n_features)
:param label: shape (n_sample, )
:param test_size: the ratio of test_size, default: 0.2
:param random_state: the randomseed, default: 123
:return: X_train :(n_samples, n_features)
X_test: (n_samples, n_features)
y_train: (n_sample, )
y_test: (n_sample, )
"""
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=test_ratio, random_state=random_state)#,stratify=label
return X_train, X_test, y_train, y_test
def spxy(data, label, test_size=0.2):
"""
:param data: shape (n_samples, n_features)
:param label: shape (n_samples, )
:param test_size: the ratio of test_size, default: 0.2
:return: X_train :(n_samples, n_features)
X_test: (n_samples, n_features)
y_train: (n_samples, )
y_test: (n_samples, )
"""
# 确保 data 和 label 是 NumPy 数组
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
label = label.to_numpy() if isinstance(label, pd.Series) else label
# 备份原始数据和标签
x_backup = data
y_backup = label
M = data.shape[0]
N = round((1 - test_size) * M)
samples = np.arange(M)
# 归一化标签数据
label = (label - np.mean(label)) / np.std(label)
D = np.zeros((M, M))
Dy = np.zeros((M, M))
# 计算样本之间的距离
for i in range(M - 1):
xa = data[i, :]
ya = label[i]
for j in range((i + 1), M):
xb = data[j, :]
yb = label[j]
D[i, j] = np.linalg.norm(xa - xb)
Dy[i, j] = np.linalg.norm(ya - yb)
# 距离归一化
Dmax = np.max(D)
Dymax = np.max(Dy)
D = D / Dmax + Dy / Dymax
# 找到最远的两个点
maxD = D.max(axis=0)
index_row = D.argmax(axis=0)
index_column = maxD.argmax()
m = np.zeros(N, dtype=int)
m[0] = index_row[index_column]
m[1] = index_column
dminmax = np.zeros(N)
dminmax[1] = D[m[0], m[1]]
# 根据距离选择训练集
for i in range(2, N):
pool = np.delete(samples, m[:i])
dmin = np.zeros(M - i)
for j in range(M - i):
indexa = pool[j]
d = np.zeros(i)
for k in range(i):
indexb = m[k]
if indexa < indexb:
d[k] = D[indexa, indexb]
else:
d[k] = D[indexb, indexa]
dmin[j] = np.min(d)
dminmax[i] = np.max(dmin)
index = np.argmax(dmin)
m[i] = pool[index]
m_complement = np.delete(samples, m)
# 划分训练集和测试集
X_train = data[m, :]
y_train = y_backup[m]
X_test = data[m_complement, :]
y_test = y_backup[m_complement]
return X_train, X_test, y_train, y_test
#利用kennard-stone算法划分数据集
def ks(data, label, test_size=0.2):
"""
:param data: shape (n_samples, n_features)
:param label: shape (n_sample, )
:param test_size: the ratio of test_size, default: 0.2
:return: X_train: (n_samples, n_features)
X_test: (n_samples, n_features)
y_train: (n_samples, )
y_test: (n_samples, )
"""
# 确保 data 和 label 是 NumPy 数组
data = data.to_numpy() if isinstance(data, pd.DataFrame) else data
label = label.to_numpy() if isinstance(label, pd.Series) else label
M = data.shape[0]
N = round((1 - test_size) * M)
samples = np.arange(M)
D = np.zeros((M, M))
for i in range((M - 1)):
xa = data[i, :]
for j in range((i + 1), M):
xb = data[j, :]
D[i, j] = np.linalg.norm(xa - xb)
maxD = np.max(D, axis=0)
index_row = np.argmax(D, axis=0)
index_column = np.argmax(maxD)
m = np.zeros(N)
m[0] = np.array(index_row[index_column])
m[1] = np.array(index_column)
m = m.astype(int)
dminmax = np.zeros(N)
dminmax[1] = D[m[0], m[1]]
for i in range(2, N):
pool = np.delete(samples, m[:i])
dmin = np.zeros((M - i))
for j in range((M - i)):
indexa = pool[j]
d = np.zeros(i)
for k in range(i):
indexb = m[k]
if indexa < indexb:
d[k] = D[indexa, indexb]
else:
d[k] = D[indexb, indexa]
dmin[j] = np.min(d)
dminmax[i] = np.max(dmin)
index = np.argmax(dmin)
m[i] = pool[index]
m_complement = np.delete(np.arange(data.shape[0]), m)
X_train = data[m, :]
y_train = label[m]
X_test = data[m_complement, :]
y_test = label[m_complement]
return X_train, X_test, y_train, y_test
# 分别使用一个回归、一个分类的公开数据集做为example
def LoadNirtest(type):
if type == "Rgs":
CDataPath1 = r'G:\UAV\dazhou\20m\新,无条带\output.csv'
data1 = np.loadtxt(open(CDataPath1, 'rb'), dtype=np.float64, delimiter=',', skiprows=0)
data = data1[:, 2:]
label = data1[:, 0]
elif type == "Cls":
path = r"G:\danzhu_test\rgb_refine\reflence\yellow_green_deepgreen\sum.csv"
Nirdata = np.loadtxt(open(path, 'rb'), dtype=np.float64, delimiter=',', skiprows=0)
data = Nirdata[1:, 1:463]
label = Nirdata[1:,0]
return data, label
def SetSplit(method, data, label, test_size=0.3, randomseed=123):
"""
:param method: the method to split trainset and testset, include: random, kennard-stone(ks), spxy
:param data: shape (n_samples, n_features)
:param label: shape (n_sample, )
:param test_size: the ratio of test_size, default: 0.2
:return: X_train: (n_samples, n_features)
X_test: (n_samples, n_features)
y_train: (n_sample, )
y_test: (n_sample, )
"""
if method == "random":
X_train, X_test, y_train, y_test = random(data, label, test_size, randomseed)
elif method == "spxy":
X_train, X_test, y_train, y_test = spxy(data, label, test_size)
elif method == "ks":
X_train, X_test, y_train, y_test = ks(data, label, test_size)
else:
print("no this method of split dataset! ")
return X_train, X_test, y_train, y_test