Files
micro_plastic/classification_model/Preprocessing/Preprocessing.py
2026-03-05 17:12:01 +08:00

158 lines
5.5 KiB
Python

import numpy as np
from scipy import signal
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd
import pywt
from copy import deepcopy
import joblib # 用于保存和加载模型
# 最大最小值归一化
def MMS(input_spectrum):
output_spectrum = MinMaxScaler().fit_transform(input_spectrum)
return output_spectrum
# 标准化
def SS(input_spectrum, save_path=None):
# 初始化 StandardScaler 并拟合数据
scaler = StandardScaler()
output_spectrum = scaler.fit_transform(input_spectrum)
# 如果指定了保存路径,保存 scaler 对象
if save_path:
joblib.dump(scaler, save_path)
print(f"Scaler parameters saved to {save_path}")
return output_spectrum
# 均值中心化
def CT(input_spectrum):
output_spectrum = deepcopy(input_spectrum)
for i in range(output_spectrum.shape[0]):
MEAN = np.mean(output_spectrum[i])
output_spectrum[i] = output_spectrum[i] - MEAN
return output_spectrum
# 标准正态变换
def SNV(input_spectrum):
if not isinstance(input_spectrum, pd.DataFrame):
raise ValueError("Input spectrum must be a Pandas DataFrame")
data_average = input_spectrum.mean(axis=1)
data_std = input_spectrum.std(axis=1)
data_std = data_std.replace(0, 1)
output_spectrum = (input_spectrum.sub(data_average, axis=0)).div(data_std, axis=0)
return output_spectrum
# 移动平均平滑
def MA(input_spectrum, WSZ=11):
output_spectrum = deepcopy(input_spectrum)
for i in range(output_spectrum.shape[0]):
out0 = np.convolve(output_spectrum[i], np.ones(WSZ, dtype=int), 'valid') / WSZ
r = np.arange(1, WSZ - 1, 2)
start = np.cumsum(output_spectrum[i, :WSZ - 1])[::2] / r
stop = (np.cumsum(output_spectrum[i, :-WSZ:-1])[::2] / r)[::-1]
output_spectrum[i] = np.concatenate((start, out0, stop))
return output_spectrum
# Savitzky-Golay平滑滤波
def SG(input_spectrum, w=15, p=2):
output_spectrum = signal.savgol_filter(input_spectrum, w, p)
return output_spectrum
# 一阶导数
def D1(input_spectrum):
n, p = input_spectrum.shape
output_spectrum = np.ones((n, p - 1))
for i in range(n):
output_spectrum[i] = np.diff(input_spectrum[i])
return output_spectrum
# 二阶导数
def D2(input_spectrum):
temp2 = (pd.DataFrame(input_spectrum)).diff(axis=1)
temp3 = np.delete(temp2.values, 0, axis=1)
temp4 = (pd.DataFrame(temp3)).diff(axis=1)
output_spectrum = np.delete(temp4.values, 0, axis=1)
return output_spectrum
# 趋势校正
def DT(input_spectrum):
lenth = input_spectrum.shape[1]
x = np.asarray(range(lenth), dtype=np.float32)
output_spectrum = np.array(input_spectrum)
l = LinearRegression()
for i in range(output_spectrum.shape[0]):
l.fit(x.reshape(-1, 1), output_spectrum[i].reshape(-1, 1))
k = l.coef_
b = l.intercept_
for j in range(output_spectrum.shape[1]):
output_spectrum[i][j] = output_spectrum[i][j] - (j * k + b)
return output_spectrum
# 多元散射校正
def MSC(input_spectrum):
n, p = input_spectrum.shape
output_spectrum = np.ones((n, p))
mean = np.mean(input_spectrum, axis=0)
for i in range(n):
y = input_spectrum[i, :]
l = LinearRegression()
l.fit(mean.reshape(-1, 1), y.reshape(-1, 1))
k = l.coef_
b = l.intercept_
output_spectrum[i, :] = (y - b) / k
return output_spectrum
# 小波变换
def wave(input_spectrum):
def wave_(input_spectrum_row):
w = pywt.Wavelet('db8')
maxlev = pywt.dwt_max_level(len(input_spectrum_row), w.dec_len)
coeffs = pywt.wavedec(input_spectrum_row, 'db8', level=maxlev)
threshold = 0.04
for i in range(1, len(coeffs)):
coeffs[i] = pywt.threshold(coeffs[i], threshold * max(coeffs[i]))
output_spectrum_row = pywt.waverec(coeffs, 'db8')
return output_spectrum_row
output_spectrum = None
for i in range(input_spectrum.shape[0]):
if i == 0:
output_spectrum = wave_(input_spectrum[i])
else:
output_spectrum = np.vstack((output_spectrum, wave_(input_spectrum[i])))
return output_spectrum
# 通用预处理函数
def Preprocessing(method, input_spectrum):
if isinstance(input_spectrum, np.ndarray):
input_spectrum = pd.DataFrame(input_spectrum)
if method == "None":
output_spectrum = input_spectrum
elif method == 'MMS':
output_spectrum = MMS(input_spectrum.values)
elif method == 'SS':
output_spectrum = SS(input_spectrum.values, r'D:\plastic\plastic\modelsave\240model\new\0303\scaler_params.pkl')
elif method == 'CT':
output_spectrum = CT(input_spectrum.values)
elif method == 'SNV':
output_spectrum = SNV(input_spectrum)
elif method == 'MA':
output_spectrum = MA(input_spectrum.values)
elif method == 'SG':
output_spectrum = SG(input_spectrum.values)
elif method == 'MSC':
output_spectrum = MSC(input_spectrum.values)
elif method == 'D1':
output_spectrum = D1(input_spectrum.values)
elif method == 'D2':
output_spectrum = D2(input_spectrum.values)
elif method == 'DT':
output_spectrum = DT(input_spectrum.values)
elif method == 'WVAE':
output_spectrum = wave(input_spectrum.values)
else:
print("No such method of preprocessing!")
output_spectrum = input_spectrum.values
return output_spectrum