
1. 去掉多余的引用包,以减小打包后的程序大小 → 结论:不能减小打包后的大小; 2. 在原始csv文件中2个光谱仪的位置2、4、6实际上都是测的地物(向下),所以分割后都改为对应的实际位置:P2、P3、P4;
127 lines
4.9 KiB
Python
127 lines
4.9 KiB
Python
import pandas as pd
|
||
import os
|
||
import time
|
||
import argparse
|
||
|
||
|
||
class DataSplit:
|
||
def __init__(self, inputDataPath, outputDataPath):
|
||
self.inputPath = inputDataPath
|
||
self.outputPath = outputDataPath
|
||
|
||
def create_directory(self, directory):
|
||
"""
|
||
可直接创建深层文件夹,例如,如果B和C没有,直接创建/A/B/C/D也能成功
|
||
:param directory:
|
||
:return:
|
||
"""
|
||
if not os.path.exists(directory): # 判断所在目录下是否有该文件名的文件夹
|
||
os.makedirs(directory)
|
||
|
||
def read_data(self, file_path):
|
||
"""
|
||
读取csv数据,并修改信息以适配分割后的csv
|
||
:param file_path:
|
||
:return:
|
||
"""
|
||
guess_column_number = 10000
|
||
|
||
df = pd.read_csv(file_path, header=None, sep=',', names=range(guess_column_number))
|
||
correct_column_number = max(df.iloc[2, 6], df.iloc[4, 6]) + 3
|
||
|
||
df.drop(list(range(int(correct_column_number), guess_column_number)), axis=1,
|
||
inplace=True) # inplace=True会就地修改,不会创建副本
|
||
# df = pd.read_csv(file_path, header=None, sep=',', names=range(int(correct_column_number))) # 重新读文件
|
||
|
||
# 修改信息
|
||
df.iloc[1, 3] = 2
|
||
|
||
# 任总:老师嫌采集时间过长,想每采集一次地物都采集一次天空
|
||
# csv中的2个光谱仪分别有6个位置的数据,位置1、3、5实际上都是测的光纤位置1的数据(测天空),所以分割后都改为P1;
|
||
# 位置2、4、6实际上都是测的地物(向下),所以分割后都改为对应的实际位置:P2、P3、P4;
|
||
df.iloc[9, 0] = df.iloc[9, 0].replace("3", "1")
|
||
df.iloc[10, 0] = df.iloc[10, 0].replace("4", "3")
|
||
|
||
df.iloc[11, 0] = df.iloc[11, 0].replace("5", "1")
|
||
df.iloc[12, 0] = df.iloc[12, 0].replace("6", "4")
|
||
|
||
df.iloc[15, 0] = df.iloc[15, 0].replace("3", "1")
|
||
df.iloc[16, 0] = df.iloc[16, 0].replace("4", "3")
|
||
|
||
df.iloc[17, 0] = df.iloc[17, 0].replace("5", "1")
|
||
df.iloc[18, 0] = df.iloc[18, 0].replace("6", "4")
|
||
|
||
return df
|
||
|
||
def split_data(self, df):
|
||
"""
|
||
分割pandas的dataframe的对应的那些行到新的dataframe中,并返回
|
||
:param df:
|
||
:return:
|
||
"""
|
||
df1 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 14]]
|
||
df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 9, 10, 15, 16]]
|
||
df3 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 11, 12, 17, 18]]
|
||
|
||
return df1, df2, df3
|
||
|
||
def write_data(self, df, file_path):
|
||
# 写csv方式1的问题:有的整数会在后面加.0(例如77777 → 77777.0,后期在c# 写的easysif中不能直接转int,需要先转double,在转int)
|
||
df.to_csv(file_path, index=False, header=False)
|
||
|
||
# 写csv方式2:也有方式1的问题,推测pandas是基于numpy实现的,所以numpy具有此问题,那么pandas也具有此问题
|
||
# np.array(df1).tofile(file_path_out1, sep=',')
|
||
|
||
def start_split_process(self):
|
||
"""
|
||
分割处理数据主函数
|
||
:return:
|
||
"""
|
||
time_start = time.time() # 记录开始时间
|
||
|
||
directories = os.listdir(self.inputPath)
|
||
self.validFiles = []
|
||
|
||
for directory in directories:
|
||
directory1_tmp = os.path.join(self.outputPath, "split1", directory)
|
||
directory2_tmp = os.path.join(self.outputPath, "split2", directory)
|
||
directory3_tmp = os.path.join(self.outputPath, "split3", directory)
|
||
self.create_directory(directory1_tmp)
|
||
self.create_directory(directory2_tmp)
|
||
self.create_directory(directory3_tmp)
|
||
|
||
files = os.listdir(os.path.join(self.inputPath, directory))
|
||
for file in files:
|
||
file_path = os.path.join(self.inputPath, directory, file)
|
||
if os.path.splitext(file_path)[1] != '.csv':
|
||
continue
|
||
|
||
df = self.read_data(file_path)
|
||
df1, df2, df3 = self.split_data(df)
|
||
|
||
file_path_out1 = os.path.join(directory1_tmp, file)
|
||
file_path_out2 = os.path.join(directory2_tmp, file)
|
||
file_path_out3 = os.path.join(directory3_tmp, file)
|
||
|
||
self.write_data(df1, file_path_out1)
|
||
self.write_data(df2, file_path_out2)
|
||
self.write_data(df3, file_path_out3)
|
||
|
||
time_end = time.time() # 记录结束时间
|
||
time_sum = time_end - time_start # 计算的时间差为程序的执行时间,单位为秒/s
|
||
print("处理用时:%d" % time_sum)
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("inputDataPath", help="输入数据路径")
|
||
parser.add_argument("outputDataPath", help="输出路径。")
|
||
args = parser.parse_args()
|
||
|
||
data_split = DataSplit(args.inputDataPath, args.outputDataPath)
|
||
data_split.start_split_process()
|
||
|
||
print("completed!!")
|