Files
towerDataSplit/towerDataSplit.py
tangchao0503 ad9f6a1042 更新:张欣欣的修改意见
1. 去掉多余的引用包,以减小打包后的程序大小 → 结论:不能减小打包后的大小;
2. 在原始csv文件中2个光谱仪的位置2、4、6实际上都是测的地物(向下),所以分割后都改为对应的实际位置:P2、P3、P4;
2022-09-08 16:19:53 +08:00

127 lines
4.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import os
import time
import argparse
class DataSplit:
def __init__(self, inputDataPath, outputDataPath):
self.inputPath = inputDataPath
self.outputPath = outputDataPath
def create_directory(self, directory):
"""
可直接创建深层文件夹例如如果B和C没有直接创建/A/B/C/D也能成功
:param directory:
:return:
"""
if not os.path.exists(directory): # 判断所在目录下是否有该文件名的文件夹
os.makedirs(directory)
def read_data(self, file_path):
"""
读取csv数据并修改信息以适配分割后的csv
:param file_path:
:return:
"""
guess_column_number = 10000
df = pd.read_csv(file_path, header=None, sep=',', names=range(guess_column_number))
correct_column_number = max(df.iloc[2, 6], df.iloc[4, 6]) + 3
df.drop(list(range(int(correct_column_number), guess_column_number)), axis=1,
inplace=True) # inplace=True会就地修改不会创建副本
# df = pd.read_csv(file_path, header=None, sep=',', names=range(int(correct_column_number))) # 重新读文件
# 修改信息
df.iloc[1, 3] = 2
# 任总:老师嫌采集时间过长,想每采集一次地物都采集一次天空
# csv中的2个光谱仪分别有6个位置的数据位置1、3、5实际上都是测的光纤位置1的数据测天空所以分割后都改为P1
# 位置2、4、6实际上都是测的地物向下所以分割后都改为对应的实际位置P2、P3、P4
df.iloc[9, 0] = df.iloc[9, 0].replace("3", "1")
df.iloc[10, 0] = df.iloc[10, 0].replace("4", "3")
df.iloc[11, 0] = df.iloc[11, 0].replace("5", "1")
df.iloc[12, 0] = df.iloc[12, 0].replace("6", "4")
df.iloc[15, 0] = df.iloc[15, 0].replace("3", "1")
df.iloc[16, 0] = df.iloc[16, 0].replace("4", "3")
df.iloc[17, 0] = df.iloc[17, 0].replace("5", "1")
df.iloc[18, 0] = df.iloc[18, 0].replace("6", "4")
return df
def split_data(self, df):
"""
分割pandas的dataframe的对应的那些行到新的dataframe中并返回
:param df:
:return:
"""
df1 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 14]]
df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 9, 10, 15, 16]]
df3 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 11, 12, 17, 18]]
return df1, df2, df3
def write_data(self, df, file_path):
# 写csv方式1的问题有的整数会在后面加.0例如77777 → 77777.0后期在c# 写的easysif中不能直接转int需要先转double在转int
df.to_csv(file_path, index=False, header=False)
# 写csv方式2也有方式1的问题推测pandas是基于numpy实现的所以numpy具有此问题那么pandas也具有此问题
# np.array(df1).tofile(file_path_out1, sep=',')
def start_split_process(self):
"""
分割处理数据主函数
:return:
"""
time_start = time.time() # 记录开始时间
directories = os.listdir(self.inputPath)
self.validFiles = []
for directory in directories:
directory1_tmp = os.path.join(self.outputPath, "split1", directory)
directory2_tmp = os.path.join(self.outputPath, "split2", directory)
directory3_tmp = os.path.join(self.outputPath, "split3", directory)
self.create_directory(directory1_tmp)
self.create_directory(directory2_tmp)
self.create_directory(directory3_tmp)
files = os.listdir(os.path.join(self.inputPath, directory))
for file in files:
file_path = os.path.join(self.inputPath, directory, file)
if os.path.splitext(file_path)[1] != '.csv':
continue
df = self.read_data(file_path)
df1, df2, df3 = self.split_data(df)
file_path_out1 = os.path.join(directory1_tmp, file)
file_path_out2 = os.path.join(directory2_tmp, file)
file_path_out3 = os.path.join(directory3_tmp, file)
self.write_data(df1, file_path_out1)
self.write_data(df2, file_path_out2)
self.write_data(df3, file_path_out3)
time_end = time.time() # 记录结束时间
time_sum = time_end - time_start # 计算的时间差为程序的执行时间,单位为秒/s
print("处理用时:%d" % time_sum)
return 0
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("inputDataPath", help="输入数据路径")
parser.add_argument("outputDataPath", help="输出路径。")
args = parser.parse_args()
data_split = DataSplit(args.inputDataPath, args.outputDataPath)
data_split.start_split_process()
print("completed!!")