Files
towerDataSplit/towerDataSplit.py
tangchao0503 ba8cab6751 第一次提交:用于根河用户towersif原始数据分割
1. 任总:csv中的2个光谱仪分别有6个位置的数据,位置1、3、5实际上都是测的光纤位置1的数据(测天空),所以分割后都改为P1;
2. 添加了界面分别输入原始数据文件夹和输出文件夹,在输出文件夹中新建文件夹split1、split2和split3用于保存分割后的数据;
2022-09-08 15:31:33 +08:00

122 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import pandas as pd
import os
from datetime import datetime
import time
import math
import argparse
import copy
class DataSplit:
def __init__(self, inputDataPath, outputDataPath):
self.inputPath = inputDataPath
self.outputPath = outputDataPath
def create_directory(self, directory):
"""
可直接创建深层文件夹例如如果B和C没有直接创建/A/B/C/D也能成功
:param directory:
:return:
"""
if not os.path.exists(directory): # 判断所在目录下是否有该文件名的文件夹
os.makedirs(directory)
def read_data(self, file_path):
"""
读取csv数据并修改信息以适配分割后的csv
:param file_path:
:return:
"""
guess_column_number = 10000
df = pd.read_csv(file_path, header=None, sep=',', names=range(guess_column_number))
correct_column_number = max(df.iloc[2, 6], df.iloc[4, 6]) + 3
df.drop(list(range(int(correct_column_number), guess_column_number)), axis=1,
inplace=True) # inplace=True会就地修改不会创建副本
# df = pd.read_csv(file_path, header=None, sep=',', names=range(int(correct_column_number))) # 重新读文件
# 修改信息
df.iloc[1, 3] = 2
# 任总数据中显示的P3也是采集的位置1的数据老师嫌采集时间过长想采集2次天空
df.iloc[9, 0] = df.iloc[9, 0].replace("3", "1")
df.iloc[11, 0] = df.iloc[11, 0].replace("5", "1")
df.iloc[15, 0] = df.iloc[15, 0].replace("3", "1")
df.iloc[17, 0] = df.iloc[17, 0].replace("5", "1")
return df
def split_data(self, df):
"""
分割pandas的dataframe的对应的那些行到新的dataframe中并返回
:param df:
:return:
"""
df1 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 14]]
df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 9, 10, 15, 16]]
df3 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 11, 12, 17, 18]]
return df1, df2, df3
def write_data(self, df, file_path):
# 写csv方式1的问题有的整数会在后面加.0例如77777 → 77777.0后期在c# 写的easysif中不能直接转int需要先转double在转int
df.to_csv(file_path, index=False, header=False)
# 写csv方式2也有方式1的问题推测pandas是基于numpy实现的所以numpy具有此问题那么pandas也具有此问题
# np.array(df1).tofile(file_path_out1, sep=',')
def start_split_process(self):
"""
分割处理数据主函数
:return:
"""
time_start = time.time() # 记录开始时间
directories = os.listdir(self.inputPath)
self.validFiles = []
for directory in directories:
directory1_tmp = os.path.join(self.outputPath, "split1", directory)
directory2_tmp = os.path.join(self.outputPath, "split2", directory)
directory3_tmp = os.path.join(self.outputPath, "split3", directory)
self.create_directory(directory1_tmp)
self.create_directory(directory2_tmp)
self.create_directory(directory3_tmp)
files = os.listdir(os.path.join(self.inputPath, directory))
for file in files:
file_path = os.path.join(self.inputPath, directory, file)
if os.path.splitext(file_path)[1] != '.csv':
continue
df = self.read_data(file_path)
df1, df2, df3 = self.split_data(df)
file_path_out1 = os.path.join(directory1_tmp, file)
file_path_out2 = os.path.join(directory2_tmp, file)
file_path_out3 = os.path.join(directory3_tmp, file)
self.write_data(df1, file_path_out1)
self.write_data(df2, file_path_out2)
self.write_data(df3, file_path_out3)
time_end = time.time() # 记录结束时间
time_sum = time_end - time_start # 计算的时间差为程序的执行时间,单位为秒/s
print("处理用时:%d" % time_sum)
return 0
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("inputDataPath", help="输入数据路径")
parser.add_argument("outputDataPath", help="输出路径。")
args = parser.parse_args()
data_split = DataSplit(args.inputDataPath, args.outputDataPath)
data_split.start_split_process()
print("completed!!")