import numpy as np import pandas as pd import os from datetime import datetime import time import math import argparse import copy class DataSplit: def __init__(self, inputDataPath, outputDataPath): self.inputPath = inputDataPath self.outputPath = outputDataPath def create_directory(self, directory): """ 可直接创建深层文件夹,例如,如果B和C没有,直接创建/A/B/C/D也能成功 :param directory: :return: """ if not os.path.exists(directory): # 判断所在目录下是否有该文件名的文件夹 os.makedirs(directory) def read_data(self, file_path): """ 读取csv数据,并修改信息以适配分割后的csv :param file_path: :return: """ guess_column_number = 10000 df = pd.read_csv(file_path, header=None, sep=',', names=range(guess_column_number)) correct_column_number = max(df.iloc[2, 6], df.iloc[4, 6]) + 3 df.drop(list(range(int(correct_column_number), guess_column_number)), axis=1, inplace=True) # inplace=True会就地修改,不会创建副本 # df = pd.read_csv(file_path, header=None, sep=',', names=range(int(correct_column_number))) # 重新读文件 # 修改信息 df.iloc[1, 3] = 2 # 任总:数据中显示的P3也是采集的位置1的数据,老师嫌采集时间过长,想采集2次天空 df.iloc[9, 0] = df.iloc[9, 0].replace("3", "1") df.iloc[11, 0] = df.iloc[11, 0].replace("5", "1") df.iloc[15, 0] = df.iloc[15, 0].replace("3", "1") df.iloc[17, 0] = df.iloc[17, 0].replace("5", "1") return df def split_data(self, df): """ 分割pandas的dataframe的对应的那些行到新的dataframe中,并返回 :param df: :return: """ df1 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 14]] df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 9, 10, 15, 16]] df3 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 11, 12, 17, 18]] return df1, df2, df3 def write_data(self, df, file_path): # 写csv方式1的问题:有的整数会在后面加.0(例如77777 → 77777.0,后期在c# 写的easysif中不能直接转int,需要先转double,在转int) df.to_csv(file_path, index=False, header=False) # 写csv方式2:也有方式1的问题,推测pandas是基于numpy实现的,所以numpy具有此问题,那么pandas也具有此问题 # np.array(df1).tofile(file_path_out1, sep=',') def start_split_process(self): """ 分割处理数据主函数 :return: """ time_start = time.time() # 记录开始时间 directories = os.listdir(self.inputPath) self.validFiles = [] for directory in directories: directory1_tmp = os.path.join(self.outputPath, "split1", directory) directory2_tmp = os.path.join(self.outputPath, "split2", directory) directory3_tmp = os.path.join(self.outputPath, "split3", directory) self.create_directory(directory1_tmp) self.create_directory(directory2_tmp) self.create_directory(directory3_tmp) files = os.listdir(os.path.join(self.inputPath, directory)) for file in files: file_path = os.path.join(self.inputPath, directory, file) if os.path.splitext(file_path)[1] != '.csv': continue df = self.read_data(file_path) df1, df2, df3 = self.split_data(df) file_path_out1 = os.path.join(directory1_tmp, file) file_path_out2 = os.path.join(directory2_tmp, file) file_path_out3 = os.path.join(directory3_tmp, file) self.write_data(df1, file_path_out1) self.write_data(df2, file_path_out2) self.write_data(df3, file_path_out3) time_end = time.time() # 记录结束时间 time_sum = time_end - time_start # 计算的时间差为程序的执行时间,单位为秒/s print("处理用时:%d" % time_sum) return 0 if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("inputDataPath", help="输入数据路径") parser.add_argument("outputDataPath", help="输出路径。") args = parser.parse_args() data_split = DataSplit(args.inputDataPath, args.outputDataPath) data_split.start_split_process() print("completed!!")