第一次提交:用于根河用户towersif原始数据分割

1. 任总:csv中的2个光谱仪分别有6个位置的数据,位置1、3、5实际上都是测的光纤位置1的数据(测天空),所以分割后都改为P1;
2. 添加了界面分别输入原始数据文件夹和输出文件夹,在输出文件夹中新建文件夹split1、split2和split3用于保存分割后的数据;
This commit is contained in:
tangchao0503
2022-09-08 15:31:33 +08:00
commit ba8cab6751
11 changed files with 453 additions and 0 deletions

166
.gitignore vendored Normal file
View File

@ -0,0 +1,166 @@
# 唐超添加
/.idea
*.zip
/Data
Data_split
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

14
main.py Normal file
View File

@ -0,0 +1,14 @@
import easygui
from towerDataSplit import DataSplit
inputDataPath = easygui.diropenbox('第1/2步', '选择towersif原始数据文件夹')
outputDataPath = easygui.diropenbox('第2/2步', '选择输出文件夹')
# inputDataPath = r"D:\PycharmProjects\towerDataSplit\sampleData"
# outputDataPath = r"D:\PycharmProjects\towerDataSplit\Data_split"
data_split = DataSplit(inputDataPath, outputDataPath)
data_split.start_split_process()
easygui.msgbox("数据分割完成!", "提示")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

121
towerDataSplit.py Normal file
View File

@ -0,0 +1,121 @@
import numpy as np
import pandas as pd
import os
from datetime import datetime
import time
import math
import argparse
import copy
class DataSplit:
def __init__(self, inputDataPath, outputDataPath):
self.inputPath = inputDataPath
self.outputPath = outputDataPath
def create_directory(self, directory):
"""
可直接创建深层文件夹例如如果B和C没有直接创建/A/B/C/D也能成功
:param directory:
:return:
"""
if not os.path.exists(directory): # 判断所在目录下是否有该文件名的文件夹
os.makedirs(directory)
def read_data(self, file_path):
"""
读取csv数据并修改信息以适配分割后的csv
:param file_path:
:return:
"""
guess_column_number = 10000
df = pd.read_csv(file_path, header=None, sep=',', names=range(guess_column_number))
correct_column_number = max(df.iloc[2, 6], df.iloc[4, 6]) + 3
df.drop(list(range(int(correct_column_number), guess_column_number)), axis=1,
inplace=True) # inplace=True会就地修改不会创建副本
# df = pd.read_csv(file_path, header=None, sep=',', names=range(int(correct_column_number))) # 重新读文件
# 修改信息
df.iloc[1, 3] = 2
# 任总数据中显示的P3也是采集的位置1的数据老师嫌采集时间过长想采集2次天空
df.iloc[9, 0] = df.iloc[9, 0].replace("3", "1")
df.iloc[11, 0] = df.iloc[11, 0].replace("5", "1")
df.iloc[15, 0] = df.iloc[15, 0].replace("3", "1")
df.iloc[17, 0] = df.iloc[17, 0].replace("5", "1")
return df
def split_data(self, df):
"""
分割pandas的dataframe的对应的那些行到新的dataframe中并返回
:param df:
:return:
"""
df1 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 14]]
df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 9, 10, 15, 16]]
df3 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 11, 12, 17, 18]]
return df1, df2, df3
def write_data(self, df, file_path):
# 写csv方式1的问题有的整数会在后面加.0例如77777 → 77777.0后期在c# 写的easysif中不能直接转int需要先转double在转int
df.to_csv(file_path, index=False, header=False)
# 写csv方式2也有方式1的问题推测pandas是基于numpy实现的所以numpy具有此问题那么pandas也具有此问题
# np.array(df1).tofile(file_path_out1, sep=',')
def start_split_process(self):
"""
分割处理数据主函数
:return:
"""
time_start = time.time() # 记录开始时间
directories = os.listdir(self.inputPath)
self.validFiles = []
for directory in directories:
directory1_tmp = os.path.join(self.outputPath, "split1", directory)
directory2_tmp = os.path.join(self.outputPath, "split2", directory)
directory3_tmp = os.path.join(self.outputPath, "split3", directory)
self.create_directory(directory1_tmp)
self.create_directory(directory2_tmp)
self.create_directory(directory3_tmp)
files = os.listdir(os.path.join(self.inputPath, directory))
for file in files:
file_path = os.path.join(self.inputPath, directory, file)
if os.path.splitext(file_path)[1] != '.csv':
continue
df = self.read_data(file_path)
df1, df2, df3 = self.split_data(df)
file_path_out1 = os.path.join(directory1_tmp, file)
file_path_out2 = os.path.join(directory2_tmp, file)
file_path_out3 = os.path.join(directory3_tmp, file)
self.write_data(df1, file_path_out1)
self.write_data(df2, file_path_out2)
self.write_data(df3, file_path_out3)
time_end = time.time() # 记录结束时间
time_sum = time_end - time_start # 计算的时间差为程序的执行时间,单位为秒/s
print("处理用时:%d" % time_sum)
return 0
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("inputDataPath", help="输入数据路径")
parser.add_argument("outputDataPath", help="输出路径。")
args = parser.parse_args()
data_split = DataSplit(args.inputDataPath, args.outputDataPath)
data_split.start_split_process()
print("completed!!")