第一次提交：用于根河用户towersif原始数据分割

1. 任总：csv中的2个光谱仪分别有6个位置的数据，位置1、3、5实际上都是测的光纤位置1的数据（测天空），所以分割后都改为P1； 2. 添加了界面分别输入原始数据文件夹和输出文件夹，在输出文件夹中新建文件夹split1、split2和split3用于保存分割后的数据；
2022-09-08 15:31:33 +08:00
commit ba8cab6751
11 changed files with 453 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,166 @@
 # 唐超添加
 /.idea
 *.zip
 /Data
 Data_split
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/main.py
+++ b/main.py
@ -0,0 +1,14 @@
 import easygui
 from towerDataSplit import DataSplit
 inputDataPath = easygui.diropenbox('第1/2步', '选择towersif原始数据文件夹')
 outputDataPath = easygui.diropenbox('第2/2步', '选择输出文件夹')
 # inputDataPath = r"D:\PycharmProjects\towerDataSplit\sampleData"
 # outputDataPath = r"D:\PycharmProjects\towerDataSplit\Data_split"
 data_split = DataSplit(inputDataPath, outputDataPath)
 data_split.start_split_process()
 easygui.msgbox("数据分割完成！", "提示")
--- a/sampleData/2022_09_03/genhe_06_44_44.csv
+++ b/sampleData/2022_09_03/genhe_06_44_44.csv
--- a/sampleData/2022_09_03/genhe_07_07_02.csv
+++ b/sampleData/2022_09_03/genhe_07_07_02.csv
--- a/sampleData/2022_09_04/genhe_06_45_25.csv
+++ b/sampleData/2022_09_04/genhe_06_45_25.csv
--- a/sampleData/2022_09_04/genhe_07_04_33.csv
+++ b/sampleData/2022_09_04/genhe_07_04_33.csv
--- a/sampleData/2022_09_05/genhe_06_43_52.csv
+++ b/sampleData/2022_09_05/genhe_06_43_52.csv
--- a/sampleData/2022_09_05/genhe_07_08_22.csv
+++ b/sampleData/2022_09_05/genhe_07_08_22.csv
--- a/sampleData/2022_09_06/genhe_06_43_48.csv
+++ b/sampleData/2022_09_06/genhe_06_43_48.csv
--- a/sampleData/2022_09_06/genhe_07_06_59.csv
+++ b/sampleData/2022_09_06/genhe_07_06_59.csv
--- a/towerDataSplit.py
+++ b/towerDataSplit.py
@ -0,0 +1,121 @@
 import numpy as np
 import pandas as pd
 import os
 from datetime import datetime
 import time
 import math
 import argparse
 import copy
 class DataSplit:
    def __init__(self, inputDataPath, outputDataPath):
        self.inputPath = inputDataPath
        self.outputPath = outputDataPath
    def create_directory(self, directory):
        """
        可直接创建深层文件夹，例如，如果B和C没有，直接创建/A/B/C/D也能成功
        :param directory:
        :return:
        """
        if not os.path.exists(directory):  # 判断所在目录下是否有该文件名的文件夹
            os.makedirs(directory)
    def read_data(self, file_path):
        """
        读取csv数据，并修改信息以适配分割后的csv
        :param file_path:
        :return:
        """
        guess_column_number = 10000
        df = pd.read_csv(file_path, header=None, sep=',', names=range(guess_column_number))
        correct_column_number = max(df.iloc[2, 6], df.iloc[4, 6]) + 3
        df.drop(list(range(int(correct_column_number), guess_column_number)), axis=1,
                inplace=True)  # inplace=True会就地修改，不会创建副本
        # df = pd.read_csv(file_path, header=None, sep=',', names=range(int(correct_column_number)))  # 重新读文件
        # 修改信息
        df.iloc[1, 3] = 2
        # 任总：数据中显示的P3也是采集的位置1的数据，老师嫌采集时间过长，想采集2次天空
        df.iloc[9, 0] = df.iloc[9, 0].replace("3", "1")
        df.iloc[11, 0] = df.iloc[11, 0].replace("5", "1")
        df.iloc[15, 0] = df.iloc[15, 0].replace("3", "1")
        df.iloc[17, 0] = df.iloc[17, 0].replace("5", "1")
        return df
    def split_data(self, df):
        """
        分割pandas的dataframe的对应的那些行到新的dataframe中，并返回
        :param df:
        :return:
        """
        df1 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 14]]
        df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 9, 10, 15, 16]]
        df3 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 11, 12, 17, 18]]
        return df1, df2, df3
    def write_data(self, df, file_path):
        # 写csv方式1的问题：有的整数会在后面加.0（例如77777 → 77777.0，后期在c# 写的easysif中不能直接转int，需要先转double，在转int）
        df.to_csv(file_path, index=False, header=False)
        # 写csv方式2：也有方式1的问题，推测pandas是基于numpy实现的，所以numpy具有此问题，那么pandas也具有此问题
        # np.array(df1).tofile(file_path_out1, sep=',')
    def start_split_process(self):
        """
        分割处理数据主函数
        :return:
        """
        time_start = time.time()  # 记录开始时间
        directories = os.listdir(self.inputPath)
        self.validFiles = []
        for directory in directories:
            directory1_tmp = os.path.join(self.outputPath, "split1", directory)
            directory2_tmp = os.path.join(self.outputPath, "split2", directory)
            directory3_tmp = os.path.join(self.outputPath, "split3", directory)
            self.create_directory(directory1_tmp)
            self.create_directory(directory2_tmp)
            self.create_directory(directory3_tmp)
            files = os.listdir(os.path.join(self.inputPath, directory))
            for file in files:
                file_path = os.path.join(self.inputPath, directory, file)
                if os.path.splitext(file_path)[1] != '.csv':
                    continue
                df = self.read_data(file_path)
                df1, df2, df3 = self.split_data(df)
                file_path_out1 = os.path.join(directory1_tmp, file)
                file_path_out2 = os.path.join(directory2_tmp, file)
                file_path_out3 = os.path.join(directory3_tmp, file)
                self.write_data(df1, file_path_out1)
                self.write_data(df2, file_path_out2)
                self.write_data(df3, file_path_out3)
        time_end = time.time()  # 记录结束时间
        time_sum = time_end - time_start  # 计算的时间差为程序的执行时间，单位为秒/s
        print("处理用时：%d" % time_sum)
        return 0
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("inputDataPath", help="输入数据路径")
    parser.add_argument("outputDataPath", help="输出路径。")
    args = parser.parse_args()
    data_split = DataSplit(args.inputDataPath, args.outputDataPath)
    data_split.start_split_process()
    print("completed!!")