import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdates import numpy as np from datetime import datetime # 设置叶绿素a阈值 CHLOROPHYLL_THRESHOLD = 100.0 # 大于此值的叶绿素a数据将被剔除 # 1. 读取CSV文件 try: # 尝试自动检测文件格式 df = pd.read_csv( r"D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\不去耀斑index.csv", header=None, skip_blank_lines=True, on_bad_lines='warn') # 检查列数,可能有多余列 if df.shape[1] > 2: print(f"警告:检测到 {df.shape[1]} 列,但预期只有2列。尝试提取前两列。") df = df.iloc[:, :2] # 只保留前两列 # 重命名列 df.columns = ['Time', 'Chl_a'] # 打印前几行用于调试 print("原始数据预览:") print(df.head()) # 检查是否有标题行 if not isinstance(df['Time'].iloc[0], str) or any(char.isalpha() for char in str(df['Time'].iloc[0])): print("检测到可能的标题行,跳过第一行") df = pd.read_csv( r"D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\不去耀斑index.csv", header=None, skiprows=1, skip_blank_lines=True) df = df.iloc[:, :2] # 只保留前两列 df.columns = ['Time', 'Chl_a'] print("处理后数据预览:") print(df.head()) except Exception as e: print(f"读取文件时出错: {e}") exit() # 2. 转换时间格式 try: # 尝试解析时间 df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y/%m/%d %H:%M') # 检查是否有解析失败的时间 if df['Time'].isna().any(): print("部分时间解析失败,尝试其他格式...") # 尝试其他常见格式 df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y-%m-%d %H:%M') df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y/%m/%d %H:%M:%S') df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y-%m-%d %H:%M:%S') # 如果仍有失败,尝试自动推断 if df['Time'].isna().any(): print("使用混合格式解析时间") df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='mixed') # 删除无法解析时间的行 na_count = df['Time'].isna().sum() if na_count > 0: print(f"警告: 删除了 {na_count} 行无法解析的时间数据") df = df.dropna(subset=['Time']) # 提取日期和时间 df['Date'] = df['Time'].dt.date df['Time_of_day'] = df['Time'].dt.time except Exception as e: print(f"时间解析错误: {e}") exit() # 3. 确保叶绿素a列是数值型并应用阈值过滤 try: df['Chl_a'] = pd.to_numeric(df['Chl_a'], errors='coerce') # 记录过滤前的数据量 original_count = len(df) # 应用阈值过滤 above_threshold = df[df['Chl_a'] > CHLOROPHYLL_THRESHOLD] if not above_threshold.empty: print(f"警告: 发现 {len(above_threshold)} 行叶绿素a值超过阈值 {CHLOROPHYLL_THRESHOLD} μg/L,将被剔除") print(above_threshold) df = df[df['Chl_a'] <= CHLOROPHYLL_THRESHOLD] # 删除无效值 na_count = df['Chl_a'].isna().sum() if na_count > 0: print(f"警告: 删除了 {na_count} 行无效的叶绿素值") df = df.dropna(subset=['Chl_a']) # 报告过滤结果 filtered_count = len(df) print(f"数据过滤结果: 原始 {original_count} 行 → 保留 {filtered_count} 行") except Exception as e: print(f"叶绿素值转换错误: {e}") exit() # 4. 按日期分组计算统计量 daily_stats = df.groupby('Date')['Chl_a'].agg(['mean', 'std', 'count']).reset_index() daily_stats.columns = ['Date', 'Daily_Mean', 'Std_Dev', 'Sample_Count'] # 获取唯一日期 unique_dates = sorted(df['Date'].unique()) # 5. 创建绘图 - 箱型图 fig1, ax1 = plt.subplots(figsize=(14, 6)) # 绘制箱型图 ax1.boxplot([df[df['Date'] == date]['Chl_a'] for date in unique_dates], labels=[str(date) for date in unique_dates]) # 添加平均线 ax1.plot(range(1, len(unique_dates) + 1), daily_stats['Daily_Mean'], color='red', linestyle='--', label='Daily Mean') ax1.set_title('Daily Chlorophyll-a Concentration with Box Plot', fontsize=14) ax1.set_ylabel('Chl-a (μg/L)', fontsize=12) ax1.grid(axis='y', linestyle='--', alpha=0.7) # 添加图例 ax1.legend() plt.tight_layout() plt.savefig( r'D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\plot\daily_chlorophyll_boxplot.png', dpi=300, bbox_inches='tight') # # # 6. 创建每日时间序列图 - 每个日期一个子图 # fig2, axs = plt.subplots(len(unique_dates), 1, figsize=(14, 4 * len(unique_dates)), sharex=True) # # if len(unique_dates) == 1: # axs = [axs] # 确保单个子图时也能正确处理 # # fig2.suptitle('Chlorophyll-a Concentration Variation Throughout the Day', fontsize=16, y=0.98) # # for i, date in enumerate(unique_dates): # daily_data = df[df['Date'] == date].sort_values('Time') # ax = axs[i] # # # 使用真实时间作横轴 # ax.plot(daily_data['Time'], daily_data['Chl_a'], marker='', linestyle='-', color='royalblue', # label=f'Chl-a on {date}', alpha=0.8) # # # 添加统计信息 # daily_mean = daily_data['Chl_a'].mean() # daily_std = daily_data['Chl_a'].std() # ax.axhline(y=daily_mean, color='g', linestyle='--', alpha=0.7) # ax.axhspan(daily_mean - daily_std, daily_mean + daily_std, color='lightgreen', alpha=0.3) # # ax.set_title(f"{date.strftime('%Y-%m-%d')} (n={len(daily_data)})", fontsize=12) # ax.set_ylabel('Chl-a (μg/L)', fontsize=10) # ax.grid(True, linestyle='--', alpha=0.7) # # ax.legend(loc='upper left') # # # 自动优化时间刻度显示 # fig2.autofmt_xdate(rotation=45) # # # 保存每个日期的图像,确保文件名唯一 # save_path = f'D:/WQ/zhanghuilai/hyperspectral-inversion/data/input/一代高光谱/2025-07-11_2025-07-21/plot/daily_chlorophyll_timeseries_{date.strftime("%Y-%m-%d")}.png' # fig2.savefig(save_path, dpi=300, bbox_inches='tight') # 6. 为每个日期单独创建图像 for date in unique_dates: daily_data = df[df['Date'] == date].sort_values('Time') fig, ax = plt.subplots(figsize=(14, 4)) # 使用真实时间作横轴 ax.plot(daily_data['Time'], daily_data['Chl_a'], marker='o', linestyle='-', color='royalblue', label=f'Chl-a on {date}', alpha=0.8) # 添加统计信息 daily_mean = daily_data['Chl_a'].mean() daily_std = daily_data['Chl_a'].std() ax.axhline(y=daily_mean, color='g', linestyle='--', alpha=0.7, label='Daily Mean') ax.axhspan(daily_mean - daily_std, daily_mean + daily_std, color='lightgreen', alpha=0.3, label='±1 Std Dev') ax.set_title(f"Chlorophyll-a Time Series on {date.strftime('%Y-%m-%d')} (n={len(daily_data)})", fontsize=13) ax.set_xlabel("Time", fontsize=11) ax.set_ylabel("Chl-a (μg/L)", fontsize=11) ax.grid(True, linestyle='--', alpha=0.7) # 设置时间格式优化显示 ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) fig.autofmt_xdate(rotation=45) ax.legend(loc='upper left') # 保存图像 save_path = f'D:/WQ/zhanghuilai/hyperspectral-inversion/data/input/一代高光谱/2025-07-11_2025-07-21/plot/daily_chlorophyll_timeseries_{date.strftime("%Y-%m-%d")}.png' fig.savefig(save_path, dpi=300, bbox_inches='tight') plt.close(fig) # 防止内存过多占用 # 打印数据统计摘要 print("\n数据统计摘要:") print(f"叶绿素a阈值: {CHLOROPHYLL_THRESHOLD} μg/L") print(f"总样本数: {len(df)}") print(f"监测天数: {len(unique_dates)}") print(f"平均每日样本数: {len(df) / len(unique_dates):.1f}") print(f"叶绿素a总体均值: {df['Chl_a'].mean():.2f} ± {df['Chl_a'].std():.2f} μg/L") print(f"叶绿素a最小值: {df['Chl_a'].min():.2f} μg/L") print(f"叶绿素a最大值: {df['Chl_a'].max():.2f} μg/L") print("\n每日统计:") print(daily_stats)