211 lines
8.2 KiB
Python
211 lines
8.2 KiB
Python
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
import matplotlib.dates as mdates
|
||
import numpy as np
|
||
from datetime import datetime
|
||
|
||
# 设置叶绿素a阈值
|
||
CHLOROPHYLL_THRESHOLD = 100.0 # 大于此值的叶绿素a数据将被剔除
|
||
|
||
# 1. 读取CSV文件
|
||
try:
|
||
# 尝试自动检测文件格式
|
||
df = pd.read_csv(
|
||
r"D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\不去耀斑index.csv",
|
||
header=None, skip_blank_lines=True, on_bad_lines='warn')
|
||
|
||
# 检查列数,可能有多余列
|
||
if df.shape[1] > 2:
|
||
print(f"警告:检测到 {df.shape[1]} 列,但预期只有2列。尝试提取前两列。")
|
||
df = df.iloc[:, :2] # 只保留前两列
|
||
|
||
# 重命名列
|
||
df.columns = ['Time', 'Chl_a']
|
||
|
||
# 打印前几行用于调试
|
||
print("原始数据预览:")
|
||
print(df.head())
|
||
|
||
# 检查是否有标题行
|
||
if not isinstance(df['Time'].iloc[0], str) or any(char.isalpha() for char in str(df['Time'].iloc[0])):
|
||
print("检测到可能的标题行,跳过第一行")
|
||
df = pd.read_csv(
|
||
r"D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\不去耀斑index.csv",
|
||
header=None, skiprows=1, skip_blank_lines=True)
|
||
df = df.iloc[:, :2] # 只保留前两列
|
||
df.columns = ['Time', 'Chl_a']
|
||
print("处理后数据预览:")
|
||
print(df.head())
|
||
|
||
except Exception as e:
|
||
print(f"读取文件时出错: {e}")
|
||
exit()
|
||
|
||
# 2. 转换时间格式
|
||
try:
|
||
# 尝试解析时间
|
||
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y/%m/%d %H:%M')
|
||
|
||
# 检查是否有解析失败的时间
|
||
if df['Time'].isna().any():
|
||
print("部分时间解析失败,尝试其他格式...")
|
||
# 尝试其他常见格式
|
||
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y-%m-%d %H:%M')
|
||
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y/%m/%d %H:%M:%S')
|
||
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
|
||
|
||
# 如果仍有失败,尝试自动推断
|
||
if df['Time'].isna().any():
|
||
print("使用混合格式解析时间")
|
||
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='mixed')
|
||
|
||
# 删除无法解析时间的行
|
||
na_count = df['Time'].isna().sum()
|
||
if na_count > 0:
|
||
print(f"警告: 删除了 {na_count} 行无法解析的时间数据")
|
||
df = df.dropna(subset=['Time'])
|
||
|
||
# 提取日期和时间
|
||
df['Date'] = df['Time'].dt.date
|
||
df['Time_of_day'] = df['Time'].dt.time
|
||
|
||
except Exception as e:
|
||
print(f"时间解析错误: {e}")
|
||
exit()
|
||
|
||
# 3. 确保叶绿素a列是数值型并应用阈值过滤
|
||
try:
|
||
df['Chl_a'] = pd.to_numeric(df['Chl_a'], errors='coerce')
|
||
|
||
# 记录过滤前的数据量
|
||
original_count = len(df)
|
||
|
||
# 应用阈值过滤
|
||
above_threshold = df[df['Chl_a'] > CHLOROPHYLL_THRESHOLD]
|
||
if not above_threshold.empty:
|
||
print(f"警告: 发现 {len(above_threshold)} 行叶绿素a值超过阈值 {CHLOROPHYLL_THRESHOLD} μg/L,将被剔除")
|
||
print(above_threshold)
|
||
df = df[df['Chl_a'] <= CHLOROPHYLL_THRESHOLD]
|
||
|
||
# 删除无效值
|
||
na_count = df['Chl_a'].isna().sum()
|
||
if na_count > 0:
|
||
print(f"警告: 删除了 {na_count} 行无效的叶绿素值")
|
||
df = df.dropna(subset=['Chl_a'])
|
||
|
||
# 报告过滤结果
|
||
filtered_count = len(df)
|
||
print(f"数据过滤结果: 原始 {original_count} 行 → 保留 {filtered_count} 行")
|
||
|
||
except Exception as e:
|
||
print(f"叶绿素值转换错误: {e}")
|
||
exit()
|
||
|
||
# 4. 按日期分组计算统计量
|
||
daily_stats = df.groupby('Date')['Chl_a'].agg(['mean', 'std', 'count']).reset_index()
|
||
daily_stats.columns = ['Date', 'Daily_Mean', 'Std_Dev', 'Sample_Count']
|
||
|
||
# 获取唯一日期
|
||
unique_dates = sorted(df['Date'].unique())
|
||
|
||
# 5. 创建绘图 - 箱型图
|
||
fig1, ax1 = plt.subplots(figsize=(14, 6))
|
||
|
||
# 绘制箱型图
|
||
ax1.boxplot([df[df['Date'] == date]['Chl_a'] for date in unique_dates], labels=[str(date) for date in unique_dates])
|
||
|
||
# 添加平均线
|
||
ax1.plot(range(1, len(unique_dates) + 1), daily_stats['Daily_Mean'], color='red', linestyle='--', label='Daily Mean')
|
||
|
||
ax1.set_title('Daily Chlorophyll-a Concentration with Box Plot', fontsize=14)
|
||
ax1.set_ylabel('Chl-a (μg/L)', fontsize=12)
|
||
ax1.grid(axis='y', linestyle='--', alpha=0.7)
|
||
|
||
# 添加图例
|
||
ax1.legend()
|
||
|
||
plt.tight_layout()
|
||
plt.savefig(
|
||
r'D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\plot\daily_chlorophyll_boxplot.png',
|
||
dpi=300, bbox_inches='tight')
|
||
|
||
#
|
||
# # 6. 创建每日时间序列图 - 每个日期一个子图
|
||
# fig2, axs = plt.subplots(len(unique_dates), 1, figsize=(14, 4 * len(unique_dates)), sharex=True)
|
||
#
|
||
# if len(unique_dates) == 1:
|
||
# axs = [axs] # 确保单个子图时也能正确处理
|
||
#
|
||
# fig2.suptitle('Chlorophyll-a Concentration Variation Throughout the Day', fontsize=16, y=0.98)
|
||
#
|
||
# for i, date in enumerate(unique_dates):
|
||
# daily_data = df[df['Date'] == date].sort_values('Time')
|
||
# ax = axs[i]
|
||
#
|
||
# # 使用真实时间作横轴
|
||
# ax.plot(daily_data['Time'], daily_data['Chl_a'], marker='', linestyle='-', color='royalblue',
|
||
# label=f'Chl-a on {date}', alpha=0.8)
|
||
#
|
||
# # 添加统计信息
|
||
# daily_mean = daily_data['Chl_a'].mean()
|
||
# daily_std = daily_data['Chl_a'].std()
|
||
# ax.axhline(y=daily_mean, color='g', linestyle='--', alpha=0.7)
|
||
# ax.axhspan(daily_mean - daily_std, daily_mean + daily_std, color='lightgreen', alpha=0.3)
|
||
#
|
||
# ax.set_title(f"{date.strftime('%Y-%m-%d')} (n={len(daily_data)})", fontsize=12)
|
||
# ax.set_ylabel('Chl-a (μg/L)', fontsize=10)
|
||
# ax.grid(True, linestyle='--', alpha=0.7)
|
||
#
|
||
# ax.legend(loc='upper left')
|
||
#
|
||
# # 自动优化时间刻度显示
|
||
# fig2.autofmt_xdate(rotation=45)
|
||
#
|
||
# # 保存每个日期的图像,确保文件名唯一
|
||
# save_path = f'D:/WQ/zhanghuilai/hyperspectral-inversion/data/input/一代高光谱/2025-07-11_2025-07-21/plot/daily_chlorophyll_timeseries_{date.strftime("%Y-%m-%d")}.png'
|
||
# fig2.savefig(save_path, dpi=300, bbox_inches='tight')
|
||
|
||
# 6. 为每个日期单独创建图像
|
||
for date in unique_dates:
|
||
daily_data = df[df['Date'] == date].sort_values('Time')
|
||
|
||
fig, ax = plt.subplots(figsize=(14, 4))
|
||
|
||
# 使用真实时间作横轴
|
||
ax.plot(daily_data['Time'], daily_data['Chl_a'], marker='o', linestyle='-', color='royalblue',
|
||
label=f'Chl-a on {date}', alpha=0.8)
|
||
|
||
# 添加统计信息
|
||
daily_mean = daily_data['Chl_a'].mean()
|
||
daily_std = daily_data['Chl_a'].std()
|
||
ax.axhline(y=daily_mean, color='g', linestyle='--', alpha=0.7, label='Daily Mean')
|
||
ax.axhspan(daily_mean - daily_std, daily_mean + daily_std, color='lightgreen', alpha=0.3, label='±1 Std Dev')
|
||
|
||
ax.set_title(f"Chlorophyll-a Time Series on {date.strftime('%Y-%m-%d')} (n={len(daily_data)})", fontsize=13)
|
||
ax.set_xlabel("Time", fontsize=11)
|
||
ax.set_ylabel("Chl-a (μg/L)", fontsize=11)
|
||
ax.grid(True, linestyle='--', alpha=0.7)
|
||
|
||
# 设置时间格式优化显示
|
||
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
|
||
fig.autofmt_xdate(rotation=45)
|
||
|
||
ax.legend(loc='upper left')
|
||
|
||
# 保存图像
|
||
save_path = f'D:/WQ/zhanghuilai/hyperspectral-inversion/data/input/一代高光谱/2025-07-11_2025-07-21/plot/daily_chlorophyll_timeseries_{date.strftime("%Y-%m-%d")}.png'
|
||
fig.savefig(save_path, dpi=300, bbox_inches='tight')
|
||
plt.close(fig) # 防止内存过多占用
|
||
|
||
# 打印数据统计摘要
|
||
print("\n数据统计摘要:")
|
||
print(f"叶绿素a阈值: {CHLOROPHYLL_THRESHOLD} μg/L")
|
||
print(f"总样本数: {len(df)}")
|
||
print(f"监测天数: {len(unique_dates)}")
|
||
print(f"平均每日样本数: {len(df) / len(unique_dates):.1f}")
|
||
print(f"叶绿素a总体均值: {df['Chl_a'].mean():.2f} ± {df['Chl_a'].std():.2f} μg/L")
|
||
print(f"叶绿素a最小值: {df['Chl_a'].min():.2f} μg/L")
|
||
print(f"叶绿素a最大值: {df['Chl_a'].max():.2f} μg/L")
|
||
print("\n每日统计:")
|
||
print(daily_stats)
|