Initial commit of WQ_GUI

This commit is contained in:
2026-04-08 15:25:08 +08:00
commit 91e36407ae
302 changed files with 40872 additions and 0 deletions

View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

View File

@ -0,0 +1,124 @@
import pandas as pd
# ---- 工具:在多个候选列名里自动匹配实际列名 ----
def _find_col(df, candidates, required=True):
cols = [c.strip() for c in df.columns]
colmap = {c.strip(): c for c in df.columns} # strip 后到原名的映射
for cand in candidates:
if cand in cols:
return colmap[cand]
if required:
raise KeyError(f"找不到列:候选 {candidates} ,实际列有:{list(df.columns)}")
return None
# ---- 主函数:输入文件路径,输出文件路径(直接传参)----
def pick_best_by_target(input_csv: str,
output_csv: str = "best_by_target.csv",
tie_break_priority: list | None = None) -> pd.DataFrame:
"""
读取一个CSV表头包含目标列、测试集R² 等),
按“目标列”分组挑选“测试集R²”最高的那一行并做可选的并列打破
导出到 output_csv并返回结果 DataFrame。
"""
df = pd.read_csv(input_csv)
# 处理表头空格/BOM
df.columns = df.columns.str.replace("\ufeff", "", regex=False).str.strip()
# 兼容多种列名写法
target_col = _find_col(df, ["目标列", "Target", "target"])
test_r2_col = _find_col(df, ["测试集R²", "测试集R2", "测试集R^2", "Test R2", "test_R2", "test r2"])
# 常见可选并列指标(按需要会自动忽略不存在的列)
default_ties = [
# metric, order: "min" 表示越小越好;"max" 表示越大越好
("测试集RMSE", "min"), ("Test RMSE", "min"), ("test_RMSE", "min"),
("测试集MAE", "min"), ("Test MAE", "min"), ("test_MAE", "min"),
("测试集MSE", "min"), ("Test MSE", "min"), ("test_MSE", "min"),
]
# 如果用户传入自定义优先级,就覆盖;否则用默认
tie_break_priority = tie_break_priority or default_ties
# 转数值(无法解析置 NaN
df[test_r2_col] = pd.to_numeric(df[test_r2_col], errors="coerce")
# 仅使用有 R² 的行参与选择
df_valid = df.dropna(subset=[test_r2_col]).copy()
if df_valid.empty:
raise ValueError("没有有效的测试集R²数值全为空无法挑选最佳。")
# 每个目标列的候选数量
counts = df.groupby(target_col).size().rename("模型条数")
# 构造排序键:先按 测试集R² 降序,其次按若干并列指标(若列不存在会被跳过)
sort_cols = [test_r2_col]
sort_ascending = [False] # R² 越大越好
for col_name, order in tie_break_priority:
if col_name in df_valid.columns:
sort_cols.append(col_name)
sort_ascending.append(order == "min") # min → True, max → False
# 对每个目标列分组排序后取第一行
best = (
df_valid
.sort_values(by=sort_cols, ascending=sort_ascending, kind="mergesort")
.groupby(target_col, as_index=False)
.head(1)
)
# 合并候选数量,并按 测试集R² 再整体排序一下(可选)
best = best.merge(counts, left_on=target_col, right_index=True)
best = best.sort_values(by=[test_r2_col], ascending=False)
# 导出
best.to_csv(output_csv, index=False, encoding="utf-8-sig")
return best
# ---- 另一个便捷函数:直接传 DataFrame不用落盘读写----
def pick_best_by_target_df(df: pd.DataFrame,
tie_break_priority: list | None = None) -> pd.DataFrame:
"""
与 pick_best_by_target 相同逻辑,但输入是 DataFrame返回挑选后的 DataFrame。
"""
df = df.copy()
df.columns = df.columns.str.replace("\ufeff", "", regex=False).str.strip()
target_col = _find_col(df, ["目标列", "Target", "target"])
test_r2_col = _find_col(df, ["测试集R²", "测试集R2", "测试集R^2", "Test R2", "test_R2", "test r2"])
default_ties = [
("测试集RMSE", "min"), ("Test RMSE", "min"), ("test_RMSE", "min"),
("测试集MAE", "min"), ("Test MAE", "min"), ("test_MAE", "min"),
("测试集MSE", "min"), ("Test MSE", "min"), ("test_MSE", "min"),
]
tie_break_priority = tie_break_priority or default_ties
df[test_r2_col] = pd.to_numeric(df[test_r2_col], errors="coerce")
df_valid = df.dropna(subset=[test_r2_col]).copy()
if df_valid.empty:
raise ValueError("没有有效的测试集R²数值全为空无法挑选最佳。")
counts = df.groupby(target_col).size().rename("模型条数")
sort_cols = [test_r2_col]
sort_ascending = [False]
for col_name, order in tie_break_priority:
if col_name in df_valid.columns:
sort_cols.append(col_name)
sort_ascending.append(order == "min")
best = (
df_valid
.sort_values(by=sort_cols, ascending=sort_ascending, kind="mergesort")
.groupby(target_col, as_index=False)
.head(1)
.merge(counts, left_on=target_col, right_index=True)
.sort_values(by=[test_r2_col], ascending=False)
)
return best
# 路径方式
res = pick_best_by_target(r"E:\code\WQ\yaobao925\qvchuyaoban\batch_detailed_results.csv", output_csv=r"E:\code\WQ\yaobao925\qvchuyaoban\best_by_target.csv")
print(res.head())
# DataFrame 方式(如果你在笔记本里已有 df
# res_df = pick_best_by_target_df(df)
# res_df.to_csv("best_by_target.csv", index=False, encoding="utf-8-sig")

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,392 @@
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
class SingleVariableRegressionAnalysis:
"""
单变量回归分析类,支持多种回归方法和对每个自变量单独分析
"""
def __init__(self):
self.results = []
def linear_regression(self, x, y):
"""线性回归: y = a + b*x"""
try:
x_2d = x.reshape(-1, 1)
model = LinearRegression()
model.fit(x_2d, y)
y_pred = model.predict(x_2d)
r2 = r2_score(y, y_pred)
params = f"y = {model.intercept_:.6f} + {model.coef_[0]:.6f}*x"
return r2, params, y_pred
except Exception as e:
return np.nan, f"Error: {str(e)}", None
def exponential_regression(self, x, y):
"""指数回归: y = a * exp(b*x)"""
try:
# 确保y为正数
if np.any(y <= 0):
return np.nan, "Error: y must be positive for exponential regression", None
# 转换为线性形式: ln(y) = ln(a) + b*x
y_log = np.log(y)
x_2d = x.reshape(-1, 1)
model = LinearRegression()
model.fit(x_2d, y_log)
# 转换回指数形式
a = np.exp(model.intercept_)
b = model.coef_[0]
y_pred = a * np.exp(b * x)
r2 = r2_score(y, y_pred)
params = f"y = {a:.6f} * exp({b:.6f}*x)"
return r2, params, y_pred
except Exception as e:
return np.nan, f"Error: {str(e)}", None
def power_regression(self, x, y):
"""乘幂回归: y = a * x^b"""
try:
# 确保x和y为正数
if np.any(x <= 0) or np.any(y <= 0):
return np.nan, "Error: x and y must be positive for power regression", None
# 转换为线性形式: ln(y) = ln(a) + b*ln(x)
x_log = np.log(x)
y_log = np.log(y)
x_2d = x_log.reshape(-1, 1)
model = LinearRegression()
model.fit(x_2d, y_log)
# 转换回幂函数形式
a = np.exp(model.intercept_)
b = model.coef_[0]
y_pred = a * np.power(x, b)
r2 = r2_score(y, y_pred)
params = f"y = {a:.6f} * x^{b:.6f}"
return r2, params, y_pred
except Exception as e:
return np.nan, f"Error: {str(e)}", None
def logarithmic_regression(self, x, y):
"""对数回归: y = a + b*ln(x)"""
try:
# 确保x为正数
if np.any(x <= 0):
return np.nan, "Error: x must be positive for logarithmic regression", None
# 对x取对数
x_log = np.log(x)
x_2d = x_log.reshape(-1, 1)
model = LinearRegression()
model.fit(x_2d, y)
y_pred = model.predict(x_2d)
r2 = r2_score(y, y_pred)
params = f"y = {model.intercept_:.6f} + {model.coef_[0]:.6f}*ln(x)"
return r2, params, y_pred
except Exception as e:
return np.nan, f"Error: {str(e)}", None
def batch_single_variable_regression(self, data, x_columns, y_columns, methods='all', output_dir='custom_regression_results'):
"""
批量单变量回归分析 - 对每个自变量和因变量组合进行回归
Parameters:
-----------
data : pandas.DataFrame
输入数据
x_columns : list
自变量列名列表,对每个自变量单独进行回归
y_columns : str or list
因变量列名或列名列表
methods : str or list
回归方法,可选 'all' 或方法列表 ['linear', 'exponential', 'power', 'logarithmic']
output_dir : str
输出目录路径每个因变量将单独保存为一个CSV文件
"""
# 处理方法参数
if methods == 'all':
methods = ['linear', 'exponential', 'power', 'logarithmic']
method_functions = {
'linear': self.linear_regression,
'exponential': self.exponential_regression,
'power': self.power_regression,
'logarithmic': self.logarithmic_regression
}
# 确保x_columns为列表
if isinstance(x_columns, str):
x_columns = [x_columns]
# 确保y_columns为列表
if isinstance(y_columns, str):
y_columns = [y_columns]
# 创建输出目录
from pathlib import Path
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True, parents=True)
self.results = {}
all_results = []
print(f"开始单变量回归分析:")
print(f"因变量数量: {len(y_columns)}")
print(f"自变量数量: {len(x_columns)}")
print(f"回归方法: {methods}")
print(f"输出目录: {output_dir}")
print("-" * 80)
# 对每个因变量进行回归分析
for y_col in y_columns:
print(f"\n分析因变量: {y_col}")
self.results[y_col] = []
# 对每个自变量单独进行回归分析
for x_col in x_columns:
print(f"\n 分析自变量: {x_col}")
# 准备数据
x_data = data[x_col].values
y_data = data[y_col].values
# 移除包含NaN的行
valid_mask = ~(np.isnan(x_data) | np.isnan(y_data))
x_clean = x_data[valid_mask]
y_clean = y_data[valid_mask]
if len(x_clean) == 0:
print(f" ⚠ 无有效数据,跳过")
continue
print(f" 有效样本数: {len(x_clean)}")
# 对当前自变量执行所有指定的回归方法
for method_name in methods:
if method_name not in method_functions:
continue
regression_func = method_functions[method_name]
try:
r2, equation, y_pred = regression_func(x_clean, y_clean)
if not np.isnan(r2):
result = {
'regression_method': method_name,
'x_variable': x_col,
'y_variable': y_col,
'r_squared': r2,
'equation': equation,
'sample_size': len(x_clean),
'x_mean': np.mean(x_clean),
'x_std': np.std(x_clean),
'y_mean': np.mean(y_clean),
'y_std': np.std(y_clean)
}
self.results[y_col].append(result)
all_results.append(result)
print(f" {method_name:12} | R² = {r2:.6f}")
else:
print(f" {method_name:12} | 失败")
except Exception as e:
print(f" {method_name:12} | 错误: {str(e)}")
# 为当前因变量保存单独的CSV文件
if self.results[y_col]:
results_df = pd.DataFrame(self.results[y_col])
# 按R²排序
results_df = results_df.sort_values(['x_variable', 'r_squared'], ascending=[True, False])
# 为每个因变量创建单独的文件名
safe_y_name = y_col.replace('/', '_').replace('\\', '_').replace(' ', '_')
output_file = output_path / f"{safe_y_name}_regression_results.csv"
results_df.to_csv(output_file, index=False, encoding='utf-8')
print(f"\n {y_col} 的结果已保存到: {output_file}")
# 显示该因变量的最佳模型
self._show_best_models_for_y(results_df, y_col)
# 保存汇总结果到CSV
if all_results:
summary_df = pd.DataFrame(all_results)
# 按因变量和R²排序
summary_df = summary_df.sort_values(['y_variable', 'x_variable', 'r_squared'], ascending=[True, True, False])
summary_file = output_path / "all_regression_results.csv"
summary_df.to_csv(summary_file, index=False, encoding='utf-8')
print(f"\n汇总结果已保存到: {summary_file}")
return self.results
def _show_best_models_for_y(self, results_df, y_variable):
"""显示指定因变量的最佳回归模型"""
if results_df.empty:
return
print(f"\n {y_variable} 的最佳回归模型:")
for x_var in results_df['x_variable'].unique():
x_results = results_df[results_df['x_variable'] == x_var]
best_model = x_results.loc[x_results['r_squared'].idxmax()]
print(f" 自变量 {x_var}:")
print(f" 方法: {best_model['regression_method']}")
print(f" R²: {best_model['r_squared']:.6f}")
print(f" 方程: {best_model['equation']}")
def _show_best_models(self):
"""显示每个自变量的最佳回归模型"""
if not self.results:
return
print("\n" + "=" * 80)
print("每个自变量的最佳回归模型:")
print("=" * 80)
results_df = pd.DataFrame(self.results)
for x_var in results_df['x_variable'].unique():
x_results = results_df[results_df['x_variable'] == x_var]
best_model = x_results.loc[x_results['r_squared'].idxmax()]
print(f"\n自变量: {x_var}")
print(f" 最佳方法: {best_model['regression_method']}")
print(f" R²: {best_model['r_squared']:.6f}")
print(f" 方程: {best_model['equation']}")
print(f" 样本数: {best_model['sample_size']}")
def get_results_df(self):
"""获取结果DataFrame"""
return pd.DataFrame(self.results)
def get_best_models_summary(self):
"""获取每个自变量的最佳模型汇总"""
if not self.results:
return pd.DataFrame()
results_df = pd.DataFrame(self.results)
best_models = []
for x_var in results_df['x_variable'].unique():
x_results = results_df[results_df['x_variable'] == x_var]
best_model = x_results.loc[x_results['r_squared'].idxmax()].to_dict()
best_models.append(best_model)
return pd.DataFrame(best_models)
def main():
"""主函数示例"""
# 创建示例数据
# 初始化回归分析器
analyzer = SingleVariableRegressionAnalysis()
print("=" * 80)
print("水质参数单变量回归分析")
print("=" * 80)
# 示例1: 使用所有回归方法分析光谱指数
print("\n1. 光谱指数与叶绿素a的回归分析:")
sample_data = pd.read_csv(r"E:\code\WQ\pipeline_result\work_dir\5_training_spectra\water_quality_results.csv")
spectral_indices = ['Al10SABI','Am092Bsub']
results1 = analyzer.batch_single_variable_regression(
data=sample_data,
x_columns=spectral_indices,
y_column='Chlorophyll',
methods='all',
output_file=r'E:\code\WQ\pipeline_result\work_dir\5_training_spectra\spectral_indices_regression.csv'
)
# # 示例2: 使用特定方法分析反射率波段
# print("\n2. 反射率波段与叶绿素a的回归分析:")
# reflectance_bands = ['R443', 'R490', 'R560', 'R665', 'R705', 'R740']
#
# results2 = analyzer.batch_single_variable_regression(
# data=sample_data,
# x_columns=reflectance_bands,
# y_column='Chl_a',
# methods=['linear', 'power', 'logarithmic'],
# output_file='reflectance_bands_regression.csv'
# )
# 示例3: 获取最佳模型汇总
print("\n3. 最佳模型汇总:")
best_models = analyzer.get_best_models_summary()
if not best_models.empty:
print(best_models[['x_variable', 'regression_method', 'r_squared', 'equation']].to_string(index=False))
best_models.to_csv(r'E:\code\WQ\pipeline_result\work_dir\5_training_spectra\best_models_summary.csv', index=False)
print("\n最佳模型汇总已保存到 'best_models_summary.csv'")
#
# def advanced_usage_example():
# """高级使用示例 - 处理实际数据"""
# # 读取您的实际数据
# try:
# # 替换为您的实际数据文件路径
# data = pd.read_csv('your_actual_water_data.csv')
#
# # 假设您的数据包含以下列(根据实际情况调整)
# # 光谱指数列: ['NDCI', 'FLH', 'NDTI', 'SABI', ...]
# # 反射率列: ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', ...] 或 ['R443', 'R490', ...]
# # 水质参数列: ['Chl_a', 'Turbidity', 'TSS', 'CDOM', ...]
#
# analyzer = SingleVariableRegressionAnalysis()
#
# # 分析叶绿素a与所有光谱指数的关系
# spectral_indices = ['NDCI', 'FLH', 'NDTI', 'SABI'] # 替换为您的实际列名
# analyzer.batch_single_variable_regression(
# data=data,
# x_columns=spectral_indices,
# y_column='Chl_a', # 替换为您的实际水质参数列名
# methods='all',
# output_file='chl_a_spectral_regression.csv'
# )
#
# # 分析浊度与反射率波段的关系
# reflectance_bands = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7'] # 替换为您的实际列名
# analyzer.batch_single_variable_regression(
# data=data,
# x_columns=reflectance_bands,
# y_column='Turbidity', # 替换为您的实际水质参数列名
# methods=['linear', 'power'],
# output_file='turbidity_reflectance_regression.csv'
# )
#
# except FileNotFoundError:
# print("请准备您的实际数据文件 'your_actual_water_data.csv'")
# except Exception as e:
# print(f"处理数据时出错: {str(e)}")
if __name__ == "__main__":
main()
# 取消注释以下行来处理您的实际数据
# advanced_usage_example()