Initial commit of WQ_GUI
This commit is contained in:
1
src/core/modeling/__init__.py
Normal file
1
src/core/modeling/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
124
src/core/modeling/best_R2.py
Normal file
124
src/core/modeling/best_R2.py
Normal file
@ -0,0 +1,124 @@
|
||||
import pandas as pd
|
||||
|
||||
# ---- 工具:在多个候选列名里自动匹配实际列名 ----
|
||||
def _find_col(df, candidates, required=True):
|
||||
cols = [c.strip() for c in df.columns]
|
||||
colmap = {c.strip(): c for c in df.columns} # strip 后到原名的映射
|
||||
for cand in candidates:
|
||||
if cand in cols:
|
||||
return colmap[cand]
|
||||
if required:
|
||||
raise KeyError(f"找不到列:候选 {candidates} ,实际列有:{list(df.columns)}")
|
||||
return None
|
||||
|
||||
# ---- 主函数:输入文件路径,输出文件路径(直接传参)----
|
||||
def pick_best_by_target(input_csv: str,
|
||||
output_csv: str = "best_by_target.csv",
|
||||
tie_break_priority: list | None = None) -> pd.DataFrame:
|
||||
"""
|
||||
读取一个CSV(表头包含:目标列、测试集R² 等),
|
||||
按“目标列”分组,挑选“测试集R²”最高的那一行(并做可选的并列打破),
|
||||
导出到 output_csv,并返回结果 DataFrame。
|
||||
"""
|
||||
df = pd.read_csv(input_csv)
|
||||
# 处理表头空格/BOM
|
||||
df.columns = df.columns.str.replace("\ufeff", "", regex=False).str.strip()
|
||||
|
||||
# 兼容多种列名写法
|
||||
target_col = _find_col(df, ["目标列", "Target", "target"])
|
||||
test_r2_col = _find_col(df, ["测试集R²", "测试集R2", "测试集R^2", "Test R2", "test_R2", "test r2"])
|
||||
|
||||
# 常见可选并列指标(按需要会自动忽略不存在的列)
|
||||
default_ties = [
|
||||
# metric, order: "min" 表示越小越好;"max" 表示越大越好
|
||||
("测试集RMSE", "min"), ("Test RMSE", "min"), ("test_RMSE", "min"),
|
||||
("测试集MAE", "min"), ("Test MAE", "min"), ("test_MAE", "min"),
|
||||
("测试集MSE", "min"), ("Test MSE", "min"), ("test_MSE", "min"),
|
||||
]
|
||||
# 如果用户传入自定义优先级,就覆盖;否则用默认
|
||||
tie_break_priority = tie_break_priority or default_ties
|
||||
|
||||
# 转数值(无法解析置 NaN)
|
||||
df[test_r2_col] = pd.to_numeric(df[test_r2_col], errors="coerce")
|
||||
|
||||
# 仅使用有 R² 的行参与选择
|
||||
df_valid = df.dropna(subset=[test_r2_col]).copy()
|
||||
if df_valid.empty:
|
||||
raise ValueError("没有有效的测试集R²数值(全为空),无法挑选最佳。")
|
||||
|
||||
# 每个目标列的候选数量
|
||||
counts = df.groupby(target_col).size().rename("模型条数")
|
||||
|
||||
# 构造排序键:先按 测试集R² 降序,其次按若干并列指标(若列不存在会被跳过)
|
||||
sort_cols = [test_r2_col]
|
||||
sort_ascending = [False] # R² 越大越好
|
||||
|
||||
for col_name, order in tie_break_priority:
|
||||
if col_name in df_valid.columns:
|
||||
sort_cols.append(col_name)
|
||||
sort_ascending.append(order == "min") # min → True, max → False
|
||||
|
||||
# 对每个目标列分组排序后取第一行
|
||||
best = (
|
||||
df_valid
|
||||
.sort_values(by=sort_cols, ascending=sort_ascending, kind="mergesort")
|
||||
.groupby(target_col, as_index=False)
|
||||
.head(1)
|
||||
)
|
||||
|
||||
# 合并候选数量,并按 测试集R² 再整体排序一下(可选)
|
||||
best = best.merge(counts, left_on=target_col, right_index=True)
|
||||
best = best.sort_values(by=[test_r2_col], ascending=False)
|
||||
|
||||
# 导出
|
||||
best.to_csv(output_csv, index=False, encoding="utf-8-sig")
|
||||
return best
|
||||
|
||||
# ---- 另一个便捷函数:直接传 DataFrame(不用落盘读写)----
|
||||
def pick_best_by_target_df(df: pd.DataFrame,
|
||||
tie_break_priority: list | None = None) -> pd.DataFrame:
|
||||
"""
|
||||
与 pick_best_by_target 相同逻辑,但输入是 DataFrame,返回挑选后的 DataFrame。
|
||||
"""
|
||||
df = df.copy()
|
||||
df.columns = df.columns.str.replace("\ufeff", "", regex=False).str.strip()
|
||||
target_col = _find_col(df, ["目标列", "Target", "target"])
|
||||
test_r2_col = _find_col(df, ["测试集R²", "测试集R2", "测试集R^2", "Test R2", "test_R2", "test r2"])
|
||||
|
||||
default_ties = [
|
||||
("测试集RMSE", "min"), ("Test RMSE", "min"), ("test_RMSE", "min"),
|
||||
("测试集MAE", "min"), ("Test MAE", "min"), ("test_MAE", "min"),
|
||||
("测试集MSE", "min"), ("Test MSE", "min"), ("test_MSE", "min"),
|
||||
]
|
||||
tie_break_priority = tie_break_priority or default_ties
|
||||
|
||||
df[test_r2_col] = pd.to_numeric(df[test_r2_col], errors="coerce")
|
||||
df_valid = df.dropna(subset=[test_r2_col]).copy()
|
||||
if df_valid.empty:
|
||||
raise ValueError("没有有效的测试集R²数值(全为空),无法挑选最佳。")
|
||||
|
||||
counts = df.groupby(target_col).size().rename("模型条数")
|
||||
|
||||
sort_cols = [test_r2_col]
|
||||
sort_ascending = [False]
|
||||
for col_name, order in tie_break_priority:
|
||||
if col_name in df_valid.columns:
|
||||
sort_cols.append(col_name)
|
||||
sort_ascending.append(order == "min")
|
||||
|
||||
best = (
|
||||
df_valid
|
||||
.sort_values(by=sort_cols, ascending=sort_ascending, kind="mergesort")
|
||||
.groupby(target_col, as_index=False)
|
||||
.head(1)
|
||||
.merge(counts, left_on=target_col, right_index=True)
|
||||
.sort_values(by=[test_r2_col], ascending=False)
|
||||
)
|
||||
return best
|
||||
# 路径方式
|
||||
res = pick_best_by_target(r"E:\code\WQ\yaobao925\qvchuyaoban\batch_detailed_results.csv", output_csv=r"E:\code\WQ\yaobao925\qvchuyaoban\best_by_target.csv")
|
||||
print(res.head())
|
||||
|
||||
# DataFrame 方式(如果你在笔记本里已有 df)
|
||||
# res_df = pick_best_by_target_df(df)
|
||||
# res_df.to_csv("best_by_target.csv", index=False, encoding="utf-8-sig")
|
||||
1134
src/core/modeling/modeling_batch.py
Normal file
1134
src/core/modeling/modeling_batch.py
Normal file
File diff suppressed because it is too large
Load Diff
392
src/core/modeling/regression.py
Normal file
392
src/core/modeling/regression.py
Normal file
@ -0,0 +1,392 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import r2_score
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
class SingleVariableRegressionAnalysis:
|
||||
"""
|
||||
单变量回归分析类,支持多种回归方法和对每个自变量单独分析
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.results = []
|
||||
|
||||
def linear_regression(self, x, y):
|
||||
"""线性回归: y = a + b*x"""
|
||||
try:
|
||||
x_2d = x.reshape(-1, 1)
|
||||
model = LinearRegression()
|
||||
model.fit(x_2d, y)
|
||||
|
||||
y_pred = model.predict(x_2d)
|
||||
r2 = r2_score(y, y_pred)
|
||||
|
||||
params = f"y = {model.intercept_:.6f} + {model.coef_[0]:.6f}*x"
|
||||
|
||||
return r2, params, y_pred
|
||||
except Exception as e:
|
||||
return np.nan, f"Error: {str(e)}", None
|
||||
|
||||
def exponential_regression(self, x, y):
|
||||
"""指数回归: y = a * exp(b*x)"""
|
||||
try:
|
||||
# 确保y为正数
|
||||
if np.any(y <= 0):
|
||||
return np.nan, "Error: y must be positive for exponential regression", None
|
||||
|
||||
# 转换为线性形式: ln(y) = ln(a) + b*x
|
||||
y_log = np.log(y)
|
||||
x_2d = x.reshape(-1, 1)
|
||||
|
||||
model = LinearRegression()
|
||||
model.fit(x_2d, y_log)
|
||||
|
||||
# 转换回指数形式
|
||||
a = np.exp(model.intercept_)
|
||||
b = model.coef_[0]
|
||||
|
||||
y_pred = a * np.exp(b * x)
|
||||
r2 = r2_score(y, y_pred)
|
||||
|
||||
params = f"y = {a:.6f} * exp({b:.6f}*x)"
|
||||
|
||||
return r2, params, y_pred
|
||||
except Exception as e:
|
||||
return np.nan, f"Error: {str(e)}", None
|
||||
|
||||
def power_regression(self, x, y):
|
||||
"""乘幂回归: y = a * x^b"""
|
||||
try:
|
||||
# 确保x和y为正数
|
||||
if np.any(x <= 0) or np.any(y <= 0):
|
||||
return np.nan, "Error: x and y must be positive for power regression", None
|
||||
|
||||
# 转换为线性形式: ln(y) = ln(a) + b*ln(x)
|
||||
x_log = np.log(x)
|
||||
y_log = np.log(y)
|
||||
|
||||
x_2d = x_log.reshape(-1, 1)
|
||||
model = LinearRegression()
|
||||
model.fit(x_2d, y_log)
|
||||
|
||||
# 转换回幂函数形式
|
||||
a = np.exp(model.intercept_)
|
||||
b = model.coef_[0]
|
||||
|
||||
y_pred = a * np.power(x, b)
|
||||
r2 = r2_score(y, y_pred)
|
||||
|
||||
params = f"y = {a:.6f} * x^{b:.6f}"
|
||||
|
||||
return r2, params, y_pred
|
||||
except Exception as e:
|
||||
return np.nan, f"Error: {str(e)}", None
|
||||
|
||||
def logarithmic_regression(self, x, y):
|
||||
"""对数回归: y = a + b*ln(x)"""
|
||||
try:
|
||||
# 确保x为正数
|
||||
if np.any(x <= 0):
|
||||
return np.nan, "Error: x must be positive for logarithmic regression", None
|
||||
|
||||
# 对x取对数
|
||||
x_log = np.log(x)
|
||||
x_2d = x_log.reshape(-1, 1)
|
||||
|
||||
model = LinearRegression()
|
||||
model.fit(x_2d, y)
|
||||
|
||||
y_pred = model.predict(x_2d)
|
||||
r2 = r2_score(y, y_pred)
|
||||
|
||||
params = f"y = {model.intercept_:.6f} + {model.coef_[0]:.6f}*ln(x)"
|
||||
|
||||
return r2, params, y_pred
|
||||
except Exception as e:
|
||||
return np.nan, f"Error: {str(e)}", None
|
||||
|
||||
def batch_single_variable_regression(self, data, x_columns, y_columns, methods='all', output_dir='custom_regression_results'):
|
||||
"""
|
||||
批量单变量回归分析 - 对每个自变量和因变量组合进行回归
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
data : pandas.DataFrame
|
||||
输入数据
|
||||
x_columns : list
|
||||
自变量列名列表,对每个自变量单独进行回归
|
||||
y_columns : str or list
|
||||
因变量列名或列名列表
|
||||
methods : str or list
|
||||
回归方法,可选 'all' 或方法列表 ['linear', 'exponential', 'power', 'logarithmic']
|
||||
output_dir : str
|
||||
输出目录路径,每个因变量将单独保存为一个CSV文件
|
||||
"""
|
||||
# 处理方法参数
|
||||
if methods == 'all':
|
||||
methods = ['linear', 'exponential', 'power', 'logarithmic']
|
||||
|
||||
method_functions = {
|
||||
'linear': self.linear_regression,
|
||||
'exponential': self.exponential_regression,
|
||||
'power': self.power_regression,
|
||||
'logarithmic': self.logarithmic_regression
|
||||
}
|
||||
|
||||
# 确保x_columns为列表
|
||||
if isinstance(x_columns, str):
|
||||
x_columns = [x_columns]
|
||||
|
||||
# 确保y_columns为列表
|
||||
if isinstance(y_columns, str):
|
||||
y_columns = [y_columns]
|
||||
|
||||
# 创建输出目录
|
||||
from pathlib import Path
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
self.results = {}
|
||||
all_results = []
|
||||
|
||||
print(f"开始单变量回归分析:")
|
||||
print(f"因变量数量: {len(y_columns)}")
|
||||
print(f"自变量数量: {len(x_columns)}")
|
||||
print(f"回归方法: {methods}")
|
||||
print(f"输出目录: {output_dir}")
|
||||
print("-" * 80)
|
||||
|
||||
# 对每个因变量进行回归分析
|
||||
for y_col in y_columns:
|
||||
print(f"\n分析因变量: {y_col}")
|
||||
self.results[y_col] = []
|
||||
|
||||
# 对每个自变量单独进行回归分析
|
||||
for x_col in x_columns:
|
||||
print(f"\n 分析自变量: {x_col}")
|
||||
|
||||
# 准备数据
|
||||
x_data = data[x_col].values
|
||||
y_data = data[y_col].values
|
||||
|
||||
# 移除包含NaN的行
|
||||
valid_mask = ~(np.isnan(x_data) | np.isnan(y_data))
|
||||
x_clean = x_data[valid_mask]
|
||||
y_clean = y_data[valid_mask]
|
||||
|
||||
if len(x_clean) == 0:
|
||||
print(f" ⚠ 无有效数据,跳过")
|
||||
continue
|
||||
|
||||
print(f" 有效样本数: {len(x_clean)}")
|
||||
|
||||
# 对当前自变量执行所有指定的回归方法
|
||||
for method_name in methods:
|
||||
if method_name not in method_functions:
|
||||
continue
|
||||
|
||||
regression_func = method_functions[method_name]
|
||||
|
||||
try:
|
||||
r2, equation, y_pred = regression_func(x_clean, y_clean)
|
||||
|
||||
if not np.isnan(r2):
|
||||
result = {
|
||||
'regression_method': method_name,
|
||||
'x_variable': x_col,
|
||||
'y_variable': y_col,
|
||||
'r_squared': r2,
|
||||
'equation': equation,
|
||||
'sample_size': len(x_clean),
|
||||
'x_mean': np.mean(x_clean),
|
||||
'x_std': np.std(x_clean),
|
||||
'y_mean': np.mean(y_clean),
|
||||
'y_std': np.std(y_clean)
|
||||
}
|
||||
|
||||
self.results[y_col].append(result)
|
||||
all_results.append(result)
|
||||
print(f" {method_name:12} | R² = {r2:.6f}")
|
||||
else:
|
||||
print(f" {method_name:12} | 失败")
|
||||
|
||||
except Exception as e:
|
||||
print(f" {method_name:12} | 错误: {str(e)}")
|
||||
|
||||
# 为当前因变量保存单独的CSV文件
|
||||
if self.results[y_col]:
|
||||
results_df = pd.DataFrame(self.results[y_col])
|
||||
|
||||
# 按R²排序
|
||||
results_df = results_df.sort_values(['x_variable', 'r_squared'], ascending=[True, False])
|
||||
|
||||
# 为每个因变量创建单独的文件名
|
||||
safe_y_name = y_col.replace('/', '_').replace('\\', '_').replace(' ', '_')
|
||||
output_file = output_path / f"{safe_y_name}_regression_results.csv"
|
||||
|
||||
results_df.to_csv(output_file, index=False, encoding='utf-8')
|
||||
print(f"\n {y_col} 的结果已保存到: {output_file}")
|
||||
|
||||
# 显示该因变量的最佳模型
|
||||
self._show_best_models_for_y(results_df, y_col)
|
||||
|
||||
# 保存汇总结果到CSV
|
||||
if all_results:
|
||||
summary_df = pd.DataFrame(all_results)
|
||||
|
||||
# 按因变量和R²排序
|
||||
summary_df = summary_df.sort_values(['y_variable', 'x_variable', 'r_squared'], ascending=[True, True, False])
|
||||
|
||||
summary_file = output_path / "all_regression_results.csv"
|
||||
summary_df.to_csv(summary_file, index=False, encoding='utf-8')
|
||||
print(f"\n汇总结果已保存到: {summary_file}")
|
||||
|
||||
return self.results
|
||||
|
||||
def _show_best_models_for_y(self, results_df, y_variable):
|
||||
"""显示指定因变量的最佳回归模型"""
|
||||
if results_df.empty:
|
||||
return
|
||||
|
||||
print(f"\n {y_variable} 的最佳回归模型:")
|
||||
|
||||
for x_var in results_df['x_variable'].unique():
|
||||
x_results = results_df[results_df['x_variable'] == x_var]
|
||||
best_model = x_results.loc[x_results['r_squared'].idxmax()]
|
||||
|
||||
print(f" 自变量 {x_var}:")
|
||||
print(f" 方法: {best_model['regression_method']}")
|
||||
print(f" R²: {best_model['r_squared']:.6f}")
|
||||
print(f" 方程: {best_model['equation']}")
|
||||
|
||||
def _show_best_models(self):
|
||||
"""显示每个自变量的最佳回归模型"""
|
||||
if not self.results:
|
||||
return
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("每个自变量的最佳回归模型:")
|
||||
print("=" * 80)
|
||||
|
||||
results_df = pd.DataFrame(self.results)
|
||||
|
||||
for x_var in results_df['x_variable'].unique():
|
||||
x_results = results_df[results_df['x_variable'] == x_var]
|
||||
best_model = x_results.loc[x_results['r_squared'].idxmax()]
|
||||
|
||||
print(f"\n自变量: {x_var}")
|
||||
print(f" 最佳方法: {best_model['regression_method']}")
|
||||
print(f" R²: {best_model['r_squared']:.6f}")
|
||||
print(f" 方程: {best_model['equation']}")
|
||||
print(f" 样本数: {best_model['sample_size']}")
|
||||
|
||||
def get_results_df(self):
|
||||
"""获取结果DataFrame"""
|
||||
return pd.DataFrame(self.results)
|
||||
|
||||
def get_best_models_summary(self):
|
||||
"""获取每个自变量的最佳模型汇总"""
|
||||
if not self.results:
|
||||
return pd.DataFrame()
|
||||
|
||||
results_df = pd.DataFrame(self.results)
|
||||
best_models = []
|
||||
|
||||
for x_var in results_df['x_variable'].unique():
|
||||
x_results = results_df[results_df['x_variable'] == x_var]
|
||||
best_model = x_results.loc[x_results['r_squared'].idxmax()].to_dict()
|
||||
best_models.append(best_model)
|
||||
|
||||
return pd.DataFrame(best_models)
|
||||
|
||||
def main():
|
||||
"""主函数示例"""
|
||||
# 创建示例数据
|
||||
|
||||
|
||||
# 初始化回归分析器
|
||||
analyzer = SingleVariableRegressionAnalysis()
|
||||
|
||||
print("=" * 80)
|
||||
print("水质参数单变量回归分析")
|
||||
print("=" * 80)
|
||||
|
||||
# 示例1: 使用所有回归方法分析光谱指数
|
||||
print("\n1. 光谱指数与叶绿素a的回归分析:")
|
||||
sample_data = pd.read_csv(r"E:\code\WQ\pipeline_result\work_dir\5_training_spectra\water_quality_results.csv")
|
||||
spectral_indices = ['Al10SABI','Am092Bsub']
|
||||
|
||||
results1 = analyzer.batch_single_variable_regression(
|
||||
data=sample_data,
|
||||
x_columns=spectral_indices,
|
||||
y_column='Chlorophyll',
|
||||
methods='all',
|
||||
output_file=r'E:\code\WQ\pipeline_result\work_dir\5_training_spectra\spectral_indices_regression.csv'
|
||||
)
|
||||
|
||||
# # 示例2: 使用特定方法分析反射率波段
|
||||
# print("\n2. 反射率波段与叶绿素a的回归分析:")
|
||||
# reflectance_bands = ['R443', 'R490', 'R560', 'R665', 'R705', 'R740']
|
||||
#
|
||||
# results2 = analyzer.batch_single_variable_regression(
|
||||
# data=sample_data,
|
||||
# x_columns=reflectance_bands,
|
||||
# y_column='Chl_a',
|
||||
# methods=['linear', 'power', 'logarithmic'],
|
||||
# output_file='reflectance_bands_regression.csv'
|
||||
# )
|
||||
|
||||
# 示例3: 获取最佳模型汇总
|
||||
print("\n3. 最佳模型汇总:")
|
||||
best_models = analyzer.get_best_models_summary()
|
||||
if not best_models.empty:
|
||||
print(best_models[['x_variable', 'regression_method', 'r_squared', 'equation']].to_string(index=False))
|
||||
best_models.to_csv(r'E:\code\WQ\pipeline_result\work_dir\5_training_spectra\best_models_summary.csv', index=False)
|
||||
print("\n最佳模型汇总已保存到 'best_models_summary.csv'")
|
||||
#
|
||||
# def advanced_usage_example():
|
||||
# """高级使用示例 - 处理实际数据"""
|
||||
# # 读取您的实际数据
|
||||
# try:
|
||||
# # 替换为您的实际数据文件路径
|
||||
# data = pd.read_csv('your_actual_water_data.csv')
|
||||
#
|
||||
# # 假设您的数据包含以下列(根据实际情况调整)
|
||||
# # 光谱指数列: ['NDCI', 'FLH', 'NDTI', 'SABI', ...]
|
||||
# # 反射率列: ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', ...] 或 ['R443', 'R490', ...]
|
||||
# # 水质参数列: ['Chl_a', 'Turbidity', 'TSS', 'CDOM', ...]
|
||||
#
|
||||
# analyzer = SingleVariableRegressionAnalysis()
|
||||
#
|
||||
# # 分析叶绿素a与所有光谱指数的关系
|
||||
# spectral_indices = ['NDCI', 'FLH', 'NDTI', 'SABI'] # 替换为您的实际列名
|
||||
# analyzer.batch_single_variable_regression(
|
||||
# data=data,
|
||||
# x_columns=spectral_indices,
|
||||
# y_column='Chl_a', # 替换为您的实际水质参数列名
|
||||
# methods='all',
|
||||
# output_file='chl_a_spectral_regression.csv'
|
||||
# )
|
||||
#
|
||||
# # 分析浊度与反射率波段的关系
|
||||
# reflectance_bands = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7'] # 替换为您的实际列名
|
||||
# analyzer.batch_single_variable_regression(
|
||||
# data=data,
|
||||
# x_columns=reflectance_bands,
|
||||
# y_column='Turbidity', # 替换为您的实际水质参数列名
|
||||
# methods=['linear', 'power'],
|
||||
# output_file='turbidity_reflectance_regression.csv'
|
||||
# )
|
||||
#
|
||||
# except FileNotFoundError:
|
||||
# print("请准备您的实际数据文件 'your_actual_water_data.csv'")
|
||||
# except Exception as e:
|
||||
# print(f"处理数据时出错: {str(e)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
# 取消注释以下行来处理您的实际数据
|
||||
# advanced_usage_example()
|
||||
Reference in New Issue
Block a user