首页 > 分享 > 数据可视化应用与实print(npzfile['feature_names'],' ',npzfile['data']) data=npzfile['data'][:

数据可视化应用与实print(npzfile['feature_names'],' ',npzfile['data']) data=npzfile['data'][:

花匠小妙招
2025-12-11 16:02

import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV from sklearn.metrics import (roc_curve, auc, precision_recall_curve, average_precision_score, f1_score, PrecisionRecallDisplay, RocCurveDisplay, roc_auc_score, accuracy_score, precision_score, recall_score) from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline import scipy.stats as stats import warnings warnings.filterwarnings('ignore') # 导入SHAP import shap # 定义计算AUC置信区间的函数 def calculate_auc_ci(y_true, y_pred_proba, confidence=0.95): """ 计算AUC的置信区间 """ auc_score = roc_auc_score(y_true, y_pred_proba) # 使用DeLong方法计算标准误 n1 = np.sum(y_true == 1) # 正例数量 n0 = np.sum(y_true == 0) # 负例数量 n = n0 + n1 if n1 == 0 or n0 == 0: return auc_score, (auc_score, auc_score) # 简化的标准误估计 Q1 = auc_score / (2 - auc_score) Q2 = 2 * auc_score**2 / (1 + auc_score) se_auc = np.sqrt((auc_score * (1 - auc_score) + (n1 - 1) * (Q1 - auc_score**2) + (n0 - 1) * (Q2 - auc_score**2)) / (n0 * n1)) # 计算置信区间 z_value = stats.norm.ppf(1 - (1 - confidence) / 2) lower_bound = auc_score - z_value * se_auc upper_bound = auc_score + z_value * se_auc # 确保置信区间在[0,1]范围内 lower_bound = max(0, lower_bound) upper_bound = min(1, upper_bound) return auc_score, (lower_bound, upper_bound) # 定义性能指标计算函数 def calculate_metrics(y_true, y_pred, y_pred_proba): """ 计算模型的各种性能指标 """ metrics = { 'AUC': roc_auc_score(y_true, y_pred_proba), 'F1': f1_score(y_true, y_pred, zero_division=0), 'Accuracy': accuracy_score(y_true, y_pred), 'Precision': precision_score(y_true, y_pred, zero_division=0), 'Recall': recall_score(y_true, y_pred, zero_division=0) } return metrics def print_detailed_metrics(metrics, model_name): """ 打印详细的性能指标 """ print(f"n=== {model_name} 性能指标 ===") print(f"AUC: {metrics['AUC']:.4f}") print(f"F1 Score: {metrics['F1']:.4f}") print(f"Accuracy: {metrics['Accuracy']:.4f}") print(f"Precision: {metrics['Precision']:.4f}") print(f"Recall: {metrics['Recall']:.4f}") def plot_shap_summary(model, X, feature_names, model_name="Model"): """ 绘制SHAP摘要图 """ print(f"n=== 生成 {model_name} 的SHAP摘要图 ===") # 提取随机森林模型 rf_model = model.named_steps['randomforestclassifier'] # 预处理数据 X_processed = model.named_steps['standardscaler'].transform(X) # 创建SHAP解释器 explainer = shap.TreeExplainer(rf_model) # 计算SHAP值 shap_values = explainer.shap_values(X_processed) # 打印形状信息用于调试 print(f"原始SHAP值形状: {np.array(shap_values).shape}") print(f"处理后的数据形状: {X_processed.shape}") # 处理SHAP值的格式 # 对于二分类问题，SHAP返回一个列表，包含两个数组：[负类SHAP值, 正类SHAP值] # 每个数组的形状都是(样本数, 特征数) if isinstance(shap_values, list) and len(shap_values) == 2: # 对于二分类，我们通常关注正类（类别1）的SHAP值 shap_values_positive = shap_values[1] print(f"正类SHAP值形状: {shap_values_positive.shape}") # 创建SHAP摘要图 - 使用正类的SHAP值 plt.figure(figsize=(12, 10)) shap.summary_plot(shap_values_positive, X_processed, feature_names=feature_names, show=False) plt.title(f'SHAP Summary Plot - {model_name} (正类)', fontsize=16, fontweight='bold') plt.tight_layout() plt.show() # 打印特征重要性 print(f"n{model_name} 特征重要性排序 (基于正类SHAP值):") feature_importance = np.abs(shap_values_positive).mean(axis=0) importance_df = pd.DataFrame({ 'Feature': feature_names, 'Importance': feature_importance }).sort_values('Importance', ascending=False) print(importance_df.to_string(index=False)) return importance_df else: # 如果SHAP值不是预期的列表格式，尝试直接使用 print("SHAP值不是预期的列表格式，尝试直接处理") # 确保SHAP值是二维的 if len(shap_values.shape) == 3: # 如果是三维，取正类的SHAP值 shap_values_2d = shap_values[:, :, 1] else: shap_values_2d = shap_values print(f"处理后的SHAP值形状: {shap_values_2d.shape}") # 创建SHAP摘要图 plt.figure(figsize=(12, 10)) shap.summary_plot(shap_values_2d, X_processed, feature_names=feature_names, show=False) plt.title(f'SHAP Summary Plot - {model_name}', fontsize=16, fontweight='bold') plt.tight_layout() plt.show() # 打印特征重要性 print(f"n{model_name} 特征重要性排序:") feature_importance = np.abs(shap_values_2d).mean(axis=0) importance_df = pd.DataFrame({ 'Feature': feature_names, 'Importance': feature_importance }).sort_values('Importance', ascending=False) print(importance_df.to_string(index=False)) return importance_df def train_random_forest(X, y): """ 训练并优化随机森林模型，返回详细性能指标和SHAP分析 """ print("=== 开始训练随机森林模型 ===") # 参数网格 param_grid = { 'randomforestclassifier__n_estimators': [1000, 2000, 3000,4000,5000,10000], 'randomforestclassifier__max_depth': [3, 5, 7, 10,15,20, None], 'randomforestclassifier__min_samples_split': [2, 5, 10,15], 'randomforestclassifier__min_samples_leaf': [1, 2, 4,8], 'randomforestclassifier__max_features': ['sqrt', 'log2', 0.3, 0.5,0.7], 'randomforestclassifier__bootstrap': [True, False], 'randomforestclassifier__class_weight': ['balanced', 'balanced_subsample'] } # 创建管道 pipeline = make_pipeline( StandardScaler(), RandomForestClassifier(random_state=42) ) # 十折交叉验证 X_np = X.values if hasattr(X, 'values') else X y_np = y.values if hasattr(y, 'values') else y cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) results = {'actual': [], 'proba': [], 'pred': []} best_params_list = [] best_scores_list = [] fold_auc_scores = [] for fold, (train_idx, test_idx) in enumerate(cv.split(X_np, y_np)): X_train, X_test = X_np[train_idx], X_np[test_idx] y_train, y_test = y_np[train_idx], y_np[test_idx] print(f"随机森林 - 第 {fold+1}/5 折") # 随机搜索 random_search = RandomizedSearchCV( pipeline, param_distributions=param_grid, n_iter=10, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), scoring='roc_auc', n_jobs=-1, random_state=42, verbose=0 ) random_search.fit(X_train, y_train) best_model = random_search.best_estimator_ best_params_list.append(random_search.best_params_) best_scores_list.append(random_search.best_score_) # 预测 y_pred_proba = best_model.predict_proba(X_test)[:, 1] y_pred = best_model.predict(X_test) results['actual'].extend(y_test) results['proba'].extend(y_pred_proba) results['pred'].extend(y_pred) # 计算并存储当前fold的测试集AUC fold_auc = roc_auc_score(y_test, y_pred_proba) fold_auc_scores.append(fold_auc) print(f"随机森林 - 第 {fold+1} 折最佳分数: {random_search.best_score_:.4f}") print(f"随机森林 - 第 {fold+1} 折测试集AUC: {fold_auc:.4f}") # 计算整体性能指标 final_metrics = calculate_metrics( results['actual'], results['pred'], results['proba'] ) # 计算交叉验证AUC的置信区间 cv_auc_mean = np.mean(fold_auc_scores) cv_auc_std = np.std(fold_auc_scores) cv_auc_ci = stats.t.interval(0.95, len(fold_auc_scores)-1, loc=cv_auc_mean, scale=cv_auc_std/np.sqrt(len(fold_auc_scores))) # 打印详细指标 print_detailed_metrics(final_metrics, "随机森林") # 结果汇总 print(f"n随机森林交叉验证结果:") print(f"平均最佳AUC: {np.mean(best_scores_list):.4f} (+/- {np.std(best_scores_list):.4f})") print(f"测试集AUC均值: {cv_auc_mean:.4f} (95% CI: {cv_auc_ci[0]:.4f}-{cv_auc_ci[1]:.4f})") # 在整个数据集上训练最终模型用于SHAP分析 print("n=== 训练最终模型用于SHAP分析 ===") final_random_search = RandomizedSearchCV( pipeline, param_distributions=param_grid, n_iter=10, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), scoring='roc_auc', n_jobs=-1, random_state=42, verbose=0 ) final_random_search.fit(X_np, y_np) final_model = final_random_search.best_estimator_ print(f"最终模型最佳参数: {final_random_search.best_params_}") print(f"最终模型最佳分数: {final_random_search.best_score_:.4f}") # 生成SHAP图 feature_names = X.columns.tolist() if hasattr(X, 'columns') else [f'Feature_{i}' for i in range(X_np.shape[1])] importance_df = plot_shap_summary(final_model, X_np, feature_names, "随机森林") return { 'results': results, 'best_params': best_params_list, 'best_scores': best_scores_list, 'metrics': final_metrics, 'model_name': 'Random Forest', 'fold_auc_scores': fold_auc_scores, 'cv_auc_ci': cv_auc_ci, 'final_model': final_model, 'feature_importance': importance_df } # 训练模型并生成SHAP图 rf_results = train_random_forest(X, y) def plot_additional_shap_plots(model, X, y, feature_names, model_name="Model"): """ 绘制其他类型的SHAP图：依赖图、力图和瀑布图 """ print(f"n=== 生成 {model_name} 的其他SHAP图 ===") # 提取模型和预处理数据 rf_model = model.named_steps['randomforestclassifier'] scaler = model.named_steps['standardscaler'] X_processed = scaler.transform(X) # 创建SHAP解释器 explainer = shap.TreeExplainer(rf_model) # 计算SHAP值 shap_values = explainer.shap_values(X_processed) # 处理SHAP值格式 if isinstance(shap_values, list) and len(shap_values) == 2: shap_values_positive = shap_values[1] expected_value = explainer.expected_value[1] # 正类的期望值 else: if len(shap_values.shape) == 3: shap_values_positive = shap_values[:, :, 1] expected_value = explainer.expected_value[1] if hasattr(explainer.expected_value, '__len__') else explainer.expected_value else: shap_values_positive = shap_values expected_value = explainer.expected_value # 确保expected_value是标量 if hasattr(expected_value, '__len__'): expected_value = expected_value[0] if len(expected_value) > 0 else expected_value print(f"期望值 (基准值): {expected_value}") print(f"SHAP值形状: {shap_values_positive.shape}") # 计算特征重要性并选择前8个最重要的特征 feature_importance = np.abs(shap_values_positive).mean(axis=0) top_features_idx = np.argsort(feature_importance)[-8:][::-1] # 前8个特征 top_feature_names = [feature_names[i] for i in top_features_idx] print(f"前8个最重要的特征: {top_feature_names}") # 1. 改进的SHAP特征重要性条形图（只显示前8个） print("n1. 绘制SHAP特征重要性条形图（前8个特征）...") # 创建自定义的条形图 plt.figure(figsize=(12, 8)) top_importance = feature_importance[top_features_idx] # 创建水平条形图 y_pos = np.arange(len(top_feature_names)) colors = plt.cm.viridis(np.linspace(0, 1, len(top_feature_names))) plt.barh(y_pos, top_importance, color=colors, alpha=0.7) plt.yticks(y_pos, top_feature_names) plt.xlabel('平均|SHAP值| (特征重要性)') plt.title(f'SHAP特征重要性排名 - {model_name}', fontsize=16, fontweight='bold') plt.gca().invert_yaxis() # 最重要的在顶部 # 在条形上添加数值标签 for i, v in enumerate(top_importance): plt.text(v + 0.001, i, f'{v:.4f}', va='center', fontsize=10) plt.grid(True, alpha=0.3, axis='x') plt.tight_layout() plt.show() # 2. 改进的SHAP特征依赖图（只显示前8个特征） print("n2. 绘制SHAP特征依赖图（前8个特征）...") # 创建2x4的子图布局 fig, axes = plt.subplots(2, 4, figsize=(20, 10)) axes = axes.ravel() for i, feature_idx in enumerate(top_features_idx): if i >= 8: # 只显示前8个 break # 清除当前子图 axes[i].clear() # 获取当前特征的数据 feature_data = X_processed[:, feature_idx] shap_data = shap_values_positive[:, feature_idx] # 创建散点图 scatter = axes[i].scatter(feature_data, shap_data, alpha=0.6, c=shap_data, cmap='coolwarm', s=20) axes[i].set_xlabel(feature_names[feature_idx]) axes[i].set_ylabel('SHAP value') axes[i].set_title(f'{feature_names[feature_idx]}', fontsize=12, fontweight='bold') axes[i].grid(True, alpha=0.3) # 添加趋势线 if len(feature_data) > 1: z = np.polyfit(feature_data, shap_data, 1) p = np.poly1d(z) axes[i].plot(np.sort(feature_data), p(np.sort(feature_data)), "r--", alpha=0.8, linewidth=2) # 隐藏多余的子图 for i in range(len(top_features_idx), 8): axes[i].set_visible(False) plt.suptitle(f'SHAP特征依赖图 - {model_name} (前8个最重要特征)', fontsize=16, fontweight='bold') plt.tight_layout() plt.show() # 3. SHAP蜂群图 (只显示前8个特征) print("n3. 绘制SHAP蜂群图（前8个特征）...") plt.figure(figsize=(12, 10)) # 只显示前8个特征 shap.summary_plot(shap_values_positive, X_processed, feature_names=feature_names, max_display=8, # 只显示前8个 show=False) plt.title(f'SHAP蜂群图 - {model_name} (前8个最重要特征)', fontsize=16, fontweight='bold') plt.tight_layout() plt.show() # 4. SHAP单个样本力图（前8个特征） print("n4. 绘制SHAP单个样本力图（前8个特征）...") # 选择几个有代表性的样本进行展示 positive_indices = np.where(y == 1)[0] negative_indices = np.where(y == 0)[0] # 选择1个正例和1个负例样本 sample_indices = [] if len(positive_indices) > 0: sample_indices.append(positive_indices[0]) if len(negative_indices) > 0: sample_indices.append(negative_indices[0]) # 绘制单个样本的力图（只显示前8个特征） for i, idx in enumerate(sample_indices[:2]): # 最多显示2个样本 plt.figure(figsize=(12, 6)) actual_label = "正例(有事件)" if y.iloc[idx] == 1 else "负例(无事件)" # 创建只包含前8个特征的SHAP值 top_shap_values = shap_values_positive[idx, top_features_idx] top_feature_values = X_processed[idx, top_features_idx] # 使用force_plot shap.force_plot( expected_value, top_shap_values, top_feature_values, feature_names=top_feature_names, matplotlib=True, show=False ) plt.title(f'SHAP单个样本力图 - 样本 {idx} ({actual_label}) - {model_name}', fontsize=14, fontweight='bold') plt.tight_layout() plt.show() # 5. 决策图 - 显示多个样本的SHAP决策路径（前8个特征） print("n5. 绘制SHAP决策图（前8个特征）...") plt.figure(figsize=(12, 8)) # 选择前100个样本以避免图像过于拥挤 num_samples_decision = min(100, len(X_processed)) # 只使用前8个特征的SHAP值 top_shap_values_all = shap_values_positive[:num_samples_decision, top_features_idx] shap.decision_plot( expected_value, top_shap_values_all, feature_names=top_feature_names, show=False, feature_order='importance' ) plt.title(f'SHAP决策图 (前{num_samples_decision}个样本, 前8个特征) - {model_name}', fontsize=14, fontweight='bold') plt.tight_layout() plt.show() return top_features_idx, top_feature_names, feature_importance[top_features_idx] def plot_shap_comparison(rf_model, X, y, feature_names, top_features_idx, top_feature_names): """ 比较正例和负例的SHAP值分布（只关注前8个特征） """ print("n=== 比较正例和负例的SHAP值分布（前8个特征）===") # 提取模型和预处理数据 rf_model_final = rf_model.named_steps['randomforestclassifier'] scaler = rf_model.named_steps['standardscaler'] X_processed = scaler.transform(X) # 创建SHAP解释器 explainer = shap.TreeExplainer(rf_model_final) shap_values = explainer.shap_values(X_processed) # 处理SHAP值格式 if isinstance(shap_values, list) and len(shap_values) == 2: shap_values_positive = shap_values[1] else: if len(shap_values.shape) == 3: shap_values_positive = shap_values[:, :, 1] else: shap_values_positive = shap_values # 分离正例和负例的SHAP值 positive_indices = y == 1 negative_indices = y == 0 # 为每个重要特征绘制正负例的SHAP值分布 fig, axes = plt.subplots(2, 4, figsize=(20, 10)) axes = axes.ravel() for i, feature_idx in enumerate(top_features_idx): if i >= 8: # 只显示前8个 break # 正例和负例的SHAP值 shap_positive_class = shap_values_positive[positive_indices, feature_idx] shap_negative_class = shap_values_positive[negative_indices, feature_idx] # 绘制箱线图 data_to_plot = [shap_negative_class, shap_positive_class] axes[i].boxplot(data_to_plot, labels=['负例', '正例']) axes[i].set_title(f'{feature_names[feature_idx]}', fontsize=12, fontweight='bold') axes[i].set_ylabel('SHAP值') axes[i].grid(True, alpha=0.3) # 添加统计检验 if len(shap_positive_class) > 0 and len(shap_negative_class) > 0: from scipy import stats try: t_stat, p_value = stats.ttest_ind(shap_positive_class, shap_negative_class, equal_var=False) stars = "" if p_value < 0.001: stars = "***" elif p_value < 0.01: stars = "**" elif p_value < 0.05: stars = "*" axes[i].text(0.5, 0.95, f'p={p_value:.4f}{stars}', transform=axes[i].transAxes, ha='center', va='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8), fontsize=10) except: pass # 隐藏多余的子图 for i in range(len(top_features_idx), 8): axes[i].set_visible(False) plt.suptitle('正例 vs 负例: 前8个最重要特征的SHAP值分布比较', fontsize=16, fontweight='bold') plt.tight_layout() plt.show() # 特征重要性对比图（前8个特征） print("n6. 绘制特征重要性对比图（前8个特征）...") # 分别计算正例和负例的特征重要性 importance_positive = np.abs(shap_values_positive[positive_indices]).mean(axis=0) importance_negative = np.abs(shap_values_positive[negative_indices]).mean(axis=0) plt.figure(figsize=(14, 8)) x_pos = np.arange(len(top_features_idx)) width = 0.35 plt.bar(x_pos - width/2, importance_positive[top_features_idx], width, label='正例', alpha=0.7, color='lightcoral') plt.bar(x_pos + width/2, importance_negative[top_features_idx], width, label='负例', alpha=0.7, color='lightblue') plt.xlabel('特征') plt.ylabel('平均|SHAP值|') plt.title('正例 vs 负例: 前8个最重要特征的重要性对比', fontsize=16, fontweight='bold') plt.xticks(x_pos, top_feature_names, rotation=45, ha='right') plt.legend() plt.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.show() # 在您现有的代码后面添加以下代码来生成这些图： # 生成其他SHAP图 print("开始生成其他SHAP可视化图...") # 获取最终模型和数据 final_model = rf_results['final_model'] feature_names = X.columns.tolist() # 绘制其他SHAP图，并获取前8个特征的信息 top_features_idx, top_feature_names, top_importance = plot_additional_shap_plots( final_model, X, y, feature_names, "随机森林" ) # 绘制正负例比较图（使用前8个特征） plot_shap_comparison(final_model, X, y, feature_names, top_features_idx, top_feature_names) print("n=== 所有SHAP图生成完成 ===") # 打印特征重要性总结（前8个） print("n=== 前8个特征重要性总结 ===") importance_df_top8 = pd.DataFrame({ 'Feature': top_feature_names, 'Importance': top_importance }).sort_values('Importance', ascending=False) print(importance_df_top8.to_string(index=False)) # 保存前8个特征重要性到Excel文件 importance_df_top8.to_excel("随机森林前8特征重要性.xlsx", index=False) print("前8个特征重要性已保存到 '随机森林前8特征重要性.xlsx'") # 额外的分析：显示前8个特征的影响方向 print("n=== 前8个最重要特征及其影响方向 ===") rf_model_final = final_model.named_steps['randomforestclassifier'] scaler = final_model.named_steps['standardscaler'] X_processed = scaler.transform(X) explainer = shap.TreeExplainer(rf_model_final) shap_values = explainer.shap_values(X_processed) if isinstance(shap_values, list) and len(shap_values) == 2: shap_values_positive = shap_values[1] else: if len(shap_values.shape) == 3: shap_values_positive = shap_values[:, :, 1] else: shap_values_positive = shap_values for i, feature_idx in enumerate(top_features_idx): shap_values_feature = shap_values_positive[:, feature_idx] mean_shap = np.mean(shap_values_feature) direction = "正向" if mean_shap > 0 else "负向" print(f"{top_feature_names[i]}: 重要性={top_importance[i]:.4f}, 平均影响方向={direction} (平均SHAP值={mean_shap:.4f})") 帮我把它改成一个MLP模型的解释器