# if evaluation_mode == 'supervised':
#     # Simple logic: check if single model (regardless of dataset count)
#     num_models = len(model_endpoints)
#     num_datasets = len(all_results)
    
#     # For single model: skip cross-model comparison entirely
#     if num_models == 1:
#         print("\n" + "=" * 60)
#         print("EVALUATION RESULTS")
#         print("=" * 60)
        
#         # Just show the visualizations without confusing headers
#         plot_confusion_matrices_all_datasets(all_results)
        
#         # Show ROC curves for all datasets (including challenging with adversarial examples)
#         print("\n" + "=" * 60)
#         print("ROC CURVES")
#         print("=" * 60)
#         plot_roc_curves_all_datasets(all_results)
        
#         plot_confidence_distribution_all_datasets(all_results)
#         print("\n")
#         plot_latency_comparison_all_datasets(all_results)
        
#     else:
#         # Multiple models - show full analysis including cross-model comparison
#         metrics, robustness_scores = cross_dataset_comparison(all_results)

#         print("\n" + "=" * 60)
#         if num_datasets > 1:
#             print("CONFUSION MATRICES ACROSS ALL DATASETS")
#         else:
#             print("CONFUSION MATRICES")
#         print("=" * 60)
#         plot_confusion_matrices_all_datasets(all_results)

#         # Add ROC curves for multiple models (now including challenging dataset)
#         print("\n" + "=" * 60)
#         if num_datasets > 1:
#             print("ROC CURVES ACROSS ALL DATASETS")
#         else:
#             print("ROC CURVES")
#         print("=" * 60)
        
#         # Show ROC curves for all datasets including challenging (now with adversarial examples)
#         plot_roc_curves_all_datasets(all_results)

#         print("\n" + "=" * 60)
#         if num_datasets > 1:
#             print("CONFIDENCE DISTRIBUTIONS ACROSS ALL DATASETS")
#         else:
#             print("CONFIDENCE DISTRIBUTIONS")
#         print("=" * 60)
#         plot_confidence_distribution_all_datasets(all_results)

#         print("\n" + "=" * 60)
#         if num_datasets > 1:
#             print("LATENCY COMPARISON ACROSS ALL DATASETS")
#         else:
#             print("LATENCY COMPARISON")
#         print("=" * 60)
#         plot_latency_comparison_all_datasets(all_results)

# else:
#     # NEW UNSUPERVISED VISUALIZATIONS
#     print("\n" + "=" * 60)
#     print("UNSUPERVISED EVALUATION RESULTS")
#     print("=" * 60)
    
#     # Import unsupervised visualization functions
#     exec(open('src/unsupervised_visualization.py').read())
    
#     dataset_name = list(all_results.keys())[0]
#     dataset_data = all_results[dataset_name]
    
#     # Create summary table
#     summary_df = create_unsupervised_summary_table(
#         dataset_data['model_results'], 
#         dataset_data['metrics']
#     )
    
#     # Create visualizations
#     plot_unsupervised_results(
#         dataset_data['model_results'], 
#         dataset_data['metrics']
#     )
    
#     # Generate recommendations
#     generate_unsupervised_recommendations(
#         dataset_data['model_results'], 
#         dataset_data['metrics']
#     )


if evaluation_mode == 'supervised':
    # Simple logic: check if single model (regardless of dataset count)
    num_models = len(model_endpoints)
    num_datasets = len(all_results)
    
    # For single model: skip cross-model comparison entirely
    if num_models == 1:
        print("\n" + "=" * 60)
        print("EVALUATION RESULTS")
        print("=" * 60)
        
        # Just show the visualizations without confusing headers
        plot_confusion_matrices_all_datasets(all_results)
        print("Confusion Matrix Interpretation:")
        print("   • Y-axis (vertical) = True Label, X-axis (horizontal) = Predicted Label")
        print("   • Dark blue squares = high counts, light blue = low counts")
        print("   • Perfect model: only dark squares on diagonal (top-left to bottom-right)")
        print("   • Off-diagonal squares indicate misclassifications")
        
        # Show ROC curves for all datasets (including challenging with adversarial examples)
        print("\n" + "=" * 60)
        print("ROC CURVES")
        print("=" * 60)
        plot_roc_curves_all_datasets(all_results)
        print("ROC Curve Analysis:")
        print("   • Orange line = model performance, black dashed = random guessing")
        print("   • Perfect model: line goes straight up then right (AUC = 1.0)")
        print("   • Higher AUC scores indicate better classification performance")
        print("   • Compare AUC values across datasets to assess model consistency")
        
        plot_confidence_distribution_all_datasets(all_results)
        print("Confidence distributions show how certain models are about predictions.")
        print("   • Higher peaks near 0 or 1 = more confident predictions")
        print("   • Peaks near 0.5 = uncertain/borderline predictions")
        print("   • Well-calibrated models should show clear separation")
        print("\n")
        plot_latency_comparison_all_datasets(all_results)
        print("Latency comparison shows inference speed across models.")
        print("   • Lower bars = faster inference times")
        print("   • Consider speed vs accuracy tradeoffs for production use")
        
    else:
        # Multiple models - show full analysis including cross-model comparison
        metrics, robustness_scores = cross_dataset_comparison(all_results)

        print("\n" + "=" * 60)
        if num_datasets > 1:
            print("CONFUSION MATRICES ACROSS ALL DATASETS")
        else:
            print("CONFUSION MATRICES")
        print("=" * 60)
        plot_confusion_matrices_all_datasets(all_results)
        print("Confusion Matrix Interpretation:")
        print("   • Y-axis (vertical) = True Label, X-axis (horizontal) = Predicted Label")
        print("   • Dark blue squares = high counts, light blue = low counts")
        print("   • Perfect model: only dark squares on diagonal (top-left to bottom-right)")
        print("   • Off-diagonal squares indicate misclassifications")

        # Add ROC curves for multiple models (now including challenging dataset)
        print("\n" + "=" * 60)
        if num_datasets > 1:
            print("ROC CURVES ACROSS ALL DATASETS")
        else:
            print("ROC CURVES")
        print("=" * 60)
        
        # Show ROC curves for all datasets including challenging (now with adversarial examples)
        plot_roc_curves_all_datasets(all_results)
        print("ROC Curve Analysis:")
        print("   • Orange line = model performance, black dashed = random guessing")
        print("   • Perfect model: line goes straight up then right (AUC = 1.0)")
        print("   • Higher AUC scores indicate better classification performance")
        print("   • Compare AUC values across datasets to assess model consistency")

        print("\n" + "=" * 60)
        if num_datasets > 1:
            print("CONFIDENCE DISTRIBUTIONS ACROSS ALL DATASETS")
        else:
            print("CONFIDENCE DISTRIBUTIONS")
        print("=" * 60)
        plot_confidence_distribution_all_datasets(all_results)
        print("Confidence Distribution Analysis:")
        print("   • Green bars = correct predictions, Red bars = incorrect predictions")
        print("   • X-axis: confidence score (0 = negative, 1 = positive)")
        print("   • Well-calibrated models: green bars at extremes (0.0, 1.0), few red bars")
        print("   • Mean values show average confidence across predictions")
        print("   • Higher separation between correct/incorrect indicates better calibration")

        print("\n" + "=" * 60)
        if num_datasets > 1:
            print("LATENCY COMPARISON ACROSS ALL DATASETS")
        else:
            print("LATENCY COMPARISON")
        print("=" * 60)
        plot_latency_comparison_all_datasets(all_results)
        print("Latency Analysis:")
        print("   • Bars show inference time in seconds (lower = faster)")
        print("   • Average, P50, P90, P99 represent different percentiles of response times")
        print("   • P99 = 99% of requests complete within this time (important for SLA)")
        print("   • Compare models: similar accuracy but different speed = choose faster")
        print("   • Consistent bars across datasets = stable performance under different loads")

else:
    # NEW UNSUPERVISED VISUALIZATIONS
    print("\n" + "=" * 60)
    print("UNSUPERVISED EVALUATION RESULTS")
    print("=" * 60)
    
    # Import unsupervised visualization functions
    exec(open('src/unsupervised_visualization.py').read())
    
    dataset_name = list(all_results.keys())[0]
    dataset_data = all_results[dataset_name]
    
    # Create summary table
    summary_df = create_unsupervised_summary_table(
        dataset_data['model_results'], 
        dataset_data['metrics']
    )
    
    # Create visualizations
    plot_unsupervised_results(
        dataset_data['model_results'], 
        dataset_data['metrics']
    )
    
    # Generate recommendations
    generate_unsupervised_recommendations(
        dataset_data['model_results'], 
        dataset_data['metrics']
    )