def evaluate_single_model(endpoint_name, test_data):
    """
    Evaluate a single model endpoint with test data
    
    Args:
        endpoint_name (str): SageMaker endpoint name
        test_data (DataFrame): Test dataset with 'text' column
    
    Returns:
        dict: Evaluation results including predictions and timing
    """
    # Get the region from the current session or use a default
    import boto3
    session = boto3.Session()
    region = session.region_name or 'us-east-1'  # Default to us-east-1 if no region configured
    
    client = boto3.client('runtime.sagemaker', region_name=region)
    predictions = []
    raw_predictions = []  # Store the original predictions
    confidences = []
    probabilities = []  # Store full probability arrays for ROC curve
    inference_times = []
    label_format = None  # Will be detected from first successful response
    error_count = 0
    first_error = None
    
    print(f"  Testing endpoint: {endpoint_name}")
    
    for text in test_data['text']:
        try:
            # Time the inference
            start_time = time.time()
            
            response = client.invoke_endpoint(
                EndpointName=endpoint_name,
                ContentType='application/x-text',
                Body=text.encode('utf-8'),
                Accept='application/json;verbose'
            )
            
            end_time = time.time()
            inference_times.append(end_time - start_time)
            
            # Parse response
            result = json.loads(response['Body'].read())
            
            # Store the raw prediction for debugging
            raw_pred = result['predicted_label']
            raw_predictions.append(raw_pred)
            
            # Store full probability array for ROC curve calculation
            probabilities.append(result['probabilities'])
            
            # Detect label format if not already detected
            if label_format is None:
                # Check if labels are in the result
                if 'labels' in result:
                    # Check the format of the first label
                    if isinstance(result['labels'][0], str) and result['labels'][0].startswith('LABEL_'):
                        label_format = 'string'  # Format: 'LABEL_0', 'LABEL_1'
                    else:
                        label_format = 'numeric'  # Format: 0, 1
                else:
                    # If no labels field, check the predicted_label format
                    if isinstance(raw_pred, str) and raw_pred.startswith('LABEL_'):
                        label_format = 'string'
                    else:
                        label_format = 'numeric'
                
                print(f"    Detected label format: {label_format}")
            
            # Normalize the prediction to the expected format (LABEL_0, LABEL_1)
            if label_format == 'numeric':
                # Convert numeric to string format
                normalized_pred = f"LABEL_{raw_pred}"
            else:
                # Already in the right format
                normalized_pred = raw_pred
                
            predictions.append(normalized_pred)
            
            # Get the confidence score (maximum probability)
            confidences.append(max(result['probabilities']))
            
        except Exception as e:
            error_count += 1
            if first_error is None:
                first_error = str(e)[:100]
            predictions.append('ERROR')
            raw_predictions.append('ERROR')
            confidences.append(0.0)
            probabilities.append([0.0, 0.0])  # Default probabilities for error cases
            inference_times.append(0.0)
    
    # Show error summary instead of individual errors
    if error_count > 0:
        if error_count == len(test_data):
            print(f"    ❌ All {len(test_data)} predictions failed")
            print(f"    Error: {first_error}...")
        else:
            print(f"    ⚠️ {error_count}/{len(test_data)} predictions failed")
            print(f"    First error: {first_error}...")
    
    # Show error summary instead of individual errors
    if error_count > 0:
        if error_count == len(test_data):
            print(f"    ❌ All {len(test_data)} predictions failed")
            print(f"    Error: {first_error}...")
        else:
            print(f"    ⚠️  {error_count}/{len(test_data)} predictions failed")
            print(f"    First error: {first_error}...")
    
    # Calculate latency percentiles
    if inference_times:
        latency_p50 = np.percentile(inference_times, 50)
        latency_p90 = np.percentile(inference_times, 90)
        latency_p99 = np.percentile(inference_times, 99)
    else:
        latency_p50 = latency_p90 = latency_p99 = 0.0
    
    return {
        'predictions': predictions,
        'raw_predictions': raw_predictions,
        'confidences': confidences,
        'probabilities': probabilities,
        'inference_times': inference_times,
        'avg_inference_time': np.mean(inference_times) if inference_times else 0,
        'latency_p50': latency_p50,
        'latency_p90': latency_p90,
        'latency_p99': latency_p99,
        'total_time': sum(inference_times),
        'label_format': label_format
    }

def normalize_labels_to_binary(labels):
    """
    Convert various label formats to binary (0, 1) for sklearn compatibility
    
    Handles formats like:
    - LABEL_0, LABEL_1
    - 0, 1 
    - positive, negative
    - True, False
    - yes, no
    """
    if not labels:
        return np.array([])
    
    # Convert to strings and lowercase for consistent comparison
    str_labels = [str(label).lower().strip() for label in labels]
    
    # Define positive and negative label mappings
    positive_labels = {'label_1', '1', 'true', 'positive', 'yes'}
    negative_labels = {'label_0', '0', 'false', 'negative', 'no'}
    
    # Convert each label individually
    result = []
    for label in str_labels:
        if label in positive_labels:
            result.append(1)
        elif label in negative_labels:
            result.append(0)
        else:
            # Unknown label - this shouldn't happen with our supported formats
            # Default to 0 for safety
            result.append(0)
    
    return np.array(result)

def calculate_metrics(true_labels, predictions, probabilities):
    """
    Calculate comprehensive evaluation metrics
    
    Args:
        true_labels (list): Ground truth labels
        predictions (list): Model predictions
        probabilities (list): Probability scores for each prediction
    
    Returns:
        dict: Dictionary containing various metrics
    """
    # Filter out error predictions
    valid_indices = [i for i, p in enumerate(predictions) if p != 'ERROR']
    
    if not valid_indices:
        return {
            'accuracy': 0.0,
            'balanced_accuracy': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1_score': 0.0,
            'mcc': 0.0,
            'auc_roc': 0.0,
            'confusion_matrix': np.zeros((2, 2)),
            'valid_predictions': 0,
            'label_mismatch': False,
            'missing_classes': [],
            'extra_classes': []
        }
    
    # Get valid predictions and true labels
    valid_preds = [predictions[i] for i in valid_indices]
    valid_true = [true_labels[i] for i in valid_indices]
    valid_probs = [probabilities[i] for i in valid_indices]
    
    # Convert to numeric for sklearn using robust label conversion
    true_numeric = normalize_labels_to_binary(valid_true)
    pred_numeric = normalize_labels_to_binary(valid_preds)
    
    # Quick debug to see if we have valid data
    print(f"    Valid predictions: {len(valid_preds)}, True labels: {len(valid_true)}")
    if len(valid_preds) > 0:
        print(f"    Sample predictions: {valid_preds[:3]} -> {pred_numeric[:3]}")
        print(f"    Sample true labels: {valid_true[:3]} -> {true_numeric[:3]}")
    
    # Check for label mismatches
    unique_true_classes = np.unique(true_numeric)
    unique_pred_classes = np.unique(pred_numeric)
    
    # Identify missing and extra classes
    missing_classes = list(set(unique_true_classes) - set(unique_pred_classes))
    extra_classes = list(set(unique_pred_classes) - set(unique_true_classes))
    label_mismatch = len(extra_classes) > 0 or len(missing_classes) > 0
    
    # Fix for accuracy calculation when label mismatch occurs
    if label_mismatch:
        # Count any prediction of a class not in ground truth as incorrect
        correct_count = 0
        total_count = len(true_numeric)
        
        for i, (true, pred) in enumerate(zip(true_numeric, pred_numeric)):
            # If the predicted class is not in unique_true_classes, it's wrong
            if pred in unique_true_classes:
                # Only count as correct if the prediction matches the true label
                if pred == true:
                    correct_count += 1
            # If predicted class is not in ground truth, it's automatically wrong
        
        # Calculate final accuracy
        accuracy = correct_count / total_count if total_count > 0 else 0
    else:
        # Normal accuracy calculation when no label mismatch
        accuracy = accuracy_score(true_numeric, pred_numeric)
    
    # Filter predictions to only include classes in true labels for other metrics
    # This prevents sklearn warnings
    if label_mismatch and len(unique_true_classes) > 0:
        # Map any class not in true_labels to the most common class in true_labels
        most_common_class = np.bincount(true_numeric).argmax()
        filtered_pred_numeric = np.array([
            p if p in unique_true_classes else most_common_class for p in pred_numeric
        ])
    else:
        filtered_pred_numeric = pred_numeric
    
    # Extract probability for positive class (LABEL_1)
    pos_probs = [probs[1] for probs in valid_probs]
    
    # Calculate balanced accuracy
    try:
        if len(unique_true_classes) < 2:
            # For single-class ground truth, balanced accuracy should reflect if model
            # incorrectly predicts classes not in ground truth
            if label_mismatch:
                # If model predicts classes not in ground truth, balanced accuracy should equal accuracy
                # This is more intuitive than showing 100% balanced accuracy when model is making errors
                balanced_acc = accuracy
            else:
                # If no label mismatch with single class, balanced accuracy equals accuracy
                balanced_acc = accuracy
        else:
            # Normal case with multiple classes in ground truth
            balanced_acc = balanced_accuracy_score(true_numeric, filtered_pred_numeric)
    except:
        balanced_acc = 0.0
    
    try:
        precision, recall, f1, _ = precision_recall_fscore_support(
            true_numeric, filtered_pred_numeric, 
            average='binary', zero_division=0,
            labels=unique_true_classes if len(unique_true_classes) > 0 else None
        )
        # Handle case where precision_recall_fscore_support returns a tuple of length 1
        if not isinstance(precision, float):
            precision = float(precision[0]) if len(precision) > 0 else 0.0
        if not isinstance(recall, float):
            recall = float(recall[0]) if len(recall) > 0 else 0.0
        if not isinstance(f1, float):
            f1 = float(f1[0]) if len(f1) > 0 else 0.0
    except:
        precision, recall, f1 = 0.0, 0.0, 0.0
    
    try:
        mcc = matthews_corrcoef(true_numeric, filtered_pred_numeric)
    except:
        mcc = 0.0
    
    # Calculate AUC-ROC only if there are two classes in the true labels
    try:
        if len(unique_true_classes) > 1:
            auc_roc = roc_auc_score(true_numeric, pos_probs)
        else:
            auc_roc = 0.5  # Default for single-class case
    except:
        auc_roc = 0.5
    
    # Calculate confusion matrix safely
    try:
        # For label mismatch cases, we need to ensure the confusion matrix reflects the errors
        if label_mismatch:
            # Create a confusion matrix that properly shows errors
            # First, initialize a 2x2 matrix
            cm = np.zeros((2, 2), dtype=int)
            
            # For each prediction, update the confusion matrix
            for true_label, pred_label in zip(true_numeric, pred_numeric):
                # Convert to int to use as indices
                t = int(true_label)
                p = int(pred_label)
                
                # Ensure indices are valid (0 or 1)
                t = min(max(t, 0), 1)
                p = min(max(p, 0), 1)
                
                # Update the confusion matrix
                cm[t, p] += 1
        else:
            # Standard confusion matrix calculation
            cm = confusion_matrix(true_numeric, filtered_pred_numeric, labels=[0, 1])
            
        # Ensure we have a 2x2 matrix
        if cm.shape != (2, 2):
            # Expand to 2x2 if needed
            if cm.shape == (1, 1):
                # Only one class present, expand to 2x2
                if len(unique_true_classes) > 0 and unique_true_classes[0] == 0:
                    # All negatives
                    cm_expanded = np.zeros((2, 2), dtype=int)
                    cm_expanded[0, 0] = cm[0, 0]  # TN
                    cm = cm_expanded
                else:
                    # All positives
                    cm_expanded = np.zeros((2, 2), dtype=int)
                    cm_expanded[1, 1] = cm[0, 0]  # TP
                    cm = cm_expanded
            else:
                cm = np.zeros((2, 2), dtype=int)
    except:
        cm = np.zeros((2, 2), dtype=int)
    
    return {
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'mcc': mcc,
        'auc_roc': auc_roc,
        'confusion_matrix': cm,
        'valid_predictions': len(valid_indices),
        'true_labels': true_numeric,
        'pred_labels': filtered_pred_numeric,
        'pos_probs': pos_probs,
        'label_mismatch': label_mismatch,
        'missing_classes': missing_classes,
        'extra_classes': extra_classes,
        'unique_true_classes': list(unique_true_classes),
        'confidences': [valid_probs[i][1] for i in range(len(valid_probs))]
    }

def compare_multiple_models(model_endpoints, test_data):
    """
    Compare multiple models and return comprehensive results
    
    Args:
        model_endpoints (dict): Dictionary of model names and endpoint names
        test_data (DataFrame): Test dataset
    
    Returns:
        dict: Comparison results for all models
    """
    comparison_results = {}
    
    print("Starting model comparison...\n")
    
    for model_name, endpoint_name in model_endpoints.items():
        print(f"Evaluating {model_name}...")
        
        # Evaluate model
        eval_results = evaluate_single_model(endpoint_name, test_data)
        
        # Calculate metrics
        metrics = calculate_metrics(
            test_data['true_label'], 
            eval_results['predictions'],
            eval_results['probabilities']
        )
        
        # Calculate cost efficiency (a simple metric based on inference time)
        # Lower is better - this is a relative measure for comparison
        avg_time = eval_results['avg_inference_time']
        cost_efficiency = 1.0 / (avg_time + 0.001) if avg_time > 0 else 0
        
        # Combine results
        valid_confidences = [c for c in eval_results['confidences'] if c > 0]
        comparison_results[model_name] = {
            **metrics,
            'avg_confidence': np.mean(valid_confidences) if valid_confidences else 0.0,
            'avg_inference_time': eval_results['avg_inference_time'],
            'latency_p50': eval_results['latency_p50'],
            'latency_p90': eval_results['latency_p90'],
            'latency_p99': eval_results['latency_p99'],
            'total_time': eval_results['total_time'],
            'cost_efficiency': cost_efficiency,
            'endpoint_name': endpoint_name,
            'label_format': eval_results['label_format']
        }
        
        # Only show success message if evaluation actually succeeded
        if metrics['valid_predictions'] > 0:
            print(f"  ✅ {model_name} evaluation completed\n")
        else:
            print(f"  ❌ {model_name} evaluation failed - no valid predictions\n")
    
    return comparison_results

# DEBUG: File updated at 2025-08-18 19:21 - Label conversion fix applied
