Evaluating Emotion Classification with evaluate_emotions()

transforEmotion Team

2026-01-05

Introduction

The evaluate_emotions() function provides comprehensive evaluation capabilities for discrete emotion classification tasks. This vignette demonstrates how to use the function to assess model performance using standard metrics and visualizations.

Installation and Setup

# Install transforEmotion if not already installed
# devtools::install_github("your-repo/transforEmotion")
library(transforEmotion)
library(transforEmotion)

Basic Usage

Creating Sample Data

First, let’s create some sample evaluation data to demonstrate the function:

# Create synthetic evaluation data
set.seed(42)
n_samples <- 200

# Generate ground truth labels
emotions <- c("anger", "joy", "sadness", "fear", "surprise")
eval_data <- data.frame(
  id = 1:n_samples,
  truth = sample(emotions, n_samples, replace = TRUE, 
                prob = c(0.2, 0.3, 0.2, 0.15, 0.15)),
  stringsAsFactors = FALSE
)

# Generate realistic predictions (correlated with truth but with some errors)
eval_data$pred <- eval_data$truth
# Introduce some classification errors
error_indices <- sample(1:n_samples, size = 0.25 * n_samples)
eval_data$pred[error_indices] <- sample(emotions, length(error_indices), replace = TRUE)

# Generate probability scores
for (emotion in emotions) {
  # Higher probability for correct class, lower for others
  eval_data[[paste0("prob_", emotion)]] <- ifelse(
    eval_data$truth == emotion,
    runif(n_samples, 0.6, 0.95),  # Higher prob for correct class
    runif(n_samples, 0.01, 0.4)   # Lower prob for incorrect classes
  )
}

# Normalize probabilities to sum to 1
prob_cols <- paste0("prob_", emotions)
prob_sums <- rowSums(eval_data[, prob_cols])
eval_data[, prob_cols] <- eval_data[, prob_cols] / prob_sums

# Display sample data
head(eval_data)
#>   id    truth     pred prob_anger   prob_joy prob_sadness  prob_fear
#> 1  1     fear surprise 0.10212549 0.02829695   0.16916418 0.63349125
#> 2  2     fear surprise 0.22000913 0.08829725   0.04855244 0.44341087
#> 3  3      joy      joy 0.05642642 0.40583491   0.17749856 0.18908123
#> 4  4 surprise surprise 0.19460955 0.09289653   0.03606787 0.06384095
#> 5  5    anger    anger 0.47644873 0.21565003   0.08878616 0.17295498
#> 6  6    anger    anger 0.50608394 0.09512487   0.17226538 0.21580694
#>   prob_surprise
#> 1    0.06692213
#> 2    0.19973031
#> 3    0.17115887
#> 4    0.61258509
#> 5    0.04616011
#> 6    0.01071888

Basic Evaluation

Now let’s evaluate the model performance with basic metrics:

# Basic evaluation with default metrics
results <- evaluate_emotions(
  data = eval_data,
  truth_col = "truth",
  pred_col = "pred"
)

# Print results
print(results)
#> Emotion Classification Evaluation Results
#> ========================================
#> 
#> Summary:
#>   Total instances: 200
#>   Classes: 5 (anger, fear, joy, sadness, surprise)
#>   Overall accuracy: 0.805
#>   Macro F1: 0.799
#> 
#> Metrics:
#>               metric     value
#> 1           accuracy 0.8050000
#> 2    precision_macro 0.7986522
#> 3       recall_macro 0.8004904
#> 4           f1_macro 0.7993882
#> 5           f1_micro 0.8050000
#> 6 krippendorff_alpha 0.7531802

Evaluation with Probabilities

For more comprehensive evaluation including calibration metrics:

# Full evaluation with probability scores
results_full <- evaluate_emotions(
  data = eval_data,
  truth_col = "truth",
  pred_col = "pred",
  probs_cols = prob_cols,
  classes = emotions,
  return_plot = TRUE
)

# Display summary
summary(results_full)
#> Emotion Classification Evaluation Summary
#> =======================================
#> 
#> Dataset Information:
#>   - Total instances: 200
#>   - Number of classes: 5
#>   - Classes: anger, fear, joy, sadness, surprise
#> 
#> Overall Performance:
#>   - Accuracy: 0.805
#>   - Macro F1: 0.799
#>   - Micro F1: 0.805
#>   - Macro AUROC: 1.000
#>   - Expected Calibration Error: 0.313
#>   - Krippendorff's alpha: 0.753
#> 
#> Per-Class Metrics:
#>     class precision    recall        f1
#>     anger 0.8048780 0.8250000 0.8148148
#>       joy 0.8518519 0.8214286 0.8363636
#>   sadness 0.8125000 0.7878788 0.8000000
#>      fear 0.7333333 0.7586207 0.7457627
#>  surprise 0.7906977 0.8095238 0.8000000
#> 
#> Confusion Matrix:
#>           Actual
#> Predicted  anger joy sadness fear surprise Sum
#>   anger       33   2       0    2        4  41
#>   joy          1  46       3    2        2  54
#>   sadness      3   1      26    1        1  32
#>   fear         2   3       2   22        1  30
#>   surprise     1   4       2    2       34  43
#>   Sum         40  56      33   29       42 200

Understanding the Metrics

Classification Metrics

The function computes several standard classification metrics:

# Access per-class metrics
results_full$per_class_metrics
#>      class precision    recall        f1
#> 1    anger 0.8048780 0.8250000 0.8148148
#> 2      joy 0.8518519 0.8214286 0.8363636
#> 3  sadness 0.8125000 0.7878788 0.8000000
#> 4     fear 0.7333333 0.7586207 0.7457627
#> 5 surprise 0.7906977 0.8095238 0.8000000

Probabilistic Metrics

When probability scores are provided:

# AUROC results
results_full$auroc
#> $per_class
#>      class auroc
#> 1    anger     1
#> 2      joy     1
#> 3  sadness     1
#> 4     fear     1
#> 5 surprise     1
#> 
#> $macro
#> [1] 1

# Calibration error
cat("Expected Calibration Error:", round(results_full$ece, 3))
#> Expected Calibration Error: 0.313

Inter-rater Reliability

Krippendorff’s α measures agreement between human annotators and model predictions:

cat("Krippendorff's α:", round(results_full$krippendorff_alpha, 3))
#> Krippendorff's α: 0.753

Visualization

The function provides built-in plotting capabilities:

# Plot confusion matrix and metrics (requires ggplot2)
if (requireNamespace("ggplot2", quietly = TRUE)) {
  plots <- plot(results_full)
  
  # Display confusion matrix
  print(plots$confusion_matrix)
  
  # Display per-class metrics
  print(plots$metrics)
}

Integration with transforEmotion Workflow

Complete Pipeline Example

Here’s how to integrate evaluate_emotions() into a complete emotion analysis workflow:

# Step 1: Get emotion predictions using transforEmotion
text_data <- c(
  "I am so happy today!",
  "This makes me really angry.",
  "I feel very sad about this news."
)

# Get transformer-based predictions
predictions <- transformer_scores(
  x = text_data,
  classes = c("anger", "joy", "sadness"),
  return_prob = TRUE
)

# Step 2: Prepare evaluation data (assuming you have ground truth)
ground_truth <- c("joy", "anger", "sadness")  # Your ground truth labels

eval_df <- data.frame(
  id = 1:length(text_data),
  truth = ground_truth,
  pred = predictions$predicted_class,
  prob_anger = predictions$prob_anger,
  prob_joy = predictions$prob_joy,
  prob_sadness = predictions$prob_sadness,
  stringsAsFactors = FALSE
)

# Step 3: Evaluate performance
evaluation <- evaluate_emotions(
  data = eval_df,
  probs_cols = c("prob_anger", "prob_joy", "prob_sadness")
)

print(evaluation)

Using with CSV Data

You can also evaluate models using data stored in CSV files:

# Save evaluation data to CSV
write.csv(eval_data, "model_evaluation.csv", row.names = FALSE)

# Load and evaluate from CSV
csv_results <- evaluate_emotions(
  data = "model_evaluation.csv",
  probs_cols = prob_cols
)

Advanced Usage

Custom Metrics Selection

Select only specific metrics for faster computation:

# Evaluate only accuracy and F1 scores
quick_eval <- evaluate_emotions(
  data = eval_data,
  metrics = c("accuracy", "f1_macro", "f1_micro"),
  return_plot = FALSE
)

print(quick_eval$metrics)
#>     metric     value
#> 1 accuracy 0.8050000
#> 2 f1_macro 0.7993882
#> 3 f1_micro 0.8050000

Handling Missing Data

The function automatically handles missing values:

# Create data with missing values
eval_data_missing <- eval_data
eval_data_missing$truth[1:5] <- NA
eval_data_missing$pred[6:10] <- NA

# Evaluate with automatic missing value removal
results_clean <- evaluate_emotions(
  data = eval_data_missing,
  na_rm = TRUE  # Default behavior
)
#> Warning: Removed 10 rows with missing values

cat("Original samples:", nrow(eval_data_missing), "\n")
#> Original samples: 200
cat("Samples after cleaning:", results_clean$summary$n_instances, "\n")
#> Samples after cleaning: 190

Custom Column Names

Use custom column names for your data:

# Rename columns in your data
custom_data <- eval_data
names(custom_data)[names(custom_data) == "truth"] <- "ground_truth"
names(custom_data)[names(custom_data) == "pred"] <- "model_prediction"

# Evaluate with custom column names
custom_results <- evaluate_emotions(
  data = custom_data,
  truth_col = "ground_truth",
  pred_col = "model_prediction",
  metrics = c("accuracy", "f1_macro")
)

print(custom_results)
#> Emotion Classification Evaluation Results
#> ========================================
#> 
#> Summary:
#>   Total instances: 200
#>   Classes: 5 (anger, fear, joy, sadness, surprise)
#>   Overall accuracy: 0.805
#>   Macro F1: 0.799
#> 
#> Metrics:
#>     metric     value
#> 1 accuracy 0.8050000
#> 2 f1_macro 0.7993882

Best Practices

1. Always Include Probability Scores

When possible, include probability scores for more comprehensive evaluation:

# Good: Include probabilities for calibration analysis
results_with_probs <- evaluate_emotions(
  data = eval_data,
  probs_cols = prob_cols
)

2. Use Appropriate Metrics

Choose metrics based on your use case:

3. Validate Data Quality

Always check your evaluation data before analysis:

# Check class distribution
table(eval_data$truth)
#> 
#>    anger     fear      joy  sadness surprise 
#>       40       29       56       33       42
table(eval_data$pred)
#> 
#>    anger     fear      joy  sadness surprise 
#>       41       30       54       32       43

# Check for missing values
sum(is.na(eval_data$truth))
#> [1] 0
sum(is.na(eval_data$pred))
#> [1] 0

4. Report Multiple Metrics

Don’t rely on a single metric - report comprehensive results:

# Get comprehensive evaluation
comprehensive_eval <- evaluate_emotions(
  data = eval_data,
  probs_cols = prob_cols,
  metrics = c("accuracy", "precision", "recall", "f1_macro", "f1_micro", 
             "auroc", "ece", "krippendorff", "confusion_matrix")
)

# Report key metrics
key_metrics <- comprehensive_eval$metrics[
  comprehensive_eval$metrics$metric %in% c("accuracy", "f1_macro", "f1_micro"),
]
print(key_metrics)
#>     metric     value
#> 1 accuracy 0.8050000
#> 4 f1_macro 0.7993882
#> 5 f1_micro 0.8050000

Conclusion

The evaluate_emotions() function provides a comprehensive toolkit for evaluating emotion classification models. It integrates seamlessly with the transforEmotion package workflow and follows best practices from the machine learning evaluation literature.

Key features: - Standard classification metrics (accuracy, precision, recall, F1) - Probabilistic evaluation (AUROC, calibration) - Inter-rater reliability (Krippendorff’s α) - Built-in visualization capabilities - Flexible input handling and data validation

For more information, see the function documentation with ?evaluate_emotions.