## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## quickSentiment: Retaining negation words (e.g., 'not', 'no', 'never') to preserve sentiment polarity. To apply the strict stopword list instead, set `retain_negations = FALSE`. View qs_negations for more
result <- pipeline(
# --- Define the vectorization method ---
# Options: "bow" (raw counts), "tf" (term frequency), "tfidf", "binary"
vect_method = "tf",
# --- Define the model to train ---
# Options: "logit", "rf", "xgb","nb"
model_name = "rf",
# --- Specify the data and column names ---
text_vector = tweets$cleaned_text , # The column with our preprocessed text
sentiment_vector = tweets$sentiment, # The column with the target variable
# --- Set vectorization options ---
# Use n_gram = 2 for unigrams + bigrams, or 1 for just unigrams
n_gram = 1,
parallel = cores
)## --- Running Pipeline: TERM_FREQUENCY + RANDOM_FOREST ---
## Data split: 944 training elements, 237 test elements.
## Vectorizing with TERM_FREQUENCY (ngram=1)...
## - Fitting BoW model (term_frequency) on training data...
## - Applying BoW transformation (term_frequency) to new data...
##
## --- Training Random Forest Model (ranger) ---
## --- Random Forest complete. Returning results. ---
##
## ======================================================
## --- quickSentiment Pipeline Complete ---
## Model Type: RANDOM_FOREST
## Vectorizer: TERM_FREQUENCY (ngram=1)
## Test Set Size: 237 rows
## Accuracy of 75.53% under baseline threshold.
## ======================================================
## =========================================
## quickSentiment Model Evaluation
## =========================================
## Target Class: P
##
## --- Global Metrics ---
## ROC AUC: 0.6901
## PR AUC: 0.4404
##
## --- Optimal Thresholds ---
## Best ROC Threshold (Youden's J): 0.2792
## Best PR Threshold (F1-Score): 0.2792
## Accuracy at Best PR Threshold: 0.7257
##
## --- Threshold Summary Table ---
## Threshold Accuracy Precision Recall F1
## 0.0 0.257 0.257 1.000 0.409
## 0.1 0.473 0.307 0.836 0.449
## 0.2 0.633 0.377 0.656 0.479
## 0.3 0.726 0.469 0.492 0.480
## 0.4 0.738 0.480 0.197 0.279
## 0.5 0.755 0.636 0.115 0.194
## 0.6 0.751 0.667 0.066 0.119
## 0.7 0.747 1.000 0.016 0.032
## 0.8 0.743 0.000 0.000 0.000
## 0.9 0.743 0.000 0.000 0.000
## 1.0 0.743 0.000 0.000 0.000
##
## (Note: Use plot() to view ROC and PR curves)
# =================================================================== #
— 6. PREDICTION ON NEW, UNSEEN DATA — #
=================================================================== ##
The training is complete. The ‘result’ object now contains our trained
## model and all the necessary “artifacts” for prediction.
## --- Preparing new data for prediction ---
## - Applying BoW transformation (term_frequency) to new data...
## --- Making Predictions ---
## --- Prediction Complete ---
## predicted_class prob_N prob_P
## 1 P 0.4664830 0.5335170
## 2 P 0.3152195 0.6847805
## 3 P 0.3464905 0.6535095
## 4 P 0.3411345 0.6588655
## 5 P 0.3740126 0.6259874
## 6 P 0.2542101 0.7457899