mlspatial:Spatial Machine Learning workflow

olddir <- getwd()
setwd(tempdir())
setwd(olddir)
oldopt <- options(digits = 4)
options(oldopt)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(mlspatial)
library(dplyr)
library(ggplot2)
library(tmap)
library(sf)
library(spdep)
library(randomForest)
library(xgboost) 
library(e1071) 
library(caret)
library(tidyr)
# Quick thematic map with country labels
plot(africa_shp)

qtm(africa_shp)

qtm(africa_shp, text = "NAME")  # Replace with actual field name


tm_shape(africa_shp) +
  tm_polygons() +
  tm_text("NAME", size = 0.5) +  # Replace with correct column
  tm_title ("Africa Countries")


ggplot(africa_shp) +
  geom_sf(fill = "lightgray", color = "black") +
  geom_sf_text(aes(label = NAME), size = 2) +  # Replace as needed
  ggtitle("Africa countries") +
  theme_minimal()

# Join data
mapdata <- join_data(africa_shp, panc_incidence, by = "NAME")

## OR Joining/ merging my data and shapefiles
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")   
## OR mapdata <- left_join(nat, codata, by = "DISTRICT_N")
str(mapdata)
#> Classes 'sf' and 'data.frame':   53 obs. of  26 variables:
#>  $ OBJECTID  : int  2 3 5 6 7 8 9 10 11 12 ...
#>  $ FIPS_CNTRY: chr  "UV" "CV" "GA" "GH" ...
#>  $ ISO_2DIGIT: chr  "BF" "CV" "GM" "GH" ...
#>  $ ISO_3DIGIT: chr  "BFA" "CPV" "GMB" "GHA" ...
#>  $ NAME      : chr  "Burkina Faso" "Cabo Verde" "Gambia" "Ghana" ...
#>  $ COUNTRYAFF: chr  "Burkina Faso" "Cabo Verde" "Gambia" "Ghana" ...
#>  $ CONTINENT : chr  "Africa" "Africa" "Africa" "Africa" ...
#>  $ TOTPOP    : int  20107509 560899 2051363 27499924 12413867 1792338 4689021 17885245 3758571 33986655 ...
#>  $ incidence : num  330.4 53.4 31.4 856.3 163.1 ...
#>  $ female    : num  1683 362 140 4566 375 ...
#>  $ male      : num  1869 211 197 4640 1378 ...
#>  $ ageb      : num  669.7 93.7 68.7 2047 336.7 ...
#>  $ agec      : num  2878 480 268 7147 1414 ...
#>  $ agea      : num  4.597 0.265 0.718 11.888 2.13 ...
#>  $ fageb     : num  250.3 40.2 23.1 782 59.1 ...
#>  $ fagec     : num  1429 322 116 3775 315 ...
#>  $ fagea     : num  3.413 0.146 0.548 8.816 1.228 ...
#>  $ mageb     : num  419.5 53.5 45.6 1265 277.6 ...
#>  $ magec     : num  1448 158 152 3372 1100 ...
#>  $ magea     : num  1.184 0.12 0.17 3.073 0.902 ...
#>  $ yra       : num  182.4 30.2 16.6 524.7 73.1 ...
#>  $ yrb       : num  187.2 34.1 17.1 552.6 74.9 ...
#>  $ yrc       : num  193.1 35 18 578.5 76.9 ...
#>  $ yrd       : num  198.5 35.9 18.3 602.7 78.6 ...
#>  $ yre       : num  204.3 36.5 18.7 621.5 79.4 ...
#>  $ geometry  :sfc_MULTIPOLYGON of length 53; first list element: List of 1
#>   ..$ :List of 1
#>   .. ..$ : num [1:317, 1:2] 102188 90385 80645 74151 70224 ...
#>   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
#>  - attr(*, "sf_column")= chr "geometry"
#>  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
#>   ..- attr(*, "names")= chr [1:25] "OBJECTID" "FIPS_CNTRY" "ISO_2DIGIT" "ISO_3DIGIT" ...
#Visualize Pancreatic cancer Incidence by countries
#Basic map with labels
# quantile map
p1 <- tm_shape(mapdata) + 
  tm_fill("incidence", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Incidence")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p2 <- tm_shape(mapdata) + 
  tm_fill("female", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p3 <- tm_shape(mapdata) + 
  tm_fill("male", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p4 <- tm_shape(mapdata) + 
  tm_fill("ageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p5 <- tm_shape(mapdata) + 
  tm_fill("agec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p6 <- tm_shape(mapdata) + 
  tm_fill("agea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p7 <- tm_shape(mapdata) + 
  tm_fill("fageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p8 <- tm_shape(mapdata) + 
  tm_fill("fagec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p9 <- tm_shape(mapdata) + 
  tm_fill("fagea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p10 <- tm_shape(mapdata) + 
  tm_fill("mageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p11 <- tm_shape(mapdata) + 
  tm_fill("magec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p12 <- tm_shape(mapdata) + 
  tm_fill("magea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)

current.mode <- tmap_mode("plot")
tmap_arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, 
             widths = c(.75, .75))

tmap_mode(current.mode)
## Machine Learning Model building
# 1. Random Forest Regression
# Train Random Forest
set.seed(123)
rf_model <- randomForest(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                           magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, ntree = 500, 
                         importance = TRUE)

# View model results
print(rf_model)
#> 
#> Call:
#>  randomForest(formula = incidence ~ female + male + agea + ageb +      agec + fagea + fageb + fagec + magea + mageb + magec + yrb +      yrc + yrd + yre, data = mapdata, ntree = 500, importance = TRUE) 
#>                Type of random forest: regression
#>                      Number of trees: 500
#> No. of variables tried at each split: 5
#> 
#>           Mean of squared residuals: 76299.33
#>                     % Var explained: 91.35
varImpPlot(rf_model)


#Plot Variable Importance
library(ggplot2)
importance_df <- data.frame(
  Variable = rownames(importance(rf_model)),
  Importance = importance(rf_model)[, "IncNodePurity"])

ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Variable Importance (IncNodePurity)", x = "Variable", y = "Importance")


# 2. Gradient Boosting Machine (XGBoost)
# Prepare the data
x_vars <- model.matrix(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                         magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata)[,-1]
y <- mapdata$incidence

# Convert to DMatrix
dtrain <- xgb.DMatrix(data = x_vars, label = y)

# Train model
# Train model using xgb.train()
params <- list(
  objective = "reg:squarederror",
  max_depth = 4,
  learning_rate = 0.1,
  verbosity = 0
)

xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100
)
# Feature importance
xgb.importance(model = xgb_model)
#>     Feature         Gain       Cover   Frequency
#>      <char>        <num>       <num>       <num>
#>  1:  female 9.907517e-01 0.604198127 0.614457831
#>  2:    agec 5.419284e-03 0.084314910 0.105421687
#>  3:    ageb 2.267636e-03 0.113560858 0.105421687
#>  4:    male 1.553178e-03 0.114869626 0.106927711
#>  5:   mageb 5.295916e-06 0.013188362 0.013554217
#>  6:   fagec 1.610951e-06 0.019832880 0.016566265
#>  7:     yrd 4.180083e-07 0.007449914 0.006024096
#>  8:    agea 4.159760e-07 0.018222088 0.013554217
#>  9:   magea 2.949594e-07 0.015201852 0.010542169
#> 10:   fageb 7.749886e-08 0.003976644 0.003012048
#> 11:     yrb 4.104050e-08 0.003070573 0.003012048
#> 12:   fagea 3.490967e-09 0.002114165 0.001506024

# 3. Support Vector Regression (SVR)
# Train SVR model
svr_model <- svm(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                   magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, 
                 type = "eps-regression", 
                 kernel = "radial")

# Summary and predictions
summary(svr_model)
#> 
#> Call:
#> svm(formula = incidence ~ female + male + agea + ageb + agec + fagea + 
#>     fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, 
#>     data = mapdata, type = "eps-regression", kernel = "radial")
#> 
#> 
#> Parameters:
#>    SVM-Type:  eps-regression 
#>  SVM-Kernel:  radial 
#>        cost:  1 
#>       gamma:  0.06666667 
#>     epsilon:  0.1 
#> 
#> 
#> Number of Support Vectors:  7
#mapdata$pred_svr <- predict(svr_model)
# Model Evaluation (predictions):
# evaluate each model step-by-step:
# 1. Random Forest Evaluation
rf_preds <- predict(rf_model, newdata = mapdata)
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)
print(rf_metrics)
#>       RMSE   Rsquared        MAE 
#> 98.2197100  0.9976235 30.2558297

# 2. XGBoost Evaluation
xgb_preds <- predict(xgb_model, newdata = x_vars)
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)
print(xgb_metrics)
#>      RMSE  Rsquared       MAE 
#> 2.1004543 0.9999983 0.7290456

# 3. SVR Evaluation
svr_preds <- predict(svr_model, newdata = mapdata)
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)
print(svr_metrics)
#>        RMSE    Rsquared         MAE 
#> 445.9372342   0.9178101 127.7105534

# Compare All Models
model_eval <- data.frame(
  Model = c("Random Forest", "XGBoost", "SVR"),
  RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
  MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
  Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"]))

print(model_eval)
#>           Model       RMSE         MAE  Rsquared
#> 1 Random Forest  98.219710  30.2558297 0.9976235
#> 2       XGBoost   2.100454   0.7290456 0.9999983
#> 3           SVR 445.937234 127.7105534 0.9178101

#Choosing the Best Model
#Lowest RMSE and MAE = most accurate predictions
#Highest R² = best variance explanation
#Sort and interpret:
model_eval[order(model_eval$RMSE), ]  # Best = Top row
#>           Model       RMSE         MAE  Rsquared
#> 2       XGBoost   2.100454   0.7290456 0.9999983
#> 1 Random Forest  98.219710  30.2558297 0.9976235
#> 3           SVR 445.937234 127.7105534 0.9178101
#### Models Predicted plots
# Create a data frame from your table
model_preds <- data.frame(rf_preds, xgb_preds, svr_preds)
# Add observation ID
model_preds$ID <- 1:nrow(model_preds)

# Reshape for plotting
model_long <- model_preds %>%
  tidyr::pivot_longer(cols = c("rf_preds", "xgb_preds", "svr_preds"), names_to = "Model", values_to = "Predicted")

# Plot
ggplot(model_long, aes(x = ID, y = Predicted, color = Model)) +
  geom_line(linewidth = 0.5) +
  labs(title = "Model Predictions Over Observations",
       x = "Observation", y = "Predicted Incidence") +
  theme_minimal()


## plot actual vs predicted 
oldpar <- par(mfrow = c(1, 3)) # 3 plots side-by-side

# Random Forest
plot(mapdata$incidence, mapdata$rf_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "Random Forest", pch = 19, col = "steelblue")
abline(0, 1, col = "red", lwd = 2)

# XGBoost
plot(mapdata$incidence, mapdata$xgb_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "XGBoost", pch = 19, col = "darkgreen")
abline(0, 1, col = "red", lwd = 2)

# SVR
plot(mapdata$incidence, mapdata$svr_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "SVR", pch = 19, col = "purple")
abline(0, 1, col = "red", lwd = 2)


par(oldpar)

## Actual vs predicted plot with correlations

library(ggplot2)
library(ggpubr)  # For stat_cor

mapdata$rf_pred <- predict(rf_model)
mapdata$xgb_pred <- predict(xgb_model, newdata = x_vars)
mapdata$svr_pred <- predict(svr_model, newdata = mapdata)

# Random Forest Plot
p1 <- ggplot(mapdata, aes(x = incidence, y = rf_pred)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "Random Forest", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

# XGBoost Plot
p2 <- ggplot(mapdata, aes(x = incidence, y = xgb_pred)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "XGBoost", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

# SVR Plot
p3 <- ggplot(mapdata, aes(x = incidence, y = svr_pred)) +
  geom_point(color = "purple", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "SVR", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

combined_plot <- ggarrange(p1, p2, p3, ncol = 3, nrow = 1, common.legend = FALSE)
## CROSS-VALIDATION
#Step 1: Prepare common setup
# Set seed for reproducibility
set.seed(123)
library(caret)
# Define 5-fold cross-validation
cv_control <- trainControl(method = "cv", number = 3)

# 1. Random Forest
library(randomForest)

rf_cv <- train(
  incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
    magea + mageb + magec + yrb + yrc + yrd + yre,
  data = mapdata,
  method = "rf",
  trControl = cv_control,
  tuneLength = 3,
  importance = TRUE
)
print(rf_cv)
#> Random Forest 
#> 
#> 53 samples
#> 16 predictors
#> 
#> No pre-processing
#> Resampling: Cross-Validated (3 fold) 
#> Summary of sample sizes: 36, 34, 36 
#> Resampling results across tuning parameters:
#> 
#>   mtry  RMSE      Rsquared   MAE     
#>    2    444.8157  0.8580149  161.2283
#>    8    450.7475  0.8548550  163.6862
#>   15    454.0229  0.8542215  165.2292
#> 
#> RMSE was used to select the optimal model using the smallest value.
#> The final value used for the model was mtry = 2.

# 2. XGBoost
library(xgboost)
mapdata <- st_drop_geometry(mapdata)
mapdata$incidence <- as.numeric(mapdata$incidence)

cv_control <- trainControl(
  method = "cv",
  number = 5
)

# XGBoost with caret
xgb_cv <- xgb.cv(
  params = params,
  data = dtrain,
  nrounds = 100,
  nfold = 5,           # 5-fold CV
  early_stopping_rounds = 10, # stop if no improvement in 10 rounds
  metrics = list("rmse"),
  verbose = 0
)
print(xgb_cv)
#> ##### xgb.cv 5-folds
#>   iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#>  <int>           <num>          <num>          <num>         <num>
#>      1      869.239393    127.3707020       790.2016      556.6500
#>      2      811.895853    113.6967223       753.6544      548.5258
#>      3      758.695958    101.1327812       719.3175      541.3886
#>      4      709.258868     89.5985897       687.8282      534.0328
#>      5      663.309677     79.0397616       658.7448      527.2620
#>      6      620.545640     69.4028269       632.3349      520.5086
#>      7      580.734477     60.6017161       608.0614      514.3051
#>      8      543.630439     52.5890340       585.6143      508.4693
#>      9      509.061361     45.2924036       565.3273      502.8183
#>     10      476.811169     38.6662221       545.6182      498.5917
#>     11      446.728563     32.6587860       527.4934      495.3552
#>     12      418.645545     27.2396970       510.8609      491.8151
#>     13      392.428753     22.3383309       495.3064      488.7539
#>     14      367.936479     17.9487507       480.7644      485.8850
#>     15      345.050737     14.0417407       467.7380      482.9319
#>     16      323.657820     10.5989823       455.5772      480.3232
#>     17      303.658221      7.6468438       444.2999      478.0095
#>     18      284.949110      5.3008448       433.7681      476.0269
#>     19      267.446830      3.9041583       423.8386      474.4026
#>     20      251.072552      3.8834340       414.6807      472.9013
#>     21      235.748441      4.8799317       406.5349      471.3381
#>     22      221.402044      6.2001606       398.7159      470.1287
#>     23      207.972113      7.5265587       391.4943      469.2266
#>     24      195.395832      8.7500043       384.7359      468.2956
#>     25      183.618137      9.8448664       378.4595      467.4727
#>     26      172.579433     10.8036044       372.8544      466.6354
#>     27      162.233649     11.6317973       367.5725      465.6864
#>     28      152.531762     12.3298351       362.3877      464.9543
#>     29      143.434085     12.9128515       357.7527      464.1551
#>     30      134.899409     13.3869674       353.4512      463.4164
#>     31      126.894472     13.7625645       349.4280      462.7497
#>     32      119.383493     14.0499628       345.6474      462.1675
#>     33      112.336085     14.2548206       342.1012      461.6551
#>     34      105.721359     14.3870126       338.8096      461.1720
#>     35       99.516489     14.4496212       335.7164      460.7433
#>     36       93.683744     14.4612375       332.8313      460.3522
#>     37       88.207532     14.4191607       330.1307      460.0082
#>     38       83.066339     14.3300654       327.5992      459.7049
#>     39       78.239213     14.2001212       325.2061      459.4532
#>     40       73.706952     14.0344257       322.9586      459.2320
#>     41       69.453772     13.8329782       320.8131      459.0585
#>     42       65.455340     13.6042703       318.8138      458.8997
#>     43       61.694388     13.3553540       316.8836      458.7848
#>     44       58.161545     13.0847456       315.0938      458.6745
#>     45       54.843352     12.7942086       313.4106      458.5800
#>     46       51.727013     12.4870438       311.8410      458.4893
#>     47       48.800109     12.1668617       310.3461      458.4239
#>     48       46.050852     11.8341907       308.9519      458.3600
#>     49       43.470091     11.4911414       307.6304      458.3118
#>     50       41.045067     11.1414634       306.3959      458.2640
#>     51       38.766216     10.7874114       305.2391      458.2214
#>     52       36.628218     10.4270416       304.1478      458.1830
#>     53       34.619590     10.0648898       303.1232      458.1492
#>     54       32.735149      9.7000239       302.1662      458.1140
#>     55       30.967443      9.3330799       301.2729      458.0778
#>     56       29.310898      8.9641345       300.4338      458.0435
#>     57       27.756956      8.5961935       299.6458      458.0095
#>     58       26.298972      8.2304126       298.9090      457.9752
#>     59       24.933027      7.8658598       298.2208      457.9401
#>     60       23.653849      7.5020737       297.5853      457.8992
#>     61       22.451003      7.1460083       296.9914      457.8580
#>     62       21.309321      6.8065028       296.4383      457.8143
#>     63       20.226651      6.4820813       295.9174      457.7738
#>     64       19.202495      6.1707325       295.3967      457.7511
#>     65       18.228768      5.8757562       294.9482      457.7059
#>     66       17.310864      5.5886756       294.4956      457.6777
#>     67       16.437824      5.3168546       294.0978      457.6350
#>     68       15.614714      5.0534595       293.7217      457.5942
#>     69       14.832464      4.8022642       293.3740      457.5513
#>     70       14.086469      4.5660667       293.0471      457.5093
#>     71       13.381798      4.3394510       292.7398      457.4681
#>     72       12.712208      4.1234158       292.4562      457.4248
#>     73       12.076774      3.9185987       292.1869      457.3840
#>     74       11.472889      3.7233422       291.9417      457.3399
#>     75       10.899541      3.5380380       291.7111      457.2972
#>     76       10.355206      3.3622312       291.4919      457.2569
#>     77        9.837891      3.1947687       291.2877      457.2165
#>     78        9.344513      3.0377197       291.1026      457.1739
#>     79        8.878307      2.8866616       290.9245      457.1346
#>     80        8.435075      2.7427897       290.7651      457.0920
#>     81        8.014227      2.6064334       290.6128      457.0525
#>     82        7.614482      2.4769421       290.4711      457.0131
#>     83        7.234784      2.3536897       290.3374      456.9753
#>     84        6.873823      2.2364538       290.2146      456.9371
#>     85        6.531273      2.1254459       290.0973      456.9014
#>     86        6.205946      2.0200509       289.9841      456.8685
#>     87        5.896936      1.9198917       289.8765      456.8371
#>     88        5.603172      1.8247213       289.7767      456.8061
#>     89        5.324408      1.7343520       289.6757      456.7801
#>     90        5.059483      1.6487059       289.5910      456.7492
#>     91        4.808006      1.5671828       289.5027      456.7241
#>     92        4.569061      1.4898766       289.4191      456.7002
#>     93        4.342314      1.4165729       289.3420      456.6764
#>     94        4.126835      1.3469663       289.2688      456.6538
#>     95        3.922361      1.2809817       289.1989      456.6325
#>     96        3.728132      1.2182728       289.1344      456.6112
#>     97        3.543669      1.1588310       289.0731      456.5911
#>     98        3.368373      1.1020500       289.0163      456.5712
#>     99        3.202143      1.0484843       288.9618      456.5526
#>    100        3.044323      0.9976821       288.9115      456.5340
#>   iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#> Best iteration:
#>   iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#>  <int>           <num>          <num>          <num>         <num>
#>    100        3.044323      0.9976821       288.9115       456.534
best_nrounds <- xgb_cv$best_iteration
cat("Best number of rounds:", best_nrounds, "\n")
#> Best number of rounds:
# Extract mean RMSE
mean_rmse <- min(xgb_cv$evaluation_log$test_rmse_mean)
cat("XGBoost CV RMSE:", mean_rmse, "\n")
#> XGBoost CV RMSE: 288.9115


# 3. Support Vector Regression (SVR)
library(e1071)
library(kernlab)

svr_cv <- train(
  incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
    magea + mageb + magec + yrb + yrc + yrd + yre,
  data = mapdata,
  method = "svmRadial",
  trControl = cv_control,
  preProcess = c("center", "scale"),  # SVR often benefits from scaling
  tuneLength = 3
)
print(svr_cv)
#> Support Vector Machines with Radial Basis Function Kernel 
#> 
#> 53 samples
#> 15 predictors
#> 
#> Pre-processing: centered (15), scaled (15) 
#> Resampling: Cross-Validated (5 fold) 
#> Summary of sample sizes: 43, 42, 42, 43, 42 
#> Resampling results across tuning parameters:
#> 
#>   C     RMSE      Rsquared   MAE     
#>   0.25  713.0466  0.4939345  363.8929
#>   0.50  704.6990  0.4971961  355.2001
#>   1.00  701.0369  0.5012774  350.6448
#> 
#> Tuning parameter 'sigma' was held constant at a value of 17.63942
#> RMSE was used to select the optimal model using the smallest value.
#> The final values used for the model were sigma = 17.63942 and C = 1.
## Spatial maps of predicted values of each model
# 1. Random Forest Spatial Map
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")
mapdata$pred_rf <- predict(rf_model, newdata = mapdata)

tm_shape(mapdata) + 
  tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens",  style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)


# 2. XGBoost Spatial Map
# Ensure matrix used in training
mapdata$pred_xgb <- predict(xgb_model, newdata = x_vars)

tm_shape(mapdata) + 
  tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)


# 3. Support Vector Regression (SVR) Spatial Map
mapdata$pred_svr <- predict(svr_model, newdata = mapdata)

tm_shape(mapdata) + 
  tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)


# Compare Side by Side
tmap_arrange(
  tm_shape(mapdata) + 
    tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens",  style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  tm_shape(mapdata) + 
    tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  tm_shape(mapdata) + 
    tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  nrow = 1)

# Predicted Residuals
# we've already trained all 3 models and have `mapdata$incidence` as your actual values.
### Step 1: Generate predictions and residuals for each model
# Random Forest
rf_preds <- predict(rf_model, newdata = mapdata)
rf_resid <- mapdata$incidence - rf_preds

# XGBoost
xgb_preds <- predict(xgb_model, newdata = x_vars)  # x_vars = model.matrix(...)
xgb_resid <- mapdata$incidence - xgb_preds

# SVR
svr_preds <- predict(svr_model, newdata = mapdata)
svr_resid <- mapdata$incidence - svr_preds

### Step 2: Combine into a single data frame
residuals_df <- data.frame(
  actual = mapdata$incidence,
  rf_pred = rf_preds,
  rf_resid = rf_resid,
  xgb_pred = xgb_preds,
  xgb_resid = xgb_resid,
  svr_pred = svr_preds,
  svr_resid = svr_resid
)

# export
library(writexl)

### Compare residual distributions

boxplot(residuals_df$rf_resid, residuals_df$xgb_resid, residuals_df$svr_resid,
        names = c("RF", "XGB", "SVR"),
        main = "Model Residuals",
        ylab = "Prediction Error (Residual)")

## Spatial maps of residual values from each model
#Add residuals to mapdata
#You should already have these from the previous steps:
mapdata$rf_resid <- residuals_df$rf_resid
mapdata$xgb_resid <- residuals_df$xgb_resid
mapdata$svr_resid <- residuals_df$svr_resid

# Set tmap mode to plot (static map)
tmap_mode("plot")

# Create individual residual maps
map_rf <- tm_shape(mapdata) +
  tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile", 
                                                     midpoint = 0), fill.legend = tm_legend(title = "Inci_rf_resid")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                                          frame = TRUE, component.autoscale = FALSE)

map_xgb <- tm_shape(mapdata) + 
  tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile", 
                                                      midpoint = 0), fill.legend = tm_legend(title = "Inci_xgb_resid")) + tm_borders(fill_alpha = .2) + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
            frame = TRUE, component.autoscale = FALSE)
map_svr <- tm_shape(mapdata) + 
  tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_svr_resid")) + tm_borders(fill_alpha = .2) + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
            frame = TRUE, component.autoscale = FALSE)

#Step 3: Combine maps in a grid
# Combine maps in a grid layout
tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)

##Barplot and Spatial maps for RMSE and MAE
#Step 1: Calculate RMSE and MAE for each model
# Random Forest
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)

# XGBoost
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)

# SVR
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)

#Step 2: Combine into a summary data frame
model_eval <- data.frame(
  Model = c("Random Forest", "XGBoost", "SVR"),
  RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
  MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
  Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"])
)

print(model_eval)
#>           Model       RMSE         MAE  Rsquared
#> 1 Random Forest  98.219710  30.2558297 0.9976235
#> 2       XGBoost   2.100454   0.7290456 0.9999983
#> 3           SVR 445.937234 127.7105534 0.9178101

#Visualize MAE and RMSE
oldpar <- par(mfrow = c(1, 3))

#Barplot of RMSE
barplot(model_eval$RMSE, names.arg = model_eval$Model, 
        col = "skyblue", las = 1, main = "Model RMSE", ylab = "RMSE")

#Barplot of MAE
barplot(model_eval$MAE, names.arg = model_eval$Model, 
        col = "lightgreen", las = 1, main = "Model MAE", ylab = "MAE")
#Barplot of MAE
barplot(model_eval$Rsquared, names.arg = model_eval$Model, 
        col = "grey", las = 1, main = "Model Rsquared", ylab = "MAE")


par(oldpar)

#Add metrics to residual maps as captions
map_rf <- tm_shape(mapdata) +
  tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", midpoint = 0),
          title = paste0("rf_resid\nRMSE: ", round(rf_metrics["RMSE"], 2),
                         "\nMAE: ", round(rf_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

map_xgb <- tm_shape(mapdata) +
  tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
          title = paste0("xgb_resid\nRMSE: ", round(xgb_metrics["RMSE"], 2),
                         "\nMAE: ", round(xgb_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

map_svr <- tm_shape(mapdata) +
  tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
          title = paste0("svr_resid\nRMSE: ", round(svr_metrics["RMSE"], 2),
                         "\nMAE: ", round(svr_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)

#Global Moran’s I, Local Moran’s I (LISA), Cluster categories (e.g., High-High, Low-Low), Maps: Moran’s I map,
#LISA clusters, High-High clusters using the predicted values from the four machine learning models
#Assumptions: Your spatial data is in mapdata (an sf object).
#Predicted values for each model are already stored in: rf_preds, xgb_preds, svr_preds, mlp_preds
#STEP 1: Create Spatial Weights (if not done yet)
neighbors <- poly2nb(mapdata) #if this gives warning, use the below codes
mapdata <- st_as_sf(mapdata)  # If it's not already sf
mapdata <- st_make_valid(mapdata)  # Fix any invalid geometries
neighbors <- poly2nb(mapdata, snap = 1e-15) ## Try 1e-6, 1e-5, or higher if needed. You can adjust snap upward incrementally until the warnings disappear or are reduced

listw <- nb2listw(neighbors, style = "W", zero.policy = TRUE)

#STEP 2: Define a function to compute Moran’s I and cluster categories
analyze_spatial_autocorrelation <- function(mapdata, values, listw, model_name, signif_level = 0.05) {
  # Standardize predicted values
  mapdata$val_st <- scale(values)[, 1]
  # Compute lag
  mapdata$lag_val <- lag.listw(listw, mapdata$val_st, zero.policy = TRUE)
  # Global Moran's I
  global_moran <- moran.test(values, listw, zero.policy = TRUE)
  # Local Moran's I (LISA)
  lisa <- localmoran(values, listw, zero.policy = TRUE)
  lisa_df <- as.data.frame(lisa)
  #rename p-value column
  colnames(lisa_df)[5] <- "Pr_z"
  # Add to mapdata
  mapdata$Ii <- lisa_df$Ii
  mapdata$Z_Ii <- lisa_df$Z.I
  mapdata$Pr_z <- lisa_df$Pr_z
  #mapdata$Pr_z <- lisa_df[, "Pr(z > 0)"]
  # Classify clusters
  mapdata <- mapdata %>%
    mutate(
      cluster = case_when(
        val_st > 0 & lag_val > 0 & Pr_z <= signif_level ~ "High-High",
        val_st < 0 & lag_val < 0 & Pr_z <= signif_level ~ "Low-Low",
        val_st < 0 & lag_val > 0 & Pr_z <= signif_level ~ "Low-High",
        val_st > 0 & lag_val < 0 & Pr_z <= signif_level ~ "High-Low",
        TRUE ~ "Not Significant"
      )
    )
  return(list(
    map = mapdata,
    global_moran = global_moran
  ))
}

#STEP 3: Run the function for each model
rf_result <- analyze_spatial_autocorrelation(mapdata, rf_preds, listw, "Random Forest")
xgb_result <- analyze_spatial_autocorrelation(mapdata, xgb_preds, listw, "XGBoost")
svr_result <- analyze_spatial_autocorrelation(mapdata, svr_preds, listw, "SVR")

#STEP 4: Mapping LISA Clusters and High-High Areas for Random Forest
tmap_mode("plot")
# LISA Cluster Map.  fill.scale =tm_scale_intervals(values = "-RdBu")
tm_rf <- tm_shape(rf_result$map) +
  tm_fill(
    "cluster",
    fill.scale = tm_scale(values = c(
      "High-High" = "red",
      "Low-Low" = "blue",
      "Low-High" = "lightblue",
      "High-Low" = "pink",
      "Not Significant" = "gray90")),
    fill.legend = tm_legend(title = "LISA Clusters - RF")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) 

# Mapping LISA Clusters and High-High Areas for XGBoost
tmap_mode("plot")
# LISA Cluster Map
tm_xgb <- tm_shape(xgb_result$map) +
  tm_fill("cluster", 
          fill.scale = tm_scale(values = c(
            "High-High" = "red", 
            "Low-Low" = "blue",
            "Low-High" = "lightblue", 
            "High-Low" = "pink",
            "Not Significant" = "gray90")),
          fill.legend = tm_legend(title = "LISA Clusters - XGBoost")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) 

#Mapping LISA Clusters and High-High Areas for SVR
tmap_mode("plot")
# LISA Cluster Map
tm_svr <- tm_shape(svr_result$map) +
  tm_fill("cluster", 
          fill.scale = tm_scale(values = c(
            "High-High" = "red", 
            "Low-Low" = "blue",
            "Low-High" = "lightblue", 
            "High-Low" = "pink",
            "Not Significant" = "gray90")),
          fill.legend = tm_legend(title = "LISA Clusters - SVR")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

tmap_arrange(tm_rf, tm_xgb, tm_svr, nrow = 1)

#You can also arrange maps side-by-side using tmap_arrange().

#View Global Moran’s I Results
#These print the test statistic and p-values for global spatial autocorrelation of predictions.
rf_result$global_moran
#> 
#>  Moran I test under randomisation
#> 
#> data:  values  
#> weights: listw  
#> n reduced by no-neighbour observations  
#> 
#> Moran I statistic standard deviate = -0.24595, p-value = 0.5971
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic       Expectation          Variance 
#>      -0.044681255      -0.022727273       0.007967958
xgb_result$global_moran
#> 
#>  Moran I test under randomisation
#> 
#> data:  values  
#> weights: listw  
#> n reduced by no-neighbour observations  
#> 
#> Moran I statistic standard deviate = -0.32513, p-value = 0.6275
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic       Expectation          Variance 
#>      -0.051250814      -0.022727273       0.007696578
svr_result$global_moran
#> 
#>  Moran I test under randomisation
#> 
#> data:  values  
#> weights: listw  
#> n reduced by no-neighbour observations  
#> 
#> Moran I statistic standard deviate = -0.28034, p-value = 0.6104
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic       Expectation          Variance 
#>       -0.05111299       -0.02272727        0.01025262