---
title: "B_01_RandomForest_all_data.R"
format:
html:
self-contained: true
embed-resources: true
toc: true # optional: adds a table of contents
theme: cosmo # optional: Bootstrap theme
code-fold: show # optional: collapsible code blocks
code-tools: true # optional: adds copy/paste buttons
toc-depth: 4
editor:
markdown:
wrap: 72
---
# B - 01: RandomForest (pooled models)
```{r}
#| echo: false
#| message: false
#| warning: false
#| results: "hide"
#| output: false
rm (list = ls ())
gc ()
```
## Libraries
```{r}
#| message: false
#| warning: false
#| results: "hide"
#| output: false
suppressPackageStartupMessages ({
library (here)
source (here:: here ("Code/00_Configuration.R" ))
package_list <-
c (
package_list,
"ranger" ,
"tidymodels" ,
"caret" ,
"skimr" ,
"DALEXtra" ,
"DALEX" ,
"ggthemes" ,
"hstats"
)
lapply (package_list, require, character = TRUE )
tidymodels_prefer ()
})
rm (list = ls ())
set.seed (123 )
```
## Read data
```{r}
#| message: false
#| warning: false
#| error: false
dta <-
readRDS (
here:: here (
"Data/output/1_all_predictors_merged.rds"
)
) %>%
filter (samplingPeriodID == 1 )
```
## Set variable vectors for hypotheses
```{r}
sp_id <-
c (
"verbatimIdentification" ,
"scientificName"
)
H1 <-
c (
"Mass" ,
"GlobRangeSize_km2" ,
"Migration" ,
"Habitat_5" ,
"Generalism" ,
"Threatened" ,
"pd"
)
H2 <-
c (
"D_AOO_a" ,
"mean_lnLac" ,
"AOO" ,
"joincount_delta" ,
"circNorm" ,
"minDist_toBorder_centr"
)
H3 <-
c ("datasetID" )
predictors <-
c (H1, H2, H3)
responses <-
c (
"Jaccard_dissim" ,
"log_R2_1" ,
"log_R2_1_per_year" # yearly rate
)
```
## Make model data
Reduce to model variables
```{r}
dta_new <-
dta %>%
select (all_of (c (sp_id, responses, H3, H1, H2))) %>%
ungroup ()
```
Create data subsets for both responses
```{r}
dta_J <-
dta_new %>%
select (- log_R2_1, - log_R2_1_per_year)
dta_lnRR <-
dta_new %>%
select (- Jaccard_dissim, - log_R2_1_per_year)
dta_ln_RR_year <-
dta_new %>%
select (- Jaccard_dissim, - log_R2_1)
all_datasets <-
list (dta_J, dta_lnRR, dta_ln_RR_year)
```
```{r}
#| echo: false
#| eval: true
list_res <-
readRDS (here ("Data/output/B_models/B_01_list_all_results_rf.rds" ))
tuned_res <-
list_res$ tuned_res
predictions <-
list_res$ predictions
res <-
list_res$ res
list_split_data <-
list_res$ list_split_data
```
Create empty lists to fill in the loop
```{r}
#| eval: false
tuned_res <-
replicate (length (all_datasets), list ())
predictions <-
replicate (length (all_datasets), list ())
res <-
replicate (length (all_datasets), list ())
list_split_data <-
replicate (length (all_datasets), list ())
```
# Fit RandomForest models
### START LOOP
```{r}
#| eval: false
set.seed (123 )
for (data_i in seq_along (all_datasets)) {
set.seed (123 )
model_dd <- all_datasets[[data_i]]
# create data split (80/20)
set.seed (123 )
split_info <- initial_split (model_dd, strata = datasetID, prop = 0.8 )
train <- training (split_info)
test <- testing (split_info)
# create 3x repeated 10-fold cross-validation subsets from training data
set.seed (123 )
folds <- vfold_cv (train, v = 10 , repeats = 3 )
# save splits to list for later reference
list_split_data[[data_i]] <- list (
split_info = split_info,
train = train,
test = test,
folds = folds
)
# Model recipes for different response vars:
if (responses[data_i] == "Jaccard_dissim" ) {
mod_rec <-
recipe (Jaccard_dissim ~ ., data = train) %>%
update_role (all_of (sp_id),
new_role = "speciesID" ,
old_role = "predictor"
)
} else if (responses[data_i] == "log_R2_1" ) {
mod_rec <-
recipe (log_R2_1 ~ ., data = train) %>%
update_role (all_of (sp_id),
new_role = "speciesID" ,
old_role = "predictor"
)
} else if (responses[data_i] == "log_R2_1_per_year" ) {
mod_rec <-
recipe (log_R2_1_per_year ~ ., data = train) %>%
update_role (all_of (sp_id),
new_role = "speciesID" ,
old_role = "predictor"
)
}
# Specify model engine parameters & indicate tuning
set.seed (123 )
mod_spec <- rand_forest (
mtry = tune (),
trees = tune (),
min_n = tune ()
) %>%
set_engine ("ranger" ,
importance = "permutation" ,
respect.unordered.factors = TRUE ,
always.split.variables = "datasetID"
) %>%
set_mode ("regression" )
# Create model workflow
set.seed (123 )
mod_wf <- workflow () %>%
add_recipe (mod_rec) %>%
add_model (mod_spec)
# Bayesian Hyperparameter tuning based on the following parameter ranges
set.seed (123 )
rf_params <- parameters (
mtry (range = c (2L, 10L)),
min_n (range = c (5L, 15L)),
trees (range = c (1000L, 5000L))
)
tictoc:: tic ()
tuned_bayes <-
tune_bayes (
mod_wf,
resamples = folds,
param_info = rf_params,
initial = 5 ,
iter = 50 ,
control = control_bayes (
verbose = T,
no_improve = 10 ,
seed = 123
)
)
tictoc:: toc ()
saveRDS (
tuned_bayes,
paste0 (
here:: here ("Data/output/temp/B_01_" ),
responses[data_i],
"_tuned_bayes.rds"
)
)
# select best fitting hyperparameters based on smallest rmse
best <-
tuned_bayes %>%
select_best (metric = "rmse" )
# save tuned results & best parameters to list for later reference
tuned_res[[data_i]] <-
list (tuned_bayes, best)
# Fit final model
final_wf <-
mod_wf %>%
finalize_workflow (best)
# Perform the model evaluation on the testing data directly (not OOB, not CV)
final_fit <-
final_wf %>%
last_fit (split_info)
saveRDS (
final_fit,
paste0 (
here:: here ("Data/output/temp/B_01_" ),
responses[data_i],
"_final_fit.rds"
)
)
# save to results list
res[[data_i]] <- final_fit
# Predictions on testing data
p <-
predict (
final_fit %>%
tune:: extract_workflow (),
new_data = test
) %>% # testing data
bind_cols (
test %>%
select (
responses[data_i], # change y with response var
verbatimIdentification,
scientificName,
datasetID
)
) %>%
setNames (
c (
".pred" ,
responses[data_i],
"verbatimIdentification" ,
"scientificName" ,
"datasetID"
)
) %>%
mutate (
resid = .[[responses[data_i]]] - .pred
)
# save predictions to list
predictions[[data_i]] <- p
}
```
### END LOOP
# Explainable AI (xAI)
Create empty lists to save results
```{r}
plot_list <-
replicate (3 , list ())
res_xAI <-
replicate (length (res), list ())
```
```{r}
#| echo: false
#| eval: true
res_xAI <-
list_res$ res_xAI
```
Start Loop across response vars
```{r}
#| eval: false
for (data_i in seq_along (res)) {
dd <-
list_split_data[[data_i]]
fit <-
res[[data_i]] %>%
extract_fit_parsnip ()
explainer <-
DALEXtra:: explain_tidymodels (
fit,
data = dd$ train %>%
select (! any_of (c (
responses,
"verbatimIdentification" ,
"scientificName"
))),
y = dd$ train %>%
pull (any_of (responses))
)
# Partial plots / Model profile
pd <-
model_profile (explainer,
groups = "datasetID" ,
type = "partial" ,
N = NULL ,
variables = explainer$ data %>%
names ()
)
# Model diagnostics
md <-
model_diagnostics (explainer)
# Model performance:
mp <-
model_performance (explainer)
# Variable importance:
vi <-
model_parts (explainer)
# Save results to list
res_xAI[[data_i]] <-
list (
explainer = explainer,
pd = pd,
md = md,
vi = vi
)
}
```
Set output directory for plots
```{r}
out_dir <- here:: here ("Figures/B_models/xAI/" )
```
## Jaccard plots
Unfortunately rendering destroys the ggplot.. Not sure why it doesn't recognize DALEX::plot.model_profile() instead of baseR plot() function
```{r}
#| warning: false
#| message: false
#| error: false
#| fig-width: 10
#| fig-height: 10
library ("DALEX" )
plot_list[[1 ]] <-
list (
# partial plots
pd1 =
plot (res_xAI[[1 ]]$ pd),
# model diagnostics
md1 =
plot (res_xAI[[1 ]]$ md %>%
mutate (label = paste0 ("Jaccard" ))) +
ggthemes:: theme_few (),
# variable importances
vi1 =
plot (res_xAI[[1 ]]$ vi %>%
mutate (label = paste0 ("Jaccard" ))) +
ggthemes:: theme_few ()
)
plot_list[[1 ]]
```
## Log ratio plots
```{r}
#| warning: false
#| message: false
#| error: false
#| fig-width: 10
#| fig-height: 10
plot_list[[2 ]] <-
list (
pd2 =
plot (res_xAI[[2 ]]$ pd),
md2 =
plot (res_xAI[[2 ]]$ md %>%
mutate (label = paste0 ("log ratio AOO" ))) +
ggthemes:: theme_few (),
vi2 =
plot (res_xAI[[2 ]]$ vi %>%
mutate (label = paste0 ("log ratio AOO" ))) +
ggthemes:: theme_few ()
)
plot_list[[2 ]]
```
## Yearly rate plots
```{r}
#| warning: false
#| message: false
#| error: false
#| fig-width: 10
#| fig-height: 10
plot_list[[3 ]] <-
list (
pd3 =
plot (res_xAI[[3 ]]$ pd),
md3 =
plot (res_xAI[[3 ]]$ md %>%
mutate (label = paste0 ("log ratio yearly rate" ))) +
ggthemes:: theme_few (),
vi3 =
plot (res_xAI[[3 ]]$ vi %>%
mutate (label = paste0 ("log ratio yearly rate" ))) +
ggthemes:: theme_few ()
)
plot_list[[3 ]]
```
## Save plots to pdf
```{r}
#| eval: false
pdf (
onefile = TRUE ,
paper = "a4" ,
file = here:: here (out_dir, paste0 ("all_data_xAI_plots.pdf" ))
)
plot_list
dev.off ()
```
# Interactions with H-stats
Create empty list for results
```{r}
interaction_res <-
replicate (length (res), list ())
```
```{r}
#| echo: false
#| eval: true
interaction_res <-
list_res$ interactions
```
Start Loop
```{r}
#| eval: false
for (data_i in seq_along (list_split_data)) {
# Prepare data for hstats function:
dd <-
list_split_data[[data_i]]
fit <-
res[[data_i]] %>%
extract_fit_parsnip ()
preds <-
dd$ train %>%
select (! any_of (
c (
responses,
"verbatimIdentification" ,
"scientificName"
)
))
# run hstats
interactions <-
hstats (
fit,
X = preds,
threeway_m = 5
)
# save results
interaction_res[[data_i]] <-
interactions
# Plot interactions
interaction_plot <-
plot (interactions, which = 1 : 100 ) +
theme_few ()
# save as svg
ggsave (
filename =
paste0 (
here:: here (
"Figures/B_models/interactions/" ,
"B_01_" ,
responses[data_i],
"_interactions_hstats.svg"
)
),
plot = interaction_plot,
device = "svg" ,
width = 10 , height = 10
)
}
```
save results to RDS
```{r}
#| eval: false
files_to_save <-
list (
interactions = interaction_res,
res_xAI = res_xAI,
predictions = predictions,
tuned_res = tuned_res,
res = res,
list_split_data = list_split_data
)
saveRDS (
files_to_save,
here:: here (
"Data/output/B_models/B_01_list_all_results_rf.rds"
)
)
# save with new variable name for easier recognition
list_res <- files_to_save
```
Save interaction plots to svg
```{r}
#| eval: false
#| include: true
for (resp_i in seq_along (responses)) {
int <-
list_res$ interactions[[resp_i]]
class (int) <-
"hstats"
interaction_plot <-
plot (int, which = 1 : 100 ) +
theme_few () +
ggtitle (paste (responses[resp_i])) +
xlim (c (0 , 0.14 ))
ggsave (
filename =
here:: here (
"Figures/B_models/interactions/" ,
paste0 (
"B_01_" ,
responses[resp_i],
"_interactions_hstats.svg"
)
),
plot = interaction_plot,
device = "svg" ,
width = 10 ,
height = 10
)
interaction_plot
}
```
# Create model results CSV
## RandomForest results
## Tidy models & extract performance (detailed)
```{r}
#| eval: false
# helper for MSE
mse_vec <- function (truth, estimate, na_rm = TRUE , ...) {
mean ((truth - estimate)^ 2 , na.rm = na_rm)
}
# Function to compute R² / RMSE / MSE on *any* data frame
get_metrics <- function (wflow, data, truth_chr) {
preds <- predict (wflow, data) %>% bind_cols (data)
truth_vec <- preds[[truth_chr]]
estimate <- preds$ .pred
tibble:: tibble (
r2 = yardstick:: rsq_vec (truth_vec, estimate),
rmse = yardstick:: rmse_vec (truth_vec, estimate),
mse = mse_vec (truth_vec, estimate)
)
}
# Function to summarize six validation contexts
rf_metrics_summary <- function (wflow,
outcome_chr, # character string
train_data,
test_data,
cv_folds = NULL ,
external_data = NULL ) {
## OOB from ranger --------------------------------------------------
fit_obj <- extract_fit_parsnip (wflow)$ fit
oob_mse <- fit_obj$ prediction.error
oob_r2 <- fit_obj$ r.squared
oob_rmse <- sqrt (oob_mse)
## Apparent (raw) ---------------------------------------------------
raw <- get_metrics (wflow, train_data, outcome_chr)
## Cross‑validation -------------------------------------------------
cv <- if (! is.null (cv_folds)) {
res <- fit_resamples (wflow,
resamples = cv_folds,
metrics = metric_set (rsq, rmse)
)
out <- collect_metrics (res)
rmse_val <- out %>%
filter (.metric == "rmse" ) %>%
pull (mean)
r2_val <- out %>%
filter (.metric == "rsq" ) %>%
pull (mean)
tibble (r2 = r2_val, rmse = rmse_val, mse = rmse_val^ 2 )
} else {
tibble (r2 = NA_real_ , rmse = NA_real_ , mse = NA_real_ )
}
## Train / Test -----------------------------------------------------
tr <- get_metrics (wflow, train_data, outcome_chr)
te <- get_metrics (wflow, test_data, outcome_chr)
## External ---------------------------------------------------------
ext <- if (! is.null (external_data)) {
get_metrics (wflow, external_data, outcome_chr)
} else {
tibble (r2 = NA_real_ , rmse = NA_real_ , mse = NA_real_ )
}
## Combine ----------------------------------------------------------
tibble:: tribble (
~ context, ~ r2, ~ rmse, ~ mse,
"raw" , raw$ r2, raw$ rmse, raw$ mse,
"oob" , oob_r2, oob_rmse, oob_mse,
"cv" , cv$ r2, cv$ rmse, cv$ mse,
"train" , tr$ r2, tr$ rmse, tr$ mse,
"test" , te$ r2, te$ rmse, te$ mse,
"external" , ext$ r2, ext$ rmse, ext$ mse
)
}
```
```{r}
#| eval: false
# Load results from above
list_res <-
readRDS (
here (
"Data/output/B_models/" , "B_01_list_all_results_rf.rds"
)
)
# External validation (only for J and log ratio - not yearly rate)
external_df <-
readRDS (here ("Data/output/" , "1_all_predictors_merged.rds" )) %>%
filter (samplingPeriodID == 2 ) %>%
mutate (
across (
c (samplingPeriodID, datasetID), ~ as.factor (as.character (.))
)
) %>%
# join response columns & rename
left_join (
readRDS (here ("Data/output/" , "2_big_table_3.rds" )) %>%
select (
datasetID, samplingPeriodID, verbatimIdentification,
Jaccard_dissim, log_R3_2
) %>%
rename (log_R2_1 = log_R3_2) %>%
mutate (
across (
c (samplingPeriodID, datasetID), ~ as.factor (as.character (.))
)
)
)
# Vector of responses in saved order
responses <-
c (
"Jaccard_dissim" ,
"log_R2_1" ,
"log_R2_1_per_year"
)
metrics_list <-
vector ("list" , length (responses))
for (i in seq_along (responses)) {
resp <- responses[i]
# Load fitted workflow safely
file_path <- here (
"Data/output/temp" ,
paste0 ("B_01_" , resp, "_final_fit.rds" )
)
if (! file.exists (file_path)) {
warning (glue ("Model file not found for {resp}: {file_path} — skipping" ))
next # skip to next response
}
final_fit <- readRDS (file_path)
wf <- tune:: extract_workflow (final_fit)
# Pull split elements -------------------------------------------
split_info <-
list_res$ list_split_data[[i]][1 : 3 ]
train <-
split_info$ train
test <-
split_info$ test
folds <-
list_res$ list_split_data[[i]]$ folds
# External only for first response -------------------------------
ext <-
if (resp %in% c ("Jaccard_dissim" , "log_R2_1" )) external_df else NULL
metrics_list[[i]] <-
rf_metrics_summary (
wflow = wf,
outcome_chr = resp,
train_data = train,
test_data = test,
cv_folds = folds,
external_data = ext
) %>%
mutate (response = resp)
}
all_metrics <-
bind_rows (metrics_list)
contexts <- c ("raw" , "oob" , "cv" , "test" , "external" )
metrics <- c ("r2" , "mse" , "rmse" )
cols <-
map (contexts, ~ paste0 (metrics, "_" , .x)) %>%
unlist ()
all_metrics2 <-
all_metrics %>%
filter (context != "train" ) %>%
pivot_wider (
id_cols = response,
names_from = context,
values_from = c (r2, rmse, mse)
) %>%
select (response, all_of (cols))
write.csv (all_metrics2, here ("Data/output/results/B_01_detailed_rf_results.csv" ))
```
```{r}
#|echo: false
all_metrics2 <- read.csv (here ("Data/output/results/B_01_detailed_rf_results.csv" ))
```
```{r}
# Display nicely
kableExtra:: kable (
all_metrics2 %>%
mutate (
across (
c (3 : 17 ),
round, 5
)
)
)
```
```{r}
all_metrics2 %>%
select (- X) %>%
pivot_longer (
cols = - response,
names_to = c (".value" , "context" ),
names_sep = "_"
) %>%
ggplot (
aes (context, r2, colour = response, group = response)
) +
geom_point (size = 3 ) +
geom_line (linewidth = 0.8 ) +
scale_y_continuous (limits = c (0 , 1 )) +
labs (
title = "Model Generalisation Across Validation Contexts" ,
y = "R²" ,
x = NULL
) +
theme_minimal ()
```
save simplified results to csv:
```{r}
tuned_res_df <-
list (
list_res$ tuned_res[[1 ]][[2 ]],
list_res$ tuned_res[[2 ]][[2 ]],
list_res$ tuned_res[[3 ]][[2 ]]
) %>%
bind_rows () %>%
select (- .config) %>%
mutate (
response = c (
"Jaccard_dissim" ,
"log_R2_1" ,
"log_R2_1_per_year"
)
)
mod_res <-
list_res$ res %>%
bind_rows () %>%
.[[3 ]] %>%
bind_rows () %>%
select (- .config, - .estimator) %>%
mutate (
response = c (
"Jaccard_dissim" , "Jaccard_dissim" ,
"log_R2_1" , "log_R2_1" ,
"log_R2_1_per_year" , "log_R2_1_per_year"
)
) %>%
pivot_wider (
id_cols = c ("response" ),
names_from = c (".metric" ),
values_from = c (".estimate" )
) %>%
full_join (tuned_res_df) %>%
mutate (
cv = "3x10folds" ,
tuning_method = "bayesian" ,
model_type = "ranger"
)
```
checks:
```{r}
kable (mod_res)
```
save to csv2 (with ; as sep because I ran into comma-issues in Excel)
```{r}
#| eval: false
write.csv2 (mod_res, here:: here ("Data/output/results/B_01_ranger_all_data.csv" ))
```
## Interaction results:
checks:
can be interpreted as proportion variation explained by interactions (the higher h2, the more important are interactions)
```{r}
# Overall interaction strengths
## Jaccard
interaction_res[[1 ]]
## log ratio
interaction_res[[2 ]]
## log ratio yearly rate
interaction_res[[3 ]]
```
```{r}
#| eval: false
int_res_list <-
list ()
for (resp_i in seq_along (list_res$ interactions)) {
if (resp_i == 1 ) this_response <- "Jaccard_dissim" else if (resp_i == 2 ) this_response <- "log_R2_1" else if (resp_i == 3 ) this_response <- "log_R2_1_per_year"
int_res_list[[resp_i]] <-
data.frame (
H2 = c (
summary (list_res$ interactions[[resp_i]])$ h2[[1 ]],
summary (list_res$ interactions[[resp_i]])$ h2_overall[[1 ]],
summary (list_res$ interactions[[resp_i]])$ h2_pairwise[[1 ]],
summary (list_res$ interactions[[resp_i]])$ h2_threeway[[1 ]]
),
response = this_response,
test =
c (
"total_interaction_strength" ,
rep ("overall" , 14 ),
rep ("pairwise" , 10 ),
rep ("threeway" , 10 )
),
variable = c (
"all" ,
row.names (summary (list_res$ interactions[[resp_i]])$ h2_overall[[1 ]]),
row.names (summary (list_res$ interactions[[resp_i]])$ h2_pairwise[[1 ]]),
row.names (summary (list_res$ interactions[[resp_i]])$ h2_threeway[[1 ]])
)
)
}
int_res <-
bind_rows (int_res_list)
write.csv2 (int_res, here:: here ("Data/output/results/B_01_Hstats_all_data.csv" ))
```
Variable importances:
```{r}
#| eval: false
vimp_list <- list ()
for (i in seq_along (res_all)) {
if (i == 1 ) this_response <- "Jaccard_dissim" else if (i == 2 ) this_response <- "log_R2_1" else if (i == 3 ) this_response <- "log_R2_1_per_year"
fit <-
res[[i]] %>%
extract_fit_engine ()
vimp <-
fit$ variable.importance
vimp_list[[i]] <-
data.frame (
variable = names (vimp),
importance = vimp,
row.names = NULL ,
response = this_response,
importance_mode = fit$ importance.mode
)
}
imp_df <-
vimp_list %>%
bind_rows ()
write.csv2 (imp_df, here:: here ("Data/output/results/B_01_ranger_vimp_all_data.csv" ))
```