B_01_RandomForest_all_data.R

B - 01: RandomForest (pooled models)

Libraries

Code
suppressPackageStartupMessages({
  library(here)

  source(here::here("Code/00_Configuration.R"))

  package_list <-
    c(
      package_list,
      "ranger",
      "tidymodels",
      "caret",
      "skimr",
      "DALEXtra",
      "DALEX",
      "ggthemes",
      "hstats"
    )

  lapply(package_list, require, character = TRUE)

  tidymodels_prefer()
})


rm(list = ls())

set.seed(123)

Read data

Code
dta <-
  readRDS(
    here::here(
      "Data/output/1_all_predictors_merged.rds"
    )
  ) %>%
  filter(samplingPeriodID == 1)

Set variable vectors for hypotheses

Code
sp_id <-
  c(
    "verbatimIdentification",
    "scientificName"
  )

H1 <-
  c(
    "Mass",
    "GlobRangeSize_km2",
    "Migration",
    "Habitat_5",
    "Generalism",
    "Threatened",
    "pd"
  )

H2 <-
  c(
    "D_AOO_a",
    "mean_lnLac",
    "AOO",
    "joincount_delta",
    "circNorm",
    "minDist_toBorder_centr"
  )

H3 <-
  c("datasetID")

predictors <-
  c(H1, H2, H3)

responses <-
  c(
    "Jaccard_dissim",
    "log_R2_1",
    "log_R2_1_per_year" # yearly rate
  )

Make model data

Reduce to model variables

Code
dta_new <-
  dta %>%
  select(all_of(c(sp_id, responses, H3, H1, H2))) %>%
  ungroup()

Create data subsets for both responses

Code
dta_J <-
  dta_new %>%
  select(-log_R2_1, -log_R2_1_per_year)

dta_lnRR <-
  dta_new %>%
  select(-Jaccard_dissim, -log_R2_1_per_year)

dta_ln_RR_year <-
  dta_new %>%
  select(-Jaccard_dissim, -log_R2_1)

all_datasets <-
  list(dta_J, dta_lnRR, dta_ln_RR_year)

Create empty lists to fill in the loop

Code
tuned_res <-
  replicate(length(all_datasets), list())

predictions <-
  replicate(length(all_datasets), list())

res <-
  replicate(length(all_datasets), list())

list_split_data <-
  replicate(length(all_datasets), list())

Fit RandomForest models

START LOOP

Code
set.seed(123)
for (data_i in seq_along(all_datasets)) {
  set.seed(123)
  model_dd <- all_datasets[[data_i]]


  # create data split (80/20)
  set.seed(123)
  split_info <- initial_split(model_dd, strata = datasetID, prop = 0.8)
  train <- training(split_info)
  test <- testing(split_info)

  # create 3x repeated 10-fold cross-validation subsets from training data
  set.seed(123)
  folds <- vfold_cv(train, v = 10, repeats = 3)

  # save splits to list for later reference
  list_split_data[[data_i]] <- list(
    split_info = split_info,
    train = train,
    test = test,
    folds = folds
  )

  # Model recipes for different response vars:
  if (responses[data_i] == "Jaccard_dissim") {
    mod_rec <-
      recipe(Jaccard_dissim ~ ., data = train) %>%
      update_role(all_of(sp_id),
        new_role = "speciesID",
        old_role = "predictor"
      )
  } else if (responses[data_i] == "log_R2_1") {
    mod_rec <-
      recipe(log_R2_1 ~ ., data = train) %>%
      update_role(all_of(sp_id),
        new_role = "speciesID",
        old_role = "predictor"
      )
  } else if (responses[data_i] == "log_R2_1_per_year") {
    mod_rec <-
      recipe(log_R2_1_per_year ~ ., data = train) %>%
      update_role(all_of(sp_id),
        new_role = "speciesID",
        old_role = "predictor"
      )
  }


  # Specify model engine parameters & indicate tuning
  set.seed(123)
  mod_spec <- rand_forest(
    mtry = tune(),
    trees = tune(),
    min_n = tune()
  ) %>%
    set_engine("ranger",
      importance = "permutation",
      respect.unordered.factors = TRUE,
      always.split.variables = "datasetID"
    ) %>%
    set_mode("regression")

  # Create model workflow
  set.seed(123)
  mod_wf <- workflow() %>%
    add_recipe(mod_rec) %>%
    add_model(mod_spec)

  # Bayesian Hyperparameter tuning based on the following parameter ranges
  set.seed(123)
  rf_params <- parameters(
    mtry(range = c(2L, 10L)),
    min_n(range = c(5L, 15L)),
    trees(range = c(1000L, 5000L))
  )

  tictoc::tic()
  tuned_bayes <-
    tune_bayes(
      mod_wf,
      resamples = folds,
      param_info = rf_params,
      initial = 5,
      iter = 50,
      control = control_bayes(
        verbose = T,
        no_improve = 10,
        seed = 123
      )
    )
  tictoc::toc()

  saveRDS(
    tuned_bayes,
    paste0(
      here::here("Data/output/temp/B_01_"),
      responses[data_i],
      "_tuned_bayes.rds"
    )
  )


  # select best fitting hyperparameters based on smallest rmse
  best <-
    tuned_bayes %>%
    select_best(metric = "rmse")


  # save tuned results & best parameters to list for later reference
  tuned_res[[data_i]] <-
    list(tuned_bayes, best)


  # Fit final model
  final_wf <-
    mod_wf %>%
    finalize_workflow(best)

  # Perform the model evaluation on the testing data directly (not OOB, not CV)
  final_fit <-
    final_wf %>%
    last_fit(split_info)

  saveRDS(
    final_fit,
    paste0(
      here::here("Data/output/temp/B_01_"),
      responses[data_i],
      "_final_fit.rds"
    )
  )

  # save to results list
  res[[data_i]] <- final_fit

  # Predictions on testing data
  p <-
    predict(
      final_fit %>%
        tune::extract_workflow(),
      new_data = test
    ) %>% # testing data
    bind_cols(
      test %>%
        select(
          responses[data_i], # change y with response var
          verbatimIdentification,
          scientificName,
          datasetID
        )
    ) %>%
    setNames(
      c(
        ".pred",
        responses[data_i],
        "verbatimIdentification",
        "scientificName",
        "datasetID"
      )
    ) %>%
    mutate(
      resid = .[[responses[data_i]]] - .pred
    )

  # save predictions to list
  predictions[[data_i]] <- p
}

END LOOP

Explainable AI (xAI)

Create empty lists to save results

Code
plot_list <-
  replicate(3, list())

res_xAI <-
  replicate(length(res), list())

Start Loop across response vars

Code
for (data_i in seq_along(res)) {
  dd <-
    list_split_data[[data_i]]

  fit <-
    res[[data_i]] %>%
    extract_fit_parsnip()

  explainer <-
    DALEXtra::explain_tidymodels(
      fit,
      data = dd$train %>%
        select(!any_of(c(
          responses,
          "verbatimIdentification",
          "scientificName"
        ))),
      y = dd$train %>%
        pull(any_of(responses))
    )

  # Partial plots / Model profile

  pd <-
    model_profile(explainer,
      groups = "datasetID",
      type = "partial",
      N = NULL,
      variables = explainer$data %>%
        names()
    )


  # Model diagnostics
  md <-
    model_diagnostics(explainer)

  # Model performance:
  mp <-
    model_performance(explainer)

  # Variable importance:
  vi <-
    model_parts(explainer)

  # Save results to list
  res_xAI[[data_i]] <-
    list(
      explainer = explainer,
      pd = pd,
      md = md,
      vi = vi
    )
}

Set output directory for plots

Code
out_dir <- here::here("Figures/B_models/xAI/")

Jaccard plots

Unfortunately rendering destroys the ggplot.. Not sure why it doesn’t recognize DALEX::plot.model_profile() instead of baseR plot() function

Code
library("DALEX")
plot_list[[1]] <-
  list(
    # partial plots
    pd1 =
      plot(res_xAI[[1]]$pd),

    # model diagnostics
    md1 =
      plot(res_xAI[[1]]$md %>%
        mutate(label = paste0("Jaccard"))) +
        ggthemes::theme_few(),

    # variable importances
    vi1 =
      plot(res_xAI[[1]]$vi %>%
        mutate(label = paste0("Jaccard"))) +
        ggthemes::theme_few()
  )

Code
plot_list[[1]]
$pd1
NULL

$md1


$vi1
NULL

Log ratio plots

Code
plot_list[[2]] <-
  list(
    pd2 =
      plot(res_xAI[[2]]$pd),
    md2 =
      plot(res_xAI[[2]]$md %>%
        mutate(label = paste0("log ratio AOO"))) +
        ggthemes::theme_few(),
    vi2 =
      plot(res_xAI[[2]]$vi %>%
        mutate(label = paste0("log ratio AOO"))) +
        ggthemes::theme_few()
  )

Code
plot_list[[2]]
$pd2
NULL

$md2


$vi2
NULL

Yearly rate plots

Code
plot_list[[3]] <-
  list(
    pd3 =
      plot(res_xAI[[3]]$pd),
    md3 =
      plot(res_xAI[[3]]$md %>%
        mutate(label = paste0("log ratio yearly rate"))) +
        ggthemes::theme_few(),
    vi3 =
      plot(res_xAI[[3]]$vi %>%
        mutate(label = paste0("log ratio yearly rate"))) +
        ggthemes::theme_few()
  )

Code
plot_list[[3]]
$pd3
NULL

$md3


$vi3
NULL

Save plots to pdf

Code
pdf(
  onefile = TRUE,
  paper = "a4",
  file = here::here(out_dir, paste0("all_data_xAI_plots.pdf"))
)

plot_list

dev.off()

Interactions with H-stats

Create empty list for results

Code
interaction_res <-
  replicate(length(res), list())

Start Loop

Code
for (data_i in seq_along(list_split_data)) {
  # Prepare data for hstats function:

  dd <-
    list_split_data[[data_i]]

  fit <-
    res[[data_i]] %>%
    extract_fit_parsnip()

  preds <-
    dd$train %>%
    select(!any_of(
      c(
        responses,
        "verbatimIdentification",
        "scientificName"
      )
    ))

  # run hstats
  interactions <-
    hstats(
      fit,
      X = preds,
      threeway_m = 5
    )

  # save results
  interaction_res[[data_i]] <-
    interactions

  # Plot interactions
  interaction_plot <-
    plot(interactions, which = 1:100) +
    theme_few()


  # save as svg
  ggsave(
    filename =
      paste0(
        here::here(
          "Figures/B_models/interactions/",
          "B_01_",
          responses[data_i],
          "_interactions_hstats.svg"
        )
      ),
    plot = interaction_plot,
    device = "svg",
    width = 10, height = 10
  )
}

save results to RDS

Code
files_to_save <-
  list(
    interactions = interaction_res,
    res_xAI = res_xAI,
    predictions = predictions,
    tuned_res = tuned_res,
    res = res,
    list_split_data = list_split_data
  )

saveRDS(
  files_to_save,
  here::here(
    "Data/output/B_models/B_01_list_all_results_rf.rds"
  )
)

# save with new variable name for easier recognition
list_res <- files_to_save

Save interaction plots to svg

Code
for (resp_i in seq_along(responses)) {
  int <-
    list_res$interactions[[resp_i]]
  class(int) <-
    "hstats"

  interaction_plot <-
    plot(int, which = 1:100) +
    theme_few() +
    ggtitle(paste(responses[resp_i])) +
    xlim(c(0, 0.14))

  ggsave(
    filename =
      here::here(
        "Figures/B_models/interactions/",
        paste0(
          "B_01_",
          responses[resp_i],
          "_interactions_hstats.svg"
        )
      ),
    plot = interaction_plot,
    device = "svg",
    width = 10,
    height = 10
  )

  interaction_plot
}

Create model results CSV

RandomForest results

Tidy models & extract performance (detailed)

Code
# helper for MSE
mse_vec <- function(truth, estimate, na_rm = TRUE, ...) {
  mean((truth - estimate)^2, na.rm = na_rm)
}

# Function to compute R² / RMSE / MSE on *any* data frame
get_metrics <- function(wflow, data, truth_chr) {
  preds <- predict(wflow, data) %>% bind_cols(data)
  truth_vec <- preds[[truth_chr]]
  estimate <- preds$.pred
  tibble::tibble(
    r2   = yardstick::rsq_vec(truth_vec, estimate),
    rmse = yardstick::rmse_vec(truth_vec, estimate),
    mse  = mse_vec(truth_vec, estimate)
  )
}

# Function to summarize six validation contexts
rf_metrics_summary <- function(wflow,
                               outcome_chr, # character string
                               train_data,
                               test_data,
                               cv_folds = NULL,
                               external_data = NULL) {
  ## OOB from ranger --------------------------------------------------
  fit_obj <- extract_fit_parsnip(wflow)$fit
  oob_mse <- fit_obj$prediction.error
  oob_r2 <- fit_obj$r.squared
  oob_rmse <- sqrt(oob_mse)

  ## Apparent (raw) ---------------------------------------------------
  raw <- get_metrics(wflow, train_data, outcome_chr)

  ## Cross‑validation -------------------------------------------------
  cv <- if (!is.null(cv_folds)) {
    res <- fit_resamples(wflow,
      resamples = cv_folds,
      metrics = metric_set(rsq, rmse)
    )
    out <- collect_metrics(res)
    rmse_val <- out %>%
      filter(.metric == "rmse") %>%
      pull(mean)
    r2_val <- out %>%
      filter(.metric == "rsq") %>%
      pull(mean)
    tibble(r2 = r2_val, rmse = rmse_val, mse = rmse_val^2)
  } else {
    tibble(r2 = NA_real_, rmse = NA_real_, mse = NA_real_)
  }

  ## Train / Test -----------------------------------------------------
  tr <- get_metrics(wflow, train_data, outcome_chr)
  te <- get_metrics(wflow, test_data, outcome_chr)

  ## External ---------------------------------------------------------
  ext <- if (!is.null(external_data)) {
    get_metrics(wflow, external_data, outcome_chr)
  } else {
    tibble(r2 = NA_real_, rmse = NA_real_, mse = NA_real_)
  }

  ## Combine ----------------------------------------------------------
  tibble::tribble(
    ~context,   ~r2,              ~rmse,         ~mse,
    "raw",      raw$r2,           raw$rmse,      raw$mse,
    "oob",      oob_r2,           oob_rmse,      oob_mse,
    "cv",       cv$r2,            cv$rmse,       cv$mse,
    "train",    tr$r2,            tr$rmse,       tr$mse,
    "test",     te$r2,            te$rmse,       te$mse,
    "external", ext$r2,           ext$rmse,      ext$mse
  )
}
Code
# Load results from above
list_res <-
  readRDS(
    here(
      "Data/output/B_models/", "B_01_list_all_results_rf.rds"
    )
  )

# External validation (only for J and log ratio - not yearly rate)
external_df <-
  readRDS(here("Data/output/", "1_all_predictors_merged.rds")) %>%
  filter(samplingPeriodID == 2) %>%
  mutate(
    across(
      c(samplingPeriodID, datasetID), ~ as.factor(as.character(.))
    )
  ) %>%
  # join response columns & rename
  left_join(
    readRDS(here("Data/output/", "2_big_table_3.rds")) %>%
      select(
        datasetID, samplingPeriodID, verbatimIdentification,
        Jaccard_dissim, log_R3_2
      ) %>%
      rename(log_R2_1 = log_R3_2) %>%
      mutate(
        across(
          c(samplingPeriodID, datasetID), ~ as.factor(as.character(.))
        )
      )
  )

# Vector of responses in saved order
responses <-
  c(
    "Jaccard_dissim",
    "log_R2_1",
    "log_R2_1_per_year"
  )

metrics_list <-
  vector("list", length(responses))

for (i in seq_along(responses)) {
  resp <- responses[i]

  # Load fitted workflow safely
  file_path <- here(
    "Data/output/temp",
    paste0("B_01_", resp, "_final_fit.rds")
  )


  if (!file.exists(file_path)) {
    warning(glue("Model file not found for {resp}: {file_path} — skipping"))
    next # skip to next response
  }

  final_fit <- readRDS(file_path)

  wf <- tune::extract_workflow(final_fit)

  # Pull split elements -------------------------------------------
  split_info <-
    list_res$list_split_data[[i]][1:3]
  train <-
    split_info$train
  test <-
    split_info$test
  folds <-
    list_res$list_split_data[[i]]$folds

  # External only for first response -------------------------------
  ext <-
    if (resp %in% c("Jaccard_dissim", "log_R2_1")) external_df else NULL

  metrics_list[[i]] <-
    rf_metrics_summary(
      wflow = wf,
      outcome_chr = resp,
      train_data = train,
      test_data = test,
      cv_folds = folds,
      external_data = ext
    ) %>%
    mutate(response = resp)
}

all_metrics <-
  bind_rows(metrics_list)

contexts <- c("raw", "oob", "cv", "test", "external")
metrics <- c("r2", "mse", "rmse")

cols <-
  map(contexts, ~ paste0(metrics, "_", .x)) %>%
  unlist()


all_metrics2 <-
  all_metrics %>%
  filter(context != "train") %>%
  pivot_wider(
    id_cols = response,
    names_from = context,
    values_from = c(r2, rmse, mse)
  ) %>%
  select(response, all_of(cols))

write.csv(all_metrics2, here("Data/output/results/B_01_detailed_rf_results.csv"))
Code
#|echo: false
all_metrics2 <- read.csv(here("Data/output/results/B_01_detailed_rf_results.csv"))
Code
# Display nicely
kableExtra::kable(
  all_metrics2 %>%
    mutate(
      across(
        c(3:17),
        round, 5
      )
    )
)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `across(c(3:17), round, 5)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))
X response r2_raw mse_raw rmse_raw r2_oob mse_oob rmse_oob r2_cv mse_cv rmse_cv r2_test mse_test rmse_test r2_external mse_external rmse_external
1 Jaccard_dissim 0.97705 0.00209 0.04573 0.85850 0.01151 0.10727 0.85711 0.01159 0.10767 0.84991 0.01272 0.11279 0.87265 0.01079 0.10388
2 log_R2_1 0.93179 0.05594 0.23652 0.35359 0.30975 0.55655 0.31267 0.30596 0.55314 0.09189 0.38255 0.61851 0.06891 0.43397 0.65876
3 log_R2_1_per_year 0.92911 0.00007 0.00849 0.34149 0.00039 0.01964 0.31577 0.00038 0.01949 0.13992 0.00046 0.02152 NA NA NA
Code
all_metrics2 %>%
  select(-X) %>%
  pivot_longer(
    cols = -response,
    names_to = c(".value", "context"),
    names_sep = "_"
  ) %>%

ggplot(
  aes(context, r2, colour = response, group = response)
) +
  geom_point(size = 3) +
  geom_line(linewidth = 0.8) +
  scale_y_continuous(limits = c(0, 1)) +
  labs(
    title = "Model Generalisation Across Validation Contexts",
    y = "R²",
    x = NULL
  ) +
  theme_minimal()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

save simplified results to csv:

Code
tuned_res_df <-
  list(
    list_res$tuned_res[[1]][[2]],
    list_res$tuned_res[[2]][[2]],
    list_res$tuned_res[[3]][[2]]
  ) %>%
  bind_rows() %>%
  select(-.config) %>%
  mutate(
    response = c(
      "Jaccard_dissim",
      "log_R2_1",
      "log_R2_1_per_year"
    )
  )


mod_res <-
  list_res$res %>%
  bind_rows() %>%
  .[[3]] %>%
  bind_rows() %>%
  select(-.config, -.estimator) %>%
  mutate(
    response = c(
      "Jaccard_dissim", "Jaccard_dissim",
      "log_R2_1", "log_R2_1",
      "log_R2_1_per_year", "log_R2_1_per_year"
    )
  ) %>%
  pivot_wider(
    id_cols = c("response"),
    names_from = c(".metric"),
    values_from = c(".estimate")
  ) %>%
  full_join(tuned_res_df) %>%
  mutate(
    cv = "3x10folds",
    tuning_method = "bayesian",
    model_type = "ranger"
  )
Joining with `by = join_by(response)`

checks:

Code
kable(mod_res)
response rmse rsq mtry min_n trees cv tuning_method model_type
Jaccard_dissim 0.1127898 0.8499086 6 5 4998 3x10folds bayesian ranger
log_R2_1 0.6185096 0.0918946 10 5 1074 3x10folds bayesian ranger
log_R2_1_per_year 0.0215230 0.1399250 7 5 2148 3x10folds bayesian ranger

save to csv2 (with ; as sep because I ran into comma-issues in Excel)

Code
write.csv2(mod_res, here::here("Data/output/results/B_01_ranger_all_data.csv"))

Interaction results:

checks: can be interpreted as proportion variation explained by interactions (the higher h2, the more important are interactions)

Code
# Overall interaction strengths

## Jaccard
interaction_res[[1]]
'hstats' object. Use plot() or summary() for details.

H^2 (normalized)
[1] 0.1602538
Code
## log ratio
interaction_res[[2]]
'hstats' object. Use plot() or summary() for details.

H^2 (normalized)
[1] 0.2690084
Code
## log ratio yearly rate
interaction_res[[3]]
'hstats' object. Use plot() or summary() for details.

H^2 (normalized)
[1] 0.3158487
Code
int_res_list <-
  list()

for (resp_i in seq_along(list_res$interactions)) {
  if (resp_i == 1) this_response <- "Jaccard_dissim" else if (resp_i == 2) this_response <- "log_R2_1" else if (resp_i == 3) this_response <- "log_R2_1_per_year"

  int_res_list[[resp_i]] <-
    data.frame(
      H2 = c(
        summary(list_res$interactions[[resp_i]])$h2[[1]],
        summary(list_res$interactions[[resp_i]])$h2_overall[[1]],
        summary(list_res$interactions[[resp_i]])$h2_pairwise[[1]],
        summary(list_res$interactions[[resp_i]])$h2_threeway[[1]]
      ),
      response = this_response,
      test =
        c(
          "total_interaction_strength",
          rep("overall", 14),
          rep("pairwise", 10),
          rep("threeway", 10)
        ),
      variable = c(
        "all",
        row.names(summary(list_res$interactions[[resp_i]])$h2_overall[[1]]),
        row.names(summary(list_res$interactions[[resp_i]])$h2_pairwise[[1]]),
        row.names(summary(list_res$interactions[[resp_i]])$h2_threeway[[1]])
      )
    )
}

int_res <-
  bind_rows(int_res_list)

write.csv2(int_res, here::here("Data/output/results/B_01_Hstats_all_data.csv"))

Variable importances:

Code
vimp_list <- list()

for (i in seq_along(res_all)) {
  if (i == 1) this_response <- "Jaccard_dissim" else if (i == 2) this_response <- "log_R2_1" else if (i == 3) this_response <- "log_R2_1_per_year"

  fit <-
    res[[i]] %>%
    extract_fit_engine()

  vimp <-
    fit$variable.importance

  vimp_list[[i]] <-
    data.frame(
      variable = names(vimp),
      importance = vimp,
      row.names = NULL,
      response = this_response,
      importance_mode = fit$importance.mode
    )
}

imp_df <-
  vimp_list %>%
  bind_rows()

write.csv2(imp_df, here::here("Data/output/results/B_01_ranger_vimp_all_data.csv"))