B_01_NY_MachineLearning

In the following, we will fit random forest models to the first New York data set, using the tidymodels framework. The goal is to predict the temporal change to the second atlas replication using Jaccard dissimilarity and log ratio AOO values of the species distribution for each species in the dataset. We use ranger random forest models with Bayesian hyperparameter tuning. The model is evaluated using 10-fold cross-validation repeated three times. We will also explore variable importance and partial dependencies, as well as interactions between predictors.

Code

rm(list=ls())
gc()

          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  595704 31.9    1358800 72.6   686422 36.7
Vcells 1091028  8.4    8388608 64.0  1877384 14.4

Code

install.packages("tidymodels", repos = "https://cloud.r-project.org/") # install here package if not already installed

Paket 'tidymodels' erfolgreich ausgepackt und MD5 Summen abgeglichen

Die heruntergeladenen Binärpakete sind in 
    C:\Users\wolke\AppData\Local\Temp\RtmpCmQG1d\downloaded_packages

Code

suppressPackageStartupMessages({
library(here)
library(dplyr)
library(ranger)
library(tidymodels)
library(caret)
library(skimr)
library(DALEXtra)
library(hstats)
library(rsample)
library(tune)
library(parsnip)
library(recipes)
library(workflows)
library(yardstick)
library(dials)
library(forcats)
})

tidymodels_prefer() # solve conflicted functions in favor of tidymodels

set.seed(123)

Model fitting

Get Data

Code

dta <- 
  read.csv(here::here("Demo_NewYork/Data/output/data_final_ny.csv"))[2:19] %>%
  mutate(
    Migration = factor(as.character(Migration), levels = c(1,2,3)),
    Habitat_5 = factor(as.character(Habitat_5), levels = c(
      "closed", "freshwater", "open", "human", "marine"
      )),
    Threatened = factor(as.character(Threatened), levels = c(0,1)),
    Generalism = factor(as.character(Generalism), levels = c(0,1))
  ) 


# Set variables for model
sp_id <- 
  c("verbatimIdentification", "scientificName")

H1 <- 
  c("Mass", "GlobRangeSize_km2", 
    "Migration", "Habitat_5", 
    "Generalism", "Threatened", "pd")

H2 <- 
  c("D_AOO_a", "mean_lnLac", 
    "AOO", "joincount_delta", 
    "circNorm", "minDist_toBorder_centr")

predictors <- 
  c(H1, H2)

responses <- 
  c("Jaccard_dissim", 
    "log_R2_1", 
    "log_R2_1_per_year")

Model workflow

Code

# create empty lists to store results
tuned_res <- list()
predictions <- list()
res <- list()
list_split_data <- list()
results_response <- replicate(3, list())

Code

# Create data split (80/20)
set.seed(123)
split_info <- initial_split(dta, prop = 0.8)
train <- training(split_info) # 186 sp
test <- testing(split_info) # 47 sp

Code

tictoc::tic()
set.seed(123)

# model specifics
mod_spec <- 
  rand_forest(mtry = tune(),
              trees = tune(),
              min_n = tune()) %>%
  set_engine("ranger",
             importance = "permutation",
             respect.unordered.factors = TRUE) %>%
  set_mode("regression")

# tuning parameter
rf_params <- 
  parameters(
    mtry(range = c(2L, 10L)),
    min_n(range = c(5L, 15L)),
    trees(range = c(1000L, 5000L))
    )

# Set up cross-validation
set.seed(123)
folds <- 
  vfold_cv(train, v = 10, repeats = 3)

# save data splits for later use in predictions
list_split_data <- 
  list(
    split_info = split_info,
    train = train,
    test = test,
    folds = folds
    )

Code

# Loop through response variables:

for (resp_i in seq_along(responses)){
  
  
  # Create model recipe:
  if (responses[resp_i] == "Jaccard_dissim") {
      mod_rec <-
        recipe(Jaccard_dissim ~ ., data = train) %>%
        update_role(all_of(sp_id),
          new_role = "speciesID",
          old_role = "predictor"
        ) %>%
        update_role(all_of(responses[-resp_i]),
          old_role = "predictor",
          new_role = "alt_resp"
        )
    } else if (responses[resp_i] == "log_R2_1") {
      mod_rec <-
        recipe(log_R2_1 ~ ., data = train) %>%
        update_role(all_of(sp_id),
          new_role = "speciesID",
          old_role = "predictor"
        ) %>%
        update_role(all_of(responses[-resp_i]),
          old_role = "predictor",
          new_role = "alt_resp"
        )
    } else if (responses[resp_i] == "log_R2_1_per_year") {
      mod_rec <-
        recipe(log_R2_1_per_year ~ ., data = train) %>%
        update_role(all_of(sp_id),
          new_role = "speciesID",
          old_role = "predictor"
        ) %>%
        update_role(all_of(responses[-resp_i]),
          old_role = "predictor",
          new_role = "alt_resp"
        )
    }

  
  # Create workflow with recipe
  set.seed(123)
  mod_wf <- 
    workflow() %>%
    add_recipe(mod_rec) %>%
    add_model(mod_spec)
  
  #----------------------------------------------------------#
  # Bayesian hyperparameter tuning -----
  #----------------------------------------------------------#

  tictoc::tic()
  set.seed(123)
  tuned_bayes <-
    tune_bayes(
      mod_wf,
      resamples = folds,
      param_info = rf_params, 
      initial = 5,
      iter = 50,
      control = 
        control_bayes(
          verbose = T,
          no_improve = 10, 
          seed = 123)
      )
  tictoc::toc() # 1233.37 sec elapsed

  best <-
    tuned_bayes %>%
    select_best(metric = "rmse")

  #----------------------------------------------------------#
  # Final model fit -----
  #----------------------------------------------------------#

  final_wf <- 
    mod_wf %>%
    finalize_workflow(best)

  # perform the model evaluation on the testing data directly
  final_fit <- 
    final_wf %>%
    last_fit(split_info) 

  #----------------------------------------------------------#
  # Predictions on test data -----
  #----------------------------------------------------------#

  p <-
    predict(
      final_fit %>% tune::extract_workflow(), 
      new_data = test
      ) %>%
    bind_cols(
      test %>% 
        select(responses[resp_i], verbatimIdentification, scientificName)
      ) %>% # change y with response var
    setNames(
      c(".pred", 
        responses[resp_i],  
        "verbatimIdentification", 
        "scientificName")
      ) %>%
    mutate(
      resid = .[[responses[resp_i]]] - .pred)


  #----------------------------------------------------------#
  # explainable AI (xAI) -----
  #----------------------------------------------------------#


  fit <- 
    final_fit %>%
    extract_fit_parsnip()

  explainer <- 
    DALEXtra::explain_tidymodels(
      label = paste0(responses[resp_i]),
      fit,
      data = train %>%
        select(
          !any_of(c(responses, "verbatimIdentification", "scientificName"))
          ),
      y = train %>% 
        pull(responses[resp_i])
      )

  # save results to list
  results_response[[resp_i]] <-
    list(
      tuned_bayes = tuned_bayes,
      best = best,
      final_fit = final_fit,
      predictions = p,
      explainer = explainer,
      pd = 
        model_profile(
          explainer,
          type = "partial",
          N = NULL,
          variables = explainer$data %>% names()
      ),
      md = model_diagnostics(explainer),
      mp = model_performance(explainer),
      vi = model_parts(explainer)
    )    
  }

tictoc::toc()


saveRDS(results_response, 
        here::here("Demo_NewYork/Data/output/B_01_NY_model_results.rds"))

Interactions (h^2)

Code

library(hstats)

interaction_res <- list()

for (resp_i in seq_along(results_response)){
  
  dd <- 
    results_response[[resp_i]]
  
  fit <- 
    dd$final_fit %>% 
    extract_fit_parsnip()
  
  preds <- 
    dd$explainer$data
  
  interaction_res[[resp_i]] <- 
    hstats(fit,
           X = preds,
           threeway_m = 13)
  
}

saveRDS(interaction_res, 
        here::here("Demo_NewYork/Data/output/B_01_NY_interactions.rds"))

Variable importances

Code

vimp_list <- list()
for(i in seq_along(results_response)){

  if(i == 1) this_response <- "Jaccard_dissim" else
    if(i == 2) this_response <- "log_R2_1" else
      if(i == 3) this_response <- "log_R2_1_per_year"

      fit <- 
        results_response[[i]]$final_fit %>% 
        extract_fit_engine()
      
      vimp <- 
        fit$variable.importance

      vimp_list[[i]] <- 
        data.frame(
          variable = names(vimp),
          importance = vimp,
          row.names = NULL,
          response = this_response,
          importance_mode = fit$importance.mode
          )
      }

# bind together
imp_df <-
  vimp_list %>%
  bind_rows()

saveRDS(imp_df,
        here::here("Demo_NewYork/Data/output/B_01_NY_variable_importance.rds"))

Evaluate model performance

Code

# read results back in 
model_results <- 
  readRDS(here::here("Demo_NewYork/Data/output/B_01_NY_model_results.rds"))
importances <- 
  readRDS(here::here("Demo_NewYork/Data/output/B_01_NY_variable_importance.rds"))
interactions <- 
  readRDS(here::here("Demo_NewYork/Data/output/B_01_NY_interactions.rds"))

Code

# Jaccard
model_results[[1]]$final_fit %>% 
    extract_fit_parsnip()

parsnip model object

Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~5L,      x), num.trees = ~1141L, min.node.size = min_rows(~5L, x),      importance = ~"permutation", respect.unordered.factors = ~TRUE,      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 

Type:                             Regression 
Number of trees:                  1141 
Sample size:                      186 
Number of independent variables:  13 
Mtry:                             5 
Target node size:                 5 
Variable importance mode:         permutation 
Splitrule:                        variance 
OOB prediction error (MSE):       0.0132444 
R squared (OOB):                  0.8337896

Code

# Log ratio
model_results[[2]]$final_fit %>% 
    extract_fit_parsnip()

parsnip model object

Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~2L,      x), num.trees = ~1208L, min.node.size = min_rows(~15L, x),      importance = ~"permutation", respect.unordered.factors = ~TRUE,      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 

Type:                             Regression 
Number of trees:                  1208 
Sample size:                      186 
Number of independent variables:  13 
Mtry:                             2 
Target node size:                 15 
Variable importance mode:         permutation 
Splitrule:                        variance 
OOB prediction error (MSE):       0.4627072 
R squared (OOB):                  0.08164338

Code

# Log ratio per year
model_results[[3]]$final_fit %>% 
    extract_fit_parsnip()

parsnip model object

Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~2L,      x), num.trees = ~1513L, min.node.size = min_rows(~15L, x),      importance = ~"permutation", respect.unordered.factors = ~TRUE,      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 

Type:                             Regression 
Number of trees:                  1513 
Sample size:                      186 
Number of independent variables:  13 
Mtry:                             2 
Target node size:                 15 
Variable importance mode:         permutation 
Splitrule:                        variance 
OOB prediction error (MSE):       0.0183254 
R squared (OOB):                  0.09108146

Reshape model results for plotting

Code

model_hyperparam <- 
  list(model_results[[1]]$best,
       model_results[[2]]$best,
       model_results[[3]]$best) %>%
  bind_rows() %>%
  select(-.config) %>%
  mutate(
    response = c("Jaccard_dissim", "log_R2_1", "log_R2_1_per_year")
  )


model_perf <-
  list(model_results[[1]]$final_fit$.metrics,
       model_results[[2]]$final_fit$.metrics,
       model_results[[3]]$final_fit$.metrics) %>%
  bind_rows() %>%
  mutate(
    response = c(rep("Jaccard_dissim",2), 
                 rep("log_R2_1",2), 
                 rep("log_R2_1_per_year",2))
    ) %>%
  pivot_wider(id_cols = c(response),
              names_from = c(".metric"),
              values_from = c(".estimate")) %>%
  full_join(model_hyperparam) %>%
  mutate(
    cv = "3x10folds",
    tuning_method = "bayesian",
    model_type = "ranger")

Joining with `by = join_by(response)`

Code

kableExtra::kable(model_perf)

response	rmse	rsq	mtry	min_n	trees	cv	tuning_method	model_type
Jaccard_dissim	0.0626736	0.9562854	5	5	1141	3x10folds	bayesian	ranger
log_R2_1	0.4674220	0.0001605	2	15	1208	3x10folds	bayesian	ranger
log_R2_1_per_year	0.0922329	0.0034971	2	15	1513	3x10folds	bayesian	ranger

Code

interactions[[1]] %>% plot() + ggthemes::theme_few()

Code

interactions_res <- list()
for (resp_i in seq_along(interactions)){

    if(resp_i == 1) this_response <- "Jaccard_dissim" else
      if(resp_i == 2) this_response <- "log_R2_1" else
        if(resp_i == 3) this_response <- "log_R2_1_per_year"

        print(this_response)

        response_res <- interactions[[resp_i]]

        H2_all_df <-
          data.frame(
            row.names = NULL,
            response = this_response,
            variable = "all",
            test = "total_interaction_strength",
            H2 = summary(response_res)$h2[[1]])

        H2_overall_df <-
          data.frame(
            row.names = NULL,
            response = this_response,
            variable = row.names(summary(response_res)$h2_overall[[1]]),
            test = "overall",
            H2 = summary(response_res)$h2_overall[[1]])

        H2_pairwise_df <-
          data.frame(
            row.names = NULL,
            response = this_response,
            variable = row.names(summary(response_res)$h2_pairwise[[1]]),
            test = "pairwise",
            H2 = summary(response_res)$h2_pairwise[[1]]
          )

        H2_threeway_df <-
          data.frame(
            row.names = NULL,
            response = this_response,
            variable = row.names(summary(response_res)$h2_threeway[[1]]),
            test = "threeway",
            H2 = summary(response_res)$h2_threeway[[1]]
          )

        temp <-
          full_join(H2_all_df, H2_overall_df) %>%
          full_join(H2_pairwise_df) %>%
          full_join(H2_threeway_df)

        interactions_res[[resp_i]] <- temp


  }

[1] "Jaccard_dissim"

Joining with `by = join_by(response, variable, test, H2)`
Joining with `by = join_by(response, variable, test, H2)`
Joining with `by = join_by(response, variable, test, H2)`

[1] "log_R2_1"

Joining with `by = join_by(response, variable, test, H2)`
Joining with `by = join_by(response, variable, test, H2)`
Joining with `by = join_by(response, variable, test, H2)`

[1] "log_R2_1_per_year"

Joining with `by = join_by(response, variable, test, H2)`
Joining with `by = join_by(response, variable, test, H2)`
Joining with `by = join_by(response, variable, test, H2)`

Code

interactions_res <- 
  interactions_res %>%
  bind_rows()

Plots:

Model performance

Code

p_1 <-
  model_perf %>%
  pivot_longer(cols = c(rmse, rsq)) %>%
  ggplot() +
  geom_point(aes(x = response, y = value, shape = name), cex = 2, col = "black", bg = "black")+
  ggthemes::theme_few()+
  scale_shape_manual(values = c(rmse = 21, rsq = 1))

p_1

Variable Importance

Code

library(tidytext)

Warning: Paket 'tidytext' wurde unter R Version 4.4.3 erstellt

Code

vip_full <- importances %>%
  group_by(variable) %>%
  mutate(mean_importance_jaccard = mean(importance[response == "Jaccard_dissim"], na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(variable = forcats::fct_reorder(variable, mean_importance_jaccard)) %>%
  mutate(hypo = case_when(variable %in% H1 ~ "H1",
                          variable %in% H2 ~ "H2"))

p_3 <-
  vip_full %>%
  ggplot() +
  geom_col(aes(x = importance, y = variable, fill = hypo)) +
  facet_wrap(~response, scales = "free_x") +  # free_x so y-axis stays fixed
  labs(y = "Variable", x = "Importance")+
  ggthemes::theme_few()+
  scale_fill_manual(values = c("#f1a340","#7fbf7b", "#998ec3"))


p_3

Partial Plots

Code

#----------------------------------------------------------#
# c) Partial dependencies:
#----------------------------------------------------------#

# Set variables:
H1 <- 
  c("Mass", "GlobRangeSize_km2", 
    "Migration", "Habitat_5", 
    "Generalism", "Threatened", "pd")
H2 <- 
  c("D_AOO_a", "mean_lnLac", 
    "AOO", "joincount_delta", 
    "circNorm", "minDist_toBorder_centr")

predictors <- 
  c(H1, H2)

# Categorical variables:
cat_vars <- c("Migration",
              "Habitat_5",
              "Generalism",
              "Threatened",
              "datasetID")


pdp_list_num <- 
  replicate(3, list)

pdp_list_cat <- 
  replicate(3, list)

for (resp_i in seq_along(responses)){
  this_model <- 
    model_results[[resp_i]]$final_fit %>% 
    extract_fit_parsnip()
  
  pdp_list_num[[resp_i]] <- 
    list()
  
  pdp_list_cat[[resp_i]] <- 
    list()
  
  for (var_i in seq_along(predictors)){
    this_variable <- 
      predictors[var_i]
    
      
      pdp_df <- 
      pdp::partial(
        object = this_model,
        train = train,
        pred.var = this_variable,
        plot = FALSE) %>%
      pivot_longer(cols = c(this_variable), 
                   names_to = "variable") %>%
      mutate(response = responses[resp_i])
      
      if (this_variable %in% cat_vars) {
        pdp_list_cat[[resp_i]][[this_variable]] <- pdp_df
      } else {
        pdp_list_num[[resp_i]][[this_variable]] <- pdp_df
      }
      
  }
  
}

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(this_variable)

  # Now:
  data %>% select(all_of(this_variable))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

Code

num <- lapply(pdp_list_num,bind_rows)
cat <- lapply(pdp_list_cat,bind_rows)

Code

lookup <- 
  c("Mass" = "Body mass",
    "GlobRangeSize_km2" = "Global range size",
    "Migration" = "Migration",
    "Habitat_5" = "Habitat type",
    "Generalism" = "Generalism",
    "Threatened" = "Threatened",
    "pd" = "Phylogenetic distinctiveness",
    "D_AOO_a" = "Fractal dimension",
    "mean_lnLac" = "Mean log lacunarity",
    "AOO" = "Area of occupancy",
    "joincount_delta" = "Spatial autocorrelation",
    "circNorm" = "Normalized circularity",
    "minDist_toBorder_centr" = "Smallest distance centroid-border") 


 for (resp_i in seq_along(num)){
        
        this_resp <- responses[resp_i]

        response_dd <- num[[resp_i]] %>% 
          filter(response == this_resp)
        
        p0 <- response_dd %>%
          ggplot()+
          geom_line(
            aes(
            y = yhat,
            x = value,
            group = variable),
            linewidth = 0.8, 
            alpha = 0.8)+
          labs(x = NULL,
               y = paste0(this_resp, " (Partial)"))+
          facet_wrap(~ variable , 
                     scales = "free_x", 
                     labeller = labeller(variable = lookup))+
          ggthemes::theme_par()+
          ggtitle(paste(this_resp))+
          scale_color_brewer(palette = "PuOr")
        
        
        # adjust axes and plot hline
        
        if (this_resp == "Jaccard_dissim") {
          p1 <-
            p0 +
            ylim(0, 1) +
            geom_hline(
              yintercept = 0.5,
              linewidth = 0.3,
              col = "darkgrey"
            )
        } else if (this_resp == "log_R2_1") {
          p1 <-
            p0 +
            geom_hline(
              yintercept = 0,
              linewidth = 0.3,
              col = "darkgrey"
            )
          
        } else if (this_resp == "log_R2_1_per_year") {
          p1 <-
            p0 +
            geom_hline(
              yintercept = 0,
              linewidth = 0.3,
              col = "darkgrey"
            )
        }
        
        print(p1)
        
      } # responses loop closing

Code

 for (resp_i in seq_along(cat)){
        
        this_resp <- responses[resp_i]

        response_dd <- cat[[resp_i]] %>% 
          filter(response == this_resp)
        
        p0_2 <- response_dd %>%
          ggplot()+
         geom_point(
            aes(y = yhat,
                x = value),
            alpha = 0.8,
            size = 3)+
          labs(x = NULL,
               y = paste0(this_resp, " (Partial)"))+
          facet_wrap(~ variable , 
                     scales = "free_x", 
                     labeller = labeller(variable = lookup))+
          ggthemes::theme_par()+
          ggtitle(paste(this_resp))+
          scale_color_brewer(palette = "PuOr")
        
        
        # adjust axes and plot hline
        
        if (this_resp == "Jaccard_dissim") {
          p2 <-
            p0_2 +
            ylim(0, 1) +
            geom_hline(
              yintercept = 0.5,
              linewidth = 0.3,
              col = "darkgrey"
            )
        } else if (this_resp == "log_R2_1") {
          p2 <-
            p0_2 +
            geom_hline(
              yintercept = 0,
              linewidth = 0.3,
              col = "darkgrey"
            )
          
        } else if (this_resp == "log_R2_1_per_year") {
          p2 <-
            p0_2 +
            geom_hline(
              yintercept = 0,
              linewidth = 0.3,
              col = "darkgrey"
            )
        }
        
        print(p2)
        
      } # responses loop closing

Interactions

Code

p_5 <- 
  interactions_res %>%
  filter(!c(test == "total_interaction_strength")) %>%
  ggplot()+
  geom_col(aes(x = H2, y = reorder_within(variable, H2, response)))+
  facet_wrap(~test+response,scales = "free")+
  scale_y_reordered()+
  ggthemes::theme_few()+
  theme(axis.text.y = element_text(size = 15))

p_5